tree-sitter-0.20.1/.cargo_vcs_info.json0000644000000001120000000000100133550ustar { "git": { "sha1": "062421dece3315bd6f228ad6d468cba083d0a2d5" } } tree-sitter-0.20.1/Cargo.toml0000644000000023740000000000100113670ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "tree-sitter" version = "0.20.1" authors = ["Max Brunsfeld "] build = "binding_rust/build.rs" include = ["/binding_rust/*", "/Cargo.toml", "/include/*", "/src/*.h", "/src/*.c", "/src/unicode/*"] description = "Rust bindings to the Tree-sitter parsing library" readme = "binding_rust/README.md" keywords = ["incremental", "parsing"] categories = ["api-bindings", "parsing", "text-editors"] license = "MIT" repository = "https://github.com/tree-sitter/tree-sitter" [lib] path = "binding_rust/lib.rs" [dependencies.lazy_static] version = "1.2.0" optional = true [dependencies.regex] version = "1" [dependencies.spin] version = "0.7" optional = true [build-dependencies.cc] version = "^1.0.58" [features] allocation-tracking = ["lazy_static", "spin"] tree-sitter-0.20.1/Cargo.toml.orig000064400000000000000000000016620072674642500150770ustar 00000000000000[package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" version = "0.20.1" authors = ["Max Brunsfeld "] edition = "2018" license = "MIT" readme = "binding_rust/README.md" keywords = ["incremental", "parsing"] categories = ["api-bindings", "parsing", "text-editors"] repository = "https://github.com/tree-sitter/tree-sitter" build = "binding_rust/build.rs" include = [ "/binding_rust/*", "/Cargo.toml", "/include/*", "/src/*.h", "/src/*.c", "/src/unicode/*", ] [dependencies] lazy_static = { version="1.2.0", optional=true } regex = "1" spin = { version="0.7", optional=true } [build-dependencies] cc = "^1.0.58" [lib] path = "binding_rust/lib.rs" # This feature is only useful for testing the Tree-sitter library itself. # It is exposed because all of Tree-sitter's tests live in the Tree-sitter CLI crate. [features] allocation-tracking = ["lazy_static", "spin"] tree-sitter-0.20.1/binding_rust/README.md000064400000000000000000000066630072674642500161640ustar 00000000000000# Rust Tree-sitter [![Build Status](https://travis-ci.org/tree-sitter/tree-sitter.svg?branch=master)](https://travis-ci.org/tree-sitter/tree-sitter) [![Build status](https://ci.appveyor.com/api/projects/status/vtmbd6i92e97l55w/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/tree-sitter/branch/master) [![Crates.io](https://img.shields.io/crates/v/tree-sitter.svg)](https://crates.io/crates/tree-sitter) Rust bindings to the [Tree-sitter][] parsing library. ### Basic Usage First, create a parser: ```rust use tree_sitter::{Parser, Language}; let mut parser = Parser::new(); ``` Tree-sitter languages consist of generated C code. To make sure they're properly compiled and linked, you can create a [build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) like the following (assuming `tree-sitter-javascript` is in your root directory): ```rust use std::path::PathBuf; fn main() { let dir: PathBuf = ["tree-sitter-javascript", "src"].iter().collect(); cc::Build::new() .include(&dir) .file(dir.join("parser.c")) .file(dir.join("scanner.c")) .compile("tree-sitter-javascript"); } ``` Add the `cc` crate to your `Cargo.toml` under `[build-dependencies]`: ```toml [build-dependencies] cc="*" ``` To then use languages from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`. Then you can assign them to the parser. ```rust extern "C" { fn tree_sitter_c() -> Language; } extern "C" { fn tree_sitter_rust() -> Language; } extern "C" { fn tree_sitter_javascript() -> Language; } let language = unsafe { tree_sitter_rust() }; parser.set_language(language).unwrap(); ``` Now you can parse source code: ```rust let source_code = "fn test() {}"; let tree = parser.parse(source_code, None).unwrap(); let root_node = tree.root_node(); assert_eq!(root_node.kind(), "source_file"); assert_eq!(root_node.start_position().column, 0); assert_eq!(root_node.end_position().column, 12); ``` ### Editing Once you have a syntax tree, you can update it when your source code changes. Passing in the previous edited tree makes `parse` run much more quickly: ```rust let new_source_code = "fn test(a: u32) {}" tree.edit(InputEdit { start_byte: 8, old_end_byte: 8, new_end_byte: 14, start_position: Point::new(0, 8), old_end_position: Point::new(0, 8), new_end_position: Point::new(0, 14), }); let new_tree = parser.parse(new_source_code, Some(&tree)); ``` ### Text Input The source code to parse can be provided either as a string, a slice, a vector, or as a function that returns a slice. The text can be encoded as either UTF8 or UTF16: ```rust // Store some source code in an array of lines. let lines = &[ "pub fn foo() {", " 1", "}", ]; // Parse the source code using a custom callback. The callback is called // with both a byte offset and a row/column offset. let tree = parser.parse_with(&mut |_byte: u32, position: Point| -> &[u8] { let row = position.row as usize; let column = position.column as usize; if row < lines.len() { if column < lines[row].as_bytes().len() { &lines[row].as_bytes()[column..] } else { "\n".as_bytes() } } else { &[] } }, None).unwrap(); assert_eq!( tree.root_node().to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))" ); ``` [tree-sitter]: https://github.com/tree-sitter/tree-sitter tree-sitter-0.20.1/binding_rust/allocations.rs000064400000000000000000000063010072674642500175500ustar 00000000000000use spin::Mutex; use std::{ collections::HashMap, os::raw::{c_ulong, c_void}, }; #[derive(Debug, PartialEq, Eq, Hash)] struct Allocation(*const c_void); unsafe impl Send for Allocation {} unsafe impl Sync for Allocation {} #[derive(Default)] struct AllocationRecorder { enabled: bool, allocation_count: u64, outstanding_allocations: HashMap, } thread_local! { static RECORDER: Mutex = Default::default(); } extern "C" { fn malloc(size: c_ulong) -> *mut c_void; fn calloc(count: c_ulong, size: c_ulong) -> *mut c_void; fn realloc(ptr: *mut c_void, size: c_ulong) -> *mut c_void; fn free(ptr: *mut c_void); } pub fn record(f: impl FnOnce() -> T) -> T { RECORDER.with(|recorder| { let mut recorder = recorder.lock(); recorder.enabled = true; recorder.allocation_count = 0; recorder.outstanding_allocations.clear(); }); let value = f(); let outstanding_allocation_indices = RECORDER.with(|recorder| { let mut recorder = recorder.lock(); recorder.enabled = false; recorder.allocation_count = 0; recorder .outstanding_allocations .drain() .map(|e| e.1) .collect::>() }); if !outstanding_allocation_indices.is_empty() { panic!( "Leaked allocation indices: {:?}", outstanding_allocation_indices ); } value } fn record_alloc(ptr: *mut c_void) { RECORDER.with(|recorder| { let mut recorder = recorder.lock(); if recorder.enabled { let count = recorder.allocation_count; recorder.allocation_count += 1; recorder .outstanding_allocations .insert(Allocation(ptr), count); } }); } fn record_dealloc(ptr: *mut c_void) { RECORDER.with(|recorder| { let mut recorder = recorder.lock(); if recorder.enabled { recorder.outstanding_allocations.remove(&Allocation(ptr)); } }); } #[no_mangle] pub extern "C" fn ts_record_malloc(size: c_ulong) -> *const c_void { let result = unsafe { malloc(size) }; record_alloc(result); result } #[no_mangle] pub extern "C" fn ts_record_calloc(count: c_ulong, size: c_ulong) -> *const c_void { let result = unsafe { calloc(count, size) }; record_alloc(result); result } #[no_mangle] pub extern "C" fn ts_record_realloc(ptr: *mut c_void, size: c_ulong) -> *const c_void { record_dealloc(ptr); let result = unsafe { realloc(ptr, size) }; record_alloc(result); result } // This needs to be unsafe because it's reexported as crate::util::free_ptr, which is mapped to // libc's `free` function when the allocation-tracking feature is disabled. Since `free` is // unsafe, this function needs to be too. #[no_mangle] pub unsafe extern "C" fn ts_record_free(ptr: *mut c_void) { record_dealloc(ptr); free(ptr); } #[no_mangle] pub extern "C" fn ts_toggle_allocation_recording(enabled: bool) -> bool { RECORDER.with(|recorder| { let mut recorder = recorder.lock(); let was_enabled = recorder.enabled; recorder.enabled = enabled; was_enabled }) } tree-sitter-0.20.1/binding_rust/bindings.rs000064400000000000000000001057550072674642500170520ustar 00000000000000/* automatically generated by rust-bindgen 0.59.1 */ pub type __darwin_size_t = ::std::os::raw::c_ulong; pub type FILE = [u64; 19usize]; pub type TSSymbol = u16; pub type TSFieldId = u16; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSLanguage { _unused: [u8; 0], } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSParser { _unused: [u8; 0], } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSTree { _unused: [u8; 0], } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSQuery { _unused: [u8; 0], } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSQueryCursor { _unused: [u8; 0], } pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; pub type TSInputEncoding = u32; pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; pub type TSSymbolType = u32; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSPoint { pub row: u32, pub column: u32, } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSRange { pub start_point: TSPoint, pub end_point: TSPoint, pub start_byte: u32, pub end_byte: u32, } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSInput { pub payload: *mut ::std::os::raw::c_void, pub read: ::std::option::Option< unsafe extern "C" fn( payload: *mut ::std::os::raw::c_void, byte_index: u32, position: TSPoint, bytes_read: *mut u32, ) -> *const ::std::os::raw::c_char, >, pub encoding: TSInputEncoding, } pub const TSLogType_TSLogTypeParse: TSLogType = 0; pub const TSLogType_TSLogTypeLex: TSLogType = 1; pub type TSLogType = u32; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSLogger { pub payload: *mut ::std::os::raw::c_void, pub log: ::std::option::Option< unsafe extern "C" fn( payload: *mut ::std::os::raw::c_void, arg1: TSLogType, arg2: *const ::std::os::raw::c_char, ), >, } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSInputEdit { pub start_byte: u32, pub old_end_byte: u32, pub new_end_byte: u32, pub start_point: TSPoint, pub old_end_point: TSPoint, pub new_end_point: TSPoint, } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSNode { pub context: [u32; 4usize], pub id: *const ::std::os::raw::c_void, pub tree: *const TSTree, } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSTreeCursor { pub tree: *const ::std::os::raw::c_void, pub id: *const ::std::os::raw::c_void, pub context: [u32; 2usize], } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSQueryCapture { pub node: TSNode, pub index: u32, } #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSQueryMatch { pub id: u32, pub pattern_index: u16, pub capture_count: u16, pub captures: *const TSQueryCapture, } pub const TSQueryPredicateStepType_TSQueryPredicateStepTypeDone: TSQueryPredicateStepType = 0; pub const TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture: TSQueryPredicateStepType = 1; pub const TSQueryPredicateStepType_TSQueryPredicateStepTypeString: TSQueryPredicateStepType = 2; pub type TSQueryPredicateStepType = u32; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct TSQueryPredicateStep { pub type_: TSQueryPredicateStepType, pub value_id: u32, } pub const TSQueryError_TSQueryErrorNone: TSQueryError = 0; pub const TSQueryError_TSQueryErrorSyntax: TSQueryError = 1; pub const TSQueryError_TSQueryErrorNodeType: TSQueryError = 2; pub const TSQueryError_TSQueryErrorField: TSQueryError = 3; pub const TSQueryError_TSQueryErrorCapture: TSQueryError = 4; pub const TSQueryError_TSQueryErrorStructure: TSQueryError = 5; pub const TSQueryError_TSQueryErrorLanguage: TSQueryError = 6; pub type TSQueryError = u32; extern "C" { #[doc = " Create a new parser."] pub fn ts_parser_new() -> *mut TSParser; } extern "C" { #[doc = " Delete the parser, freeing all of the memory that it used."] pub fn ts_parser_delete(parser: *mut TSParser); } extern "C" { #[doc = " Set the language that the parser should use for parsing."] #[doc = ""] #[doc = " Returns a boolean indicating whether or not the language was successfully"] #[doc = " assigned. True means assignment succeeded. False means there was a version"] #[doc = " mismatch: the language was generated with an incompatible version of the"] #[doc = " Tree-sitter CLI. Check the language's version using `ts_language_version`"] #[doc = " and compare it to this library's `TREE_SITTER_LANGUAGE_VERSION` and"] #[doc = " `TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION` constants."] pub fn ts_parser_set_language(self_: *mut TSParser, language: *const TSLanguage) -> bool; } extern "C" { #[doc = " Get the parser's current language."] pub fn ts_parser_language(self_: *const TSParser) -> *const TSLanguage; } extern "C" { #[doc = " Set the ranges of text that the parser should include when parsing."] #[doc = ""] #[doc = " By default, the parser will always include entire documents. This function"] #[doc = " allows you to parse only a *portion* of a document but still return a syntax"] #[doc = " tree whose ranges match up with the document as a whole. You can also pass"] #[doc = " multiple disjoint ranges."] #[doc = ""] #[doc = " The second and third parameters specify the location and length of an array"] #[doc = " of ranges. The parser does *not* take ownership of these ranges; it copies"] #[doc = " the data, so it doesn't matter how these ranges are allocated."] #[doc = ""] #[doc = " If `length` is zero, then the entire document will be parsed. Otherwise,"] #[doc = " the given ranges must be ordered from earliest to latest in the document,"] #[doc = " and they must not overlap. That is, the following must hold for all"] #[doc = " `i` < `length - 1`: ranges[i].end_byte <= ranges[i + 1].start_byte"] #[doc = ""] #[doc = " If this requirement is not satisfied, the operation will fail, the ranges"] #[doc = " will not be assigned, and this function will return `false`. On success,"] #[doc = " this function returns `true`"] pub fn ts_parser_set_included_ranges( self_: *mut TSParser, ranges: *const TSRange, length: u32, ) -> bool; } extern "C" { #[doc = " Get the ranges of text that the parser will include when parsing."] #[doc = ""] #[doc = " The returned pointer is owned by the parser. The caller should not free it"] #[doc = " or write to it. The length of the array will be written to the given"] #[doc = " `length` pointer."] pub fn ts_parser_included_ranges(self_: *const TSParser, length: *mut u32) -> *const TSRange; } extern "C" { #[doc = " Use the parser to parse some source code and create a syntax tree."] #[doc = ""] #[doc = " If you are parsing this document for the first time, pass `NULL` for the"] #[doc = " `old_tree` parameter. Otherwise, if you have already parsed an earlier"] #[doc = " version of this document and the document has since been edited, pass the"] #[doc = " previous syntax tree so that the unchanged parts of it can be reused."] #[doc = " This will save time and memory. For this to work correctly, you must have"] #[doc = " already edited the old syntax tree using the `ts_tree_edit` function in a"] #[doc = " way that exactly matches the source code changes."] #[doc = ""] #[doc = " The `TSInput` parameter lets you specify how to read the text. It has the"] #[doc = " following three fields:"] #[doc = " 1. `read`: A function to retrieve a chunk of text at a given byte offset"] #[doc = " and (row, column) position. The function should return a pointer to the"] #[doc = " text and write its length to the `bytes_read` pointer. The parser does"] #[doc = " not take ownership of this buffer; it just borrows it until it has"] #[doc = " finished reading it. The function should write a zero value to the"] #[doc = " `bytes_read` pointer to indicate the end of the document."] #[doc = " 2. `payload`: An arbitrary pointer that will be passed to each invocation"] #[doc = " of the `read` function."] #[doc = " 3. `encoding`: An indication of how the text is encoded. Either"] #[doc = " `TSInputEncodingUTF8` or `TSInputEncodingUTF16`."] #[doc = ""] #[doc = " This function returns a syntax tree on success, and `NULL` on failure. There"] #[doc = " are three possible reasons for failure:"] #[doc = " 1. The parser does not have a language assigned. Check for this using the"] #[doc = "`ts_parser_language` function."] #[doc = " 2. Parsing was cancelled due to a timeout that was set by an earlier call to"] #[doc = " the `ts_parser_set_timeout_micros` function. You can resume parsing from"] #[doc = " where the parser left out by calling `ts_parser_parse` again with the"] #[doc = " same arguments. Or you can start parsing from scratch by first calling"] #[doc = " `ts_parser_reset`."] #[doc = " 3. Parsing was cancelled using a cancellation flag that was set by an"] #[doc = " earlier call to `ts_parser_set_cancellation_flag`. You can resume parsing"] #[doc = " from where the parser left out by calling `ts_parser_parse` again with"] #[doc = " the same arguments."] pub fn ts_parser_parse( self_: *mut TSParser, old_tree: *const TSTree, input: TSInput, ) -> *mut TSTree; } extern "C" { #[doc = " Use the parser to parse some source code stored in one contiguous buffer."] #[doc = " The first two parameters are the same as in the `ts_parser_parse` function"] #[doc = " above. The second two parameters indicate the location of the buffer and its"] #[doc = " length in bytes."] pub fn ts_parser_parse_string( self_: *mut TSParser, old_tree: *const TSTree, string: *const ::std::os::raw::c_char, length: u32, ) -> *mut TSTree; } extern "C" { #[doc = " Use the parser to parse some source code stored in one contiguous buffer with"] #[doc = " a given encoding. The first four parameters work the same as in the"] #[doc = " `ts_parser_parse_string` method above. The final parameter indicates whether"] #[doc = " the text is encoded as UTF8 or UTF16."] pub fn ts_parser_parse_string_encoding( self_: *mut TSParser, old_tree: *const TSTree, string: *const ::std::os::raw::c_char, length: u32, encoding: TSInputEncoding, ) -> *mut TSTree; } extern "C" { #[doc = " Instruct the parser to start the next parse from the beginning."] #[doc = ""] #[doc = " If the parser previously failed because of a timeout or a cancellation, then"] #[doc = " by default, it will resume where it left off on the next call to"] #[doc = " `ts_parser_parse` or other parsing functions. If you don't want to resume,"] #[doc = " and instead intend to use this parser to parse some other document, you must"] #[doc = " call `ts_parser_reset` first."] pub fn ts_parser_reset(self_: *mut TSParser); } extern "C" { #[doc = " Set the maximum duration in microseconds that parsing should be allowed to"] #[doc = " take before halting."] #[doc = ""] #[doc = " If parsing takes longer than this, it will halt early, returning NULL."] #[doc = " See `ts_parser_parse` for more information."] pub fn ts_parser_set_timeout_micros(self_: *mut TSParser, timeout: u64); } extern "C" { #[doc = " Get the duration in microseconds that parsing is allowed to take."] pub fn ts_parser_timeout_micros(self_: *const TSParser) -> u64; } extern "C" { #[doc = " Set the parser's current cancellation flag pointer."] #[doc = ""] #[doc = " If a non-null pointer is assigned, then the parser will periodically read"] #[doc = " from this pointer during parsing. If it reads a non-zero value, it will"] #[doc = " halt early, returning NULL. See `ts_parser_parse` for more information."] pub fn ts_parser_set_cancellation_flag(self_: *mut TSParser, flag: *const usize); } extern "C" { #[doc = " Get the parser's current cancellation flag pointer."] pub fn ts_parser_cancellation_flag(self_: *const TSParser) -> *const usize; } extern "C" { #[doc = " Set the logger that a parser should use during parsing."] #[doc = ""] #[doc = " The parser does not take ownership over the logger payload. If a logger was"] #[doc = " previously assigned, the caller is responsible for releasing any memory"] #[doc = " owned by the previous logger."] pub fn ts_parser_set_logger(self_: *mut TSParser, logger: TSLogger); } extern "C" { #[doc = " Get the parser's current logger."] pub fn ts_parser_logger(self_: *const TSParser) -> TSLogger; } extern "C" { #[doc = " Set the file descriptor to which the parser should write debugging graphs"] #[doc = " during parsing. The graphs are formatted in the DOT language. You may want"] #[doc = " to pipe these graphs directly to a `dot(1)` process in order to generate"] #[doc = " SVG output. You can turn off this logging by passing a negative number."] pub fn ts_parser_print_dot_graphs(self_: *mut TSParser, file: ::std::os::raw::c_int); } extern "C" { #[doc = " Create a shallow copy of the syntax tree. This is very fast."] #[doc = ""] #[doc = " You need to copy a syntax tree in order to use it on more than one thread at"] #[doc = " a time, as syntax trees are not thread safe."] pub fn ts_tree_copy(self_: *const TSTree) -> *mut TSTree; } extern "C" { #[doc = " Delete the syntax tree, freeing all of the memory that it used."] pub fn ts_tree_delete(self_: *mut TSTree); } extern "C" { #[doc = " Get the root node of the syntax tree."] pub fn ts_tree_root_node(self_: *const TSTree) -> TSNode; } extern "C" { #[doc = " Get the language that was used to parse the syntax tree."] pub fn ts_tree_language(arg1: *const TSTree) -> *const TSLanguage; } extern "C" { #[doc = " Edit the syntax tree to keep it in sync with source code that has been"] #[doc = " edited."] #[doc = ""] #[doc = " You must describe the edit both in terms of byte offsets and in terms of"] #[doc = " (row, column) coordinates."] pub fn ts_tree_edit(self_: *mut TSTree, edit: *const TSInputEdit); } extern "C" { #[doc = " Compare an old edited syntax tree to a new syntax tree representing the same"] #[doc = " document, returning an array of ranges whose syntactic structure has changed."] #[doc = ""] #[doc = " For this to work correctly, the old syntax tree must have been edited such"] #[doc = " that its ranges match up to the new tree. Generally, you'll want to call"] #[doc = " this function right after calling one of the `ts_parser_parse` functions."] #[doc = " You need to pass the old tree that was passed to parse, as well as the new"] #[doc = " tree that was returned from that function."] #[doc = ""] #[doc = " The returned array is allocated using `malloc` and the caller is responsible"] #[doc = " for freeing it using `free`. The length of the array will be written to the"] #[doc = " given `length` pointer."] pub fn ts_tree_get_changed_ranges( old_tree: *const TSTree, new_tree: *const TSTree, length: *mut u32, ) -> *mut TSRange; } extern "C" { #[doc = " Write a DOT graph describing the syntax tree to the given file."] pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); } extern "C" { #[doc = " Get the node's type as a null-terminated string."] pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; } extern "C" { #[doc = " Get the node's type as a numerical id."] pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; } extern "C" { #[doc = " Get the node's start byte."] pub fn ts_node_start_byte(arg1: TSNode) -> u32; } extern "C" { #[doc = " Get the node's start position in terms of rows and columns."] pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; } extern "C" { #[doc = " Get the node's end byte."] pub fn ts_node_end_byte(arg1: TSNode) -> u32; } extern "C" { #[doc = " Get the node's end position in terms of rows and columns."] pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; } extern "C" { #[doc = " Get an S-expression representing the node as a string."] #[doc = ""] #[doc = " This string is allocated with `malloc` and the caller is responsible for"] #[doc = " freeing it using `free`."] pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; } extern "C" { #[doc = " Check if the node is null. Functions like `ts_node_child` and"] #[doc = " `ts_node_next_sibling` will return a null node to indicate that no such node"] #[doc = " was found."] pub fn ts_node_is_null(arg1: TSNode) -> bool; } extern "C" { #[doc = " Check if the node is *named*. Named nodes correspond to named rules in the"] #[doc = " grammar, whereas *anonymous* nodes correspond to string literals in the"] #[doc = " grammar."] pub fn ts_node_is_named(arg1: TSNode) -> bool; } extern "C" { #[doc = " Check if the node is *missing*. Missing nodes are inserted by the parser in"] #[doc = " order to recover from certain kinds of syntax errors."] pub fn ts_node_is_missing(arg1: TSNode) -> bool; } extern "C" { #[doc = " Check if the node is *extra*. Extra nodes represent things like comments,"] #[doc = " which are not required the grammar, but can appear anywhere."] pub fn ts_node_is_extra(arg1: TSNode) -> bool; } extern "C" { #[doc = " Check if a syntax node has been edited."] pub fn ts_node_has_changes(arg1: TSNode) -> bool; } extern "C" { #[doc = " Check if the node is a syntax error or contains any syntax errors."] pub fn ts_node_has_error(arg1: TSNode) -> bool; } extern "C" { #[doc = " Get the node's immediate parent."] pub fn ts_node_parent(arg1: TSNode) -> TSNode; } extern "C" { #[doc = " Get the node's child at the given index, where zero represents the first"] #[doc = " child."] pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { #[doc = " Get the field name for node's child at the given index, where zero represents"] #[doc = " the first child. Returns NULL, if no field is found."] pub fn ts_node_field_name_for_child(arg1: TSNode, arg2: u32) -> *const ::std::os::raw::c_char; } extern "C" { #[doc = " Get the node's number of children."] pub fn ts_node_child_count(arg1: TSNode) -> u32; } extern "C" { #[doc = " Get the node's *named* child at the given index."] #[doc = ""] #[doc = " See also `ts_node_is_named`."] pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { #[doc = " Get the node's number of *named* children."] #[doc = ""] #[doc = " See also `ts_node_is_named`."] pub fn ts_node_named_child_count(arg1: TSNode) -> u32; } extern "C" { #[doc = " Get the node's child with the given field name."] pub fn ts_node_child_by_field_name( self_: TSNode, field_name: *const ::std::os::raw::c_char, field_name_length: u32, ) -> TSNode; } extern "C" { #[doc = " Get the node's child with the given numerical field id."] #[doc = ""] #[doc = " You can convert a field name to an id using the"] #[doc = " `ts_language_field_id_for_name` function."] pub fn ts_node_child_by_field_id(arg1: TSNode, arg2: TSFieldId) -> TSNode; } extern "C" { #[doc = " Get the node's next / previous sibling."] pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; } extern "C" { pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; } extern "C" { #[doc = " Get the node's next / previous *named* sibling."] pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; } extern "C" { pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; } extern "C" { #[doc = " Get the node's first child that extends beyond the given byte offset."] pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { #[doc = " Get the node's first named child that extends beyond the given byte offset."] pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; } extern "C" { #[doc = " Get the smallest node within this node that spans the given range of bytes"] #[doc = " or (row, column) positions."] pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; } extern "C" { pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) -> TSNode; } extern "C" { #[doc = " Get the smallest named node within this node that spans the given range of"] #[doc = " bytes or (row, column) positions."] pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; } extern "C" { pub fn ts_node_named_descendant_for_point_range( arg1: TSNode, arg2: TSPoint, arg3: TSPoint, ) -> TSNode; } extern "C" { #[doc = " Edit the node to keep it in-sync with source code that has been edited."] #[doc = ""] #[doc = " This function is only rarely needed. When you edit a syntax tree with the"] #[doc = " `ts_tree_edit` function, all of the nodes that you retrieve from the tree"] #[doc = " afterward will already reflect the edit. You only need to use `ts_node_edit`"] #[doc = " when you have a `TSNode` instance that you want to keep and continue to use"] #[doc = " after an edit."] pub fn ts_node_edit(arg1: *mut TSNode, arg2: *const TSInputEdit); } extern "C" { #[doc = " Check if two nodes are identical."] pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; } extern "C" { #[doc = " Create a new tree cursor starting from the given node."] #[doc = ""] #[doc = " A tree cursor allows you to walk a syntax tree more efficiently than is"] #[doc = " possible using the `TSNode` functions. It is a mutable object that is always"] #[doc = " on a certain syntax node, and can be moved imperatively to different nodes."] pub fn ts_tree_cursor_new(arg1: TSNode) -> TSTreeCursor; } extern "C" { #[doc = " Delete a tree cursor, freeing all of the memory that it used."] pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); } extern "C" { #[doc = " Re-initialize a tree cursor to start at a different node."] pub fn ts_tree_cursor_reset(arg1: *mut TSTreeCursor, arg2: TSNode); } extern "C" { #[doc = " Get the tree cursor's current node."] pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; } extern "C" { #[doc = " Get the field name of the tree cursor's current node."] #[doc = ""] #[doc = " This returns `NULL` if the current node doesn't have a field."] #[doc = " See also `ts_node_child_by_field_name`."] pub fn ts_tree_cursor_current_field_name( arg1: *const TSTreeCursor, ) -> *const ::std::os::raw::c_char; } extern "C" { #[doc = " Get the field id of the tree cursor's current node."] #[doc = ""] #[doc = " This returns zero if the current node doesn't have a field."] #[doc = " See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`."] pub fn ts_tree_cursor_current_field_id(arg1: *const TSTreeCursor) -> TSFieldId; } extern "C" { #[doc = " Move the cursor to the parent of its current node."] #[doc = ""] #[doc = " This returns `true` if the cursor successfully moved, and returns `false`"] #[doc = " if there was no parent node (the cursor was already on the root node)."] pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; } extern "C" { #[doc = " Move the cursor to the next sibling of its current node."] #[doc = ""] #[doc = " This returns `true` if the cursor successfully moved, and returns `false`"] #[doc = " if there was no next sibling node."] pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; } extern "C" { #[doc = " Move the cursor to the first child of its current node."] #[doc = ""] #[doc = " This returns `true` if the cursor successfully moved, and returns `false`"] #[doc = " if there were no children."] pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; } extern "C" { #[doc = " Move the cursor to the first child of its current node that extends beyond"] #[doc = " the given byte offset or point."] #[doc = ""] #[doc = " This returns the index of the child node if one was found, and returns -1"] #[doc = " if no such child was found."] pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; } extern "C" { pub fn ts_tree_cursor_goto_first_child_for_point(arg1: *mut TSTreeCursor, arg2: TSPoint) -> i64; } extern "C" { pub fn ts_tree_cursor_copy(arg1: *const TSTreeCursor) -> TSTreeCursor; } extern "C" { #[doc = " Create a new query from a string containing one or more S-expression"] #[doc = " patterns. The query is associated with a particular language, and can"] #[doc = " only be run on syntax nodes parsed with that language."] #[doc = ""] #[doc = " If all of the given patterns are valid, this returns a `TSQuery`."] #[doc = " If a pattern is invalid, this returns `NULL`, and provides two pieces"] #[doc = " of information about the problem:"] #[doc = " 1. The byte offset of the error is written to the `error_offset` parameter."] #[doc = " 2. The type of error is written to the `error_type` parameter."] pub fn ts_query_new( language: *const TSLanguage, source: *const ::std::os::raw::c_char, source_len: u32, error_offset: *mut u32, error_type: *mut TSQueryError, ) -> *mut TSQuery; } extern "C" { #[doc = " Delete a query, freeing all of the memory that it used."] pub fn ts_query_delete(arg1: *mut TSQuery); } extern "C" { #[doc = " Get the number of patterns, captures, or string literals in the query."] pub fn ts_query_pattern_count(arg1: *const TSQuery) -> u32; } extern "C" { pub fn ts_query_capture_count(arg1: *const TSQuery) -> u32; } extern "C" { pub fn ts_query_string_count(arg1: *const TSQuery) -> u32; } extern "C" { #[doc = " Get the byte offset where the given pattern starts in the query's source."] #[doc = ""] #[doc = " This can be useful when combining queries by concatenating their source"] #[doc = " code strings."] pub fn ts_query_start_byte_for_pattern(arg1: *const TSQuery, arg2: u32) -> u32; } extern "C" { #[doc = " Get all of the predicates for the given pattern in the query."] #[doc = ""] #[doc = " The predicates are represented as a single array of steps. There are three"] #[doc = " types of steps in this array, which correspond to the three legal values for"] #[doc = " the `type` field:"] #[doc = " - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names"] #[doc = " of captures. Their `value_id` can be used with the"] #[doc = " `ts_query_capture_name_for_id` function to obtain the name of the capture."] #[doc = " - `TSQueryPredicateStepTypeString` - Steps with this type represent literal"] #[doc = " strings. Their `value_id` can be used with the"] #[doc = " `ts_query_string_value_for_id` function to obtain their string value."] #[doc = " - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels*"] #[doc = " that represent the end of an individual predicate. If a pattern has two"] #[doc = " predicates, then there will be two steps with this `type` in the array."] pub fn ts_query_predicates_for_pattern( self_: *const TSQuery, pattern_index: u32, length: *mut u32, ) -> *const TSQueryPredicateStep; } extern "C" { pub fn ts_query_is_pattern_guaranteed_at_step(self_: *const TSQuery, byte_offset: u32) -> bool; } extern "C" { #[doc = " Get the name and length of one of the query's captures, or one of the"] #[doc = " query's string literals. Each capture and string is associated with a"] #[doc = " numeric id based on the order that it appeared in the query's source."] pub fn ts_query_capture_name_for_id( arg1: *const TSQuery, id: u32, length: *mut u32, ) -> *const ::std::os::raw::c_char; } extern "C" { pub fn ts_query_string_value_for_id( arg1: *const TSQuery, id: u32, length: *mut u32, ) -> *const ::std::os::raw::c_char; } extern "C" { #[doc = " Disable a certain capture within a query."] #[doc = ""] #[doc = " This prevents the capture from being returned in matches, and also avoids"] #[doc = " any resource usage associated with recording the capture. Currently, there"] #[doc = " is no way to undo this."] pub fn ts_query_disable_capture( arg1: *mut TSQuery, arg2: *const ::std::os::raw::c_char, arg3: u32, ); } extern "C" { #[doc = " Disable a certain pattern within a query."] #[doc = ""] #[doc = " This prevents the pattern from matching and removes most of the overhead"] #[doc = " associated with the pattern. Currently, there is no way to undo this."] pub fn ts_query_disable_pattern(arg1: *mut TSQuery, arg2: u32); } extern "C" { #[doc = " Create a new cursor for executing a given query."] #[doc = ""] #[doc = " The cursor stores the state that is needed to iteratively search"] #[doc = " for matches. To use the query cursor, first call `ts_query_cursor_exec`"] #[doc = " to start running a given query on a given syntax node. Then, there are"] #[doc = " two options for consuming the results of the query:"] #[doc = " 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the"] #[doc = " *matches* in the order that they were found. Each match contains the"] #[doc = " index of the pattern that matched, and an array of captures. Because"] #[doc = " multiple patterns can match the same set of nodes, one match may contain"] #[doc = " captures that appear *before* some of the captures from a previous match."] #[doc = " 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the"] #[doc = " individual *captures* in the order that they appear. This is useful if"] #[doc = " don't care about which pattern matched, and just want a single ordered"] #[doc = " sequence of captures."] #[doc = ""] #[doc = " If you don't care about consuming all of the results, you can stop calling"] #[doc = " `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point."] #[doc = " You can then start executing another query on another node by calling"] #[doc = " `ts_query_cursor_exec` again."] pub fn ts_query_cursor_new() -> *mut TSQueryCursor; } extern "C" { #[doc = " Delete a query cursor, freeing all of the memory that it used."] pub fn ts_query_cursor_delete(arg1: *mut TSQueryCursor); } extern "C" { #[doc = " Start running a given query on a given node."] pub fn ts_query_cursor_exec(arg1: *mut TSQueryCursor, arg2: *const TSQuery, arg3: TSNode); } extern "C" { #[doc = " Manage the maximum number of in-progress matches allowed by this query"] #[doc = " cursor."] #[doc = ""] #[doc = " Query cursors have an optional maximum capacity for storing lists of"] #[doc = " in-progress captures. If this capacity is exceeded, then the"] #[doc = " earliest-starting match will silently be dropped to make room for further"] #[doc = " matches. This maximum capacity is optional — by default, query cursors allow"] #[doc = " any number of pending matches, dynamically allocating new space for them as"] #[doc = " needed as the query is executed."] pub fn ts_query_cursor_did_exceed_match_limit(arg1: *const TSQueryCursor) -> bool; } extern "C" { pub fn ts_query_cursor_match_limit(arg1: *const TSQueryCursor) -> u32; } extern "C" { pub fn ts_query_cursor_set_match_limit(arg1: *mut TSQueryCursor, arg2: u32); } extern "C" { #[doc = " Set the range of bytes or (row, column) positions in which the query"] #[doc = " will be executed."] pub fn ts_query_cursor_set_byte_range(arg1: *mut TSQueryCursor, arg2: u32, arg3: u32); } extern "C" { pub fn ts_query_cursor_set_point_range(arg1: *mut TSQueryCursor, arg2: TSPoint, arg3: TSPoint); } extern "C" { #[doc = " Advance to the next match of the currently running query."] #[doc = ""] #[doc = " If there is a match, write it to `*match` and return `true`."] #[doc = " Otherwise, return `false`."] pub fn ts_query_cursor_next_match(arg1: *mut TSQueryCursor, match_: *mut TSQueryMatch) -> bool; } extern "C" { pub fn ts_query_cursor_remove_match(arg1: *mut TSQueryCursor, id: u32); } extern "C" { #[doc = " Advance to the next capture of the currently running query."] #[doc = ""] #[doc = " If there is a capture, write its match to `*match` and its index within"] #[doc = " the matche's capture list to `*capture_index`. Otherwise, return `false`."] pub fn ts_query_cursor_next_capture( arg1: *mut TSQueryCursor, match_: *mut TSQueryMatch, capture_index: *mut u32, ) -> bool; } extern "C" { #[doc = " Get the number of distinct node types in the language."] pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; } extern "C" { #[doc = " Get a node type string for the given numerical id."] pub fn ts_language_symbol_name( arg1: *const TSLanguage, arg2: TSSymbol, ) -> *const ::std::os::raw::c_char; } extern "C" { #[doc = " Get the numerical id for the given node type string."] pub fn ts_language_symbol_for_name( self_: *const TSLanguage, string: *const ::std::os::raw::c_char, length: u32, is_named: bool, ) -> TSSymbol; } extern "C" { #[doc = " Get the number of distinct field names in the language."] pub fn ts_language_field_count(arg1: *const TSLanguage) -> u32; } extern "C" { #[doc = " Get the field name string for the given numerical id."] pub fn ts_language_field_name_for_id( arg1: *const TSLanguage, arg2: TSFieldId, ) -> *const ::std::os::raw::c_char; } extern "C" { #[doc = " Get the numerical id for the given field name string."] pub fn ts_language_field_id_for_name( arg1: *const TSLanguage, arg2: *const ::std::os::raw::c_char, arg3: u32, ) -> TSFieldId; } extern "C" { #[doc = " Check whether the given node type id belongs to named nodes, anonymous nodes,"] #[doc = " or a hidden nodes."] #[doc = ""] #[doc = " See also `ts_node_is_named`. Hidden nodes are never returned from the API."] pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; } extern "C" { #[doc = " Get the ABI version number for this language. This version number is used"] #[doc = " to ensure that languages were generated by a compatible version of"] #[doc = " Tree-sitter."] #[doc = ""] #[doc = " See also `ts_parser_set_language`."] pub fn ts_language_version(arg1: *const TSLanguage) -> u32; } pub const TREE_SITTER_LANGUAGE_VERSION: usize = 13; pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 13; tree-sitter-0.20.1/binding_rust/build.rs000064400000000000000000000033360072674642500163440ustar 00000000000000extern crate cc; use std::path::{Path, PathBuf}; use std::{env, fs}; fn main() { println!("cargo:rerun-if-env-changed=TREE_SITTER_STATIC_ANALYSIS"); if env::var("TREE_SITTER_STATIC_ANALYSIS").is_ok() { if let (Some(clang_path), Some(scan_build_path)) = (which("clang"), which("scan-build")) { let clang_path = clang_path.to_str().unwrap(); let scan_build_path = scan_build_path.to_str().unwrap(); env::set_var( "CC", &format!( "{} -analyze-headers --use-analyzer={} cc", scan_build_path, clang_path ), ); } } let mut config = cc::Build::new(); println!("cargo:rerun-if-env-changed=CARGO_FEATURE_ALLOCATION_TRACKING"); if env::var("CARGO_FEATURE_ALLOCATION_TRACKING").is_ok() { config.define("TREE_SITTER_ALLOCATION_TRACKING", ""); } let src_path = Path::new("src"); for entry in fs::read_dir(&src_path).unwrap() { let entry = entry.unwrap(); let path = src_path.join(entry.file_name()); println!("cargo:rerun-if-changed={}", path.to_str().unwrap()); } config .flag_if_supported("-std=c99") .flag_if_supported("-Wno-unused-parameter") .include(src_path) .include("include") .file(src_path.join("lib.c")) .compile("tree-sitter"); } fn which(exe_name: impl AsRef) -> Option { env::var_os("PATH").and_then(|paths| { env::split_paths(&paths).find_map(|dir| { let full_path = dir.join(&exe_name); if full_path.is_file() { Some(full_path) } else { None } }) }) } tree-sitter-0.20.1/binding_rust/ffi.rs000064400000000000000000000003120072674642500160000ustar 00000000000000#![allow(dead_code)] #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] include!("./bindings.rs"); extern "C" { pub(crate) fn dup(fd: std::os::raw::c_int) -> std::os::raw::c_int; } tree-sitter-0.20.1/binding_rust/lib.rs000064400000000000000000002261110072674642500160110ustar 00000000000000mod ffi; mod util; #[cfg(feature = "allocation-tracking")] pub mod allocations; #[cfg(unix)] use std::os::unix::io::AsRawFd; use std::{ char, error, ffi::CStr, fmt, hash, iter, marker::PhantomData, mem::MaybeUninit, ops, os::raw::{c_char, c_void}, ptr::{self, NonNull}, slice, str, sync::atomic::AtomicUsize, u16, }; /// The latest ABI version that is supported by the current version of the /// library. /// /// When Languages are generated by the Tree-sitter CLI, they are /// assigned an ABI version number that corresponds to the current CLI version. /// The Tree-sitter library is generally backwards-compatible with languages /// generated using older CLI versions, but is not forwards-compatible. pub const LANGUAGE_VERSION: usize = ffi::TREE_SITTER_LANGUAGE_VERSION; /// The earliest ABI version that is supported by the current version of the /// library. pub const MIN_COMPATIBLE_LANGUAGE_VERSION: usize = ffi::TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION; pub const PARSER_HEADER: &'static str = include_str!("../include/tree_sitter/parser.h"); /// An opaque object that defines how to parse a particular language. The code for each /// `Language` is generated by the Tree-sitter CLI. #[derive(Clone, Copy, Debug, PartialEq, Eq)] #[repr(transparent)] pub struct Language(*const ffi::TSLanguage); /// A tree that represents the syntactic structure of a source code file. pub struct Tree(NonNull); /// A position in a multi-line text document, in terms of rows and columns. /// /// Rows and columns are zero-based. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Point { pub row: usize, pub column: usize, } /// A range of positions in a multi-line text document, both in terms of bytes and of /// rows and columns. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Range { pub start_byte: usize, pub end_byte: usize, pub start_point: Point, pub end_point: Point, } /// A summary of a change to a text document. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct InputEdit { pub start_byte: usize, pub old_end_byte: usize, pub new_end_byte: usize, pub start_position: Point, pub old_end_position: Point, pub new_end_position: Point, } /// A single node within a syntax `Tree`. #[derive(Clone, Copy)] #[repr(transparent)] pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); /// A stateful object that this is used to produce a `Tree` based on some source code. pub struct Parser(NonNull); /// A type of log message. #[derive(Debug, PartialEq, Eq)] pub enum LogType { Parse, Lex, } /// A callback that receives log messages during parser. type Logger<'a> = Box; /// A stateful object for walking a syntax `Tree` efficiently. pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); /// A set of patterns that match nodes in a syntax tree. #[derive(Debug)] pub struct Query { ptr: NonNull, capture_names: Vec, text_predicates: Vec>, property_settings: Vec>, property_predicates: Vec>, general_predicates: Vec>, } /// A stateful object for executing a `Query` on a syntax `Tree`. pub struct QueryCursor { ptr: NonNull, } /// A key-value pair associated with a particular pattern in a `Query`. #[derive(Debug, PartialEq, Eq)] pub struct QueryProperty { pub key: Box, pub value: Option>, pub capture_id: Option, } #[derive(Debug, PartialEq, Eq)] pub enum QueryPredicateArg { Capture(u32), String(Box), } /// A key-value pair associated with a particular pattern in a `Query`. #[derive(Debug, PartialEq, Eq)] pub struct QueryPredicate { pub operator: Box, pub args: Vec, } /// A match of a `Query` to a particular set of `Node`s. pub struct QueryMatch<'cursor, 'tree> { pub pattern_index: usize, pub captures: &'cursor [QueryCapture<'tree>], id: u32, cursor: *mut ffi::TSQueryCursor, } /// A sequence of `QueryMatch`es associated with a given `QueryCursor`. pub struct QueryMatches<'a, 'tree: 'a, T: TextProvider<'a>> { ptr: *mut ffi::TSQueryCursor, query: &'a Query, text_provider: T, buffer1: Vec, buffer2: Vec, _tree: PhantomData<&'tree ()>, } /// A sequence of `QueryCapture`s associated with a given `QueryCursor`. pub struct QueryCaptures<'a, 'tree: 'a, T: TextProvider<'a>> { ptr: *mut ffi::TSQueryCursor, query: &'a Query, text_provider: T, buffer1: Vec, buffer2: Vec, _tree: PhantomData<&'tree ()>, } pub trait TextProvider<'a> { type I: Iterator + 'a; fn text(&mut self, node: Node) -> Self::I; } /// A particular `Node` that has been captured with a particular name within a `Query`. #[derive(Clone, Copy, Debug)] #[repr(C)] pub struct QueryCapture<'a> { pub node: Node<'a>, pub index: u32, } /// An error that occurred when trying to assign an incompatible `Language` to a `Parser`. #[derive(Debug, PartialEq, Eq)] pub struct LanguageError { version: usize, } /// An error that occurred in `Parser::set_included_ranges`. #[derive(Debug, PartialEq, Eq)] pub struct IncludedRangesError(pub usize); /// An error that occurred when trying to create a `Query`. #[derive(Debug, PartialEq, Eq)] pub struct QueryError { pub row: usize, pub column: usize, pub offset: usize, pub message: String, pub kind: QueryErrorKind, } #[derive(Debug, PartialEq, Eq)] pub enum QueryErrorKind { Syntax, NodeType, Field, Capture, Predicate, Structure, Language, } #[derive(Debug)] enum TextPredicate { CaptureEqString(u32, String, bool), CaptureEqCapture(u32, u32, bool), CaptureMatchString(u32, regex::bytes::Regex, bool), } // TODO: Remove this struct at at some point. If `core::str::lossy::Utf8Lossy` // is ever stabilized. pub struct LossyUtf8<'a> { bytes: &'a [u8], in_replacement: bool, } impl Language { /// Get the ABI version number that indicates which version of the Tree-sitter CLI /// that was used to generate this `Language`. pub fn version(&self) -> usize { unsafe { ffi::ts_language_version(self.0) as usize } } /// Get the number of distinct node types in this language. pub fn node_kind_count(&self) -> usize { unsafe { ffi::ts_language_symbol_count(self.0) as usize } } /// Get the name of the node kind for the given numerical id. pub fn node_kind_for_id(&self, id: u16) -> Option<&'static str> { let ptr = unsafe { ffi::ts_language_symbol_name(self.0, id) }; if ptr.is_null() { None } else { Some(unsafe { CStr::from_ptr(ptr) }.to_str().unwrap()) } } /// Get the numeric id for the given node kind. pub fn id_for_node_kind(&self, kind: &str, named: bool) -> u16 { unsafe { ffi::ts_language_symbol_for_name( self.0, kind.as_bytes().as_ptr() as *const c_char, kind.len() as u32, named, ) } } /// Check if the node type for the given numerical id is named (as opposed /// to an anonymous node type). pub fn node_kind_is_named(&self, id: u16) -> bool { unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } } pub fn node_kind_is_visible(&self, id: u16) -> bool { unsafe { ffi::ts_language_symbol_type(self.0, id) <= ffi::TSSymbolType_TSSymbolTypeAnonymous } } /// Get the number of distinct field names in this language. pub fn field_count(&self) -> usize { unsafe { ffi::ts_language_field_count(self.0) as usize } } /// Get the field names for the given numerical id. pub fn field_name_for_id(&self, field_id: u16) -> Option<&'static str> { let ptr = unsafe { ffi::ts_language_field_name_for_id(self.0, field_id) }; if ptr.is_null() { None } else { Some(unsafe { CStr::from_ptr(ptr) }.to_str().unwrap()) } } /// Get the numerical id for the given field name. pub fn field_id_for_name(&self, field_name: impl AsRef<[u8]>) -> Option { let field_name = field_name.as_ref(); let id = unsafe { ffi::ts_language_field_id_for_name( self.0, field_name.as_ptr() as *const c_char, field_name.len() as u32, ) }; if id == 0 { None } else { Some(id) } } } impl Parser { /// Create a new parser. pub fn new() -> Parser { unsafe { let parser = ffi::ts_parser_new(); Parser(NonNull::new_unchecked(parser)) } } /// Set the language that the parser should use for parsing. /// /// Returns a Result indicating whether or not the language was successfully /// assigned. True means assignment succeeded. False means there was a version /// mismatch: the language was generated with an incompatible version of the /// Tree-sitter CLI. Check the language's version using [Language::version] /// and compare it to this library's [LANGUAGE_VERSION](LANGUAGE_VERSION) and /// [MIN_COMPATIBLE_LANGUAGE_VERSION](MIN_COMPATIBLE_LANGUAGE_VERSION) constants. pub fn set_language(&mut self, language: Language) -> Result<(), LanguageError> { let version = language.version(); if version < MIN_COMPATIBLE_LANGUAGE_VERSION || version > LANGUAGE_VERSION { Err(LanguageError { version }) } else { unsafe { ffi::ts_parser_set_language(self.0.as_ptr(), language.0); } Ok(()) } } /// Get the parser's current language. pub fn language(&self) -> Option { let ptr = unsafe { ffi::ts_parser_language(self.0.as_ptr()) }; if ptr.is_null() { None } else { Some(Language(ptr)) } } /// Get the parser's current logger. pub fn logger(&self) -> Option<&Logger> { let logger = unsafe { ffi::ts_parser_logger(self.0.as_ptr()) }; unsafe { (logger.payload as *mut Logger).as_ref() } } /// Set the logging callback that a parser should use during parsing. pub fn set_logger(&mut self, logger: Option) { let prev_logger = unsafe { ffi::ts_parser_logger(self.0.as_ptr()) }; if !prev_logger.payload.is_null() { drop(unsafe { Box::from_raw(prev_logger.payload as *mut Logger) }); } let c_logger; if let Some(logger) = logger { let container = Box::new(logger); unsafe extern "C" fn log( payload: *mut c_void, c_log_type: ffi::TSLogType, c_message: *const c_char, ) { let callback = (payload as *mut Logger).as_mut().unwrap(); if let Ok(message) = CStr::from_ptr(c_message).to_str() { let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { LogType::Parse } else { LogType::Lex }; callback(log_type, message); } } let raw_container = Box::into_raw(container); c_logger = ffi::TSLogger { payload: raw_container as *mut c_void, log: Some(log), }; } else { c_logger = ffi::TSLogger { payload: ptr::null_mut(), log: None, }; } unsafe { ffi::ts_parser_set_logger(self.0.as_ptr(), c_logger) }; } /// Set the destination to which the parser should write debugging graphs /// during parsing. The graphs are formatted in the DOT language. You may want /// to pipe these graphs directly to a `dot(1)` process in order to generate /// SVG output. #[cfg(unix)] pub fn print_dot_graphs(&mut self, file: &impl AsRawFd) { let fd = file.as_raw_fd(); unsafe { ffi::ts_parser_print_dot_graphs(self.0.as_ptr(), ffi::dup(fd)) } } /// Stop the parser from printing debugging graphs while parsing. pub fn stop_printing_dot_graphs(&mut self) { unsafe { ffi::ts_parser_print_dot_graphs(self.0.as_ptr(), -1) } } /// Parse a slice of UTF8 text. /// /// # Arguments: /// * `text` The UTF8-encoded text to parse. /// * `old_tree` A previous syntax tree parsed from the same document. /// If the text of the document has changed since `old_tree` was /// created, then you must edit `old_tree` to match the new text using /// [Tree::edit]. /// /// Returns a [Tree] if parsing succeeded, or `None` if: /// * The parser has not yet had a language assigned with [Parser::set_language] /// * The timeout set with [Parser::set_timeout_micros] expired /// * The cancellation flag set with [Parser::set_cancellation_flag] was flipped pub fn parse(&mut self, text: impl AsRef<[u8]>, old_tree: Option<&Tree>) -> Option { let bytes = text.as_ref(); let len = bytes.len(); self.parse_with( &mut |i, _| if i < len { &bytes[i..] } else { &[] }, old_tree, ) } /// Parse a slice of UTF16 text. /// /// # Arguments: /// * `text` The UTF16-encoded text to parse. /// * `old_tree` A previous syntax tree parsed from the same document. /// If the text of the document has changed since `old_tree` was /// created, then you must edit `old_tree` to match the new text using /// [Tree::edit]. pub fn parse_utf16( &mut self, input: impl AsRef<[u16]>, old_tree: Option<&Tree>, ) -> Option { let code_points = input.as_ref(); let len = code_points.len(); self.parse_utf16_with( &mut |i, _| if i < len { &code_points[i..] } else { &[] }, old_tree, ) } /// Parse UTF8 text provided in chunks by a callback. /// /// # Arguments: /// * `callback` A function that takes a byte offset and position and /// returns a slice of UTF8-encoded text starting at that byte offset /// and position. The slices can be of any length. If the given position /// is at the end of the text, the callback should return an empty slice. /// * `old_tree` A previous syntax tree parsed from the same document. /// If the text of the document has changed since `old_tree` was /// created, then you must edit `old_tree` to match the new text using /// [Tree::edit]. pub fn parse_with<'a, T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>( &mut self, callback: &mut F, old_tree: Option<&Tree>, ) -> Option { // A pointer to this payload is passed on every call to the `read` C function. // The payload contains two things: // 1. A reference to the rust `callback`. // 2. The text that was returned from the previous call to `callback`. // This allows the callback to return owned values like vectors. let mut payload: (&mut F, Option) = (callback, None); // This C function is passed to Tree-sitter as the input callback. unsafe extern "C" fn read<'a, T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let (callback, text) = (payload as *mut (&mut F, Option)).as_mut().unwrap(); *text = Some(callback(byte_offset as usize, position.into())); let slice = text.as_ref().unwrap().as_ref(); *bytes_read = slice.len() as u32; return slice.as_ptr() as *const c_char; } let c_input = ffi::TSInput { payload: &mut payload as *mut (&mut F, Option) as *mut c_void, read: Some(read::), encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, }; let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr()); unsafe { let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input); NonNull::new(c_new_tree).map(Tree) } } /// Parse UTF16 text provided in chunks by a callback. /// /// # Arguments: /// * `callback` A function that takes a code point offset and position and /// returns a slice of UTF16-encoded text starting at that byte offset /// and position. The slices can be of any length. If the given position /// is at the end of the text, the callback should return an empty slice. /// * `old_tree` A previous syntax tree parsed from the same document. /// If the text of the document has changed since `old_tree` was /// created, then you must edit `old_tree` to match the new text using /// [Tree::edit]. pub fn parse_utf16_with<'a, T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>( &mut self, callback: &mut F, old_tree: Option<&Tree>, ) -> Option { // A pointer to this payload is passed on every call to the `read` C function. // The payload contains two things: // 1. A reference to the rust `callback`. // 2. The text that was returned from the previous call to `callback`. // This allows the callback to return owned values like vectors. let mut payload: (&mut F, Option) = (callback, None); // This C function is passed to Tree-sitter as the input callback. unsafe extern "C" fn read<'a, T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>( payload: *mut c_void, byte_offset: u32, position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { let (callback, text) = (payload as *mut (&mut F, Option)).as_mut().unwrap(); *text = Some(callback( (byte_offset / 2) as usize, Point { row: position.row as usize, column: position.column as usize / 2, }, )); let slice = text.as_ref().unwrap().as_ref(); *bytes_read = slice.len() as u32 * 2; slice.as_ptr() as *const c_char } let c_input = ffi::TSInput { payload: &mut payload as *mut (&mut F, Option) as *mut c_void, read: Some(read::), encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, }; let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr()); unsafe { let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input); NonNull::new(c_new_tree).map(Tree) } } /// Instruct the parser to start the next parse from the beginning. /// /// If the parser previously failed because of a timeout or a cancellation, then /// by default, it will resume where it left off on the next call to `parse` or /// other parsing functions. If you don't want to resume, and instead intend to /// use this parser to parse some other document, you must call `reset` first. pub fn reset(&mut self) { unsafe { ffi::ts_parser_reset(self.0.as_ptr()) } } /// Get the duration in microseconds that parsing is allowed to take. /// /// This is set via [set_timeout_micros](Parser::set_timeout_micros). pub fn timeout_micros(&self) -> u64 { unsafe { ffi::ts_parser_timeout_micros(self.0.as_ptr()) } } /// Set the maximum duration in microseconds that parsing should be allowed to /// take before halting. /// /// If parsing takes longer than this, it will halt early, returning `None`. /// See `parse` for more information. pub fn set_timeout_micros(&mut self, timeout_micros: u64) { unsafe { ffi::ts_parser_set_timeout_micros(self.0.as_ptr(), timeout_micros) } } /// Set the ranges of text that the parser should include when parsing. /// /// By default, the parser will always include entire documents. This function /// allows you to parse only a *portion* of a document but still return a syntax /// tree whose ranges match up with the document as a whole. You can also pass /// multiple disjoint ranges. /// /// If `ranges` is empty, then the entire document will be parsed. Otherwise, /// the given ranges must be ordered from earliest to latest in the document, /// and they must not overlap. That is, the following must hold for all /// `i` < `length - 1`: /// ```text /// ranges[i].end_byte <= ranges[i + 1].start_byte /// ``` /// If this requirement is not satisfied, method will return IncludedRangesError /// error with an offset in the passed ranges slice pointing to a first incorrect range. pub fn set_included_ranges<'a>( &mut self, ranges: &'a [Range], ) -> Result<(), IncludedRangesError> { let ts_ranges: Vec = ranges.iter().cloned().map(|range| range.into()).collect(); let result = unsafe { ffi::ts_parser_set_included_ranges( self.0.as_ptr(), ts_ranges.as_ptr(), ts_ranges.len() as u32, ) }; if result { Ok(()) } else { let mut prev_end_byte = 0; for (i, range) in ranges.iter().enumerate() { if range.start_byte < prev_end_byte || range.end_byte < range.start_byte { return Err(IncludedRangesError(i)); } prev_end_byte = range.end_byte; } Err(IncludedRangesError(0)) } } /// Get the parser's current cancellation flag pointer. pub unsafe fn cancellation_flag(&self) -> Option<&AtomicUsize> { (ffi::ts_parser_cancellation_flag(self.0.as_ptr()) as *const AtomicUsize).as_ref() } /// Set the parser's current cancellation flag pointer. /// /// If a pointer is assigned, then the parser will periodically read from /// this pointer during parsing. If it reads a non-zero value, it will halt early, /// returning `None`. See [parse](Parser::parse) for more information. pub unsafe fn set_cancellation_flag(&mut self, flag: Option<&AtomicUsize>) { if let Some(flag) = flag { ffi::ts_parser_set_cancellation_flag( self.0.as_ptr(), flag as *const AtomicUsize as *const usize, ); } else { ffi::ts_parser_set_cancellation_flag(self.0.as_ptr(), ptr::null()); } } } impl Drop for Parser { fn drop(&mut self) { self.stop_printing_dot_graphs(); self.set_logger(None); unsafe { ffi::ts_parser_delete(self.0.as_ptr()) } } } impl Tree { /// Get the root node of the syntax tree. pub fn root_node(&self) -> Node { Node::new(unsafe { ffi::ts_tree_root_node(self.0.as_ptr()) }).unwrap() } /// Get the language that was used to parse the syntax tree. pub fn language(&self) -> Language { Language(unsafe { ffi::ts_tree_language(self.0.as_ptr()) }) } /// Edit the syntax tree to keep it in sync with source code that has been /// edited. /// /// You must describe the edit both in terms of byte offsets and in terms of /// row/column coordinates. pub fn edit(&mut self, edit: &InputEdit) { let edit = edit.into(); unsafe { ffi::ts_tree_edit(self.0.as_ptr(), &edit) }; } /// Create a new [TreeCursor] starting from the root of the tree. pub fn walk(&self) -> TreeCursor { self.root_node().walk() } /// Compare this old edited syntax tree to a new syntax tree representing the same /// document, returning a sequence of ranges whose syntactic structure has changed. /// /// For this to work correctly, this syntax tree must have been edited such that its /// ranges match up to the new tree. Generally, you'll want to call this method right /// after calling one of the [Parser::parse] functions. Call it on the old tree that /// was passed to parse, and pass the new tree that was returned from `parse`. pub fn changed_ranges(&self, other: &Tree) -> impl ExactSizeIterator { let mut count = 0; unsafe { let ptr = ffi::ts_tree_get_changed_ranges( self.0.as_ptr(), other.0.as_ptr(), &mut count as *mut _ as *mut u32, ); util::CBufferIter::new(ptr, count).map(|r| r.into()) } } } impl fmt::Debug for Tree { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "{{Tree {:?}}}", self.root_node()) } } impl Drop for Tree { fn drop(&mut self) { unsafe { ffi::ts_tree_delete(self.0.as_ptr()) } } } impl Clone for Tree { fn clone(&self) -> Tree { unsafe { Tree(NonNull::new_unchecked(ffi::ts_tree_copy(self.0.as_ptr()))) } } } impl<'tree> Node<'tree> { fn new(node: ffi::TSNode) -> Option { if node.id.is_null() { None } else { Some(Node(node, PhantomData)) } } /// Get a numeric id for this node that is unique. /// /// Within a given syntax tree, no two nodes have the same id. However, if /// a new tree is created based on an older tree, and a node from the old /// tree is reused in the process, then that node will have the same id in /// both trees. pub fn id(&self) -> usize { self.0.id as usize } /// Get this node's type as a numerical id. pub fn kind_id(&self) -> u16 { unsafe { ffi::ts_node_symbol(self.0) } } /// Get this node's type as a string. pub fn kind(&self) -> &'static str { unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } .to_str() .unwrap() } /// Get the [Language] that was used to parse this node's syntax tree. pub fn language(&self) -> Language { Language(unsafe { ffi::ts_tree_language(self.0.tree) }) } /// Check if this node is *named*. /// /// Named nodes correspond to named rules in the grammar, whereas *anonymous* nodes /// correspond to string literals in the grammar. pub fn is_named(&self) -> bool { unsafe { ffi::ts_node_is_named(self.0) } } /// Check if this node is *extra*. /// /// Extra nodes represent things like comments, which are not required the grammar, /// but can appear anywhere. pub fn is_extra(&self) -> bool { unsafe { ffi::ts_node_is_extra(self.0) } } /// Check if this node has been edited. pub fn has_changes(&self) -> bool { unsafe { ffi::ts_node_has_changes(self.0) } } /// Check if this node represents a syntax error or contains any syntax errors anywhere /// within it. pub fn has_error(&self) -> bool { unsafe { ffi::ts_node_has_error(self.0) } } /// Check if this node represents a syntax error. /// /// Syntax errors represent parts of the code that could not be incorporated into a /// valid syntax tree. pub fn is_error(&self) -> bool { self.kind_id() == u16::MAX } /// Check if this node is *missing*. /// /// Missing nodes are inserted by the parser in order to recover from certain kinds of /// syntax errors. pub fn is_missing(&self) -> bool { unsafe { ffi::ts_node_is_missing(self.0) } } /// Get the byte offsets where this node starts. pub fn start_byte(&self) -> usize { unsafe { ffi::ts_node_start_byte(self.0) as usize } } /// Get the byte offsets where this node end. pub fn end_byte(&self) -> usize { unsafe { ffi::ts_node_end_byte(self.0) as usize } } /// Get the byte range of source code that this node represents. pub fn byte_range(&self) -> std::ops::Range { self.start_byte()..self.end_byte() } /// Get the range of source code that this node represents, both in terms of raw bytes /// and of row/column coordinates. pub fn range(&self) -> Range { Range { start_byte: self.start_byte(), end_byte: self.end_byte(), start_point: self.start_position(), end_point: self.end_position(), } } /// Get this node's start position in terms of rows and columns. pub fn start_position(&self) -> Point { let result = unsafe { ffi::ts_node_start_point(self.0) }; result.into() } /// Get this node's end position in terms of rows and columns. pub fn end_position(&self) -> Point { let result = unsafe { ffi::ts_node_end_point(self.0) }; result.into() } /// Get the node's child at the given index, where zero represents the first /// child. /// /// This method is fairly fast, but its cost is technically log(i), so you /// if you might be iterating over a long list of children, you should use /// [Node::children] instead. pub fn child(&self, i: usize) -> Option { Self::new(unsafe { ffi::ts_node_child(self.0, i as u32) }) } /// Get this node's number of children. pub fn child_count(&self) -> usize { unsafe { ffi::ts_node_child_count(self.0) as usize } } /// Get this node's *named* child at the given index. /// /// See also [Node::is_named]. /// This method is fairly fast, but its cost is technically log(i), so you /// if you might be iterating over a long list of children, you should use /// [Node::named_children] instead. pub fn named_child<'a>(&'a self, i: usize) -> Option { Self::new(unsafe { ffi::ts_node_named_child(self.0, i as u32) }) } /// Get this node's number of *named* children. /// /// See also [Node::is_named]. pub fn named_child_count(&self) -> usize { unsafe { ffi::ts_node_named_child_count(self.0) as usize } } /// Get the first child with the given field name. /// /// If multiple children may have the same field name, access them using /// [children_by_field_name](Node::children_by_field_name) pub fn child_by_field_name(&self, field_name: impl AsRef<[u8]>) -> Option { let field_name = field_name.as_ref(); Self::new(unsafe { ffi::ts_node_child_by_field_name( self.0, field_name.as_ptr() as *const c_char, field_name.len() as u32, ) }) } /// Get this node's child with the given numerical field id. /// /// See also [child_by_field_name](Node::child_by_field_name). You can convert a field name to /// an id using [Language::field_id_for_name]. pub fn child_by_field_id(&self, field_id: u16) -> Option { Self::new(unsafe { ffi::ts_node_child_by_field_id(self.0, field_id) }) } /// Get the field name of this node's child at the given index. pub fn field_name_for_child(&self, child_index: u32) -> Option<&'static str> { unsafe { let ptr = ffi::ts_node_field_name_for_child(self.0, child_index); if ptr.is_null() { None } else { Some(CStr::from_ptr(ptr).to_str().unwrap()) } } } /// Iterate over this node's children. /// /// A [TreeCursor] is used to retrieve the children efficiently. Obtain /// a [TreeCursor] by calling [Tree::walk] or [Node::walk]. To avoid unnecessary /// allocations, you should reuse the same cursor for subsequent calls to /// this method. /// /// If you're walking the tree recursively, you may want to use the `TreeCursor` /// APIs directly instead. pub fn children<'a>( &self, cursor: &'a mut TreeCursor<'tree>, ) -> impl ExactSizeIterator> + 'a { cursor.reset(*self); cursor.goto_first_child(); (0..self.child_count()).into_iter().map(move |_| { let result = cursor.node(); cursor.goto_next_sibling(); result }) } /// Iterate over this node's named children. /// /// See also [Node::children]. pub fn named_children<'a>( &self, cursor: &'a mut TreeCursor<'tree>, ) -> impl ExactSizeIterator> + 'a { cursor.reset(*self); cursor.goto_first_child(); (0..self.named_child_count()).into_iter().map(move |_| { while !cursor.node().is_named() { if !cursor.goto_next_sibling() { break; } } let result = cursor.node(); cursor.goto_next_sibling(); result }) } /// Iterate over this node's children with a given field name. /// /// See also [Node::children]. pub fn children_by_field_name<'a>( &self, field_name: &str, cursor: &'a mut TreeCursor<'tree>, ) -> impl Iterator> + 'a { let field_id = self.language().field_id_for_name(field_name); self.children_by_field_id(field_id.unwrap_or(0), cursor) } /// Iterate over this node's children with a given field id. /// /// See also [Node::children_by_field_name]. pub fn children_by_field_id<'a>( &self, field_id: u16, cursor: &'a mut TreeCursor<'tree>, ) -> impl Iterator> + 'a { cursor.reset(*self); cursor.goto_first_child(); let mut done = false; iter::from_fn(move || { while !done { while cursor.field_id() != Some(field_id) { if !cursor.goto_next_sibling() { return None; } } let result = cursor.node(); if !cursor.goto_next_sibling() { done = true; } return Some(result); } None }) } /// Get this node's immediate parent. pub fn parent(&self) -> Option { Self::new(unsafe { ffi::ts_node_parent(self.0) }) } /// Get this node's next sibling. pub fn next_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_next_sibling(self.0) }) } /// Get this node's previous sibling. pub fn prev_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_prev_sibling(self.0) }) } /// Get this node's next named sibling. pub fn next_named_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_next_named_sibling(self.0) }) } /// Get this node's previous named sibling. pub fn prev_named_sibling(&self) -> Option { Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) } /// Get the smallest node within this node that spans the given range. pub fn descendant_for_byte_range(&self, start: usize, end: usize) -> Option { Self::new(unsafe { ffi::ts_node_descendant_for_byte_range(self.0, start as u32, end as u32) }) } /// Get the smallest named node within this node that spans the given range. pub fn named_descendant_for_byte_range(&self, start: usize, end: usize) -> Option { Self::new(unsafe { ffi::ts_node_named_descendant_for_byte_range(self.0, start as u32, end as u32) }) } /// Get the smallest node within this node that spans the given range. pub fn descendant_for_point_range(&self, start: Point, end: Point) -> Option { Self::new(unsafe { ffi::ts_node_descendant_for_point_range(self.0, start.into(), end.into()) }) } /// Get the smallest named node within this node that spans the given range. pub fn named_descendant_for_point_range(&self, start: Point, end: Point) -> Option { Self::new(unsafe { ffi::ts_node_named_descendant_for_point_range(self.0, start.into(), end.into()) }) } pub fn to_sexp(&self) -> String { let c_string = unsafe { ffi::ts_node_string(self.0) }; let result = unsafe { CStr::from_ptr(c_string) } .to_str() .unwrap() .to_string(); unsafe { util::free_ptr(c_string as *mut c_void) }; result } pub fn utf8_text<'a>(&self, source: &'a [u8]) -> Result<&'a str, str::Utf8Error> { str::from_utf8(&source[self.start_byte()..self.end_byte()]) } pub fn utf16_text<'a>(&self, source: &'a [u16]) -> &'a [u16] { &source.as_ref()[self.start_byte()..self.end_byte()] } /// Create a new [TreeCursor] starting from this node. pub fn walk(&self) -> TreeCursor<'tree> { TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) } /// Edit this node to keep it in-sync with source code that has been edited. /// /// This function is only rarely needed. When you edit a syntax tree with the /// [Tree::edit] method, all of the nodes that you retrieve from the tree /// afterward will already reflect the edit. You only need to use [Node::edit] /// when you have a specific [Node] instance that you want to keep and continue /// to use after an edit. pub fn edit(&mut self, edit: &InputEdit) { let edit = edit.into(); unsafe { ffi::ts_node_edit(&mut self.0 as *mut ffi::TSNode, &edit) } } } impl<'a> PartialEq for Node<'a> { fn eq(&self, other: &Self) -> bool { self.0.id == other.0.id } } impl<'a> Eq for Node<'a> {} impl<'a> hash::Hash for Node<'a> { fn hash(&self, state: &mut H) { self.0.id.hash(state); self.0.context[0].hash(state); self.0.context[1].hash(state); self.0.context[2].hash(state); self.0.context[3].hash(state); } } impl<'a> fmt::Debug for Node<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!( f, "{{Node {} {} - {}}}", self.kind(), self.start_position(), self.end_position() ) } } impl<'a> TreeCursor<'a> { /// Get the tree cursor's current [Node]. pub fn node(&self) -> Node<'a> { Node( unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, PhantomData, ) } /// Get the numerical field id of this tree cursor's current node. /// /// See also [field_name](TreeCursor::field_name). pub fn field_id(&self) -> Option { unsafe { let id = ffi::ts_tree_cursor_current_field_id(&self.0); if id == 0 { None } else { Some(id) } } } /// Get the field name of this tree cursor's current node. pub fn field_name(&self) -> Option<&'static str> { unsafe { let ptr = ffi::ts_tree_cursor_current_field_name(&self.0); if ptr.is_null() { None } else { Some(CStr::from_ptr(ptr).to_str().unwrap()) } } } /// Move this cursor to the first child of its current node. /// /// This returns `true` if the cursor successfully moved, and returns `false` /// if there were no children. pub fn goto_first_child(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_first_child(&mut self.0) }; } /// Move this cursor to the parent of its current node. /// /// This returns `true` if the cursor successfully moved, and returns `false` /// if there was no parent node (the cursor was already on the root node). pub fn goto_parent(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_parent(&mut self.0) }; } /// Move this cursor to the next sibling of its current node. /// /// This returns `true` if the cursor successfully moved, and returns `false` /// if there was no next sibling node. pub fn goto_next_sibling(&mut self) -> bool { return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; } /// Move this cursor to the first child of its current node that extends beyond /// the given byte offset. /// /// This returns the index of the child node if one was found, and returns `None` /// if no such child was found. pub fn goto_first_child_for_byte(&mut self, index: usize) -> Option { let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; if result < 0 { None } else { Some(result as usize) } } /// Move this cursor to the first child of its current node that extends beyond /// the given byte offset. /// /// This returns the index of the child node if one was found, and returns `None` /// if no such child was found. pub fn goto_first_child_for_point(&mut self, point: Point) -> Option { let result = unsafe { ffi::ts_tree_cursor_goto_first_child_for_point(&mut self.0, point.into()) }; if result < 0 { None } else { Some(result as usize) } } /// Re-initialize this tree cursor to start at a different node. pub fn reset(&mut self, node: Node<'a>) { unsafe { ffi::ts_tree_cursor_reset(&mut self.0, node.0) }; } } impl<'a> Clone for TreeCursor<'a> { fn clone(&self) -> Self { TreeCursor(unsafe { ffi::ts_tree_cursor_copy(&self.0) }, PhantomData) } } impl<'a> Drop for TreeCursor<'a> { fn drop(&mut self) { unsafe { ffi::ts_tree_cursor_delete(&mut self.0) } } } impl Query { /// Create a new query from a string containing one or more S-expression /// patterns. /// /// The query is associated with a particular language, and can only be run /// on syntax nodes parsed with that language. References to Queries can be /// shared between multiple threads. pub fn new(language: Language, source: &str) -> Result { let mut error_offset = 0u32; let mut error_type: ffi::TSQueryError = 0; let bytes = source.as_bytes(); // Compile the query. let ptr = unsafe { ffi::ts_query_new( language.0, bytes.as_ptr() as *const c_char, bytes.len() as u32, &mut error_offset as *mut u32, &mut error_type as *mut ffi::TSQueryError, ) }; // On failure, build an error based on the error code and offset. if ptr.is_null() { if error_type == ffi::TSQueryError_TSQueryErrorLanguage { return Err(QueryError { row: 0, column: 0, offset: 0, message: LanguageError { version: language.version(), } .to_string(), kind: QueryErrorKind::Language, }); } let offset = error_offset as usize; let mut line_start = 0; let mut row = 0; let mut line_containing_error = None; for line in source.split("\n") { let line_end = line_start + line.len() + 1; if line_end > offset { line_containing_error = Some(line); break; } line_start = line_end; row += 1; } let column = offset - line_start; let kind; let message; match error_type { // Error types that report names ffi::TSQueryError_TSQueryErrorNodeType | ffi::TSQueryError_TSQueryErrorField | ffi::TSQueryError_TSQueryErrorCapture => { let suffix = source.split_at(offset).1; let end_offset = suffix .find(|c| !char::is_alphanumeric(c) && c != '_' && c != '-') .unwrap_or(source.len()); message = suffix.split_at(end_offset).0.to_string(); kind = match error_type { ffi::TSQueryError_TSQueryErrorNodeType => QueryErrorKind::NodeType, ffi::TSQueryError_TSQueryErrorField => QueryErrorKind::Field, ffi::TSQueryError_TSQueryErrorCapture => QueryErrorKind::Capture, _ => unreachable!(), }; } // Error types that report positions _ => { message = if let Some(line) = line_containing_error { line.to_string() + "\n" + &" ".repeat(offset - line_start) + "^" } else { "Unexpected EOF".to_string() }; kind = match error_type { ffi::TSQueryError_TSQueryErrorStructure => QueryErrorKind::Structure, _ => QueryErrorKind::Syntax, }; } }; return Err(QueryError { row, column, offset, kind, message, }); } let string_count = unsafe { ffi::ts_query_string_count(ptr) }; let capture_count = unsafe { ffi::ts_query_capture_count(ptr) }; let pattern_count = unsafe { ffi::ts_query_pattern_count(ptr) as usize }; let mut result = Query { ptr: unsafe { NonNull::new_unchecked(ptr) }, capture_names: Vec::with_capacity(capture_count as usize), text_predicates: Vec::with_capacity(pattern_count), property_predicates: Vec::with_capacity(pattern_count), property_settings: Vec::with_capacity(pattern_count), general_predicates: Vec::with_capacity(pattern_count), }; // Build a vector of strings to store the capture names. for i in 0..capture_count { unsafe { let mut length = 0u32; let name = ffi::ts_query_capture_name_for_id(ptr, i, &mut length as *mut u32) as *const u8; let name = slice::from_raw_parts(name, length as usize); let name = str::from_utf8_unchecked(name); result.capture_names.push(name.to_string()); } } // Build a vector of strings to represent literal values used in predicates. let string_values = (0..string_count) .map(|i| unsafe { let mut length = 0u32; let value = ffi::ts_query_string_value_for_id(ptr, i as u32, &mut length as *mut u32) as *const u8; let value = slice::from_raw_parts(value, length as usize); let value = str::from_utf8_unchecked(value); value.to_string() }) .collect::>(); // Build a vector of predicates for each pattern. for i in 0..pattern_count { let predicate_steps = unsafe { let mut length = 0u32; let raw_predicates = ffi::ts_query_predicates_for_pattern(ptr, i as u32, &mut length as *mut u32); if length > 0 { slice::from_raw_parts(raw_predicates, length as usize) } else { &[] } }; let byte_offset = unsafe { ffi::ts_query_start_byte_for_pattern(ptr, i as u32) }; let row = source .char_indices() .take_while(|(i, _)| *i < byte_offset as usize) .filter(|(_, c)| *c == '\n') .count(); let type_done = ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeDone; let type_capture = ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture; let type_string = ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeString; let mut text_predicates = Vec::new(); let mut property_predicates = Vec::new(); let mut property_settings = Vec::new(); let mut general_predicates = Vec::new(); for p in predicate_steps.split(|s| s.type_ == type_done) { if p.is_empty() { continue; } if p[0].type_ != type_string { return Err(predicate_error( row, format!( "Expected predicate to start with a function name. Got @{}.", result.capture_names[p[0].value_id as usize], ), )); } // Build a predicate for each of the known predicate function names. let operator_name = &string_values[p[0].value_id as usize]; match operator_name.as_str() { "eq?" | "not-eq?" => { if p.len() != 3 { return Err(predicate_error( row, format!( "Wrong number of arguments to #eq? predicate. Expected 2, got {}.", p.len() - 1 ), )); } if p[1].type_ != type_capture { return Err(predicate_error(row, format!( "First argument to #eq? predicate must be a capture name. Got literal \"{}\".", string_values[p[1].value_id as usize], ))); } let is_positive = operator_name == "eq?"; text_predicates.push(if p[2].type_ == type_capture { TextPredicate::CaptureEqCapture( p[1].value_id, p[2].value_id, is_positive, ) } else { TextPredicate::CaptureEqString( p[1].value_id, string_values[p[2].value_id as usize].clone(), is_positive, ) }); } "match?" | "not-match?" => { if p.len() != 3 { return Err(predicate_error(row, format!( "Wrong number of arguments to #match? predicate. Expected 2, got {}.", p.len() - 1 ))); } if p[1].type_ != type_capture { return Err(predicate_error(row, format!( "First argument to #match? predicate must be a capture name. Got literal \"{}\".", string_values[p[1].value_id as usize], ))); } if p[2].type_ == type_capture { return Err(predicate_error(row, format!( "Second argument to #match? predicate must be a literal. Got capture @{}.", result.capture_names[p[2].value_id as usize], ))); } let is_positive = operator_name == "match?"; let regex = &string_values[p[2].value_id as usize]; text_predicates.push(TextPredicate::CaptureMatchString( p[1].value_id, regex::bytes::Regex::new(regex).map_err(|_| { predicate_error(row, format!("Invalid regex '{}'", regex)) })?, is_positive, )); } "set!" => property_settings.push(Self::parse_property( row, &operator_name, &result.capture_names, &string_values, &p[1..], )?), "is?" | "is-not?" => property_predicates.push(( Self::parse_property( row, &operator_name, &result.capture_names, &string_values, &p[1..], )?, operator_name == "is?", )), _ => general_predicates.push(QueryPredicate { operator: operator_name.clone().into_boxed_str(), args: p[1..] .iter() .map(|a| { if a.type_ == type_capture { QueryPredicateArg::Capture(a.value_id) } else { QueryPredicateArg::String( string_values[a.value_id as usize].clone().into_boxed_str(), ) } }) .collect(), }), } } result .text_predicates .push(text_predicates.into_boxed_slice()); result .property_predicates .push(property_predicates.into_boxed_slice()); result .property_settings .push(property_settings.into_boxed_slice()); result .general_predicates .push(general_predicates.into_boxed_slice()); } Ok(result) } /// Get the byte offset where the given pattern starts in the query's source. pub fn start_byte_for_pattern(&self, pattern_index: usize) -> usize { if pattern_index >= self.text_predicates.len() { panic!( "Pattern index is {} but the pattern count is {}", pattern_index, self.text_predicates.len(), ); } unsafe { ffi::ts_query_start_byte_for_pattern(self.ptr.as_ptr(), pattern_index as u32) as usize } } /// Get the number of patterns in the query. pub fn pattern_count(&self) -> usize { unsafe { ffi::ts_query_pattern_count(self.ptr.as_ptr()) as usize } } /// Get the names of the captures used in the query. pub fn capture_names(&self) -> &[String] { &self.capture_names } /// Get the index for a given capture name. pub fn capture_index_for_name(&self, name: &str) -> Option { self.capture_names .iter() .position(|n| n == name) .map(|ix| ix as u32) } /// Get the properties that are checked for the given pattern index. /// /// This includes predicates with the operators `is?` and `is-not?`. pub fn property_predicates(&self, index: usize) -> &[(QueryProperty, bool)] { &self.property_predicates[index] } /// Get the properties that are set for the given pattern index. /// /// This includes predicates with the operator `set!`. pub fn property_settings(&self, index: usize) -> &[QueryProperty] { &self.property_settings[index] } /// Get the other user-defined predicates associated with the given index. /// /// This includes predicate with operators other than: /// * `match?` /// * `eq?` and `not-eq?` /// * `is?` and `is-not?` /// * `set!` pub fn general_predicates(&self, index: usize) -> &[QueryPredicate] { &self.general_predicates[index] } /// Disable a certain capture within a query. /// /// This prevents the capture from being returned in matches, and also avoids any /// resource usage associated with recording the capture. pub fn disable_capture(&mut self, name: &str) { unsafe { ffi::ts_query_disable_capture( self.ptr.as_ptr(), name.as_bytes().as_ptr() as *const c_char, name.len() as u32, ); } } /// Disable a certain pattern within a query. /// /// This prevents the pattern from matching, and also avoids any resource usage /// associated with the pattern. pub fn disable_pattern(&mut self, index: usize) { unsafe { ffi::ts_query_disable_pattern(self.ptr.as_ptr(), index as u32) } } /// Check if a given step in a query is 'definite'. /// /// A query step is 'definite' if its parent pattern will be guaranteed to match /// successfully once it reaches the step. pub fn is_pattern_guaranteed_at_step(&self, byte_offset: usize) -> bool { unsafe { ffi::ts_query_is_pattern_guaranteed_at_step(self.ptr.as_ptr(), byte_offset as u32) } } fn parse_property( row: usize, function_name: &str, capture_names: &[String], string_values: &[String], args: &[ffi::TSQueryPredicateStep], ) -> Result { if args.len() == 0 || args.len() > 3 { return Err(predicate_error( row, format!( "Wrong number of arguments to {} predicate. Expected 1 to 3, got {}.", function_name, args.len(), ), )); } let mut capture_id = None; let mut key = None; let mut value = None; for arg in args { if arg.type_ == ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture { if capture_id.is_some() { return Err(predicate_error( row, format!( "Invalid arguments to {} predicate. Unexpected second capture name @{}", function_name, capture_names[arg.value_id as usize] ), )); } capture_id = Some(arg.value_id as usize); } else if key.is_none() { key = Some(&string_values[arg.value_id as usize]); } else if value.is_none() { value = Some(string_values[arg.value_id as usize].as_str()); } else { return Err(predicate_error( row, format!( "Invalid arguments to {} predicate. Unexpected third argument @{}", function_name, string_values[arg.value_id as usize] ), )); } } if let Some(key) = key { Ok(QueryProperty::new(key, value, capture_id)) } else { return Err(predicate_error( row, format!( "Invalid arguments to {} predicate. Missing key argument", function_name, ), )); } } } impl QueryCursor { /// Create a new cursor for executing a given query. /// /// The cursor stores the state that is needed to iteratively search for matches. pub fn new() -> Self { QueryCursor { ptr: unsafe { NonNull::new_unchecked(ffi::ts_query_cursor_new()) }, } } /// Return the maximum number of in-progress matches for this cursor. pub fn match_limit(&self) -> u32 { unsafe { ffi::ts_query_cursor_match_limit(self.ptr.as_ptr()) } } /// Set the maximum number of in-progress matches for this cursor. The limit must be > 0 and /// <= 65536. pub fn set_match_limit(&mut self, limit: u32) { unsafe { ffi::ts_query_cursor_set_match_limit(self.ptr.as_ptr(), limit); } } /// Check if, on its last execution, this cursor exceeded its maximum number of /// in-progress matches. pub fn did_exceed_match_limit(&self) -> bool { unsafe { ffi::ts_query_cursor_did_exceed_match_limit(self.ptr.as_ptr()) } } /// Iterate over all of the matches in the order that they were found. /// /// Each match contains the index of the pattern that matched, and a list of captures. /// Because multiple patterns can match the same set of nodes, one match may contain /// captures that appear *before* some of the captures from a previous match. pub fn matches<'a, 'tree: 'a, T: TextProvider<'a> + 'a>( &'a mut self, query: &'a Query, node: Node<'tree>, text_provider: T, ) -> QueryMatches<'a, 'tree, T> { let ptr = self.ptr.as_ptr(); unsafe { ffi::ts_query_cursor_exec(ptr, query.ptr.as_ptr(), node.0) }; QueryMatches { ptr, query, text_provider, buffer1: Default::default(), buffer2: Default::default(), _tree: PhantomData, } } /// Iterate over all of the individual captures in the order that they appear. /// /// This is useful if you don't care about which pattern matched, and just want a single, /// ordered sequence of captures. pub fn captures<'a, 'tree: 'a, T: TextProvider<'a> + 'a>( &'a mut self, query: &'a Query, node: Node<'tree>, text_provider: T, ) -> QueryCaptures<'a, 'tree, T> { let ptr = self.ptr.as_ptr(); unsafe { ffi::ts_query_cursor_exec(self.ptr.as_ptr(), query.ptr.as_ptr(), node.0) }; QueryCaptures { ptr, query, text_provider, buffer1: Default::default(), buffer2: Default::default(), _tree: PhantomData, } } /// Set the range in which the query will be executed, in terms of byte offsets. pub fn set_byte_range(&mut self, range: ops::Range) -> &mut Self { unsafe { ffi::ts_query_cursor_set_byte_range( self.ptr.as_ptr(), range.start as u32, range.end as u32, ); } self } /// Set the range in which the query will be executed, in terms of rows and columns. pub fn set_point_range(&mut self, range: ops::Range) -> &mut Self { unsafe { ffi::ts_query_cursor_set_point_range( self.ptr.as_ptr(), range.start.into(), range.end.into(), ); } self } } impl<'a, 'tree> QueryMatch<'a, 'tree> { pub fn id(&self) -> u32 { self.id } pub fn remove(self) { unsafe { ffi::ts_query_cursor_remove_match(self.cursor, self.id) } } pub fn nodes_for_capture_index( &self, capture_ix: u32, ) -> impl Iterator> + '_ { self.captures.iter().filter_map(move |capture| { if capture.index == capture_ix { Some(capture.node) } else { None } }) } fn new(m: ffi::TSQueryMatch, cursor: *mut ffi::TSQueryCursor) -> Self { QueryMatch { cursor, id: m.id, pattern_index: m.pattern_index as usize, captures: if m.capture_count > 0 { unsafe { slice::from_raw_parts( m.captures as *const QueryCapture<'tree>, m.capture_count as usize, ) } } else { &[] }, } } fn satisfies_text_predicates( &self, query: &Query, buffer1: &mut Vec, buffer2: &mut Vec, text_provider: &mut impl TextProvider<'a>, ) -> bool { fn get_text<'a, 'b: 'a, I: Iterator>( buffer: &'a mut Vec, mut chunks: I, ) -> &'a [u8] { let first_chunk = chunks.next().unwrap_or(&[]); if let Some(next_chunk) = chunks.next() { buffer.clear(); buffer.extend_from_slice(first_chunk); buffer.extend_from_slice(next_chunk); for chunk in chunks { buffer.extend_from_slice(chunk); } buffer.as_slice() } else { first_chunk } } query.text_predicates[self.pattern_index] .iter() .all(|predicate| match predicate { TextPredicate::CaptureEqCapture(i, j, is_positive) => { let node1 = self.nodes_for_capture_index(*i).next(); let node2 = self.nodes_for_capture_index(*j).next(); match (node1, node2) { (Some(node1), Some(node2)) => { let text1 = get_text(buffer1, text_provider.text(node1)); let text2 = get_text(buffer2, text_provider.text(node2)); (text1 == text2) == *is_positive } _ => true, } } TextPredicate::CaptureEqString(i, s, is_positive) => { let node = self.nodes_for_capture_index(*i).next(); match node { Some(node) => { let text = get_text(buffer1, text_provider.text(node)); (text == s.as_bytes()) == *is_positive } None => true, } } TextPredicate::CaptureMatchString(i, r, is_positive) => { let node = self.nodes_for_capture_index(*i).next(); match node { Some(node) => { let text = get_text(buffer1, text_provider.text(node)); r.is_match(text) == *is_positive } None => true, } } }) } } impl QueryProperty { pub fn new(key: &str, value: Option<&str>, capture_id: Option) -> Self { QueryProperty { capture_id, key: key.to_string().into_boxed_str(), value: value.map(|s| s.to_string().into_boxed_str()), } } } impl<'a, 'tree, T: TextProvider<'a>> Iterator for QueryMatches<'a, 'tree, T> { type Item = QueryMatch<'a, 'tree>; fn next(&mut self) -> Option { unsafe { loop { let mut m = MaybeUninit::::uninit(); if ffi::ts_query_cursor_next_match(self.ptr, m.as_mut_ptr()) { let result = QueryMatch::new(m.assume_init(), self.ptr); if result.satisfies_text_predicates( self.query, &mut self.buffer1, &mut self.buffer2, &mut self.text_provider, ) { return Some(result); } } else { return None; } } } } } impl<'a, 'tree, T: TextProvider<'a>> Iterator for QueryCaptures<'a, 'tree, T> { type Item = (QueryMatch<'a, 'tree>, usize); fn next(&mut self) -> Option { unsafe { loop { let mut capture_index = 0u32; let mut m = MaybeUninit::::uninit(); if ffi::ts_query_cursor_next_capture( self.ptr, m.as_mut_ptr(), &mut capture_index as *mut u32, ) { let result = QueryMatch::new(m.assume_init(), self.ptr); if result.satisfies_text_predicates( self.query, &mut self.buffer1, &mut self.buffer2, &mut self.text_provider, ) { return Some((result, capture_index as usize)); } else { result.remove(); } } else { return None; } } } } } impl<'a, 'tree, T: TextProvider<'a>> QueryMatches<'a, 'tree, T> { pub fn set_byte_range(&mut self, range: ops::Range) { unsafe { ffi::ts_query_cursor_set_byte_range(self.ptr, range.start as u32, range.end as u32); } } pub fn set_point_range(&mut self, range: ops::Range) { unsafe { ffi::ts_query_cursor_set_point_range(self.ptr, range.start.into(), range.end.into()); } } } impl<'a, 'tree, T: TextProvider<'a>> QueryCaptures<'a, 'tree, T> { pub fn set_byte_range(&mut self, range: ops::Range) { unsafe { ffi::ts_query_cursor_set_byte_range(self.ptr, range.start as u32, range.end as u32); } } pub fn set_point_range(&mut self, range: ops::Range) { unsafe { ffi::ts_query_cursor_set_point_range(self.ptr, range.start.into(), range.end.into()); } } } impl<'cursor, 'tree> fmt::Debug for QueryMatch<'cursor, 'tree> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, "QueryMatch {{ id: {}, pattern_index: {}, captures: {:?} }}", self.id, self.pattern_index, self.captures ) } } impl<'a, F, I> TextProvider<'a> for F where F: FnMut(Node) -> I, I: Iterator + 'a, { type I = I; fn text(&mut self, node: Node) -> Self::I { (self)(node) } } impl<'a> TextProvider<'a> for &'a [u8] { type I = iter::Once<&'a [u8]>; fn text(&mut self, node: Node) -> Self::I { iter::once(&self[node.byte_range()]) } } impl PartialEq for Query { fn eq(&self, other: &Self) -> bool { self.ptr == other.ptr } } impl Drop for Query { fn drop(&mut self) { unsafe { ffi::ts_query_delete(self.ptr.as_ptr()) } } } impl Drop for QueryCursor { fn drop(&mut self) { unsafe { ffi::ts_query_cursor_delete(self.ptr.as_ptr()) } } } impl Point { pub fn new(row: usize, column: usize) -> Self { Point { row, column } } } impl fmt::Display for Point { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "({}, {})", self.row, self.column) } } impl Into for Point { fn into(self) -> ffi::TSPoint { ffi::TSPoint { row: self.row as u32, column: self.column as u32, } } } impl From for Point { fn from(point: ffi::TSPoint) -> Self { Self { row: point.row as usize, column: point.column as usize, } } } impl Into for Range { fn into(self) -> ffi::TSRange { ffi::TSRange { start_byte: self.start_byte as u32, end_byte: self.end_byte as u32, start_point: self.start_point.into(), end_point: self.end_point.into(), } } } impl From for Range { fn from(range: ffi::TSRange) -> Self { Self { start_byte: range.start_byte as usize, end_byte: range.end_byte as usize, start_point: range.start_point.into(), end_point: range.end_point.into(), } } } impl<'a> Into for &'a InputEdit { fn into(self) -> ffi::TSInputEdit { ffi::TSInputEdit { start_byte: self.start_byte as u32, old_end_byte: self.old_end_byte as u32, new_end_byte: self.new_end_byte as u32, start_point: self.start_position.into(), old_end_point: self.old_end_position.into(), new_end_point: self.new_end_position.into(), } } } impl<'a> LossyUtf8<'a> { pub fn new(bytes: &'a [u8]) -> Self { LossyUtf8 { bytes, in_replacement: false, } } } impl<'a> Iterator for LossyUtf8<'a> { type Item = &'a str; fn next(&mut self) -> Option<&'a str> { if self.bytes.is_empty() { return None; } if self.in_replacement { self.in_replacement = false; return Some("\u{fffd}"); } match std::str::from_utf8(self.bytes) { Ok(valid) => { self.bytes = &[]; Some(valid) } Err(error) => { if let Some(error_len) = error.error_len() { let error_start = error.valid_up_to(); if error_start > 0 { let result = unsafe { std::str::from_utf8_unchecked(&self.bytes[..error_start]) }; self.bytes = &self.bytes[(error_start + error_len)..]; self.in_replacement = true; Some(result) } else { self.bytes = &self.bytes[error_len..]; Some("\u{fffd}") } } else { None } } } } } fn predicate_error(row: usize, message: String) -> QueryError { QueryError { kind: QueryErrorKind::Predicate, row, column: 0, offset: 0, message, } } impl fmt::Display for IncludedRangesError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Incorrect range by index: {}", self.0) } } impl fmt::Display for LanguageError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, "Incompatible language version {}. Expected minimum {}, maximum {}", self.version, MIN_COMPATIBLE_LANGUAGE_VERSION, LANGUAGE_VERSION, ) } } impl fmt::Display for QueryError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let msg = match self.kind { QueryErrorKind::Field => "Invalid field name ", QueryErrorKind::NodeType => "Invalid node type ", QueryErrorKind::Capture => "Invalid capture name ", QueryErrorKind::Predicate => "Invalid predicate: ", QueryErrorKind::Structure => "Impossible pattern:\n", QueryErrorKind::Syntax => "Invalid syntax:\n", QueryErrorKind::Language => "", }; if msg.len() > 0 { write!( f, "Query error at {}:{}. {}{}", self.row + 1, self.column + 1, msg, self.message ) } else { write!(f, "{}", self.message) } } } impl error::Error for IncludedRangesError {} impl error::Error for LanguageError {} impl error::Error for QueryError {} unsafe impl Send for Language {} unsafe impl Send for Parser {} unsafe impl Send for Query {} unsafe impl Send for QueryCursor {} unsafe impl Send for Tree {} unsafe impl Sync for Language {} unsafe impl Sync for Parser {} unsafe impl Sync for Query {} unsafe impl Sync for QueryCursor {} unsafe impl Sync for Tree {} tree-sitter-0.20.1/binding_rust/util.rs000064400000000000000000000030040072674642500162120ustar 00000000000000use std::os::raw::c_void; #[cfg(not(feature = "allocation-tracking"))] extern "C" { /// Normally, use `free(1)` to free memory allocated from C. #[link_name = "free"] pub fn free_ptr(ptr: *mut c_void); } /// When the `allocation-tracking` feature is enabled, the C library is compiled with /// the `TREE_SITTER_TEST` macro, so all calls to `malloc`, `free`, etc are linked /// against wrapper functions called `ts_record_malloc`, `ts_record_free`, etc. /// When freeing buffers allocated from C, use the wrapper `free` function. #[cfg(feature = "allocation-tracking")] pub use crate::allocations::ts_record_free as free_ptr; /// A raw pointer and a length, exposed as an iterator. pub struct CBufferIter { ptr: *mut T, count: usize, i: usize, } impl CBufferIter { pub unsafe fn new(ptr: *mut T, count: usize) -> Self { Self { ptr, count, i: 0 } } } impl Iterator for CBufferIter { type Item = T; fn next(&mut self) -> Option { let i = self.i; if i >= self.count { None } else { self.i += 1; Some(unsafe { *self.ptr.offset(i as isize) }) } } fn size_hint(&self) -> (usize, Option) { let remaining = self.count - self.i; (remaining, Some(remaining)) } } impl ExactSizeIterator for CBufferIter {} impl Drop for CBufferIter { fn drop(&mut self) { unsafe { free_ptr(self.ptr as *mut c_void); } } } tree-sitter-0.20.1/include/tree_sitter/api.h000064400000000000000000000657020072674642500171130ustar 00000000000000#ifndef TREE_SITTER_API_H_ #define TREE_SITTER_API_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #include /****************************/ /* Section - ABI Versioning */ /****************************/ /** * The latest ABI version that is supported by the current version of the * library. When Languages are generated by the Tree-sitter CLI, they are * assigned an ABI version number that corresponds to the current CLI version. * The Tree-sitter library is generally backwards-compatible with languages * generated using older CLI versions, but is not forwards-compatible. */ #define TREE_SITTER_LANGUAGE_VERSION 13 /** * The earliest ABI version that is supported by the current version of the * library. */ #define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 13 /*******************/ /* Section - Types */ /*******************/ typedef uint16_t TSSymbol; typedef uint16_t TSFieldId; typedef struct TSLanguage TSLanguage; typedef struct TSParser TSParser; typedef struct TSTree TSTree; typedef struct TSQuery TSQuery; typedef struct TSQueryCursor TSQueryCursor; typedef enum { TSInputEncodingUTF8, TSInputEncodingUTF16, } TSInputEncoding; typedef enum { TSSymbolTypeRegular, TSSymbolTypeAnonymous, TSSymbolTypeAuxiliary, } TSSymbolType; typedef struct { uint32_t row; uint32_t column; } TSPoint; typedef struct { TSPoint start_point; TSPoint end_point; uint32_t start_byte; uint32_t end_byte; } TSRange; typedef struct { void *payload; const char *(*read)(void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read); TSInputEncoding encoding; } TSInput; typedef enum { TSLogTypeParse, TSLogTypeLex, } TSLogType; typedef struct { void *payload; void (*log)(void *payload, TSLogType, const char *); } TSLogger; typedef struct { uint32_t start_byte; uint32_t old_end_byte; uint32_t new_end_byte; TSPoint start_point; TSPoint old_end_point; TSPoint new_end_point; } TSInputEdit; typedef struct { uint32_t context[4]; const void *id; const TSTree *tree; } TSNode; typedef struct { const void *tree; const void *id; uint32_t context[2]; } TSTreeCursor; typedef struct { TSNode node; uint32_t index; } TSQueryCapture; typedef struct { uint32_t id; uint16_t pattern_index; uint16_t capture_count; const TSQueryCapture *captures; } TSQueryMatch; typedef enum { TSQueryPredicateStepTypeDone, TSQueryPredicateStepTypeCapture, TSQueryPredicateStepTypeString, } TSQueryPredicateStepType; typedef struct { TSQueryPredicateStepType type; uint32_t value_id; } TSQueryPredicateStep; typedef enum { TSQueryErrorNone = 0, TSQueryErrorSyntax, TSQueryErrorNodeType, TSQueryErrorField, TSQueryErrorCapture, TSQueryErrorStructure, TSQueryErrorLanguage, } TSQueryError; /********************/ /* Section - Parser */ /********************/ /** * Create a new parser. */ TSParser *ts_parser_new(void); /** * Delete the parser, freeing all of the memory that it used. */ void ts_parser_delete(TSParser *parser); /** * Set the language that the parser should use for parsing. * * Returns a boolean indicating whether or not the language was successfully * assigned. True means assignment succeeded. False means there was a version * mismatch: the language was generated with an incompatible version of the * Tree-sitter CLI. Check the language's version using `ts_language_version` * and compare it to this library's `TREE_SITTER_LANGUAGE_VERSION` and * `TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION` constants. */ bool ts_parser_set_language(TSParser *self, const TSLanguage *language); /** * Get the parser's current language. */ const TSLanguage *ts_parser_language(const TSParser *self); /** * Set the ranges of text that the parser should include when parsing. * * By default, the parser will always include entire documents. This function * allows you to parse only a *portion* of a document but still return a syntax * tree whose ranges match up with the document as a whole. You can also pass * multiple disjoint ranges. * * The second and third parameters specify the location and length of an array * of ranges. The parser does *not* take ownership of these ranges; it copies * the data, so it doesn't matter how these ranges are allocated. * * If `length` is zero, then the entire document will be parsed. Otherwise, * the given ranges must be ordered from earliest to latest in the document, * and they must not overlap. That is, the following must hold for all * `i` < `length - 1`: ranges[i].end_byte <= ranges[i + 1].start_byte * * If this requirement is not satisfied, the operation will fail, the ranges * will not be assigned, and this function will return `false`. On success, * this function returns `true` */ bool ts_parser_set_included_ranges( TSParser *self, const TSRange *ranges, uint32_t length ); /** * Get the ranges of text that the parser will include when parsing. * * The returned pointer is owned by the parser. The caller should not free it * or write to it. The length of the array will be written to the given * `length` pointer. */ const TSRange *ts_parser_included_ranges( const TSParser *self, uint32_t *length ); /** * Use the parser to parse some source code and create a syntax tree. * * If you are parsing this document for the first time, pass `NULL` for the * `old_tree` parameter. Otherwise, if you have already parsed an earlier * version of this document and the document has since been edited, pass the * previous syntax tree so that the unchanged parts of it can be reused. * This will save time and memory. For this to work correctly, you must have * already edited the old syntax tree using the `ts_tree_edit` function in a * way that exactly matches the source code changes. * * The `TSInput` parameter lets you specify how to read the text. It has the * following three fields: * 1. `read`: A function to retrieve a chunk of text at a given byte offset * and (row, column) position. The function should return a pointer to the * text and write its length to the `bytes_read` pointer. The parser does * not take ownership of this buffer; it just borrows it until it has * finished reading it. The function should write a zero value to the * `bytes_read` pointer to indicate the end of the document. * 2. `payload`: An arbitrary pointer that will be passed to each invocation * of the `read` function. * 3. `encoding`: An indication of how the text is encoded. Either * `TSInputEncodingUTF8` or `TSInputEncodingUTF16`. * * This function returns a syntax tree on success, and `NULL` on failure. There * are three possible reasons for failure: * 1. The parser does not have a language assigned. Check for this using the `ts_parser_language` function. * 2. Parsing was cancelled due to a timeout that was set by an earlier call to * the `ts_parser_set_timeout_micros` function. You can resume parsing from * where the parser left out by calling `ts_parser_parse` again with the * same arguments. Or you can start parsing from scratch by first calling * `ts_parser_reset`. * 3. Parsing was cancelled using a cancellation flag that was set by an * earlier call to `ts_parser_set_cancellation_flag`. You can resume parsing * from where the parser left out by calling `ts_parser_parse` again with * the same arguments. */ TSTree *ts_parser_parse( TSParser *self, const TSTree *old_tree, TSInput input ); /** * Use the parser to parse some source code stored in one contiguous buffer. * The first two parameters are the same as in the `ts_parser_parse` function * above. The second two parameters indicate the location of the buffer and its * length in bytes. */ TSTree *ts_parser_parse_string( TSParser *self, const TSTree *old_tree, const char *string, uint32_t length ); /** * Use the parser to parse some source code stored in one contiguous buffer with * a given encoding. The first four parameters work the same as in the * `ts_parser_parse_string` method above. The final parameter indicates whether * the text is encoded as UTF8 or UTF16. */ TSTree *ts_parser_parse_string_encoding( TSParser *self, const TSTree *old_tree, const char *string, uint32_t length, TSInputEncoding encoding ); /** * Instruct the parser to start the next parse from the beginning. * * If the parser previously failed because of a timeout or a cancellation, then * by default, it will resume where it left off on the next call to * `ts_parser_parse` or other parsing functions. If you don't want to resume, * and instead intend to use this parser to parse some other document, you must * call `ts_parser_reset` first. */ void ts_parser_reset(TSParser *self); /** * Set the maximum duration in microseconds that parsing should be allowed to * take before halting. * * If parsing takes longer than this, it will halt early, returning NULL. * See `ts_parser_parse` for more information. */ void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout); /** * Get the duration in microseconds that parsing is allowed to take. */ uint64_t ts_parser_timeout_micros(const TSParser *self); /** * Set the parser's current cancellation flag pointer. * * If a non-null pointer is assigned, then the parser will periodically read * from this pointer during parsing. If it reads a non-zero value, it will * halt early, returning NULL. See `ts_parser_parse` for more information. */ void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag); /** * Get the parser's current cancellation flag pointer. */ const size_t *ts_parser_cancellation_flag(const TSParser *self); /** * Set the logger that a parser should use during parsing. * * The parser does not take ownership over the logger payload. If a logger was * previously assigned, the caller is responsible for releasing any memory * owned by the previous logger. */ void ts_parser_set_logger(TSParser *self, TSLogger logger); /** * Get the parser's current logger. */ TSLogger ts_parser_logger(const TSParser *self); /** * Set the file descriptor to which the parser should write debugging graphs * during parsing. The graphs are formatted in the DOT language. You may want * to pipe these graphs directly to a `dot(1)` process in order to generate * SVG output. You can turn off this logging by passing a negative number. */ void ts_parser_print_dot_graphs(TSParser *self, int file); /******************/ /* Section - Tree */ /******************/ /** * Create a shallow copy of the syntax tree. This is very fast. * * You need to copy a syntax tree in order to use it on more than one thread at * a time, as syntax trees are not thread safe. */ TSTree *ts_tree_copy(const TSTree *self); /** * Delete the syntax tree, freeing all of the memory that it used. */ void ts_tree_delete(TSTree *self); /** * Get the root node of the syntax tree. */ TSNode ts_tree_root_node(const TSTree *self); /** * Get the language that was used to parse the syntax tree. */ const TSLanguage *ts_tree_language(const TSTree *); /** * Edit the syntax tree to keep it in sync with source code that has been * edited. * * You must describe the edit both in terms of byte offsets and in terms of * (row, column) coordinates. */ void ts_tree_edit(TSTree *self, const TSInputEdit *edit); /** * Compare an old edited syntax tree to a new syntax tree representing the same * document, returning an array of ranges whose syntactic structure has changed. * * For this to work correctly, the old syntax tree must have been edited such * that its ranges match up to the new tree. Generally, you'll want to call * this function right after calling one of the `ts_parser_parse` functions. * You need to pass the old tree that was passed to parse, as well as the new * tree that was returned from that function. * * The returned array is allocated using `malloc` and the caller is responsible * for freeing it using `free`. The length of the array will be written to the * given `length` pointer. */ TSRange *ts_tree_get_changed_ranges( const TSTree *old_tree, const TSTree *new_tree, uint32_t *length ); /** * Write a DOT graph describing the syntax tree to the given file. */ void ts_tree_print_dot_graph(const TSTree *, FILE *); /******************/ /* Section - Node */ /******************/ /** * Get the node's type as a null-terminated string. */ const char *ts_node_type(TSNode); /** * Get the node's type as a numerical id. */ TSSymbol ts_node_symbol(TSNode); /** * Get the node's start byte. */ uint32_t ts_node_start_byte(TSNode); /** * Get the node's start position in terms of rows and columns. */ TSPoint ts_node_start_point(TSNode); /** * Get the node's end byte. */ uint32_t ts_node_end_byte(TSNode); /** * Get the node's end position in terms of rows and columns. */ TSPoint ts_node_end_point(TSNode); /** * Get an S-expression representing the node as a string. * * This string is allocated with `malloc` and the caller is responsible for * freeing it using `free`. */ char *ts_node_string(TSNode); /** * Check if the node is null. Functions like `ts_node_child` and * `ts_node_next_sibling` will return a null node to indicate that no such node * was found. */ bool ts_node_is_null(TSNode); /** * Check if the node is *named*. Named nodes correspond to named rules in the * grammar, whereas *anonymous* nodes correspond to string literals in the * grammar. */ bool ts_node_is_named(TSNode); /** * Check if the node is *missing*. Missing nodes are inserted by the parser in * order to recover from certain kinds of syntax errors. */ bool ts_node_is_missing(TSNode); /** * Check if the node is *extra*. Extra nodes represent things like comments, * which are not required the grammar, but can appear anywhere. */ bool ts_node_is_extra(TSNode); /** * Check if a syntax node has been edited. */ bool ts_node_has_changes(TSNode); /** * Check if the node is a syntax error or contains any syntax errors. */ bool ts_node_has_error(TSNode); /** * Get the node's immediate parent. */ TSNode ts_node_parent(TSNode); /** * Get the node's child at the given index, where zero represents the first * child. */ TSNode ts_node_child(TSNode, uint32_t); /** * Get the field name for node's child at the given index, where zero represents * the first child. Returns NULL, if no field is found. */ const char *ts_node_field_name_for_child(TSNode, uint32_t); /** * Get the node's number of children. */ uint32_t ts_node_child_count(TSNode); /** * Get the node's *named* child at the given index. * * See also `ts_node_is_named`. */ TSNode ts_node_named_child(TSNode, uint32_t); /** * Get the node's number of *named* children. * * See also `ts_node_is_named`. */ uint32_t ts_node_named_child_count(TSNode); /** * Get the node's child with the given field name. */ TSNode ts_node_child_by_field_name( TSNode self, const char *field_name, uint32_t field_name_length ); /** * Get the node's child with the given numerical field id. * * You can convert a field name to an id using the * `ts_language_field_id_for_name` function. */ TSNode ts_node_child_by_field_id(TSNode, TSFieldId); /** * Get the node's next / previous sibling. */ TSNode ts_node_next_sibling(TSNode); TSNode ts_node_prev_sibling(TSNode); /** * Get the node's next / previous *named* sibling. */ TSNode ts_node_next_named_sibling(TSNode); TSNode ts_node_prev_named_sibling(TSNode); /** * Get the node's first child that extends beyond the given byte offset. */ TSNode ts_node_first_child_for_byte(TSNode, uint32_t); /** * Get the node's first named child that extends beyond the given byte offset. */ TSNode ts_node_first_named_child_for_byte(TSNode, uint32_t); /** * Get the smallest node within this node that spans the given range of bytes * or (row, column) positions. */ TSNode ts_node_descendant_for_byte_range(TSNode, uint32_t, uint32_t); TSNode ts_node_descendant_for_point_range(TSNode, TSPoint, TSPoint); /** * Get the smallest named node within this node that spans the given range of * bytes or (row, column) positions. */ TSNode ts_node_named_descendant_for_byte_range(TSNode, uint32_t, uint32_t); TSNode ts_node_named_descendant_for_point_range(TSNode, TSPoint, TSPoint); /** * Edit the node to keep it in-sync with source code that has been edited. * * This function is only rarely needed. When you edit a syntax tree with the * `ts_tree_edit` function, all of the nodes that you retrieve from the tree * afterward will already reflect the edit. You only need to use `ts_node_edit` * when you have a `TSNode` instance that you want to keep and continue to use * after an edit. */ void ts_node_edit(TSNode *, const TSInputEdit *); /** * Check if two nodes are identical. */ bool ts_node_eq(TSNode, TSNode); /************************/ /* Section - TreeCursor */ /************************/ /** * Create a new tree cursor starting from the given node. * * A tree cursor allows you to walk a syntax tree more efficiently than is * possible using the `TSNode` functions. It is a mutable object that is always * on a certain syntax node, and can be moved imperatively to different nodes. */ TSTreeCursor ts_tree_cursor_new(TSNode); /** * Delete a tree cursor, freeing all of the memory that it used. */ void ts_tree_cursor_delete(TSTreeCursor *); /** * Re-initialize a tree cursor to start at a different node. */ void ts_tree_cursor_reset(TSTreeCursor *, TSNode); /** * Get the tree cursor's current node. */ TSNode ts_tree_cursor_current_node(const TSTreeCursor *); /** * Get the field name of the tree cursor's current node. * * This returns `NULL` if the current node doesn't have a field. * See also `ts_node_child_by_field_name`. */ const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); /** * Get the field id of the tree cursor's current node. * * This returns zero if the current node doesn't have a field. * See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`. */ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *); /** * Move the cursor to the parent of its current node. * * This returns `true` if the cursor successfully moved, and returns `false` * if there was no parent node (the cursor was already on the root node). */ bool ts_tree_cursor_goto_parent(TSTreeCursor *); /** * Move the cursor to the next sibling of its current node. * * This returns `true` if the cursor successfully moved, and returns `false` * if there was no next sibling node. */ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); /** * Move the cursor to the first child of its current node. * * This returns `true` if the cursor successfully moved, and returns `false` * if there were no children. */ bool ts_tree_cursor_goto_first_child(TSTreeCursor *); /** * Move the cursor to the first child of its current node that extends beyond * the given byte offset or point. * * This returns the index of the child node if one was found, and returns -1 * if no such child was found. */ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t); int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *, TSPoint); TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *); /*******************/ /* Section - Query */ /*******************/ /** * Create a new query from a string containing one or more S-expression * patterns. The query is associated with a particular language, and can * only be run on syntax nodes parsed with that language. * * If all of the given patterns are valid, this returns a `TSQuery`. * If a pattern is invalid, this returns `NULL`, and provides two pieces * of information about the problem: * 1. The byte offset of the error is written to the `error_offset` parameter. * 2. The type of error is written to the `error_type` parameter. */ TSQuery *ts_query_new( const TSLanguage *language, const char *source, uint32_t source_len, uint32_t *error_offset, TSQueryError *error_type ); /** * Delete a query, freeing all of the memory that it used. */ void ts_query_delete(TSQuery *); /** * Get the number of patterns, captures, or string literals in the query. */ uint32_t ts_query_pattern_count(const TSQuery *); uint32_t ts_query_capture_count(const TSQuery *); uint32_t ts_query_string_count(const TSQuery *); /** * Get the byte offset where the given pattern starts in the query's source. * * This can be useful when combining queries by concatenating their source * code strings. */ uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t); /** * Get all of the predicates for the given pattern in the query. * * The predicates are represented as a single array of steps. There are three * types of steps in this array, which correspond to the three legal values for * the `type` field: * - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names * of captures. Their `value_id` can be used with the * `ts_query_capture_name_for_id` function to obtain the name of the capture. * - `TSQueryPredicateStepTypeString` - Steps with this type represent literal * strings. Their `value_id` can be used with the * `ts_query_string_value_for_id` function to obtain their string value. * - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels* * that represent the end of an individual predicate. If a pattern has two * predicates, then there will be two steps with this `type` in the array. */ const TSQueryPredicateStep *ts_query_predicates_for_pattern( const TSQuery *self, uint32_t pattern_index, uint32_t *length ); bool ts_query_is_pattern_guaranteed_at_step( const TSQuery *self, uint32_t byte_offset ); /** * Get the name and length of one of the query's captures, or one of the * query's string literals. Each capture and string is associated with a * numeric id based on the order that it appeared in the query's source. */ const char *ts_query_capture_name_for_id( const TSQuery *, uint32_t id, uint32_t *length ); const char *ts_query_string_value_for_id( const TSQuery *, uint32_t id, uint32_t *length ); /** * Disable a certain capture within a query. * * This prevents the capture from being returned in matches, and also avoids * any resource usage associated with recording the capture. Currently, there * is no way to undo this. */ void ts_query_disable_capture(TSQuery *, const char *, uint32_t); /** * Disable a certain pattern within a query. * * This prevents the pattern from matching and removes most of the overhead * associated with the pattern. Currently, there is no way to undo this. */ void ts_query_disable_pattern(TSQuery *, uint32_t); /** * Create a new cursor for executing a given query. * * The cursor stores the state that is needed to iteratively search * for matches. To use the query cursor, first call `ts_query_cursor_exec` * to start running a given query on a given syntax node. Then, there are * two options for consuming the results of the query: * 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the * *matches* in the order that they were found. Each match contains the * index of the pattern that matched, and an array of captures. Because * multiple patterns can match the same set of nodes, one match may contain * captures that appear *before* some of the captures from a previous match. * 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the * individual *captures* in the order that they appear. This is useful if * don't care about which pattern matched, and just want a single ordered * sequence of captures. * * If you don't care about consuming all of the results, you can stop calling * `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point. * You can then start executing another query on another node by calling * `ts_query_cursor_exec` again. */ TSQueryCursor *ts_query_cursor_new(void); /** * Delete a query cursor, freeing all of the memory that it used. */ void ts_query_cursor_delete(TSQueryCursor *); /** * Start running a given query on a given node. */ void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode); /** * Manage the maximum number of in-progress matches allowed by this query * cursor. * * Query cursors have an optional maximum capacity for storing lists of * in-progress captures. If this capacity is exceeded, then the * earliest-starting match will silently be dropped to make room for further * matches. This maximum capacity is optional — by default, query cursors allow * any number of pending matches, dynamically allocating new space for them as * needed as the query is executed. */ bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *); uint32_t ts_query_cursor_match_limit(const TSQueryCursor *); void ts_query_cursor_set_match_limit(TSQueryCursor *, uint32_t); /** * Set the range of bytes or (row, column) positions in which the query * will be executed. */ void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t); void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint); /** * Advance to the next match of the currently running query. * * If there is a match, write it to `*match` and return `true`. * Otherwise, return `false`. */ bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match); void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id); /** * Advance to the next capture of the currently running query. * * If there is a capture, write its match to `*match` and its index within * the matche's capture list to `*capture_index`. Otherwise, return `false`. */ bool ts_query_cursor_next_capture( TSQueryCursor *, TSQueryMatch *match, uint32_t *capture_index ); /**********************/ /* Section - Language */ /**********************/ /** * Get the number of distinct node types in the language. */ uint32_t ts_language_symbol_count(const TSLanguage *); /** * Get a node type string for the given numerical id. */ const char *ts_language_symbol_name(const TSLanguage *, TSSymbol); /** * Get the numerical id for the given node type string. */ TSSymbol ts_language_symbol_for_name( const TSLanguage *self, const char *string, uint32_t length, bool is_named ); /** * Get the number of distinct field names in the language. */ uint32_t ts_language_field_count(const TSLanguage *); /** * Get the field name string for the given numerical id. */ const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId); /** * Get the numerical id for the given field name string. */ TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t); /** * Check whether the given node type id belongs to named nodes, anonymous nodes, * or a hidden nodes. * * See also `ts_node_is_named`. Hidden nodes are never returned from the API. */ TSSymbolType ts_language_symbol_type(const TSLanguage *, TSSymbol); /** * Get the ABI version number for this language. This version number is used * to ensure that languages were generated by a compatible version of * Tree-sitter. * * See also `ts_parser_set_language`. */ uint32_t ts_language_version(const TSLanguage *); #ifdef __cplusplus } #endif #endif // TREE_SITTER_API_H_ tree-sitter-0.20.1/include/tree_sitter/parser.h000064400000000000000000000123340072674642500176270ustar 00000000000000#ifndef TREE_SITTER_PARSER_H_ #define TREE_SITTER_PARSER_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #define ts_builtin_sym_error ((TSSymbol)-1) #define ts_builtin_sym_end 0 #define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 typedef uint16_t TSStateId; #ifndef TREE_SITTER_API_H_ typedef uint16_t TSSymbol; typedef uint16_t TSFieldId; typedef struct TSLanguage TSLanguage; #endif typedef struct { TSFieldId field_id; uint8_t child_index; bool inherited; } TSFieldMapEntry; typedef struct { uint16_t index; uint16_t length; } TSFieldMapSlice; typedef struct { bool visible; bool named; bool supertype; } TSSymbolMetadata; typedef struct TSLexer TSLexer; struct TSLexer { int32_t lookahead; TSSymbol result_symbol; void (*advance)(TSLexer *, bool); void (*mark_end)(TSLexer *); uint32_t (*get_column)(TSLexer *); bool (*is_at_included_range_start)(const TSLexer *); bool (*eof)(const TSLexer *); }; typedef enum { TSParseActionTypeShift, TSParseActionTypeReduce, TSParseActionTypeAccept, TSParseActionTypeRecover, } TSParseActionType; typedef union { struct { uint8_t type; TSStateId state; bool extra; bool repetition; } shift; struct { uint8_t type; uint8_t child_count; TSSymbol symbol; int16_t dynamic_precedence; uint16_t production_id; } reduce; uint8_t type; } TSParseAction; typedef struct { uint16_t lex_state; uint16_t external_lex_state; } TSLexMode; typedef union { TSParseAction action; struct { uint8_t count; bool reusable; } entry; } TSParseActionEntry; struct TSLanguage { uint32_t version; uint32_t symbol_count; uint32_t alias_count; uint32_t token_count; uint32_t external_token_count; uint32_t state_count; uint32_t large_state_count; uint32_t production_id_count; uint32_t field_count; uint16_t max_alias_sequence_length; const uint16_t *parse_table; const uint16_t *small_parse_table; const uint32_t *small_parse_table_map; const TSParseActionEntry *parse_actions; const char * const *symbol_names; const char * const *field_names; const TSFieldMapSlice *field_map_slices; const TSFieldMapEntry *field_map_entries; const TSSymbolMetadata *symbol_metadata; const TSSymbol *public_symbol_map; const uint16_t *alias_map; const TSSymbol *alias_sequences; const TSLexMode *lex_modes; bool (*lex_fn)(TSLexer *, TSStateId); bool (*keyword_lex_fn)(TSLexer *, TSStateId); TSSymbol keyword_capture_token; struct { const bool *states; const TSSymbol *symbol_map; void *(*create)(void); void (*destroy)(void *); bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); unsigned (*serialize)(void *, char *); void (*deserialize)(void *, const char *, unsigned); } external_scanner; }; /* * Lexer Macros */ #define START_LEXER() \ bool result = false; \ bool skip = false; \ bool eof = false; \ int32_t lookahead; \ goto start; \ next_state: \ lexer->advance(lexer, skip); \ start: \ skip = false; \ lookahead = lexer->lookahead; #define ADVANCE(state_value) \ { \ state = state_value; \ goto next_state; \ } #define SKIP(state_value) \ { \ skip = true; \ state = state_value; \ goto next_state; \ } #define ACCEPT_TOKEN(symbol_value) \ result = true; \ lexer->result_symbol = symbol_value; \ lexer->mark_end(lexer); #define END_STATE() return result; /* * Parse Table Macros */ #define SMALL_STATE(id) id - LARGE_STATE_COUNT #define STATE(id) id #define ACTIONS(id) id #define SHIFT(state_value) \ {{ \ .shift = { \ .type = TSParseActionTypeShift, \ .state = state_value \ } \ }} #define SHIFT_REPEAT(state_value) \ {{ \ .shift = { \ .type = TSParseActionTypeShift, \ .state = state_value, \ .repetition = true \ } \ }} #define SHIFT_EXTRA() \ {{ \ .shift = { \ .type = TSParseActionTypeShift, \ .extra = true \ } \ }} #define REDUCE(symbol_val, child_count_val, ...) \ {{ \ .reduce = { \ .type = TSParseActionTypeReduce, \ .symbol = symbol_val, \ .child_count = child_count_val, \ __VA_ARGS__ \ }, \ }} #define RECOVER() \ {{ \ .type = TSParseActionTypeRecover \ }} #define ACCEPT_INPUT() \ {{ \ .type = TSParseActionTypeAccept \ }} #ifdef __cplusplus } #endif #endif // TREE_SITTER_PARSER_H_ tree-sitter-0.20.1/src/alloc.h000064400000000000000000000034330072674642500142400ustar 00000000000000#ifndef TREE_SITTER_ALLOC_H_ #define TREE_SITTER_ALLOC_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #if defined(TREE_SITTER_ALLOCATION_TRACKING) void *ts_record_malloc(size_t); void *ts_record_calloc(size_t, size_t); void *ts_record_realloc(void *, size_t); void ts_record_free(void *); bool ts_toggle_allocation_recording(bool); #define ts_malloc ts_record_malloc #define ts_calloc ts_record_calloc #define ts_realloc ts_record_realloc #define ts_free ts_record_free #else // Allow clients to override allocation functions #ifndef ts_malloc #define ts_malloc ts_malloc_default #endif #ifndef ts_calloc #define ts_calloc ts_calloc_default #endif #ifndef ts_realloc #define ts_realloc ts_realloc_default #endif #ifndef ts_free #define ts_free ts_free_default #endif #include static inline bool ts_toggle_allocation_recording(bool value) { (void)value; return false; } static inline void *ts_malloc_default(size_t size) { void *result = malloc(size); if (size > 0 && !result) { fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size); exit(1); } return result; } static inline void *ts_calloc_default(size_t count, size_t size) { void *result = calloc(count, size); if (count > 0 && !result) { fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size); exit(1); } return result; } static inline void *ts_realloc_default(void *buffer, size_t size) { void *result = realloc(buffer, size); if (size > 0 && !result) { fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size); exit(1); } return result; } static inline void ts_free_default(void *buffer) { free(buffer); } #endif #ifdef __cplusplus } #endif #endif // TREE_SITTER_ALLOC_H_ tree-sitter-0.20.1/src/array.h000064400000000000000000000206160072674642500142660ustar 00000000000000#ifndef TREE_SITTER_ARRAY_H_ #define TREE_SITTER_ARRAY_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #include #include #include "./alloc.h" #define Array(T) \ struct { \ T *contents; \ uint32_t size; \ uint32_t capacity; \ } #define array_init(self) \ ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) #define array_new() \ { NULL, 0, 0 } #define array_get(self, index) \ (assert((uint32_t)index < (self)->size), &(self)->contents[index]) #define array_front(self) array_get(self, 0) #define array_back(self) array_get(self, (self)->size - 1) #define array_clear(self) ((self)->size = 0) #define array_reserve(self, new_capacity) \ array__reserve((VoidArray *)(self), array__elem_size(self), new_capacity) // Free any memory allocated for this array. #define array_delete(self) array__delete((VoidArray *)self) #define array_push(self, element) \ (array__grow((VoidArray *)(self), 1, array__elem_size(self)), \ (self)->contents[(self)->size++] = (element)) // Increase the array's size by a given number of elements, reallocating // if necessary. New elements are zero-initialized. #define array_grow_by(self, count) \ (array__grow((VoidArray *)(self), count, array__elem_size(self)), \ memset((self)->contents + (self)->size, 0, (count) * array__elem_size(self)), \ (self)->size += (count)) #define array_push_all(self, other) \ array_extend((self), (other)->size, (other)->contents) // Append `count` elements to the end of the array, reading their values from the // `contents` pointer. #define array_extend(self, count, contents) \ array__splice( \ (VoidArray *)(self), array__elem_size(self), (self)->size, \ 0, count, contents \ ) // Remove `old_count` elements from the array starting at the given `index`. At // the same index, insert `new_count` new elements, reading their values from the // `new_contents` pointer. #define array_splice(self, index, old_count, new_count, new_contents) \ array__splice( \ (VoidArray *)(self), array__elem_size(self), index, \ old_count, new_count, new_contents \ ) // Insert one `element` into the array at the given `index`. #define array_insert(self, index, element) \ array__splice((VoidArray *)(self), array__elem_size(self), index, 0, 1, &element) // Remove one `element` from the array at the given `index`. #define array_erase(self, index) \ array__erase((VoidArray *)(self), array__elem_size(self), index) #define array_pop(self) ((self)->contents[--(self)->size]) #define array_assign(self, other) \ array__assign((VoidArray *)(self), (const VoidArray *)(other), array__elem_size(self)) #define array_swap(self, other) \ array__swap((VoidArray *)(self), (VoidArray *)(other)) // Search a sorted array for a given `needle` value, using the given `compare` // callback to determine the order. // // If an existing element is found to be equal to `needle`, then the `index` // out-parameter is set to the existing value's index, and the `exists` // out-parameter is set to true. Otherwise, `index` is set to an index where // `needle` should be inserted in order to preserve the sorting, and `exists` // is set to false. #define array_search_sorted_with(self, compare, needle, index, exists) \ array__search_sorted(self, 0, compare, , needle, index, exists) // Search a sorted array for a given `needle` value, using integer comparisons // of a given struct field (specified with a leading dot) to determine the order. // // See also `array_search_sorted_with`. #define array_search_sorted_by(self, field, needle, index, exists) \ array__search_sorted(self, 0, _compare_int, field, needle, index, exists) // Insert a given `value` into a sorted array, using the given `compare` // callback to determine the order. #define array_insert_sorted_with(self, compare, value) \ do { \ unsigned index, exists; \ array_search_sorted_with(self, compare, &(value), &index, &exists); \ if (!exists) array_insert(self, index, value); \ } while (0) // Insert a given `value` into a sorted array, using integer comparisons of // a given struct field (specified with a leading dot) to determine the order. // // See also `array_search_sorted_by`. #define array_insert_sorted_by(self, field, value) \ do { \ unsigned index, exists; \ array_search_sorted_by(self, field, (value) field, &index, &exists); \ if (!exists) array_insert(self, index, value); \ } while (0) // Private typedef Array(void) VoidArray; #define array__elem_size(self) sizeof(*(self)->contents) static inline void array__delete(VoidArray *self) { ts_free(self->contents); self->contents = NULL; self->size = 0; self->capacity = 0; } static inline void array__erase(VoidArray *self, size_t element_size, uint32_t index) { assert(index < self->size); char *contents = (char *)self->contents; memmove(contents + index * element_size, contents + (index + 1) * element_size, (self->size - index - 1) * element_size); self->size--; } static inline void array__reserve(VoidArray *self, size_t element_size, uint32_t new_capacity) { if (new_capacity > self->capacity) { if (self->contents) { self->contents = ts_realloc(self->contents, new_capacity * element_size); } else { self->contents = ts_malloc(new_capacity * element_size); } self->capacity = new_capacity; } } static inline void array__assign(VoidArray *self, const VoidArray *other, size_t element_size) { array__reserve(self, element_size, other->size); self->size = other->size; memcpy(self->contents, other->contents, self->size * element_size); } static inline void array__swap(VoidArray *self, VoidArray *other) { VoidArray swap = *other; *other = *self; *self = swap; } static inline void array__grow(VoidArray *self, size_t count, size_t element_size) { size_t new_size = self->size + count; if (new_size > self->capacity) { size_t new_capacity = self->capacity * 2; if (new_capacity < 8) new_capacity = 8; if (new_capacity < new_size) new_capacity = new_size; array__reserve(self, element_size, new_capacity); } } static inline void array__splice(VoidArray *self, size_t element_size, uint32_t index, uint32_t old_count, uint32_t new_count, const void *elements) { uint32_t new_size = self->size + new_count - old_count; uint32_t old_end = index + old_count; uint32_t new_end = index + new_count; assert(old_end <= self->size); array__reserve(self, element_size, new_size); char *contents = (char *)self->contents; if (self->size > old_end) { memmove( contents + new_end * element_size, contents + old_end * element_size, (self->size - old_end) * element_size ); } if (new_count > 0) { if (elements) { memcpy( (contents + index * element_size), elements, new_count * element_size ); } else { memset( (contents + index * element_size), 0, new_count * element_size ); } } self->size += new_count - old_count; } // A binary search routine, based on Rust's `std::slice::binary_search_by`. #define array__search_sorted(self, start, compare, suffix, needle, index, exists) \ do { \ *(index) = start; \ *(exists) = false; \ uint32_t size = (self)->size - *(index); \ if (size == 0) break; \ int comparison; \ while (size > 1) { \ uint32_t half_size = size / 2; \ uint32_t mid_index = *(index) + half_size; \ comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ if (comparison <= 0) *(index) = mid_index; \ size -= half_size; \ } \ comparison = compare(&((self)->contents[*(index)] suffix), (needle)); \ if (comparison == 0) *(exists) = true; \ else if (comparison < 0) *(index) += 1; \ } while (0) // Helper macro for the `_sorted_by` routines below. This takes the left (existing) // parameter by reference in order to work with the generic sorting function above. #define _compare_int(a, b) ((int)*(a) - (int)(b)) #ifdef __cplusplus } #endif #endif // TREE_SITTER_ARRAY_H_ tree-sitter-0.20.1/src/atomic.h000064400000000000000000000021660072674642500144240ustar 00000000000000#ifndef TREE_SITTER_ATOMIC_H_ #define TREE_SITTER_ATOMIC_H_ #include #ifdef __TINYC__ static inline size_t atomic_load(const volatile size_t *p) { return *p; } static inline uint32_t atomic_inc(volatile uint32_t *p) { *p += 1; return *p; } static inline uint32_t atomic_dec(volatile uint32_t *p) { *p-= 1; return *p; } #elif defined(_WIN32) #include static inline size_t atomic_load(const volatile size_t *p) { return *p; } static inline uint32_t atomic_inc(volatile uint32_t *p) { return InterlockedIncrement((long volatile *)p); } static inline uint32_t atomic_dec(volatile uint32_t *p) { return InterlockedDecrement((long volatile *)p); } #else static inline size_t atomic_load(const volatile size_t *p) { #ifdef __ATOMIC_RELAXED return __atomic_load_n(p, __ATOMIC_RELAXED); #else return __sync_fetch_and_add((volatile size_t *)p, 0); #endif } static inline uint32_t atomic_inc(volatile uint32_t *p) { return __sync_add_and_fetch(p, 1u); } static inline uint32_t atomic_dec(volatile uint32_t *p) { return __sync_sub_and_fetch(p, 1u); } #endif #endif // TREE_SITTER_ATOMIC_H_ tree-sitter-0.20.1/src/clock.h000064400000000000000000000065470072674642500142520ustar 00000000000000#ifndef TREE_SITTER_CLOCK_H_ #define TREE_SITTER_CLOCK_H_ #include typedef uint64_t TSDuration; #ifdef _WIN32 // Windows: // * Represent a time as a performance counter value. // * Represent a duration as a number of performance counter ticks. #include typedef uint64_t TSClock; static inline TSDuration duration_from_micros(uint64_t micros) { LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); return micros * (uint64_t)frequency.QuadPart / 1000000; } static inline uint64_t duration_to_micros(TSDuration self) { LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); return self * 1000000 / (uint64_t)frequency.QuadPart; } static inline TSClock clock_null(void) { return 0; } static inline TSClock clock_now(void) { LARGE_INTEGER result; QueryPerformanceCounter(&result); return (uint64_t)result.QuadPart; } static inline TSClock clock_after(TSClock base, TSDuration duration) { return base + duration; } static inline bool clock_is_null(TSClock self) { return !self; } static inline bool clock_is_gt(TSClock self, TSClock other) { return self > other; } #elif defined(CLOCK_MONOTONIC) && !defined(__APPLE__) // POSIX with monotonic clock support (Linux) // * Represent a time as a monotonic (seconds, nanoseconds) pair. // * Represent a duration as a number of microseconds. // // On these platforms, parse timeouts will correspond accurately to // real time, regardless of what other processes are running. #include typedef struct timespec TSClock; static inline TSDuration duration_from_micros(uint64_t micros) { return micros; } static inline uint64_t duration_to_micros(TSDuration self) { return self; } static inline TSClock clock_now(void) { TSClock result; clock_gettime(CLOCK_MONOTONIC, &result); return result; } static inline TSClock clock_null(void) { return (TSClock) {0, 0}; } static inline TSClock clock_after(TSClock base, TSDuration duration) { TSClock result = base; result.tv_sec += duration / 1000000; result.tv_nsec += (duration % 1000000) * 1000; return result; } static inline bool clock_is_null(TSClock self) { return !self.tv_sec; } static inline bool clock_is_gt(TSClock self, TSClock other) { if (self.tv_sec > other.tv_sec) return true; if (self.tv_sec < other.tv_sec) return false; return self.tv_nsec > other.tv_nsec; } #else // macOS or POSIX without monotonic clock support // * Represent a time as a process clock value. // * Represent a duration as a number of process clock ticks. // // On these platforms, parse timeouts may be affected by other processes, // which is not ideal, but is better than using a non-monotonic time API // like `gettimeofday`. #include typedef uint64_t TSClock; static inline TSDuration duration_from_micros(uint64_t micros) { return micros * (uint64_t)CLOCKS_PER_SEC / 1000000; } static inline uint64_t duration_to_micros(TSDuration self) { return self * 1000000 / (uint64_t)CLOCKS_PER_SEC; } static inline TSClock clock_null(void) { return 0; } static inline TSClock clock_now(void) { return (uint64_t)clock(); } static inline TSClock clock_after(TSClock base, TSDuration duration) { return base + duration; } static inline bool clock_is_null(TSClock self) { return !self; } static inline bool clock_is_gt(TSClock self, TSClock other) { return self > other; } #endif #endif // TREE_SITTER_CLOCK_H_ tree-sitter-0.20.1/src/error_costs.h000064400000000000000000000004460072674642500155130ustar 00000000000000#ifndef TREE_SITTER_ERROR_COSTS_H_ #define TREE_SITTER_ERROR_COSTS_H_ #define ERROR_STATE 0 #define ERROR_COST_PER_RECOVERY 500 #define ERROR_COST_PER_MISSING_TREE 110 #define ERROR_COST_PER_SKIPPED_TREE 100 #define ERROR_COST_PER_SKIPPED_LINE 30 #define ERROR_COST_PER_SKIPPED_CHAR 1 #endif tree-sitter-0.20.1/src/get_changed_ranges.c000064400000000000000000000364360072674642500167410ustar 00000000000000#include "./get_changed_ranges.h" #include "./subtree.h" #include "./language.h" #include "./error_costs.h" #include "./tree_cursor.h" #include // #define DEBUG_GET_CHANGED_RANGES static void ts_range_array_add(TSRangeArray *self, Length start, Length end) { if (self->size > 0) { TSRange *last_range = array_back(self); if (start.bytes <= last_range->end_byte) { last_range->end_byte = end.bytes; last_range->end_point = end.extent; return; } } if (start.bytes < end.bytes) { TSRange range = { start.extent, end.extent, start.bytes, end.bytes }; array_push(self, range); } } bool ts_range_array_intersects(const TSRangeArray *self, unsigned start_index, uint32_t start_byte, uint32_t end_byte) { for (unsigned i = start_index; i < self->size; i++) { TSRange *range = &self->contents[i]; if (range->end_byte > start_byte) { if (range->start_byte >= end_byte) break; return true; } } return false; } void ts_range_array_get_changed_ranges( const TSRange *old_ranges, unsigned old_range_count, const TSRange *new_ranges, unsigned new_range_count, TSRangeArray *differences ) { unsigned new_index = 0; unsigned old_index = 0; Length current_position = length_zero(); bool in_old_range = false; bool in_new_range = false; while (old_index < old_range_count || new_index < new_range_count) { const TSRange *old_range = &old_ranges[old_index]; const TSRange *new_range = &new_ranges[new_index]; Length next_old_position; if (in_old_range) { next_old_position = (Length) {old_range->end_byte, old_range->end_point}; } else if (old_index < old_range_count) { next_old_position = (Length) {old_range->start_byte, old_range->start_point}; } else { next_old_position = LENGTH_MAX; } Length next_new_position; if (in_new_range) { next_new_position = (Length) {new_range->end_byte, new_range->end_point}; } else if (new_index < new_range_count) { next_new_position = (Length) {new_range->start_byte, new_range->start_point}; } else { next_new_position = LENGTH_MAX; } if (next_old_position.bytes < next_new_position.bytes) { if (in_old_range != in_new_range) { ts_range_array_add(differences, current_position, next_old_position); } if (in_old_range) old_index++; current_position = next_old_position; in_old_range = !in_old_range; } else if (next_new_position.bytes < next_old_position.bytes) { if (in_old_range != in_new_range) { ts_range_array_add(differences, current_position, next_new_position); } if (in_new_range) new_index++; current_position = next_new_position; in_new_range = !in_new_range; } else { if (in_old_range != in_new_range) { ts_range_array_add(differences, current_position, next_new_position); } if (in_old_range) old_index++; if (in_new_range) new_index++; in_old_range = !in_old_range; in_new_range = !in_new_range; current_position = next_new_position; } } } typedef struct { TreeCursor cursor; const TSLanguage *language; unsigned visible_depth; bool in_padding; } Iterator; static Iterator iterator_new(TreeCursor *cursor, const Subtree *tree, const TSLanguage *language) { array_clear(&cursor->stack); array_push(&cursor->stack, ((TreeCursorEntry){ .subtree = tree, .position = length_zero(), .child_index = 0, .structural_child_index = 0, })); return (Iterator) { .cursor = *cursor, .language = language, .visible_depth = 1, .in_padding = false, }; } static bool iterator_done(Iterator *self) { return self->cursor.stack.size == 0; } static Length iterator_start_position(Iterator *self) { TreeCursorEntry entry = *array_back(&self->cursor.stack); if (self->in_padding) { return entry.position; } else { return length_add(entry.position, ts_subtree_padding(*entry.subtree)); } } static Length iterator_end_position(Iterator *self) { TreeCursorEntry entry = *array_back(&self->cursor.stack); Length result = length_add(entry.position, ts_subtree_padding(*entry.subtree)); if (self->in_padding) { return result; } else { return length_add(result, ts_subtree_size(*entry.subtree)); } } static bool iterator_tree_is_visible(const Iterator *self) { TreeCursorEntry entry = *array_back(&self->cursor.stack); if (ts_subtree_visible(*entry.subtree)) return true; if (self->cursor.stack.size > 1) { Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree; return ts_language_alias_at( self->language, parent.ptr->production_id, entry.structural_child_index ) != 0; } return false; } static void iterator_get_visible_state( const Iterator *self, Subtree *tree, TSSymbol *alias_symbol, uint32_t *start_byte ) { uint32_t i = self->cursor.stack.size - 1; if (self->in_padding) { if (i == 0) return; i--; } for (; i + 1 > 0; i--) { TreeCursorEntry entry = self->cursor.stack.contents[i]; if (i > 0) { const Subtree *parent = self->cursor.stack.contents[i - 1].subtree; *alias_symbol = ts_language_alias_at( self->language, parent->ptr->production_id, entry.structural_child_index ); } if (ts_subtree_visible(*entry.subtree) || *alias_symbol) { *tree = *entry.subtree; *start_byte = entry.position.bytes; break; } } } static void iterator_ascend(Iterator *self) { if (iterator_done(self)) return; if (iterator_tree_is_visible(self) && !self->in_padding) self->visible_depth--; if (array_back(&self->cursor.stack)->child_index > 0) self->in_padding = false; self->cursor.stack.size--; } static bool iterator_descend(Iterator *self, uint32_t goal_position) { if (self->in_padding) return false; bool did_descend; do { did_descend = false; TreeCursorEntry entry = *array_back(&self->cursor.stack); Length position = entry.position; uint32_t structural_child_index = 0; for (uint32_t i = 0, n = ts_subtree_child_count(*entry.subtree); i < n; i++) { const Subtree *child = &ts_subtree_children(*entry.subtree)[i]; Length child_left = length_add(position, ts_subtree_padding(*child)); Length child_right = length_add(child_left, ts_subtree_size(*child)); if (child_right.bytes > goal_position) { array_push(&self->cursor.stack, ((TreeCursorEntry){ .subtree = child, .position = position, .child_index = i, .structural_child_index = structural_child_index, })); if (iterator_tree_is_visible(self)) { if (child_left.bytes > goal_position) { self->in_padding = true; } else { self->visible_depth++; } return true; } did_descend = true; break; } position = child_right; if (!ts_subtree_extra(*child)) structural_child_index++; } } while (did_descend); return false; } static void iterator_advance(Iterator *self) { if (self->in_padding) { self->in_padding = false; if (iterator_tree_is_visible(self)) { self->visible_depth++; } else { iterator_descend(self, 0); } return; } for (;;) { if (iterator_tree_is_visible(self)) self->visible_depth--; TreeCursorEntry entry = array_pop(&self->cursor.stack); if (iterator_done(self)) return; const Subtree *parent = array_back(&self->cursor.stack)->subtree; uint32_t child_index = entry.child_index + 1; if (ts_subtree_child_count(*parent) > child_index) { Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); uint32_t structural_child_index = entry.structural_child_index; if (!ts_subtree_extra(*entry.subtree)) structural_child_index++; const Subtree *next_child = &ts_subtree_children(*parent)[child_index]; array_push(&self->cursor.stack, ((TreeCursorEntry){ .subtree = next_child, .position = position, .child_index = child_index, .structural_child_index = structural_child_index, })); if (iterator_tree_is_visible(self)) { if (ts_subtree_padding(*next_child).bytes > 0) { self->in_padding = true; } else { self->visible_depth++; } } else { iterator_descend(self, 0); } break; } } } typedef enum { IteratorDiffers, IteratorMayDiffer, IteratorMatches, } IteratorComparison; static IteratorComparison iterator_compare(const Iterator *old_iter, const Iterator *new_iter) { Subtree old_tree = NULL_SUBTREE; Subtree new_tree = NULL_SUBTREE; uint32_t old_start = 0; uint32_t new_start = 0; TSSymbol old_alias_symbol = 0; TSSymbol new_alias_symbol = 0; iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start); iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start); if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches; if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers; if ( old_alias_symbol == new_alias_symbol && ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree) ) { if (old_start == new_start && !ts_subtree_has_changes(old_tree) && ts_subtree_symbol(old_tree) != ts_builtin_sym_error && ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes && ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE && ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE && (ts_subtree_parse_state(old_tree) == ERROR_STATE) == (ts_subtree_parse_state(new_tree) == ERROR_STATE)) { return IteratorMatches; } else { return IteratorMayDiffer; } } return IteratorDiffers; } #ifdef DEBUG_GET_CHANGED_RANGES static inline void iterator_print_state(Iterator *self) { TreeCursorEntry entry = *array_back(&self->cursor.stack); TSPoint start = iterator_start_position(self).extent; TSPoint end = iterator_end_position(self).extent; const char *name = ts_language_symbol_name(self->language, ts_subtree_symbol(*entry.subtree)); printf( "(%-25s %s\t depth:%u [%u, %u] - [%u, %u])", name, self->in_padding ? "(p)" : " ", self->visible_depth, start.row + 1, start.column, end.row + 1, end.column ); } #endif unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *new_tree, TreeCursor *cursor1, TreeCursor *cursor2, const TSLanguage *language, const TSRangeArray *included_range_differences, TSRange **ranges) { TSRangeArray results = array_new(); Iterator old_iter = iterator_new(cursor1, old_tree, language); Iterator new_iter = iterator_new(cursor2, new_tree, language); unsigned included_range_difference_index = 0; Length position = iterator_start_position(&old_iter); Length next_position = iterator_start_position(&new_iter); if (position.bytes < next_position.bytes) { ts_range_array_add(&results, position, next_position); position = next_position; } else if (position.bytes > next_position.bytes) { ts_range_array_add(&results, next_position, position); next_position = position; } do { #ifdef DEBUG_GET_CHANGED_RANGES printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column); iterator_print_state(&old_iter); printf("\tvs\t"); iterator_print_state(&new_iter); puts(""); #endif // Compare the old and new subtrees. IteratorComparison comparison = iterator_compare(&old_iter, &new_iter); // Even if the two subtrees appear to be identical, they could differ // internally if they contain a range of text that was previously // excluded from the parse, and is now included, or vice-versa. if (comparison == IteratorMatches && ts_range_array_intersects( included_range_differences, included_range_difference_index, position.bytes, iterator_end_position(&old_iter).bytes )) { comparison = IteratorMayDiffer; } bool is_changed = false; switch (comparison) { // If the subtrees are definitely identical, move to the end // of both subtrees. case IteratorMatches: next_position = iterator_end_position(&old_iter); break; // If the subtrees might differ internally, descend into both // subtrees, finding the first child that spans the current position. case IteratorMayDiffer: if (iterator_descend(&old_iter, position.bytes)) { if (!iterator_descend(&new_iter, position.bytes)) { is_changed = true; next_position = iterator_end_position(&old_iter); } } else if (iterator_descend(&new_iter, position.bytes)) { is_changed = true; next_position = iterator_end_position(&new_iter); } else { next_position = length_min( iterator_end_position(&old_iter), iterator_end_position(&new_iter) ); } break; // If the subtrees are different, record a change and then move // to the end of both subtrees. case IteratorDiffers: is_changed = true; next_position = length_min( iterator_end_position(&old_iter), iterator_end_position(&new_iter) ); break; } // Ensure that both iterators are caught up to the current position. while ( !iterator_done(&old_iter) && iterator_end_position(&old_iter).bytes <= next_position.bytes ) iterator_advance(&old_iter); while ( !iterator_done(&new_iter) && iterator_end_position(&new_iter).bytes <= next_position.bytes ) iterator_advance(&new_iter); // Ensure that both iterators are at the same depth in the tree. while (old_iter.visible_depth > new_iter.visible_depth) { iterator_ascend(&old_iter); } while (new_iter.visible_depth > old_iter.visible_depth) { iterator_ascend(&new_iter); } if (is_changed) { #ifdef DEBUG_GET_CHANGED_RANGES printf( " change: [[%u, %u] - [%u, %u]]\n", position.extent.row + 1, position.extent.column, next_position.extent.row + 1, next_position.extent.column ); #endif ts_range_array_add(&results, position, next_position); } position = next_position; // Keep track of the current position in the included range differences // array in order to avoid scanning the entire array on each iteration. while (included_range_difference_index < included_range_differences->size) { const TSRange *range = &included_range_differences->contents[ included_range_difference_index ]; if (range->end_byte <= position.bytes) { included_range_difference_index++; } else { break; } } } while (!iterator_done(&old_iter) && !iterator_done(&new_iter)); Length old_size = ts_subtree_total_size(*old_tree); Length new_size = ts_subtree_total_size(*new_tree); if (old_size.bytes < new_size.bytes) { ts_range_array_add(&results, old_size, new_size); } else if (new_size.bytes < old_size.bytes) { ts_range_array_add(&results, new_size, old_size); } *cursor1 = old_iter.cursor; *cursor2 = new_iter.cursor; *ranges = results.contents; return results.size; } tree-sitter-0.20.1/src/get_changed_ranges.h000064400000000000000000000015040072674642500167320ustar 00000000000000#ifndef TREE_SITTER_GET_CHANGED_RANGES_H_ #define TREE_SITTER_GET_CHANGED_RANGES_H_ #ifdef __cplusplus extern "C" { #endif #include "./tree_cursor.h" #include "./subtree.h" typedef Array(TSRange) TSRangeArray; void ts_range_array_get_changed_ranges( const TSRange *old_ranges, unsigned old_range_count, const TSRange *new_ranges, unsigned new_range_count, TSRangeArray *differences ); bool ts_range_array_intersects( const TSRangeArray *self, unsigned start_index, uint32_t start_byte, uint32_t end_byte ); unsigned ts_subtree_get_changed_ranges( const Subtree *old_tree, const Subtree *new_tree, TreeCursor *cursor1, TreeCursor *cursor2, const TSLanguage *language, const TSRangeArray *included_range_differences, TSRange **ranges ); #ifdef __cplusplus } #endif #endif // TREE_SITTER_GET_CHANGED_RANGES_H_ tree-sitter-0.20.1/src/language.c000064400000000000000000000070410072674642500147230ustar 00000000000000#include "./language.h" #include "./subtree.h" #include "./error_costs.h" #include uint32_t ts_language_symbol_count(const TSLanguage *self) { return self->symbol_count + self->alias_count; } uint32_t ts_language_version(const TSLanguage *self) { return self->version; } uint32_t ts_language_field_count(const TSLanguage *self) { return self->field_count; } void ts_language_table_entry( const TSLanguage *self, TSStateId state, TSSymbol symbol, TableEntry *result ) { if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { result->action_count = 0; result->is_reusable = false; result->actions = NULL; } else { assert(symbol < self->token_count); uint32_t action_index = ts_language_lookup(self, state, symbol); const TSParseActionEntry *entry = &self->parse_actions[action_index]; result->action_count = entry->entry.count; result->is_reusable = entry->entry.reusable; result->actions = (const TSParseAction *)(entry + 1); } } TSSymbolMetadata ts_language_symbol_metadata( const TSLanguage *self, TSSymbol symbol ) { if (symbol == ts_builtin_sym_error) { return (TSSymbolMetadata){.visible = true, .named = true}; } else if (symbol == ts_builtin_sym_error_repeat) { return (TSSymbolMetadata){.visible = false, .named = false}; } else { return self->symbol_metadata[symbol]; } } TSSymbol ts_language_public_symbol( const TSLanguage *self, TSSymbol symbol ) { if (symbol == ts_builtin_sym_error) return symbol; return self->public_symbol_map[symbol]; } const char *ts_language_symbol_name( const TSLanguage *self, TSSymbol symbol ) { if (symbol == ts_builtin_sym_error) { return "ERROR"; } else if (symbol == ts_builtin_sym_error_repeat) { return "_ERROR"; } else if (symbol < ts_language_symbol_count(self)) { return self->symbol_names[symbol]; } else { return NULL; } } TSSymbol ts_language_symbol_for_name( const TSLanguage *self, const char *string, uint32_t length, bool is_named ) { if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error; uint32_t count = ts_language_symbol_count(self); for (TSSymbol i = 0; i < count; i++) { TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i); if ((!metadata.visible && !metadata.supertype) || metadata.named != is_named) continue; const char *symbol_name = self->symbol_names[i]; if (!strncmp(symbol_name, string, length) && !symbol_name[length]) { return self->public_symbol_map[i]; } } return 0; } TSSymbolType ts_language_symbol_type( const TSLanguage *self, TSSymbol symbol ) { TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol); if (metadata.named && metadata.visible) { return TSSymbolTypeRegular; } else if (metadata.visible) { return TSSymbolTypeAnonymous; } else { return TSSymbolTypeAuxiliary; } } const char *ts_language_field_name_for_id( const TSLanguage *self, TSFieldId id ) { uint32_t count = ts_language_field_count(self); if (count && id <= count) { return self->field_names[id]; } else { return NULL; } } TSFieldId ts_language_field_id_for_name( const TSLanguage *self, const char *name, uint32_t name_length ) { uint32_t count = ts_language_field_count(self); for (TSSymbol i = 1; i < count + 1; i++) { switch (strncmp(name, self->field_names[i], name_length)) { case 0: if (self->field_names[i][name_length] == 0) return i; break; case -1: return 0; default: break; } } return 0; } tree-sitter-0.20.1/src/language.h000064400000000000000000000172760072674642500147430ustar 00000000000000#ifndef TREE_SITTER_LANGUAGE_H_ #define TREE_SITTER_LANGUAGE_H_ #ifdef __cplusplus extern "C" { #endif #include "./subtree.h" #include "tree_sitter/parser.h" #define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) typedef struct { const TSParseAction *actions; uint32_t action_count; bool is_reusable; } TableEntry; typedef struct { const TSLanguage *language; const uint16_t *data; const uint16_t *group_end; TSStateId state; uint16_t table_value; uint16_t section_index; uint16_t group_count; bool is_small_state; const TSParseAction *actions; TSSymbol symbol; TSStateId next_state; uint16_t action_count; } LookaheadIterator; void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *); TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol); static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { return 0 < symbol && symbol < self->external_token_count + 1; } static inline const TSParseAction *ts_language_actions( const TSLanguage *self, TSStateId state, TSSymbol symbol, uint32_t *count ) { TableEntry entry; ts_language_table_entry(self, state, symbol, &entry); *count = entry.action_count; return entry.actions; } static inline bool ts_language_has_reduce_action( const TSLanguage *self, TSStateId state, TSSymbol symbol ) { TableEntry entry; ts_language_table_entry(self, state, symbol, &entry); return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce; } // Lookup the table value for a given symbol and state. // // For non-terminal symbols, the table value represents a successor state. // For terminal symbols, it represents an index in the actions table. // For 'large' parse states, this is a direct lookup. For 'small' parse // states, this requires searching through the symbol groups to find // the given symbol. static inline uint16_t ts_language_lookup( const TSLanguage *self, TSStateId state, TSSymbol symbol ) { if (state >= self->large_state_count) { uint32_t index = self->small_parse_table_map[state - self->large_state_count]; const uint16_t *data = &self->small_parse_table[index]; uint16_t group_count = *(data++); for (unsigned i = 0; i < group_count; i++) { uint16_t section_value = *(data++); uint16_t symbol_count = *(data++); for (unsigned i = 0; i < symbol_count; i++) { if (*(data++) == symbol) return section_value; } } return 0; } else { return self->parse_table[state * self->symbol_count + symbol]; } } static inline bool ts_language_has_actions( const TSLanguage *self, TSStateId state, TSSymbol symbol ) { return ts_language_lookup(self, state, symbol) != 0; } // Iterate over all of the symbols that are valid in the given state. // // For 'large' parse states, this just requires iterating through // all possible symbols and checking the parse table for each one. // For 'small' parse states, this exploits the structure of the // table to only visit the valid symbols. static inline LookaheadIterator ts_language_lookaheads( const TSLanguage *self, TSStateId state ) { bool is_small_state = state >= self->large_state_count; const uint16_t *data; const uint16_t *group_end = NULL; uint16_t group_count = 0; if (is_small_state) { uint32_t index = self->small_parse_table_map[state - self->large_state_count]; data = &self->small_parse_table[index]; group_end = data + 1; group_count = *data; } else { data = &self->parse_table[state * self->symbol_count] - 1; } return (LookaheadIterator) { .language = self, .data = data, .group_end = group_end, .group_count = group_count, .is_small_state = is_small_state, .symbol = UINT16_MAX, .next_state = 0, }; } static inline bool ts_lookahead_iterator_next(LookaheadIterator *self) { // For small parse states, valid symbols are listed explicitly, // grouped by their value. There's no need to look up the actions // again until moving to the next group. if (self->is_small_state) { self->data++; if (self->data == self->group_end) { if (self->group_count == 0) return false; self->group_count--; self->table_value = *(self->data++); unsigned symbol_count = *(self->data++); self->group_end = self->data + symbol_count; self->symbol = *self->data; } else { self->symbol = *self->data; return true; } } // For large parse states, iterate through every symbol until one // is found that has valid actions. else { do { self->data++; self->symbol++; if (self->symbol >= self->language->symbol_count) return false; self->table_value = *self->data; } while (!self->table_value); } // Depending on if the symbols is terminal or non-terminal, the table value either // represents a list of actions or a successor state. if (self->symbol < self->language->token_count) { const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value]; self->action_count = entry->entry.count; self->actions = (const TSParseAction *)(entry + 1); self->next_state = 0; } else { self->action_count = 0; self->next_state = self->table_value; } return true; } static inline TSStateId ts_language_next_state( const TSLanguage *self, TSStateId state, TSSymbol symbol ) { if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { return 0; } else if (symbol < self->token_count) { uint32_t count; const TSParseAction *actions = ts_language_actions(self, state, symbol, &count); if (count > 0) { TSParseAction action = actions[count - 1]; if (action.type == TSParseActionTypeShift) { return action.shift.extra ? state : action.shift.state; } } return 0; } else { return ts_language_lookup(self, state, symbol); } } static inline const bool *ts_language_enabled_external_tokens( const TSLanguage *self, unsigned external_scanner_state ) { if (external_scanner_state == 0) { return NULL; } else { return self->external_scanner.states + self->external_token_count * external_scanner_state; } } static inline const TSSymbol *ts_language_alias_sequence( const TSLanguage *self, uint32_t production_id ) { return production_id ? &self->alias_sequences[production_id * self->max_alias_sequence_length] : NULL; } static inline TSSymbol ts_language_alias_at( const TSLanguage *self, uint32_t production_id, uint32_t child_index ) { return production_id ? self->alias_sequences[production_id * self->max_alias_sequence_length + child_index] : 0; } static inline void ts_language_field_map( const TSLanguage *self, uint32_t production_id, const TSFieldMapEntry **start, const TSFieldMapEntry **end ) { if (self->field_count == 0) { *start = NULL; *end = NULL; return; } TSFieldMapSlice slice = self->field_map_slices[production_id]; *start = &self->field_map_entries[slice.index]; *end = &self->field_map_entries[slice.index] + slice.length; } static inline void ts_language_aliases_for_symbol( const TSLanguage *self, TSSymbol original_symbol, const TSSymbol **start, const TSSymbol **end ) { *start = &self->public_symbol_map[original_symbol]; *end = *start + 1; unsigned i = 0; for (;;) { TSSymbol symbol = self->alias_map[i++]; if (symbol == 0 || symbol > original_symbol) break; uint16_t count = self->alias_map[i++]; if (symbol == original_symbol) { *start = &self->alias_map[i]; *end = &self->alias_map[i + count]; break; } i += count; } } #ifdef __cplusplus } #endif #endif // TREE_SITTER_LANGUAGE_H_ tree-sitter-0.20.1/src/length.h000064400000000000000000000020430072674642500144230ustar 00000000000000#ifndef TREE_SITTER_LENGTH_H_ #define TREE_SITTER_LENGTH_H_ #include #include #include "./point.h" #include "tree_sitter/api.h" typedef struct { uint32_t bytes; TSPoint extent; } Length; static const Length LENGTH_UNDEFINED = {0, {0, 1}}; static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}}; static inline bool length_is_undefined(Length length) { return length.bytes == 0 && length.extent.column != 0; } static inline Length length_min(Length len1, Length len2) { return (len1.bytes < len2.bytes) ? len1 : len2; } static inline Length length_add(Length len1, Length len2) { Length result; result.bytes = len1.bytes + len2.bytes; result.extent = point_add(len1.extent, len2.extent); return result; } static inline Length length_sub(Length len1, Length len2) { Length result; result.bytes = len1.bytes - len2.bytes; result.extent = point_sub(len1.extent, len2.extent); return result; } static inline Length length_zero(void) { Length result = {0, {0, 0}}; return result; } #endif tree-sitter-0.20.1/src/lexer.c000064400000000000000000000271340072674642500142640ustar 00000000000000#include #include "./lexer.h" #include "./subtree.h" #include "./length.h" #include "./unicode.h" #define LOG(message, character) \ if (self->logger.log) { \ snprintf( \ self->debug_buffer, \ TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \ 32 <= character && character < 127 ? \ message " character:'%c'" : \ message " character:%d", \ character \ ); \ self->logger.log( \ self->logger.payload, \ TSLogTypeLex, \ self->debug_buffer \ ); \ } static const int32_t BYTE_ORDER_MARK = 0xFEFF; static const TSRange DEFAULT_RANGE = { .start_point = { .row = 0, .column = 0, }, .end_point = { .row = UINT32_MAX, .column = UINT32_MAX, }, .start_byte = 0, .end_byte = UINT32_MAX }; // Check if the lexer has reached EOF. This state is stored // by setting the lexer's `current_included_range_index` such that // it has consumed all of its available ranges. static bool ts_lexer__eof(const TSLexer *_self) { Lexer *self = (Lexer *)_self; return self->current_included_range_index == self->included_range_count; } // Clear the currently stored chunk of source code, because the lexer's // position has changed. static void ts_lexer__clear_chunk(Lexer *self) { self->chunk = NULL; self->chunk_size = 0; self->chunk_start = 0; } // Call the lexer's input callback to obtain a new chunk of source code // for the current position. static void ts_lexer__get_chunk(Lexer *self) { self->chunk_start = self->current_position.bytes; self->chunk = self->input.read( self->input.payload, self->current_position.bytes, self->current_position.extent, &self->chunk_size ); if (!self->chunk_size) { self->current_included_range_index = self->included_range_count; self->chunk = NULL; } } // Decode the next unicode character in the current chunk of source code. // This assumes that the lexer has already retrieved a chunk of source // code that spans the current position. static void ts_lexer__get_lookahead(Lexer *self) { uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start; uint32_t size = self->chunk_size - position_in_chunk; if (size == 0) { self->lookahead_size = 1; self->data.lookahead = '\0'; return; } const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 : ts_decode_utf16; self->lookahead_size = decode(chunk, size, &self->data.lookahead); // If this chunk ended in the middle of a multi-byte character, // try again with a fresh chunk. if (self->data.lookahead == TS_DECODE_ERROR && size < 4) { ts_lexer__get_chunk(self); chunk = (const uint8_t *)self->chunk; size = self->chunk_size; self->lookahead_size = decode(chunk, size, &self->data.lookahead); } if (self->data.lookahead == TS_DECODE_ERROR) { self->lookahead_size = 1; } } static void ts_lexer_goto(Lexer *self, Length position) { self->current_position = position; bool found_included_range = false; // Move to the first valid position at or after the given position. for (unsigned i = 0; i < self->included_range_count; i++) { TSRange *included_range = &self->included_ranges[i]; if (included_range->end_byte > position.bytes) { if (included_range->start_byte >= position.bytes) { self->current_position = (Length) { .bytes = included_range->start_byte, .extent = included_range->start_point, }; } self->current_included_range_index = i; found_included_range = true; break; } } if (found_included_range) { // If the current position is outside of the current chunk of text, // then clear out the current chunk of text. if (self->chunk && ( position.bytes < self->chunk_start || position.bytes >= self->chunk_start + self->chunk_size )) { ts_lexer__clear_chunk(self); } self->lookahead_size = 0; self->data.lookahead = '\0'; } // If the given position is beyond any of included ranges, move to the EOF // state - past the end of the included ranges. else { self->current_included_range_index = self->included_range_count; TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; self->current_position = (Length) { .bytes = last_included_range->end_byte, .extent = last_included_range->end_point, }; ts_lexer__clear_chunk(self); self->lookahead_size = 1; self->data.lookahead = '\0'; } } // Advance to the next character in the source code, retrieving a new // chunk of source code if needed. static void ts_lexer__advance(TSLexer *_self, bool skip) { Lexer *self = (Lexer *)_self; if (!self->chunk) return; if (skip) { LOG("skip", self->data.lookahead); } else { LOG("consume", self->data.lookahead); } if (self->lookahead_size) { self->current_position.bytes += self->lookahead_size; if (self->data.lookahead == '\n') { self->current_position.extent.row++; self->current_position.extent.column = 0; } else { self->current_position.extent.column += self->lookahead_size; } } const TSRange *current_range = NULL; if (self->current_included_range_index < self->included_range_count) { current_range = &self->included_ranges[self->current_included_range_index]; if (self->current_position.bytes == current_range->end_byte) { self->current_included_range_index++; if (self->current_included_range_index < self->included_range_count) { current_range++; self->current_position = (Length) { current_range->start_byte, current_range->start_point, }; } else { current_range = NULL; } } } if (skip) self->token_start_position = self->current_position; if (current_range) { if (self->current_position.bytes >= self->chunk_start + self->chunk_size) { ts_lexer__get_chunk(self); } ts_lexer__get_lookahead(self); } else { ts_lexer__clear_chunk(self); self->data.lookahead = '\0'; self->lookahead_size = 1; } } // Mark that a token match has completed. This can be called multiple // times if a longer match is found later. static void ts_lexer__mark_end(TSLexer *_self) { Lexer *self = (Lexer *)_self; if (!ts_lexer__eof(&self->data)) { // If the lexer is right at the beginning of included range, // then the token should be considered to end at the *end* of the // previous included range, rather than here. TSRange *current_included_range = &self->included_ranges[ self->current_included_range_index ]; if ( self->current_included_range_index > 0 && self->current_position.bytes == current_included_range->start_byte ) { TSRange *previous_included_range = current_included_range - 1; self->token_end_position = (Length) { previous_included_range->end_byte, previous_included_range->end_point, }; return; } } self->token_end_position = self->current_position; } static uint32_t ts_lexer__get_column(TSLexer *_self) { Lexer *self = (Lexer *)_self; self->did_get_column = true; return self->current_position.extent.column; } // Is the lexer at a boundary between two disjoint included ranges of // source code? This is exposed as an API because some languages' external // scanners need to perform custom actions at these boundaries. static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) { const Lexer *self = (const Lexer *)_self; if (self->current_included_range_index < self->included_range_count) { TSRange *current_range = &self->included_ranges[self->current_included_range_index]; return self->current_position.bytes == current_range->start_byte; } else { return false; } } void ts_lexer_init(Lexer *self) { *self = (Lexer) { .data = { // The lexer's methods are stored as struct fields so that generated // parsers can call them without needing to be linked against this // library. .advance = ts_lexer__advance, .mark_end = ts_lexer__mark_end, .get_column = ts_lexer__get_column, .is_at_included_range_start = ts_lexer__is_at_included_range_start, .eof = ts_lexer__eof, .lookahead = 0, .result_symbol = 0, }, .chunk = NULL, .chunk_size = 0, .chunk_start = 0, .current_position = {0, {0, 0}}, .logger = { .payload = NULL, .log = NULL }, .included_ranges = NULL, .included_range_count = 0, .current_included_range_index = 0, }; ts_lexer_set_included_ranges(self, NULL, 0); } void ts_lexer_delete(Lexer *self) { ts_free(self->included_ranges); } void ts_lexer_set_input(Lexer *self, TSInput input) { self->input = input; ts_lexer__clear_chunk(self); ts_lexer_goto(self, self->current_position); } // Move the lexer to the given position. This doesn't do any work // if the parser is already at the given position. void ts_lexer_reset(Lexer *self, Length position) { if (position.bytes != self->current_position.bytes) { ts_lexer_goto(self, position); } } void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; self->token_end_position = LENGTH_UNDEFINED; self->data.result_symbol = 0; self->did_get_column = false; if (!ts_lexer__eof(&self->data)) { if (!self->chunk_size) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); if ( self->current_position.bytes == 0 && self->data.lookahead == BYTE_ORDER_MARK ) ts_lexer__advance(&self->data, true); } } void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { if (length_is_undefined(self->token_end_position)) { ts_lexer__mark_end(&self->data); } uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; // In order to determine that a byte sequence is invalid UTF8 or UTF16, // the character decoding algorithm may have looked at the following byte. // Therefore, the next byte *after* the current (invalid) character // affects the interpretation of the current character. if (self->data.lookahead == TS_DECODE_ERROR) { current_lookahead_end_byte++; } if (current_lookahead_end_byte > *lookahead_end_byte) { *lookahead_end_byte = current_lookahead_end_byte; } } void ts_lexer_advance_to_end(Lexer *self) { while (self->chunk) { ts_lexer__advance(&self->data, false); } } void ts_lexer_mark_end(Lexer *self) { ts_lexer__mark_end(&self->data); } bool ts_lexer_set_included_ranges( Lexer *self, const TSRange *ranges, uint32_t count ) { if (count == 0 || !ranges) { ranges = &DEFAULT_RANGE; count = 1; } else { uint32_t previous_byte = 0; for (unsigned i = 0; i < count; i++) { const TSRange *range = &ranges[i]; if ( range->start_byte < previous_byte || range->end_byte < range->start_byte ) return false; previous_byte = range->end_byte; } } size_t size = count * sizeof(TSRange); self->included_ranges = ts_realloc(self->included_ranges, size); memcpy(self->included_ranges, ranges, size); self->included_range_count = count; ts_lexer_goto(self, self->current_position); return true; } TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) { *count = self->included_range_count; return self->included_ranges; } #undef LOG tree-sitter-0.20.1/src/lexer.h000064400000000000000000000022110072674642500142560ustar 00000000000000#ifndef TREE_SITTER_LEXER_H_ #define TREE_SITTER_LEXER_H_ #ifdef __cplusplus extern "C" { #endif #include "./length.h" #include "./subtree.h" #include "tree_sitter/api.h" #include "tree_sitter/parser.h" typedef struct { TSLexer data; Length current_position; Length token_start_position; Length token_end_position; TSRange *included_ranges; const char *chunk; TSInput input; TSLogger logger; uint32_t included_range_count; uint32_t current_included_range_index; uint32_t chunk_start; uint32_t chunk_size; uint32_t lookahead_size; bool did_get_column; char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; } Lexer; void ts_lexer_init(Lexer *); void ts_lexer_delete(Lexer *); void ts_lexer_set_input(Lexer *, TSInput); void ts_lexer_reset(Lexer *, Length); void ts_lexer_start(Lexer *); void ts_lexer_finish(Lexer *, uint32_t *); void ts_lexer_advance_to_end(Lexer *); void ts_lexer_mark_end(Lexer *); bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count); TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count); #ifdef __cplusplus } #endif #endif // TREE_SITTER_LEXER_H_ tree-sitter-0.20.1/src/lib.c000064400000000000000000000006500072674642500137050ustar 00000000000000// The Tree-sitter library can be built by compiling this one source file. // // The following directories must be added to the include path: // - include #define _POSIX_C_SOURCE 200112L #include "./get_changed_ranges.c" #include "./language.c" #include "./lexer.c" #include "./node.c" #include "./parser.c" #include "./query.c" #include "./stack.c" #include "./subtree.c" #include "./tree_cursor.c" #include "./tree.c" tree-sitter-0.20.1/src/node.c000064400000000000000000000454350072674642500140760ustar 00000000000000#include #include "./subtree.h" #include "./tree.h" #include "./language.h" typedef struct { Subtree parent; const TSTree *tree; Length position; uint32_t child_index; uint32_t structural_child_index; const TSSymbol *alias_sequence; } NodeChildIterator; // TSNode - constructors TSNode ts_node_new( const TSTree *tree, const Subtree *subtree, Length position, TSSymbol alias ) { return (TSNode) { {position.bytes, position.extent.row, position.extent.column, alias}, subtree, tree, }; } static inline TSNode ts_node__null(void) { return ts_node_new(NULL, NULL, length_zero(), 0); } // TSNode - accessors uint32_t ts_node_start_byte(TSNode self) { return self.context[0]; } TSPoint ts_node_start_point(TSNode self) { return (TSPoint) {self.context[1], self.context[2]}; } static inline uint32_t ts_node__alias(const TSNode *self) { return self->context[3]; } static inline Subtree ts_node__subtree(TSNode self) { return *(const Subtree *)self.id; } // NodeChildIterator static inline NodeChildIterator ts_node_iterate_children(const TSNode *node) { Subtree subtree = ts_node__subtree(*node); if (ts_subtree_child_count(subtree) == 0) { return (NodeChildIterator) {NULL_SUBTREE, node->tree, length_zero(), 0, 0, NULL}; } const TSSymbol *alias_sequence = ts_language_alias_sequence( node->tree->language, subtree.ptr->production_id ); return (NodeChildIterator) { .tree = node->tree, .parent = subtree, .position = {ts_node_start_byte(*node), ts_node_start_point(*node)}, .child_index = 0, .structural_child_index = 0, .alias_sequence = alias_sequence, }; } static inline bool ts_node_child_iterator_done(NodeChildIterator *self) { return self->child_index == self->parent.ptr->child_count; } static inline bool ts_node_child_iterator_next( NodeChildIterator *self, TSNode *result ) { if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false; const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; TSSymbol alias_symbol = 0; if (!ts_subtree_extra(*child)) { if (self->alias_sequence) { alias_symbol = self->alias_sequence[self->structural_child_index]; } self->structural_child_index++; } if (self->child_index > 0) { self->position = length_add(self->position, ts_subtree_padding(*child)); } *result = ts_node_new( self->tree, child, self->position, alias_symbol ); self->position = length_add(self->position, ts_subtree_size(*child)); self->child_index++; return true; } // TSNode - private static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) { Subtree tree = ts_node__subtree(self); if (include_anonymous) { return ts_subtree_visible(tree) || ts_node__alias(&self); } else { TSSymbol alias = ts_node__alias(&self); if (alias) { return ts_language_symbol_metadata(self.tree->language, alias).named; } else { return ts_subtree_visible(tree) && ts_subtree_named(tree); } } } static inline uint32_t ts_node__relevant_child_count( TSNode self, bool include_anonymous ) { Subtree tree = ts_node__subtree(self); if (ts_subtree_child_count(tree) > 0) { if (include_anonymous) { return tree.ptr->visible_child_count; } else { return tree.ptr->named_child_count; } } else { return 0; } } static inline TSNode ts_node__child( TSNode self, uint32_t child_index, bool include_anonymous ) { TSNode result = self; bool did_descend = true; while (did_descend) { did_descend = false; TSNode child; uint32_t index = 0; NodeChildIterator iterator = ts_node_iterate_children(&result); while (ts_node_child_iterator_next(&iterator, &child)) { if (ts_node__is_relevant(child, include_anonymous)) { if (index == child_index) { return child; } index++; } else { uint32_t grandchild_index = child_index - index; uint32_t grandchild_count = ts_node__relevant_child_count(child, include_anonymous); if (grandchild_index < grandchild_count) { did_descend = true; result = child; child_index = grandchild_index; break; } index += grandchild_count; } } } return ts_node__null(); } static bool ts_subtree_has_trailing_empty_descendant( Subtree self, Subtree other ) { for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) { Subtree child = ts_subtree_children(self)[i]; if (ts_subtree_total_bytes(child) > 0) break; if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) { return true; } } return false; } static inline TSNode ts_node__prev_sibling(TSNode self, bool include_anonymous) { Subtree self_subtree = ts_node__subtree(self); bool self_is_empty = ts_subtree_total_bytes(self_subtree) == 0; uint32_t target_end_byte = ts_node_end_byte(self); TSNode node = ts_node_parent(self); TSNode earlier_node = ts_node__null(); bool earlier_node_is_relevant = false; while (!ts_node_is_null(node)) { TSNode earlier_child = ts_node__null(); bool earlier_child_is_relevant = false; bool found_child_containing_target = false; TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { if (child.id == self.id) break; if (iterator.position.bytes > target_end_byte) { found_child_containing_target = true; break; } if (iterator.position.bytes == target_end_byte && (!self_is_empty || ts_subtree_has_trailing_empty_descendant(ts_node__subtree(child), self_subtree))) { found_child_containing_target = true; break; } if (ts_node__is_relevant(child, include_anonymous)) { earlier_child = child; earlier_child_is_relevant = true; } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { earlier_child = child; earlier_child_is_relevant = false; } } if (found_child_containing_target) { if (!ts_node_is_null(earlier_child)) { earlier_node = earlier_child; earlier_node_is_relevant = earlier_child_is_relevant; } node = child; } else if (earlier_child_is_relevant) { return earlier_child; } else if (!ts_node_is_null(earlier_child)) { node = earlier_child; } else if (earlier_node_is_relevant) { return earlier_node; } else { node = earlier_node; } } return ts_node__null(); } static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) { uint32_t target_end_byte = ts_node_end_byte(self); TSNode node = ts_node_parent(self); TSNode later_node = ts_node__null(); bool later_node_is_relevant = false; while (!ts_node_is_null(node)) { TSNode later_child = ts_node__null(); bool later_child_is_relevant = false; TSNode child_containing_target = ts_node__null(); TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { if (iterator.position.bytes < target_end_byte) continue; if (ts_node_start_byte(child) <= ts_node_start_byte(self)) { if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) { child_containing_target = child; } } else if (ts_node__is_relevant(child, include_anonymous)) { later_child = child; later_child_is_relevant = true; break; } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { later_child = child; later_child_is_relevant = false; break; } } if (!ts_node_is_null(child_containing_target)) { if (!ts_node_is_null(later_child)) { later_node = later_child; later_node_is_relevant = later_child_is_relevant; } node = child_containing_target; } else if (later_child_is_relevant) { return later_child; } else if (!ts_node_is_null(later_child)) { node = later_child; } else if (later_node_is_relevant) { return later_node; } else { node = later_node; } } return ts_node__null(); } static inline TSNode ts_node__first_child_for_byte( TSNode self, uint32_t goal, bool include_anonymous ) { TSNode node = self; bool did_descend = true; while (did_descend) { did_descend = false; TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { if (ts_node_end_byte(child) > goal) { if (ts_node__is_relevant(child, include_anonymous)) { return child; } else if (ts_node_child_count(child) > 0) { did_descend = true; node = child; break; } } } } return ts_node__null(); } static inline TSNode ts_node__descendant_for_byte_range( TSNode self, uint32_t range_start, uint32_t range_end, bool include_anonymous ) { TSNode node = self; TSNode last_visible_node = self; bool did_descend = true; while (did_descend) { did_descend = false; TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { uint32_t node_end = iterator.position.bytes; // The end of this node must extend far enough forward to touch // the end of the range and exceed the start of the range. if (node_end < range_end) continue; if (node_end <= range_start) continue; // The start of this node must extend far enough backward to // touch the start of the range. if (range_start < ts_node_start_byte(child)) break; node = child; if (ts_node__is_relevant(node, include_anonymous)) { last_visible_node = node; } did_descend = true; break; } } return last_visible_node; } static inline TSNode ts_node__descendant_for_point_range( TSNode self, TSPoint range_start, TSPoint range_end, bool include_anonymous ) { TSNode node = self; TSNode last_visible_node = self; bool did_descend = true; while (did_descend) { did_descend = false; TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { TSPoint node_end = iterator.position.extent; // The end of this node must extend far enough forward to touch // the end of the range and exceed the start of the range. if (point_lt(node_end, range_end)) continue; if (point_lte(node_end, range_start)) continue; // The start of this node must extend far enough backward to // touch the start of the range. if (point_lt(range_start, ts_node_start_point(child))) break; node = child; if (ts_node__is_relevant(node, include_anonymous)) { last_visible_node = node; } did_descend = true; break; } } return last_visible_node; } // TSNode - public uint32_t ts_node_end_byte(TSNode self) { return ts_node_start_byte(self) + ts_subtree_size(ts_node__subtree(self)).bytes; } TSPoint ts_node_end_point(TSNode self) { return point_add(ts_node_start_point(self), ts_subtree_size(ts_node__subtree(self)).extent); } TSSymbol ts_node_symbol(TSNode self) { TSSymbol symbol = ts_node__alias(&self); if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); return ts_language_public_symbol(self.tree->language, symbol); } const char *ts_node_type(TSNode self) { TSSymbol symbol = ts_node__alias(&self); if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); return ts_language_symbol_name(self.tree->language, symbol); } char *ts_node_string(TSNode self) { return ts_subtree_string(ts_node__subtree(self), self.tree->language, false); } bool ts_node_eq(TSNode self, TSNode other) { return self.tree == other.tree && self.id == other.id; } bool ts_node_is_null(TSNode self) { return self.id == 0; } bool ts_node_is_extra(TSNode self) { return ts_subtree_extra(ts_node__subtree(self)); } bool ts_node_is_named(TSNode self) { TSSymbol alias = ts_node__alias(&self); return alias ? ts_language_symbol_metadata(self.tree->language, alias).named : ts_subtree_named(ts_node__subtree(self)); } bool ts_node_is_missing(TSNode self) { return ts_subtree_missing(ts_node__subtree(self)); } bool ts_node_has_changes(TSNode self) { return ts_subtree_has_changes(ts_node__subtree(self)); } bool ts_node_has_error(TSNode self) { return ts_subtree_error_cost(ts_node__subtree(self)) > 0; } TSNode ts_node_parent(TSNode self) { TSNode node = ts_tree_root_node(self.tree); uint32_t end_byte = ts_node_end_byte(self); if (node.id == self.id) return ts_node__null(); TSNode last_visible_node = node; bool did_descend = true; while (did_descend) { did_descend = false; TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { if ( ts_node_start_byte(child) > ts_node_start_byte(self) || child.id == self.id ) break; if (iterator.position.bytes >= end_byte) { node = child; if (ts_node__is_relevant(child, true)) { last_visible_node = node; } did_descend = true; break; } } } return last_visible_node; } TSNode ts_node_child(TSNode self, uint32_t child_index) { return ts_node__child(self, child_index, true); } TSNode ts_node_named_child(TSNode self, uint32_t child_index) { return ts_node__child(self, child_index, false); } TSNode ts_node_child_by_field_id(TSNode self, TSFieldId field_id) { recur: if (!field_id || ts_node_child_count(self) == 0) return ts_node__null(); const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map( self.tree->language, ts_node__subtree(self).ptr->production_id, &field_map, &field_map_end ); if (field_map == field_map_end) return ts_node__null(); // The field mappings are sorted by their field id. Scan all // the mappings to find the ones for the given field id. while (field_map->field_id < field_id) { field_map++; if (field_map == field_map_end) return ts_node__null(); } while (field_map_end[-1].field_id > field_id) { field_map_end--; if (field_map == field_map_end) return ts_node__null(); } TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&self); while (ts_node_child_iterator_next(&iterator, &child)) { if (!ts_subtree_extra(ts_node__subtree(child))) { uint32_t index = iterator.structural_child_index - 1; if (index < field_map->child_index) continue; // Hidden nodes' fields are "inherited" by their visible parent. if (field_map->inherited) { // If this is the *last* possible child node for this field, // then perform a tail call to avoid recursion. if (field_map + 1 == field_map_end) { self = child; goto recur; } // Otherwise, descend into this child, but if it doesn't contain // the field, continue searching subsequent children. else { TSNode result = ts_node_child_by_field_id(child, field_id); if (result.id) return result; field_map++; if (field_map == field_map_end) return ts_node__null(); } } else if (ts_node__is_relevant(child, true)) { return child; } // If the field refers to a hidden node with visible children, // return the first visible child. else if (ts_node_child_count(child) > 0 ) { return ts_node_child(child, 0); } // Otherwise, continue searching subsequent children. else { field_map++; if (field_map == field_map_end) return ts_node__null(); } } } return ts_node__null(); } const char *ts_node_field_name_for_child(TSNode self, uint32_t child_index) { const TSFieldMapEntry *field_map_start = NULL, *field_map_end = NULL; ts_language_field_map( self.tree->language, ts_node__subtree(self).ptr->production_id, &field_map_start, &field_map_end ); for (const TSFieldMapEntry *i = field_map_start; i < field_map_end; i++) { if (i->child_index == child_index) { return self.tree->language->field_names[i->field_id]; } } return NULL; } TSNode ts_node_child_by_field_name( TSNode self, const char *name, uint32_t name_length ) { TSFieldId field_id = ts_language_field_id_for_name( self.tree->language, name, name_length ); return ts_node_child_by_field_id(self, field_id); } uint32_t ts_node_child_count(TSNode self) { Subtree tree = ts_node__subtree(self); if (ts_subtree_child_count(tree) > 0) { return tree.ptr->visible_child_count; } else { return 0; } } uint32_t ts_node_named_child_count(TSNode self) { Subtree tree = ts_node__subtree(self); if (ts_subtree_child_count(tree) > 0) { return tree.ptr->named_child_count; } else { return 0; } } TSNode ts_node_next_sibling(TSNode self) { return ts_node__next_sibling(self, true); } TSNode ts_node_next_named_sibling(TSNode self) { return ts_node__next_sibling(self, false); } TSNode ts_node_prev_sibling(TSNode self) { return ts_node__prev_sibling(self, true); } TSNode ts_node_prev_named_sibling(TSNode self) { return ts_node__prev_sibling(self, false); } TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte) { return ts_node__first_child_for_byte(self, byte, true); } TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte) { return ts_node__first_child_for_byte(self, byte, false); } TSNode ts_node_descendant_for_byte_range( TSNode self, uint32_t start, uint32_t end ) { return ts_node__descendant_for_byte_range(self, start, end, true); } TSNode ts_node_named_descendant_for_byte_range( TSNode self, uint32_t start, uint32_t end ) { return ts_node__descendant_for_byte_range(self, start, end, false); } TSNode ts_node_descendant_for_point_range( TSNode self, TSPoint start, TSPoint end ) { return ts_node__descendant_for_point_range(self, start, end, true); } TSNode ts_node_named_descendant_for_point_range( TSNode self, TSPoint start, TSPoint end ) { return ts_node__descendant_for_point_range(self, start, end, false); } void ts_node_edit(TSNode *self, const TSInputEdit *edit) { uint32_t start_byte = ts_node_start_byte(*self); TSPoint start_point = ts_node_start_point(*self); if (start_byte >= edit->old_end_byte) { start_byte = edit->new_end_byte + (start_byte - edit->old_end_byte); start_point = point_add(edit->new_end_point, point_sub(start_point, edit->old_end_point)); } else if (start_byte > edit->start_byte) { start_byte = edit->new_end_byte; start_point = edit->new_end_point; } self->context[0] = start_byte; self->context[1] = start_point.row; self->context[2] = start_point.column; } tree-sitter-0.20.1/src/parser.c000064400000000000000000002043040072674642500144350ustar 00000000000000#include #include #include #include #include #include "tree_sitter/api.h" #include "./alloc.h" #include "./array.h" #include "./atomic.h" #include "./clock.h" #include "./error_costs.h" #include "./get_changed_ranges.h" #include "./language.h" #include "./length.h" #include "./lexer.h" #include "./reduce_action.h" #include "./reusable_node.h" #include "./stack.h" #include "./subtree.h" #include "./tree.h" #define LOG(...) \ if (self->lexer.logger.log || self->dot_graph_file) { \ snprintf(self->lexer.debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ ts_parser__log(self); \ } #define LOG_LOOKAHEAD(symbol_name, size) \ if (self->lexer.logger.log || self->dot_graph_file) { \ char *buf = self->lexer.debug_buffer; \ const char *symbol = symbol_name; \ int off = sprintf(buf, "lexed_lookahead sym:"); \ for ( \ int i = 0; \ symbol[i] != '\0' \ && off < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; \ i++ \ ) { \ switch (symbol[i]) { \ case '\t': buf[off++] = '\\'; buf[off++] = 't'; break; \ case '\n': buf[off++] = '\\'; buf[off++] = 'n'; break; \ case '\v': buf[off++] = '\\'; buf[off++] = 'v'; break; \ case '\f': buf[off++] = '\\'; buf[off++] = 'f'; break; \ case '\r': buf[off++] = '\\'; buf[off++] = 'r'; break; \ case '\\': buf[off++] = '\\'; buf[off++] = '\\'; break; \ default: buf[off++] = symbol[i]; break; \ } \ } \ snprintf( \ buf + off, \ TREE_SITTER_SERIALIZATION_BUFFER_SIZE - off, \ ", size:%u", \ size \ ); \ ts_parser__log(self); \ } #define LOG_STACK() \ if (self->dot_graph_file) { \ ts_stack_print_dot_graph(self->stack, self->language, self->dot_graph_file); \ fputs("\n\n", self->dot_graph_file); \ } #define LOG_TREE(tree) \ if (self->dot_graph_file) { \ ts_subtree_print_dot_graph(tree, self->language, self->dot_graph_file); \ fputs("\n", self->dot_graph_file); \ } #define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) #define TREE_NAME(tree) SYM_NAME(ts_subtree_symbol(tree)) static const unsigned MAX_VERSION_COUNT = 6; static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4; static const unsigned MAX_SUMMARY_DEPTH = 16; static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; static const unsigned OP_COUNT_PER_TIMEOUT_CHECK = 100; typedef struct { Subtree token; Subtree last_external_token; uint32_t byte_index; } TokenCache; struct TSParser { Lexer lexer; Stack *stack; SubtreePool tree_pool; const TSLanguage *language; ReduceActionSet reduce_actions; Subtree finished_tree; SubtreeArray trailing_extras; SubtreeArray trailing_extras2; SubtreeArray scratch_trees; TokenCache token_cache; ReusableNode reusable_node; void *external_scanner_payload; FILE *dot_graph_file; TSClock end_clock; TSDuration timeout_duration; unsigned accept_count; unsigned operation_count; const volatile size_t *cancellation_flag; Subtree old_tree; TSRangeArray included_range_differences; unsigned included_range_difference_index; }; typedef struct { unsigned cost; unsigned node_count; int dynamic_precedence; bool is_in_error; } ErrorStatus; typedef enum { ErrorComparisonTakeLeft, ErrorComparisonPreferLeft, ErrorComparisonNone, ErrorComparisonPreferRight, ErrorComparisonTakeRight, } ErrorComparison; typedef struct { const char *string; uint32_t length; } TSStringInput; // StringInput static const char *ts_string_input_read( void *_self, uint32_t byte, TSPoint pt, uint32_t *length ) { (void)pt; TSStringInput *self = (TSStringInput *)_self; if (byte >= self->length) { *length = 0; return ""; } else { *length = self->length - byte; return self->string + byte; } } // Parser - Private static void ts_parser__log(TSParser *self) { if (self->lexer.logger.log) { self->lexer.logger.log( self->lexer.logger.payload, TSLogTypeParse, self->lexer.debug_buffer ); } if (self->dot_graph_file) { fprintf(self->dot_graph_file, "graph {\nlabel=\""); for (char *c = &self->lexer.debug_buffer[0]; *c != 0; c++) { if (*c == '"') fputc('\\', self->dot_graph_file); fputc(*c, self->dot_graph_file); } fprintf(self->dot_graph_file, "\"\n}\n\n"); } } static bool ts_parser__breakdown_top_of_stack( TSParser *self, StackVersion version ) { bool did_break_down = false; bool pending = false; do { StackSliceArray pop = ts_stack_pop_pending(self->stack, version); if (!pop.size) break; did_break_down = true; pending = false; for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; TSStateId state = ts_stack_state(self->stack, slice.version); Subtree parent = *array_front(&slice.subtrees); for (uint32_t j = 0, n = ts_subtree_child_count(parent); j < n; j++) { Subtree child = ts_subtree_children(parent)[j]; pending = ts_subtree_child_count(child) > 0; if (ts_subtree_is_error(child)) { state = ERROR_STATE; } else if (!ts_subtree_extra(child)) { state = ts_language_next_state(self->language, state, ts_subtree_symbol(child)); } ts_subtree_retain(child); ts_stack_push(self->stack, slice.version, child, pending, state); } for (uint32_t j = 1; j < slice.subtrees.size; j++) { Subtree tree = slice.subtrees.contents[j]; ts_stack_push(self->stack, slice.version, tree, false, state); } ts_subtree_release(&self->tree_pool, parent); array_delete(&slice.subtrees); LOG("breakdown_top_of_stack tree:%s", TREE_NAME(parent)); LOG_STACK(); } } while (pending); return did_break_down; } static void ts_parser__breakdown_lookahead( TSParser *self, Subtree *lookahead, TSStateId state, ReusableNode *reusable_node ) { bool did_descend = false; Subtree tree = reusable_node_tree(reusable_node); while (ts_subtree_child_count(tree) > 0 && ts_subtree_parse_state(tree) != state) { LOG("state_mismatch sym:%s", TREE_NAME(tree)); reusable_node_descend(reusable_node); tree = reusable_node_tree(reusable_node); did_descend = true; } if (did_descend) { ts_subtree_release(&self->tree_pool, *lookahead); *lookahead = tree; ts_subtree_retain(*lookahead); } } static ErrorComparison ts_parser__compare_versions( TSParser *self, ErrorStatus a, ErrorStatus b ) { (void)self; if (!a.is_in_error && b.is_in_error) { if (a.cost < b.cost) { return ErrorComparisonTakeLeft; } else { return ErrorComparisonPreferLeft; } } if (a.is_in_error && !b.is_in_error) { if (b.cost < a.cost) { return ErrorComparisonTakeRight; } else { return ErrorComparisonPreferRight; } } if (a.cost < b.cost) { if ((b.cost - a.cost) * (1 + a.node_count) > MAX_COST_DIFFERENCE) { return ErrorComparisonTakeLeft; } else { return ErrorComparisonPreferLeft; } } if (b.cost < a.cost) { if ((a.cost - b.cost) * (1 + b.node_count) > MAX_COST_DIFFERENCE) { return ErrorComparisonTakeRight; } else { return ErrorComparisonPreferRight; } } if (a.dynamic_precedence > b.dynamic_precedence) return ErrorComparisonPreferLeft; if (b.dynamic_precedence > a.dynamic_precedence) return ErrorComparisonPreferRight; return ErrorComparisonNone; } static ErrorStatus ts_parser__version_status( TSParser *self, StackVersion version ) { unsigned cost = ts_stack_error_cost(self->stack, version); bool is_paused = ts_stack_is_paused(self->stack, version); if (is_paused) cost += ERROR_COST_PER_SKIPPED_TREE; return (ErrorStatus) { .cost = cost, .node_count = ts_stack_node_count_since_error(self->stack, version), .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), .is_in_error = is_paused || ts_stack_state(self->stack, version) == ERROR_STATE }; } static bool ts_parser__better_version_exists( TSParser *self, StackVersion version, bool is_in_error, unsigned cost ) { if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) <= cost) { return true; } Length position = ts_stack_position(self->stack, version); ErrorStatus status = { .cost = cost, .is_in_error = is_in_error, .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), .node_count = ts_stack_node_count_since_error(self->stack, version), }; for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { if (i == version || !ts_stack_is_active(self->stack, i) || ts_stack_position(self->stack, i).bytes < position.bytes) continue; ErrorStatus status_i = ts_parser__version_status(self, i); switch (ts_parser__compare_versions(self, status, status_i)) { case ErrorComparisonTakeRight: return true; case ErrorComparisonPreferRight: if (ts_stack_can_merge(self->stack, i, version)) return true; break; default: break; } } return false; } static void ts_parser__restore_external_scanner( TSParser *self, Subtree external_token ) { if (external_token.ptr) { self->language->external_scanner.deserialize( self->external_scanner_payload, ts_external_scanner_state_data(&external_token.ptr->external_scanner_state), external_token.ptr->external_scanner_state.length ); } else { self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); } } static bool ts_parser__can_reuse_first_leaf( TSParser *self, TSStateId state, Subtree tree, TableEntry *table_entry ) { TSLexMode current_lex_mode = self->language->lex_modes[state]; TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree); TSStateId leaf_state = ts_subtree_leaf_parse_state(tree); TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state]; // At the end of a non-terminal extra node, the lexer normally returns // NULL, which indicates that the parser should look for a reduce action // at symbol `0`. Avoid reusing tokens in this situation to ensure that // the same thing happens when incrementally reparsing. if (current_lex_mode.lex_state == (uint16_t)(-1)) return false; // If the token was created in a state with the same set of lookaheads, it is reusable. if ( table_entry->action_count > 0 && memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexMode)) == 0 && ( leaf_symbol != self->language->keyword_capture_token || (!ts_subtree_is_keyword(tree) && ts_subtree_parse_state(tree) == state) ) ) return true; // Empty tokens are not reusable in states with different lookaheads. if (ts_subtree_size(tree).bytes == 0 && leaf_symbol != ts_builtin_sym_end) return false; // If the current state allows external tokens or other tokens that conflict with this // token, this token is not reusable. return current_lex_mode.external_lex_state == 0 && table_entry->is_reusable; } static Subtree ts_parser__lex( TSParser *self, StackVersion version, TSStateId parse_state ) { TSLexMode lex_mode = self->language->lex_modes[parse_state]; if (lex_mode.lex_state == (uint16_t)-1) { LOG("no_lookahead_after_non_terminal_extra"); return NULL_SUBTREE; } Length start_position = ts_stack_position(self->stack, version); Subtree external_token = ts_stack_last_external_token(self->stack, version); const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state ); bool found_external_token = false; bool error_mode = parse_state == ERROR_STATE; bool skipped_error = false; bool called_get_column = false; int32_t first_error_character = 0; Length error_start_position = length_zero(); Length error_end_position = length_zero(); uint32_t lookahead_end_byte = 0; ts_lexer_reset(&self->lexer, start_position); for (;;) { Length current_position = self->lexer.current_position; if (valid_external_tokens) { LOG( "lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state, current_position.extent.row, current_position.extent.column ); ts_lexer_start(&self->lexer); ts_parser__restore_external_scanner(self, external_token); bool found_token = self->language->external_scanner.scan( self->external_scanner_payload, &self->lexer.data, valid_external_tokens ); ts_lexer_finish(&self->lexer, &lookahead_end_byte); // Zero-length external tokens are generally allowed, but they're not // allowed right after a syntax error. This is for two reasons: // 1. After a syntax error, the lexer is looking for any possible token, // as opposed to the specific set of tokens that are valid in some // parse state. In this situation, it's very easy for an external // scanner to produce unwanted zero-length tokens. // 2. The parser sometimes inserts *missing* tokens to recover from // errors. These tokens are also zero-length. If we allow more // zero-length tokens to be created after missing tokens, it // can lead to infinite loops. Forbidding zero-length tokens // right at the point of error recovery is a conservative strategy // for preventing this kind of infinite loop. if (found_token && ( self->lexer.token_end_position.bytes > current_position.bytes || (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) )) { found_external_token = true; called_get_column = self->lexer.did_get_column; break; } ts_lexer_reset(&self->lexer, current_position); } LOG( "lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state, current_position.extent.row, current_position.extent.column ); ts_lexer_start(&self->lexer); bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); ts_lexer_finish(&self->lexer, &lookahead_end_byte); if (found_token) break; if (!error_mode) { error_mode = true; lex_mode = self->language->lex_modes[ERROR_STATE]; valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state ); ts_lexer_reset(&self->lexer, start_position); continue; } if (!skipped_error) { LOG("skip_unrecognized_character"); skipped_error = true; error_start_position = self->lexer.token_start_position; error_end_position = self->lexer.token_start_position; first_error_character = self->lexer.data.lookahead; } if (self->lexer.current_position.bytes == error_end_position.bytes) { if (self->lexer.data.eof(&self->lexer.data)) { self->lexer.data.result_symbol = ts_builtin_sym_error; break; } self->lexer.data.advance(&self->lexer.data, false); } error_end_position = self->lexer.current_position; } Subtree result; if (skipped_error) { Length padding = length_sub(error_start_position, start_position); Length size = length_sub(error_end_position, error_start_position); uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes; result = ts_subtree_new_error( &self->tree_pool, first_error_character, padding, size, lookahead_bytes, parse_state, self->language ); LOG_LOOKAHEAD( SYM_NAME(ts_subtree_symbol(result)), ts_subtree_total_size(result).bytes ); } else { if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { self->lexer.token_start_position = self->lexer.token_end_position; } bool is_keyword = false; TSSymbol symbol = self->lexer.data.result_symbol; Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes; if (found_external_token) { symbol = self->language->external_scanner.symbol_map[symbol]; } else if (symbol == self->language->keyword_capture_token && symbol != 0) { uint32_t end_byte = self->lexer.token_end_position.bytes; ts_lexer_reset(&self->lexer, self->lexer.token_start_position); ts_lexer_start(&self->lexer); if ( self->language->keyword_lex_fn(&self->lexer.data, 0) && self->lexer.token_end_position.bytes == end_byte && ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) ) { is_keyword = true; symbol = self->lexer.data.result_symbol; } } result = ts_subtree_new_leaf( &self->tree_pool, symbol, padding, size, lookahead_bytes, parse_state, found_external_token, called_get_column, is_keyword, self->language ); if (found_external_token) { unsigned length = self->language->external_scanner.serialize( self->external_scanner_payload, self->lexer.debug_buffer ); ts_external_scanner_state_init( &((SubtreeHeapData *)result.ptr)->external_scanner_state, self->lexer.debug_buffer, length ); } LOG_LOOKAHEAD( SYM_NAME(ts_subtree_symbol(result)), ts_subtree_total_size(result).bytes ); } return result; } static Subtree ts_parser__get_cached_token( TSParser *self, TSStateId state, size_t position, Subtree last_external_token, TableEntry *table_entry ) { TokenCache *cache = &self->token_cache; if ( cache->token.ptr && cache->byte_index == position && ts_subtree_external_scanner_state_eq(cache->last_external_token, last_external_token) ) { ts_language_table_entry(self->language, state, ts_subtree_symbol(cache->token), table_entry); if (ts_parser__can_reuse_first_leaf(self, state, cache->token, table_entry)) { ts_subtree_retain(cache->token); return cache->token; } } return NULL_SUBTREE; } static void ts_parser__set_cached_token( TSParser *self, size_t byte_index, Subtree last_external_token, Subtree token ) { TokenCache *cache = &self->token_cache; if (token.ptr) ts_subtree_retain(token); if (last_external_token.ptr) ts_subtree_retain(last_external_token); if (cache->token.ptr) ts_subtree_release(&self->tree_pool, cache->token); if (cache->last_external_token.ptr) ts_subtree_release(&self->tree_pool, cache->last_external_token); cache->token = token; cache->byte_index = byte_index; cache->last_external_token = last_external_token; } static bool ts_parser__has_included_range_difference( const TSParser *self, uint32_t start_position, uint32_t end_position ) { return ts_range_array_intersects( &self->included_range_differences, self->included_range_difference_index, start_position, end_position ); } static Subtree ts_parser__reuse_node( TSParser *self, StackVersion version, TSStateId *state, uint32_t position, Subtree last_external_token, TableEntry *table_entry ) { Subtree result; while ((result = reusable_node_tree(&self->reusable_node)).ptr) { uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node); uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result); // Do not reuse an EOF node if the included ranges array has changes // later on in the file. if (ts_subtree_is_eof(result)) end_byte_offset = UINT32_MAX; if (byte_offset > position) { LOG("before_reusable_node symbol:%s", TREE_NAME(result)); break; } if (byte_offset < position) { LOG("past_reusable_node symbol:%s", TREE_NAME(result)); if (end_byte_offset <= position || !reusable_node_descend(&self->reusable_node)) { reusable_node_advance(&self->reusable_node); } continue; } if (!ts_subtree_external_scanner_state_eq(self->reusable_node.last_external_token, last_external_token)) { LOG("reusable_node_has_different_external_scanner_state symbol:%s", TREE_NAME(result)); reusable_node_advance(&self->reusable_node); continue; } const char *reason = NULL; if (ts_subtree_has_changes(result)) { reason = "has_changes"; } else if (ts_subtree_is_error(result)) { reason = "is_error"; } else if (ts_subtree_missing(result)) { reason = "is_missing"; } else if (ts_subtree_is_fragile(result)) { reason = "is_fragile"; } else if (ts_parser__has_included_range_difference(self, byte_offset, end_byte_offset)) { reason = "contains_different_included_range"; } if (reason) { LOG("cant_reuse_node_%s tree:%s", reason, TREE_NAME(result)); if (!reusable_node_descend(&self->reusable_node)) { reusable_node_advance(&self->reusable_node); ts_parser__breakdown_top_of_stack(self, version); *state = ts_stack_state(self->stack, version); } continue; } TSSymbol leaf_symbol = ts_subtree_leaf_symbol(result); ts_language_table_entry(self->language, *state, leaf_symbol, table_entry); if (!ts_parser__can_reuse_first_leaf(self, *state, result, table_entry)) { LOG( "cant_reuse_node symbol:%s, first_leaf_symbol:%s", TREE_NAME(result), SYM_NAME(leaf_symbol) ); reusable_node_advance_past_leaf(&self->reusable_node); break; } LOG("reuse_node symbol:%s", TREE_NAME(result)); ts_subtree_retain(result); return result; } return NULL_SUBTREE; } // Determine if a given tree should be replaced by an alternative tree. // // The decision is based on the trees' error costs (if any), their dynamic precedence, // and finally, as a default, by a recursive comparison of the trees' symbols. static bool ts_parser__select_tree(TSParser *self, Subtree left, Subtree right) { if (!left.ptr) return true; if (!right.ptr) return false; if (ts_subtree_error_cost(right) < ts_subtree_error_cost(left)) { LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(right), TREE_NAME(left)); return true; } if (ts_subtree_error_cost(left) < ts_subtree_error_cost(right)) { LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); return false; } if (ts_subtree_dynamic_precedence(right) > ts_subtree_dynamic_precedence(left)) { LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", TREE_NAME(right), ts_subtree_dynamic_precedence(right), TREE_NAME(left), ts_subtree_dynamic_precedence(left)); return true; } if (ts_subtree_dynamic_precedence(left) > ts_subtree_dynamic_precedence(right)) { LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", TREE_NAME(left), ts_subtree_dynamic_precedence(left), TREE_NAME(right), ts_subtree_dynamic_precedence(right)); return false; } if (ts_subtree_error_cost(left) > 0) return true; int comparison = ts_subtree_compare(left, right); switch (comparison) { case -1: LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); return false; break; case 1: LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(right), TREE_NAME(left)); return true; default: LOG("select_existing symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); return false; } } // Determine if a given tree's children should be replaced by an alternative // array of children. static bool ts_parser__select_children( TSParser *self, Subtree left, const SubtreeArray *children ) { array_assign(&self->scratch_trees, children); // Create a temporary subtree using the scratch trees array. This node does // not perform any allocation except for possibly growing the array to make // room for its own heap data. The scratch tree is never explicitly released, // so the same 'scratch trees' array can be reused again later. MutableSubtree scratch_tree = ts_subtree_new_node( ts_subtree_symbol(left), &self->scratch_trees, 0, self->language ); return ts_parser__select_tree( self, left, ts_subtree_from_mut(scratch_tree) ); } static void ts_parser__shift( TSParser *self, StackVersion version, TSStateId state, Subtree lookahead, bool extra ) { bool is_leaf = ts_subtree_child_count(lookahead) == 0; Subtree subtree_to_push = lookahead; if (extra != ts_subtree_extra(lookahead) && is_leaf) { MutableSubtree result = ts_subtree_make_mut(&self->tree_pool, lookahead); ts_subtree_set_extra(&result, extra); subtree_to_push = ts_subtree_from_mut(result); } ts_stack_push(self->stack, version, subtree_to_push, !is_leaf, state); if (ts_subtree_has_external_tokens(subtree_to_push)) { ts_stack_set_last_external_token( self->stack, version, ts_subtree_last_external_token(subtree_to_push) ); } } static StackVersion ts_parser__reduce( TSParser *self, StackVersion version, TSSymbol symbol, uint32_t count, int dynamic_precedence, uint16_t production_id, bool is_fragile, bool end_of_non_terminal_extra ) { uint32_t initial_version_count = ts_stack_version_count(self->stack); // Pop the given number of nodes from the given version of the parse stack. // If stack versions have previously merged, then there may be more than one // path back through the stack. For each path, create a new parent node to // contain the popped children, and push it onto the stack in place of the // children. StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); uint32_t removed_version_count = 0; for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; StackVersion slice_version = slice.version - removed_version_count; // This is where new versions are added to the parse stack. The versions // will all be sorted and truncated at the end of the outer parsing loop. // Allow the maximum version count to be temporarily exceeded, but only // by a limited threshold. if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; while (i + 1 < pop.size) { StackSlice next_slice = pop.contents[i + 1]; if (next_slice.version != slice.version) break; ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); i++; } continue; } // Extra tokens on top of the stack should not be included in this new parent // node. They will be re-pushed onto the stack after the parent node is // created and pushed. SubtreeArray children = slice.subtrees; ts_subtree_array_remove_trailing_extras(&children, &self->trailing_extras); MutableSubtree parent = ts_subtree_new_node( symbol, &children, production_id, self->language ); // This pop operation may have caused multiple stack versions to collapse // into one, because they all diverged from a common state. In that case, // choose one of the arrays of trees to be the parent node's children, and // delete the rest of the tree arrays. while (i + 1 < pop.size) { StackSlice next_slice = pop.contents[i + 1]; if (next_slice.version != slice.version) break; i++; SubtreeArray children = next_slice.subtrees; ts_subtree_array_remove_trailing_extras(&children, &self->trailing_extras2); if (ts_parser__select_children( self, ts_subtree_from_mut(parent), &children )) { ts_subtree_array_clear(&self->tree_pool, &self->trailing_extras); ts_subtree_release(&self->tree_pool, ts_subtree_from_mut(parent)); array_swap(&self->trailing_extras, &self->trailing_extras2); parent = ts_subtree_new_node( symbol, &children, production_id, self->language ); } else { array_clear(&self->trailing_extras2); ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); } } TSStateId state = ts_stack_state(self->stack, slice_version); TSStateId next_state = ts_language_next_state(self->language, state, symbol); if (end_of_non_terminal_extra && next_state == state) { parent.ptr->extra = true; } if (is_fragile || pop.size > 1 || initial_version_count > 1) { parent.ptr->fragile_left = true; parent.ptr->fragile_right = true; parent.ptr->parse_state = TS_TREE_STATE_NONE; } else { parent.ptr->parse_state = state; } parent.ptr->dynamic_precedence += dynamic_precedence; // Push the parent node onto the stack, along with any extra tokens that // were previously on top of the stack. ts_stack_push(self->stack, slice_version, ts_subtree_from_mut(parent), false, next_state); for (uint32_t j = 0; j < self->trailing_extras.size; j++) { ts_stack_push(self->stack, slice_version, self->trailing_extras.contents[j], false, next_state); } for (StackVersion j = 0; j < slice_version; j++) { if (j == version) continue; if (ts_stack_merge(self->stack, j, slice_version)) { removed_version_count++; break; } } } // Return the first new stack version that was created. return ts_stack_version_count(self->stack) > initial_version_count ? initial_version_count : STACK_VERSION_NONE; } static void ts_parser__accept( TSParser *self, StackVersion version, Subtree lookahead ) { assert(ts_subtree_is_eof(lookahead)); ts_stack_push(self->stack, version, lookahead, false, 1); StackSliceArray pop = ts_stack_pop_all(self->stack, version); for (uint32_t i = 0; i < pop.size; i++) { SubtreeArray trees = pop.contents[i].subtrees; Subtree root = NULL_SUBTREE; for (uint32_t j = trees.size - 1; j + 1 > 0; j--) { Subtree tree = trees.contents[j]; if (!ts_subtree_extra(tree)) { assert(!tree.data.is_inline); uint32_t child_count = ts_subtree_child_count(tree); const Subtree *children = ts_subtree_children(tree); for (uint32_t k = 0; k < child_count; k++) { ts_subtree_retain(children[k]); } array_splice(&trees, j, 1, child_count, children); root = ts_subtree_from_mut(ts_subtree_new_node( ts_subtree_symbol(tree), &trees, tree.ptr->production_id, self->language )); ts_subtree_release(&self->tree_pool, tree); break; } } assert(root.ptr); self->accept_count++; if (self->finished_tree.ptr) { if (ts_parser__select_tree(self, self->finished_tree, root)) { ts_subtree_release(&self->tree_pool, self->finished_tree); self->finished_tree = root; } else { ts_subtree_release(&self->tree_pool, root); } } else { self->finished_tree = root; } } ts_stack_remove_version(self->stack, pop.contents[0].version); ts_stack_halt(self->stack, version); } static bool ts_parser__do_all_potential_reductions( TSParser *self, StackVersion starting_version, TSSymbol lookahead_symbol ) { uint32_t initial_version_count = ts_stack_version_count(self->stack); bool can_shift_lookahead_symbol = false; StackVersion version = starting_version; for (unsigned i = 0; true; i++) { uint32_t version_count = ts_stack_version_count(self->stack); if (version >= version_count) break; bool merged = false; for (StackVersion i = initial_version_count; i < version; i++) { if (ts_stack_merge(self->stack, i, version)) { merged = true; break; } } if (merged) continue; TSStateId state = ts_stack_state(self->stack, version); bool has_shift_action = false; array_clear(&self->reduce_actions); TSSymbol first_symbol, end_symbol; if (lookahead_symbol != 0) { first_symbol = lookahead_symbol; end_symbol = lookahead_symbol + 1; } else { first_symbol = 1; end_symbol = self->language->token_count; } for (TSSymbol symbol = first_symbol; symbol < end_symbol; symbol++) { TableEntry entry; ts_language_table_entry(self->language, state, symbol, &entry); for (uint32_t i = 0; i < entry.action_count; i++) { TSParseAction action = entry.actions[i]; switch (action.type) { case TSParseActionTypeShift: case TSParseActionTypeRecover: if (!action.shift.extra && !action.shift.repetition) has_shift_action = true; break; case TSParseActionTypeReduce: if (action.reduce.child_count > 0) ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ .symbol = action.reduce.symbol, .count = action.reduce.child_count, .dynamic_precedence = action.reduce.dynamic_precedence, .production_id = action.reduce.production_id, }); break; default: break; } } } StackVersion reduction_version = STACK_VERSION_NONE; for (uint32_t i = 0; i < self->reduce_actions.size; i++) { ReduceAction action = self->reduce_actions.contents[i]; reduction_version = ts_parser__reduce( self, version, action.symbol, action.count, action.dynamic_precedence, action.production_id, true, false ); } if (has_shift_action) { can_shift_lookahead_symbol = true; } else if (reduction_version != STACK_VERSION_NONE && i < MAX_VERSION_COUNT) { ts_stack_renumber_version(self->stack, reduction_version, version); continue; } else if (lookahead_symbol != 0) { ts_stack_remove_version(self->stack, version); } if (version == starting_version) { version = version_count; } else { version++; } } return can_shift_lookahead_symbol; } static void ts_parser__handle_error( TSParser *self, StackVersion version, TSSymbol lookahead_symbol ) { uint32_t previous_version_count = ts_stack_version_count(self->stack); // Perform any reductions that can happen in this state, regardless of the lookahead. After // skipping one or more invalid tokens, the parser might find a token that would have allowed // a reduction to take place. ts_parser__do_all_potential_reductions(self, version, 0); uint32_t version_count = ts_stack_version_count(self->stack); Length position = ts_stack_position(self->stack, version); // Push a discontinuity onto the stack. Merge all of the stack versions that // were created in the previous step. bool did_insert_missing_token = false; for (StackVersion v = version; v < version_count;) { if (!did_insert_missing_token) { TSStateId state = ts_stack_state(self->stack, v); for (TSSymbol missing_symbol = 1; missing_symbol < self->language->token_count; missing_symbol++) { TSStateId state_after_missing_symbol = ts_language_next_state( self->language, state, missing_symbol ); if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { continue; } if (ts_language_has_reduce_action( self->language, state_after_missing_symbol, lookahead_symbol )) { // In case the parser is currently outside of any included range, the lexer will // snap to the beginning of the next included range. The missing token's padding // must be assigned to position it within the next included range. ts_lexer_reset(&self->lexer, position); ts_lexer_mark_end(&self->lexer); Length padding = length_sub(self->lexer.token_end_position, position); StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); Subtree missing_tree = ts_subtree_new_missing_leaf( &self->tree_pool, missing_symbol, padding, self->language ); ts_stack_push( self->stack, version_with_missing_tree, missing_tree, false, state_after_missing_symbol ); if (ts_parser__do_all_potential_reductions( self, version_with_missing_tree, lookahead_symbol )) { LOG( "recover_with_missing symbol:%s, state:%u", SYM_NAME(missing_symbol), ts_stack_state(self->stack, version_with_missing_tree) ); did_insert_missing_token = true; break; } } } } ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); v = (v == version) ? previous_version_count : v + 1; } for (unsigned i = previous_version_count; i < version_count; i++) { bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); assert(did_merge); } ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); LOG_STACK(); } static bool ts_parser__recover_to_state( TSParser *self, StackVersion version, unsigned depth, TSStateId goal_state ) { StackSliceArray pop = ts_stack_pop_count(self->stack, version, depth); StackVersion previous_version = STACK_VERSION_NONE; for (unsigned i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; if (slice.version == previous_version) { ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); array_erase(&pop, i--); continue; } if (ts_stack_state(self->stack, slice.version) != goal_state) { ts_stack_halt(self->stack, slice.version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); array_erase(&pop, i--); continue; } SubtreeArray error_trees = ts_stack_pop_error(self->stack, slice.version); if (error_trees.size > 0) { assert(error_trees.size == 1); Subtree error_tree = error_trees.contents[0]; uint32_t error_child_count = ts_subtree_child_count(error_tree); if (error_child_count > 0) { array_splice(&slice.subtrees, 0, 0, error_child_count, ts_subtree_children(error_tree)); for (unsigned j = 0; j < error_child_count; j++) { ts_subtree_retain(slice.subtrees.contents[j]); } } ts_subtree_array_delete(&self->tree_pool, &error_trees); } ts_subtree_array_remove_trailing_extras(&slice.subtrees, &self->trailing_extras); if (slice.subtrees.size > 0) { Subtree error = ts_subtree_new_error_node(&slice.subtrees, true, self->language); ts_stack_push(self->stack, slice.version, error, false, goal_state); } else { array_delete(&slice.subtrees); } for (unsigned j = 0; j < self->trailing_extras.size; j++) { Subtree tree = self->trailing_extras.contents[j]; ts_stack_push(self->stack, slice.version, tree, false, goal_state); } previous_version = slice.version; } return previous_version != STACK_VERSION_NONE; } static void ts_parser__recover( TSParser *self, StackVersion version, Subtree lookahead ) { bool did_recover = false; unsigned previous_version_count = ts_stack_version_count(self->stack); Length position = ts_stack_position(self->stack, version); StackSummary *summary = ts_stack_get_summary(self->stack, version); unsigned node_count_since_error = ts_stack_node_count_since_error(self->stack, version); unsigned current_error_cost = ts_stack_error_cost(self->stack, version); // When the parser is in the error state, there are two strategies for recovering with a // given lookahead token: // 1. Find a previous state on the stack in which that lookahead token would be valid. Then, // create a new stack version that is in that state again. This entails popping all of the // subtrees that have been pushed onto the stack since that previous state, and wrapping // them in an ERROR node. // 2. Wrap the lookahead token in an ERROR node, push that ERROR node onto the stack, and // move on to the next lookahead token, remaining in the error state. // // First, try the strategy 1. Upon entering the error state, the parser recorded a summary // of the previous parse states and their depths. Look at each state in the summary, to see // if the current lookahead token would be valid in that state. if (summary && !ts_subtree_is_error(lookahead)) { for (unsigned i = 0; i < summary->size; i++) { StackSummaryEntry entry = summary->contents[i]; if (entry.state == ERROR_STATE) continue; if (entry.position.bytes == position.bytes) continue; unsigned depth = entry.depth; if (node_count_since_error > 0) depth++; // Do not recover in ways that create redundant stack versions. bool would_merge = false; for (unsigned j = 0; j < previous_version_count; j++) { if ( ts_stack_state(self->stack, j) == entry.state && ts_stack_position(self->stack, j).bytes == position.bytes ) { would_merge = true; break; } } if (would_merge) continue; // Do not recover if the result would clearly be worse than some existing stack version. unsigned new_cost = current_error_cost + entry.depth * ERROR_COST_PER_SKIPPED_TREE + (position.bytes - entry.position.bytes) * ERROR_COST_PER_SKIPPED_CHAR + (position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE; if (ts_parser__better_version_exists(self, version, false, new_cost)) break; // If the current lookahead token is valid in some previous state, recover to that state. // Then stop looking for further recoveries. if (ts_language_has_actions(self->language, entry.state, ts_subtree_symbol(lookahead))) { if (ts_parser__recover_to_state(self, version, depth, entry.state)) { did_recover = true; LOG("recover_to_previous state:%u, depth:%u", entry.state, depth); LOG_STACK(); break; } } } } // In the process of attempting to recover, some stack versions may have been created // and subsequently halted. Remove those versions. for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { if (!ts_stack_is_active(self->stack, i)) { ts_stack_remove_version(self->stack, i--); } } // If strategy 1 succeeded, a new stack version will have been created which is able to handle // the current lookahead token. Now, in addition, try strategy 2 described above: skip the // current lookahead token by wrapping it in an ERROR node. // Don't pursue this additional strategy if there are already too many stack versions. if (did_recover && ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { ts_stack_halt(self->stack, version); ts_subtree_release(&self->tree_pool, lookahead); return; } // If the parser is still in the error state at the end of the file, just wrap everything // in an ERROR node and terminate. if (ts_subtree_is_eof(lookahead)) { LOG("recover_eof"); SubtreeArray children = array_new(); Subtree parent = ts_subtree_new_error_node(&children, false, self->language); ts_stack_push(self->stack, version, parent, false, 1); ts_parser__accept(self, version, lookahead); return; } // Do not recover if the result would clearly be worse than some existing stack version. unsigned new_cost = current_error_cost + ERROR_COST_PER_SKIPPED_TREE + ts_subtree_total_bytes(lookahead) * ERROR_COST_PER_SKIPPED_CHAR + ts_subtree_total_size(lookahead).extent.row * ERROR_COST_PER_SKIPPED_LINE; if (ts_parser__better_version_exists(self, version, false, new_cost)) { ts_stack_halt(self->stack, version); ts_subtree_release(&self->tree_pool, lookahead); return; } // If the current lookahead token is an extra token, mark it as extra. This means it won't // be counted in error cost calculations. unsigned n; const TSParseAction *actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n); if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].shift.extra) { MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); ts_subtree_set_extra(&mutable_lookahead, true); lookahead = ts_subtree_from_mut(mutable_lookahead); } // Wrap the lookahead token in an ERROR. LOG("skip_token symbol:%s", TREE_NAME(lookahead)); SubtreeArray children = array_new(); array_reserve(&children, 1); array_push(&children, lookahead); MutableSubtree error_repeat = ts_subtree_new_node( ts_builtin_sym_error_repeat, &children, 0, self->language ); // If other tokens have already been skipped, so there is already an ERROR at the top of the // stack, then pop that ERROR off the stack and wrap the two ERRORs together into one larger // ERROR. if (node_count_since_error > 0) { StackSliceArray pop = ts_stack_pop_count(self->stack, version, 1); // TODO: Figure out how to make this condition occur. // See https://github.com/atom/atom/issues/18450#issuecomment-439579778 // If multiple stack versions have merged at this point, just pick one of the errors // arbitrarily and discard the rest. if (pop.size > 1) { for (unsigned i = 1; i < pop.size; i++) { ts_subtree_array_delete(&self->tree_pool, &pop.contents[i].subtrees); } while (ts_stack_version_count(self->stack) > pop.contents[0].version + 1) { ts_stack_remove_version(self->stack, pop.contents[0].version + 1); } } ts_stack_renumber_version(self->stack, pop.contents[0].version, version); array_push(&pop.contents[0].subtrees, ts_subtree_from_mut(error_repeat)); error_repeat = ts_subtree_new_node( ts_builtin_sym_error_repeat, &pop.contents[0].subtrees, 0, self->language ); } // Push the new ERROR onto the stack. ts_stack_push(self->stack, version, ts_subtree_from_mut(error_repeat), false, ERROR_STATE); if (ts_subtree_has_external_tokens(lookahead)) { ts_stack_set_last_external_token( self->stack, version, ts_subtree_last_external_token(lookahead) ); } } static bool ts_parser__advance( TSParser *self, StackVersion version, bool allow_node_reuse ) { TSStateId state = ts_stack_state(self->stack, version); uint32_t position = ts_stack_position(self->stack, version).bytes; Subtree last_external_token = ts_stack_last_external_token(self->stack, version); bool did_reuse = true; Subtree lookahead = NULL_SUBTREE; TableEntry table_entry = {.action_count = 0}; // If possible, reuse a node from the previous syntax tree. if (allow_node_reuse) { lookahead = ts_parser__reuse_node( self, version, &state, position, last_external_token, &table_entry ); } // If no node from the previous syntax tree could be reused, then try to // reuse the token previously returned by the lexer. if (!lookahead.ptr) { did_reuse = false; lookahead = ts_parser__get_cached_token( self, state, position, last_external_token, &table_entry ); } bool needs_lex = !lookahead.ptr; for (;;) { // Otherwise, re-run the lexer. if (needs_lex) { needs_lex = false; lookahead = ts_parser__lex(self, version, state); if (lookahead.ptr) { ts_parser__set_cached_token(self, position, last_external_token, lookahead); ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); } // When parsing a non-terminal extra, a null lookahead indicates the // end of the rule. The reduction is stored in the EOF table entry. // After the reduction, the lexer needs to be run again. else { ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); } } // If a cancellation flag or a timeout was provided, then check every // time a fixed number of parse actions has been processed. if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { self->operation_count = 0; } if ( self->operation_count == 0 && ((self->cancellation_flag && atomic_load(self->cancellation_flag)) || (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock))) ) { ts_subtree_release(&self->tree_pool, lookahead); return false; } // Process each parse action for the current lookahead token in // the current state. If there are multiple actions, then this is // an ambiguous state. REDUCE actions always create a new stack // version, whereas SHIFT actions update the existing stack version // and terminate this loop. StackVersion last_reduction_version = STACK_VERSION_NONE; for (uint32_t i = 0; i < table_entry.action_count; i++) { TSParseAction action = table_entry.actions[i]; switch (action.type) { case TSParseActionTypeShift: { if (action.shift.repetition) break; TSStateId next_state; if (action.shift.extra) { next_state = state; LOG("shift_extra"); } else { next_state = action.shift.state; LOG("shift state:%u", next_state); } if (ts_subtree_child_count(lookahead) > 0) { ts_parser__breakdown_lookahead(self, &lookahead, state, &self->reusable_node); next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead)); } ts_parser__shift(self, version, next_state, lookahead, action.shift.extra); if (did_reuse) reusable_node_advance(&self->reusable_node); return true; } case TSParseActionTypeReduce: { bool is_fragile = table_entry.action_count > 1; bool end_of_non_terminal_extra = lookahead.ptr == NULL; LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.reduce.symbol), action.reduce.child_count); StackVersion reduction_version = ts_parser__reduce( self, version, action.reduce.symbol, action.reduce.child_count, action.reduce.dynamic_precedence, action.reduce.production_id, is_fragile, end_of_non_terminal_extra ); if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; } break; } case TSParseActionTypeAccept: { LOG("accept"); ts_parser__accept(self, version, lookahead); return true; } case TSParseActionTypeRecover: { if (ts_subtree_child_count(lookahead) > 0) { ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node); } ts_parser__recover(self, version, lookahead); if (did_reuse) reusable_node_advance(&self->reusable_node); return true; } } } // If a reduction was performed, then replace the current stack version // with one of the stack versions created by a reduction, and continue // processing this version of the stack with the same lookahead symbol. if (last_reduction_version != STACK_VERSION_NONE) { ts_stack_renumber_version(self->stack, last_reduction_version, version); LOG_STACK(); state = ts_stack_state(self->stack, version); // At the end of a non-terminal extra rule, the lexer will return a // null subtree, because the parser needs to perform a fixed reduction // regardless of the lookahead node. After performing that reduction, // (and completing the non-terminal extra rule) run the lexer again based // on the current parse state. if (!lookahead.ptr) { needs_lex = true; continue; } ts_language_table_entry( self->language, state, ts_subtree_leaf_symbol(lookahead), &table_entry ); continue; } if (!lookahead.ptr) { ts_stack_pause(self->stack, version, ts_builtin_sym_end); return true; } // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that // token is valid in this state. if ( ts_subtree_is_keyword(lookahead) && ts_subtree_symbol(lookahead) != self->language->keyword_capture_token ) { ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry); if (table_entry.action_count > 0) { LOG( "switch from_keyword:%s, to_word_token:%s", TREE_NAME(lookahead), SYM_NAME(self->language->keyword_capture_token) ); MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); ts_subtree_set_symbol(&mutable_lookahead, self->language->keyword_capture_token, self->language); lookahead = ts_subtree_from_mut(mutable_lookahead); continue; } } // If the current lookahead token is not valid and the parser is // already in the error state, restart the error recovery process. // TODO - can this be unified with the other `RECOVER` case above? if (state == ERROR_STATE) { ts_parser__recover(self, version, lookahead); return true; } // If the current lookahead token is not valid and the previous // subtree on the stack was reused from an old tree, it isn't actually // valid to reuse it. Remove it from the stack, and in its place, // push each of its children. Then try again to process the current // lookahead. if (ts_parser__breakdown_top_of_stack(self, version)) { state = ts_stack_state(self->stack, version); ts_subtree_release(&self->tree_pool, lookahead); needs_lex = true; continue; } // At this point, the current lookahead token is definitely not valid // for this parse stack version. Mark this version as paused and continue // processing any other stack versions that might exist. If some other // version advances successfully, then this version can simply be removed. // But if all versions end up paused, then error recovery is needed. LOG("detect_error"); ts_stack_pause(self->stack, version, ts_subtree_leaf_symbol(lookahead)); ts_subtree_release(&self->tree_pool, lookahead); return true; } } static unsigned ts_parser__condense_stack(TSParser *self) { bool made_changes = false; unsigned min_error_cost = UINT_MAX; for (StackVersion i = 0; i < ts_stack_version_count(self->stack); i++) { // Prune any versions that have been marked for removal. if (ts_stack_is_halted(self->stack, i)) { ts_stack_remove_version(self->stack, i); i--; continue; } // Keep track of the minimum error cost of any stack version so // that it can be returned. ErrorStatus status_i = ts_parser__version_status(self, i); if (!status_i.is_in_error && status_i.cost < min_error_cost) { min_error_cost = status_i.cost; } // Examine each pair of stack versions, removing any versions that // are clearly worse than another version. Ensure that the versions // are ordered from most promising to least promising. for (StackVersion j = 0; j < i; j++) { ErrorStatus status_j = ts_parser__version_status(self, j); switch (ts_parser__compare_versions(self, status_j, status_i)) { case ErrorComparisonTakeLeft: made_changes = true; ts_stack_remove_version(self->stack, i); i--; j = i; break; case ErrorComparisonPreferLeft: case ErrorComparisonNone: if (ts_stack_merge(self->stack, j, i)) { made_changes = true; i--; j = i; } break; case ErrorComparisonPreferRight: made_changes = true; if (ts_stack_merge(self->stack, j, i)) { i--; j = i; } else { ts_stack_swap_versions(self->stack, i, j); } break; case ErrorComparisonTakeRight: made_changes = true; ts_stack_remove_version(self->stack, j); i--; j--; break; } } } // Enfore a hard upper bound on the number of stack versions by // discarding the least promising versions. while (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { ts_stack_remove_version(self->stack, MAX_VERSION_COUNT); made_changes = true; } // If the best-performing stack version is currently paused, or all // versions are paused, then resume the best paused version and begin // the error recovery process. Otherwise, remove the paused versions. if (ts_stack_version_count(self->stack) > 0) { bool has_unpaused_version = false; for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { if (ts_stack_is_paused(self->stack, i)) { if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT) { LOG("resume version:%u", i); min_error_cost = ts_stack_error_cost(self->stack, i); TSSymbol lookahead_symbol = ts_stack_resume(self->stack, i); ts_parser__handle_error(self, i, lookahead_symbol); has_unpaused_version = true; } else { ts_stack_remove_version(self->stack, i); i--; n--; } } else { has_unpaused_version = true; } } } if (made_changes) { LOG("condense"); LOG_STACK(); } return min_error_cost; } static bool ts_parser_has_outstanding_parse(TSParser *self) { return ( ts_stack_state(self->stack, 0) != 1 || ts_stack_node_count_since_error(self->stack, 0) != 0 ); } // Parser - Public TSParser *ts_parser_new(void) { TSParser *self = ts_calloc(1, sizeof(TSParser)); ts_lexer_init(&self->lexer); array_init(&self->reduce_actions); array_reserve(&self->reduce_actions, 4); self->tree_pool = ts_subtree_pool_new(32); self->stack = ts_stack_new(&self->tree_pool); self->finished_tree = NULL_SUBTREE; self->reusable_node = reusable_node_new(); self->dot_graph_file = NULL; self->cancellation_flag = NULL; self->timeout_duration = 0; self->end_clock = clock_null(); self->operation_count = 0; self->old_tree = NULL_SUBTREE; self->included_range_differences = (TSRangeArray) array_new(); self->included_range_difference_index = 0; ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); return self; } void ts_parser_delete(TSParser *self) { if (!self) return; ts_parser_set_language(self, NULL); ts_stack_delete(self->stack); if (self->reduce_actions.contents) { array_delete(&self->reduce_actions); } if (self->included_range_differences.contents) { array_delete(&self->included_range_differences); } if (self->old_tree.ptr) { ts_subtree_release(&self->tree_pool, self->old_tree); self->old_tree = NULL_SUBTREE; } ts_lexer_delete(&self->lexer); ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); ts_subtree_pool_delete(&self->tree_pool); reusable_node_delete(&self->reusable_node); array_delete(&self->trailing_extras); array_delete(&self->trailing_extras2); array_delete(&self->scratch_trees); ts_free(self); } const TSLanguage *ts_parser_language(const TSParser *self) { return self->language; } bool ts_parser_set_language(TSParser *self, const TSLanguage *language) { if (language) { if (language->version > TREE_SITTER_LANGUAGE_VERSION) return false; if (language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION) return false; } if (self->external_scanner_payload && self->language->external_scanner.destroy) { self->language->external_scanner.destroy(self->external_scanner_payload); } if (language && language->external_scanner.create) { self->external_scanner_payload = language->external_scanner.create(); } else { self->external_scanner_payload = NULL; } self->language = language; ts_parser_reset(self); return true; } TSLogger ts_parser_logger(const TSParser *self) { return self->lexer.logger; } void ts_parser_set_logger(TSParser *self, TSLogger logger) { self->lexer.logger = logger; } void ts_parser_print_dot_graphs(TSParser *self, int fd) { if (self->dot_graph_file) { fclose(self->dot_graph_file); } if (fd >= 0) { self->dot_graph_file = fdopen(fd, "a"); } else { self->dot_graph_file = NULL; } } const size_t *ts_parser_cancellation_flag(const TSParser *self) { return (const size_t *)self->cancellation_flag; } void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag) { self->cancellation_flag = (const volatile size_t *)flag; } uint64_t ts_parser_timeout_micros(const TSParser *self) { return duration_to_micros(self->timeout_duration); } void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout_micros) { self->timeout_duration = duration_from_micros(timeout_micros); } bool ts_parser_set_included_ranges( TSParser *self, const TSRange *ranges, uint32_t count ) { return ts_lexer_set_included_ranges(&self->lexer, ranges, count); } const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count) { return ts_lexer_included_ranges(&self->lexer, count); } void ts_parser_reset(TSParser *self) { if (self->language && self->language->external_scanner.deserialize) { self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); } if (self->old_tree.ptr) { ts_subtree_release(&self->tree_pool, self->old_tree); self->old_tree = NULL_SUBTREE; } reusable_node_clear(&self->reusable_node); ts_lexer_reset(&self->lexer, length_zero()); ts_stack_clear(self->stack); ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); if (self->finished_tree.ptr) { ts_subtree_release(&self->tree_pool, self->finished_tree); self->finished_tree = NULL_SUBTREE; } self->accept_count = 0; } TSTree *ts_parser_parse( TSParser *self, const TSTree *old_tree, TSInput input ) { if (!self->language || !input.read) return NULL; ts_lexer_set_input(&self->lexer, input); array_clear(&self->included_range_differences); self->included_range_difference_index = 0; if (ts_parser_has_outstanding_parse(self)) { LOG("resume_parsing"); } else if (old_tree) { ts_subtree_retain(old_tree->root); self->old_tree = old_tree->root; ts_range_array_get_changed_ranges( old_tree->included_ranges, old_tree->included_range_count, self->lexer.included_ranges, self->lexer.included_range_count, &self->included_range_differences ); reusable_node_reset(&self->reusable_node, old_tree->root); LOG("parse_after_edit"); LOG_TREE(self->old_tree); for (unsigned i = 0; i < self->included_range_differences.size; i++) { TSRange *range = &self->included_range_differences.contents[i]; LOG("different_included_range %u - %u", range->start_byte, range->end_byte); } } else { reusable_node_clear(&self->reusable_node); LOG("new_parse"); } uint32_t position = 0, last_position = 0, version_count = 0; self->operation_count = 0; if (self->timeout_duration) { self->end_clock = clock_after(clock_now(), self->timeout_duration); } else { self->end_clock = clock_null(); } do { for (StackVersion version = 0; version_count = ts_stack_version_count(self->stack), version < version_count; version++) { bool allow_node_reuse = version_count == 1; while (ts_stack_is_active(self->stack, version)) { LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", version, ts_stack_version_count(self->stack), ts_stack_state(self->stack, version), ts_stack_position(self->stack, version).extent.row, ts_stack_position(self->stack, version).extent.column); if (!ts_parser__advance(self, version, allow_node_reuse)) return NULL; LOG_STACK(); position = ts_stack_position(self->stack, version).bytes; if (position > last_position || (version > 0 && position == last_position)) { last_position = position; break; } } } unsigned min_error_cost = ts_parser__condense_stack(self); if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) < min_error_cost) { break; } while (self->included_range_difference_index < self->included_range_differences.size) { TSRange *range = &self->included_range_differences.contents[self->included_range_difference_index]; if (range->end_byte <= position) { self->included_range_difference_index++; } else { break; } } } while (version_count != 0); ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); LOG("done"); LOG_TREE(self->finished_tree); TSTree *result = ts_tree_new( self->finished_tree, self->language, self->lexer.included_ranges, self->lexer.included_range_count ); self->finished_tree = NULL_SUBTREE; ts_parser_reset(self); return result; } TSTree *ts_parser_parse_string( TSParser *self, const TSTree *old_tree, const char *string, uint32_t length ) { return ts_parser_parse_string_encoding(self, old_tree, string, length, TSInputEncodingUTF8); } TSTree *ts_parser_parse_string_encoding(TSParser *self, const TSTree *old_tree, const char *string, uint32_t length, TSInputEncoding encoding) { TSStringInput input = {string, length}; return ts_parser_parse(self, old_tree, (TSInput) { &input, ts_string_input_read, encoding, }); } #undef LOG tree-sitter-0.20.1/src/point.h000064400000000000000000000026740072674642500143050ustar 00000000000000#ifndef TREE_SITTER_POINT_H_ #define TREE_SITTER_POINT_H_ #include "tree_sitter/api.h" #define POINT_ZERO ((TSPoint) {0, 0}) #define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) static inline TSPoint point__new(unsigned row, unsigned column) { TSPoint result = {row, column}; return result; } static inline TSPoint point_add(TSPoint a, TSPoint b) { if (b.row > 0) return point__new(a.row + b.row, b.column); else return point__new(a.row, a.column + b.column); } static inline TSPoint point_sub(TSPoint a, TSPoint b) { if (a.row > b.row) return point__new(a.row - b.row, a.column); else return point__new(0, a.column - b.column); } static inline bool point_lte(TSPoint a, TSPoint b) { return (a.row < b.row) || (a.row == b.row && a.column <= b.column); } static inline bool point_lt(TSPoint a, TSPoint b) { return (a.row < b.row) || (a.row == b.row && a.column < b.column); } static inline bool point_gt(TSPoint a, TSPoint b) { return (a.row > b.row) || (a.row == b.row && a.column > b.column); } static inline bool point_eq(TSPoint a, TSPoint b) { return a.row == b.row && a.column == b.column; } static inline TSPoint point_min(TSPoint a, TSPoint b) { if (a.row < b.row || (a.row == b.row && a.column < b.column)) return a; else return b; } static inline TSPoint point_max(TSPoint a, TSPoint b) { if (a.row > b.row || (a.row == b.row && a.column > b.column)) return a; else return b; } #endif tree-sitter-0.20.1/src/query.c000064400000000000000000003417260072674642500143200ustar 00000000000000#include "tree_sitter/api.h" #include "./alloc.h" #include "./array.h" #include "./language.h" #include "./point.h" #include "./tree_cursor.h" #include "./unicode.h" #include // #define DEBUG_ANALYZE_QUERY // #define DEBUG_EXECUTE_QUERY #define MAX_STEP_CAPTURE_COUNT 3 #define MAX_STATE_PREDECESSOR_COUNT 100 #define MAX_ANALYSIS_STATE_DEPTH 8 #define MAX_NEGATED_FIELD_COUNT 8 /* * Stream - A sequence of unicode characters derived from a UTF8 string. * This struct is used in parsing queries from S-expressions. */ typedef struct { const char *input; const char *start; const char *end; int32_t next; uint8_t next_size; } Stream; /* * QueryStep - A step in the process of matching a query. Each node within * a query S-expression corresponds to one of these steps. An entire pattern * is represented as a sequence of these steps. The basic properties of a * node are represented by these fields: * - `symbol` - The grammar symbol to match. A zero value represents the * wildcard symbol, '_'. * - `field` - The field name to match. A zero value means that a field name * was not specified. * - `capture_ids` - An array of integers representing the names of captures * associated with this node in the pattern, terminated by a `NONE` value. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. * - `negated_field_list_id` - An id representing a set of fields that must * that must not be present on a node matching this step. * * Steps have some additional fields in order to handle the `.` (or "anchor") operator, * which forbids additional child nodes: * - `is_immediate` - Indicates that the node matching this step cannot be preceded * by other sibling nodes that weren't specified in the pattern. * - `is_last_child` - Indicates that the node matching this step cannot have any * subsequent named siblings. * * For simple patterns, steps are matched in sequential order. But in order to * handle alternative/repeated/optional sub-patterns, query steps are not always * structured as a linear sequence; they sometimes need to split and merge. This * is done using the following fields: * - `alternative_index` - The index of a different query step that serves as * an alternative to this step. A `NONE` value represents no alternative. * When a query state reaches a step with an alternative index, the state * is duplicated, with one copy remaining at the original step, and one copy * moving to the alternative step. The alternative may have its own alternative * step, so this splitting is an iterative process. * - `is_dead_end` - Indicates that this state cannot be passed directly, and * exists only in order to redirect to an alternative index, with no splitting. * - `is_pass_through` - Indicates that state has no matching logic of its own, * and exists only to split a state. One copy of the state advances immediately * to the next step, and one moves to the alternative step. * - `alternative_is_immediate` - Indicates that this step's alternative step * should be treated as if `is_immediate` is true. * * Steps also store some derived state that summarizes how they relate to other * steps within the same pattern. This is used to optimize the matching process: * - `contains_captures` - Indicates that this step or one of its child steps * has a non-empty `capture_ids` list. * - `parent_pattern_guaranteed` - Indicates that if this step is reached, then * it and all of its subsequent sibling steps within the same parent pattern * are guaranteed to match. * - `root_pattern_guaranteed` - Similar to `parent_pattern_guaranteed`, but * for the entire top-level pattern. When iterating through a query's * captures using `ts_query_cursor_next_capture`, this field is used to * detect that a capture can safely be returned from a match that has not * even completed yet. */ typedef struct { TSSymbol symbol; TSSymbol supertype_symbol; TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; uint16_t depth; uint16_t alternative_index; uint16_t negated_field_list_id; bool is_named: 1; bool is_immediate: 1; bool is_last_child: 1; bool is_pass_through: 1; bool is_dead_end: 1; bool alternative_is_immediate: 1; bool contains_captures: 1; bool root_pattern_guaranteed: 1; bool parent_pattern_guaranteed: 1; } QueryStep; /* * Slice - A slice of an external array. Within a query, capture names, * literal string values, and predicate step informations are stored in three * contiguous arrays. Individual captures, string values, and predicates are * represented as slices of these three arrays. */ typedef struct { uint32_t offset; uint32_t length; } Slice; /* * SymbolTable - a two-way mapping of strings to ids. */ typedef struct { Array(char) characters; Array(Slice) slices; } SymbolTable; /* * PatternEntry - Information about the starting point for matching a particular * pattern. These entries are stored in a 'pattern map' - a sorted array that * makes it possible to efficiently lookup patterns based on the symbol for their * first step. The entry consists of the following fields: * - `pattern_index` - the index of the pattern within the query * - `step_index` - the index of the pattern's first step in the shared `steps` array * - `is_rooted` - whether or not the pattern has a single root node. This property * affects decisions about whether or not to start the pattern for nodes outside * of a QueryCursor's range restriction. */ typedef struct { uint16_t step_index; uint16_t pattern_index; bool is_rooted; } PatternEntry; typedef struct { Slice steps; Slice predicate_steps; uint32_t start_byte; } QueryPattern; typedef struct { uint32_t byte_offset; uint16_t step_index; } StepOffset; /* * QueryState - The state of an in-progress match of a particular pattern * in a query. While executing, a `TSQueryCursor` must keep track of a number * of possible in-progress matches. Each of those possible matches is * represented as one of these states. Fields: * - `id` - A numeric id that is exposed to the public API. This allows the * caller to remove a given match, preventing any more of its captures * from being returned. * - `start_depth` - The depth in the tree where the first step of the state's * pattern was matched. * - `pattern_index` - The pattern that the state is matching. * - `consumed_capture_count` - The number of captures from this match that * have already been returned. * - `capture_list_id` - A numeric id that can be used to retrieve the state's * list of captures from the `CaptureListPool`. * - `seeking_immediate_match` - A flag that indicates that the state's next * step must be matched by the very next sibling. This is used when * processing repetitions. * - `has_in_progress_alternatives` - A flag that indicates that there is are * other states that have the same captures as this state, but are at * different steps in their pattern. This means that in order to obey the * 'longest-match' rule, this state should not be returned as a match until * it is clear that there can be no other alternative match with more captures. */ typedef struct { uint32_t id; uint32_t capture_list_id; uint16_t start_depth; uint16_t step_index; uint16_t pattern_index; uint16_t consumed_capture_count: 12; bool seeking_immediate_match: 1; bool has_in_progress_alternatives: 1; bool dead: 1; bool needs_parent: 1; } QueryState; typedef Array(TSQueryCapture) CaptureList; /* * CaptureListPool - A collection of *lists* of captures. Each query state needs * to maintain its own list of captures. To avoid repeated allocations, this struct * maintains a fixed set of capture lists, and keeps track of which ones are * currently in use by a query state. */ typedef struct { Array(CaptureList) list; CaptureList empty_list; // The maximum number of capture lists that we are allowed to allocate. We // never allow `list` to allocate more entries than this, dropping pending // matches if needed to stay under the limit. uint32_t max_capture_list_count; // The number of capture lists allocated in `list` that are not currently in // use. We reuse those existing-but-unused capture lists before trying to // allocate any new ones. We use an invalid value (UINT32_MAX) for a capture // list's length to indicate that it's not in use. uint32_t free_capture_list_count; } CaptureListPool; /* * AnalysisState - The state needed for walking the parse table when analyzing * a query pattern, to determine at which steps the pattern might fail to match. */ typedef struct { TSStateId parse_state; TSSymbol parent_symbol; uint16_t child_index; TSFieldId field_id: 15; bool done: 1; } AnalysisStateEntry; typedef struct { AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH]; uint16_t depth; uint16_t step_index; } AnalysisState; typedef Array(AnalysisState) AnalysisStateSet; /* * AnalysisSubgraph - A subset of the states in the parse table that are used * in constructing nodes with a certain symbol. Each state is accompanied by * some information about the possible node that could be produced in * downstream states. */ typedef struct { TSStateId state; uint8_t production_id; uint8_t child_index: 7; bool done: 1; } AnalysisSubgraphNode; typedef struct { TSSymbol symbol; Array(TSStateId) start_states; Array(AnalysisSubgraphNode) nodes; } AnalysisSubgraph; /* * StatePredecessorMap - A map that stores the predecessors of each parse state. * This is used during query analysis to determine which parse states can lead * to which reduce actions. */ typedef struct { TSStateId *contents; } StatePredecessorMap; /* * TSQuery - A tree query, compiled from a string of S-expressions. The query * itself is immutable. The mutable state used in the process of executing the * query is stored in a `TSQueryCursor`. */ struct TSQuery { SymbolTable captures; SymbolTable predicate_values; Array(QueryStep) steps; Array(PatternEntry) pattern_map; Array(TSQueryPredicateStep) predicate_steps; Array(QueryPattern) patterns; Array(StepOffset) step_offsets; Array(TSFieldId) negated_fields; Array(char) string_buffer; const TSLanguage *language; uint16_t wildcard_root_pattern_count; }; /* * TSQueryCursor - A stateful struct used to execute a query on a tree. */ struct TSQueryCursor { const TSQuery *query; TSTreeCursor cursor; Array(QueryState) states; Array(QueryState) finished_states; CaptureListPool capture_list_pool; uint32_t depth; uint32_t start_byte; uint32_t end_byte; TSPoint start_point; TSPoint end_point; uint32_t next_state_id; bool ascending; bool halted; bool did_exceed_match_limit; }; static const TSQueryError PARENT_DONE = -1; static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX; static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; /********** * Stream **********/ // Advance to the next unicode code point in the stream. static bool stream_advance(Stream *self) { self->input += self->next_size; if (self->input < self->end) { uint32_t size = ts_decode_utf8( (const uint8_t *)self->input, self->end - self->input, &self->next ); if (size > 0) { self->next_size = size; return true; } } else { self->next_size = 0; self->next = '\0'; } return false; } // Reset the stream to the given input position, represented as a pointer // into the input string. static void stream_reset(Stream *self, const char *input) { self->input = input; self->next_size = 0; stream_advance(self); } static Stream stream_new(const char *string, uint32_t length) { Stream self = { .next = 0, .input = string, .start = string, .end = string + length, }; stream_advance(&self); return self; } static void stream_skip_whitespace(Stream *self) { for (;;) { if (iswspace(self->next)) { stream_advance(self); } else if (self->next == ';') { // skip over comments stream_advance(self); while (self->next && self->next != '\n') { if (!stream_advance(self)) break; } } else { break; } } } static bool stream_is_ident_start(Stream *self) { return iswalnum(self->next) || self->next == '_' || self->next == '-'; } static void stream_scan_identifier(Stream *stream) { do { stream_advance(stream); } while ( iswalnum(stream->next) || stream->next == '_' || stream->next == '-' || stream->next == '.' || stream->next == '?' || stream->next == '!' ); } static uint32_t stream_offset(Stream *self) { return self->input - self->start; } /****************** * CaptureListPool ******************/ static CaptureListPool capture_list_pool_new(void) { return (CaptureListPool) { .list = array_new(), .empty_list = array_new(), .max_capture_list_count = UINT32_MAX, .free_capture_list_count = 0, }; } static void capture_list_pool_reset(CaptureListPool *self) { for (uint16_t i = 0; i < self->list.size; i++) { // This invalid size means that the list is not in use. self->list.contents[i].size = UINT32_MAX; } self->free_capture_list_count = self->list.size; } static void capture_list_pool_delete(CaptureListPool *self) { for (uint16_t i = 0; i < self->list.size; i++) { array_delete(&self->list.contents[i]); } array_delete(&self->list); } static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) { if (id >= self->list.size) return &self->empty_list; return &self->list.contents[id]; } static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) { assert(id < self->list.size); return &self->list.contents[id]; } static bool capture_list_pool_is_empty(const CaptureListPool *self) { // The capture list pool is empty if all allocated lists are in use, and we // have reached the maximum allowed number of allocated lists. return self->free_capture_list_count == 0 && self->list.size >= self->max_capture_list_count; } static uint16_t capture_list_pool_acquire(CaptureListPool *self) { // First see if any already allocated capture list is currently unused. if (self->free_capture_list_count > 0) { for (uint16_t i = 0; i < self->list.size; i++) { if (self->list.contents[i].size == UINT32_MAX) { array_clear(&self->list.contents[i]); self->free_capture_list_count--; return i; } } } // Otherwise allocate and initialize a new capture list, as long as that // doesn't put us over the requested maximum. uint32_t i = self->list.size; if (i >= self->max_capture_list_count) { return NONE; } CaptureList list; array_init(&list); array_push(&self->list, list); return i; } static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { if (id >= self->list.size) return; self->list.contents[id].size = UINT32_MAX; self->free_capture_list_count++; } /************** * SymbolTable **************/ static SymbolTable symbol_table_new(void) { return (SymbolTable) { .characters = array_new(), .slices = array_new(), }; } static void symbol_table_delete(SymbolTable *self) { array_delete(&self->characters); array_delete(&self->slices); } static int symbol_table_id_for_name( const SymbolTable *self, const char *name, uint32_t length ) { for (unsigned i = 0; i < self->slices.size; i++) { Slice slice = self->slices.contents[i]; if ( slice.length == length && !strncmp(&self->characters.contents[slice.offset], name, length) ) return i; } return -1; } static const char *symbol_table_name_for_id( const SymbolTable *self, uint16_t id, uint32_t *length ) { Slice slice = self->slices.contents[id]; *length = slice.length; return &self->characters.contents[slice.offset]; } static uint16_t symbol_table_insert_name( SymbolTable *self, const char *name, uint32_t length ) { int id = symbol_table_id_for_name(self, name, length); if (id >= 0) return (uint16_t)id; Slice slice = { .offset = self->characters.size, .length = length, }; array_grow_by(&self->characters, length + 1); memcpy(&self->characters.contents[slice.offset], name, length); self->characters.contents[self->characters.size - 1] = 0; array_push(&self->slices, slice); return self->slices.size - 1; } /************ * QueryStep ************/ static QueryStep query_step__new( TSSymbol symbol, uint16_t depth, bool is_immediate ) { return (QueryStep) { .symbol = symbol, .depth = depth, .field = 0, .capture_ids = {NONE, NONE, NONE}, .alternative_index = NONE, .negated_field_list_id = 0, .contains_captures = false, .is_last_child = false, .is_named = false, .is_pass_through = false, .is_dead_end = false, .root_pattern_guaranteed = false, .is_immediate = is_immediate, .alternative_is_immediate = false, }; } static void query_step__add_capture(QueryStep *self, uint16_t capture_id) { for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { if (self->capture_ids[i] == NONE) { self->capture_ids[i] = capture_id; break; } } } static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) { for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { if (self->capture_ids[i] == capture_id) { self->capture_ids[i] = NONE; while (i + 1 < MAX_STEP_CAPTURE_COUNT) { if (self->capture_ids[i + 1] == NONE) break; self->capture_ids[i] = self->capture_ids[i + 1]; self->capture_ids[i + 1] = NONE; i++; } break; } } } /********************** * StatePredecessorMap **********************/ static inline StatePredecessorMap state_predecessor_map_new( const TSLanguage *language ) { return (StatePredecessorMap) { .contents = ts_calloc( language->state_count * (MAX_STATE_PREDECESSOR_COUNT + 1), sizeof(TSStateId) ), }; } static inline void state_predecessor_map_delete(StatePredecessorMap *self) { ts_free(self->contents); } static inline void state_predecessor_map_add( StatePredecessorMap *self, TSStateId state, TSStateId predecessor ) { unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1); TSStateId *count = &self->contents[index]; if ( *count == 0 || (*count < MAX_STATE_PREDECESSOR_COUNT && self->contents[index + *count] != predecessor) ) { (*count)++; self->contents[index + *count] = predecessor; } } static inline const TSStateId *state_predecessor_map_get( const StatePredecessorMap *self, TSStateId state, unsigned *count ) { unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1); *count = self->contents[index]; return &self->contents[index + 1]; } /**************** * AnalysisState ****************/ static unsigned analysis_state__recursion_depth(const AnalysisState *self) { unsigned result = 0; for (unsigned i = 0; i < self->depth; i++) { TSSymbol symbol = self->stack[i].parent_symbol; for (unsigned j = 0; j < i; j++) { if (self->stack[j].parent_symbol == symbol) { result++; break; } } } return result; } static inline int analysis_state__compare_position( const AnalysisState *self, const AnalysisState *other ) { for (unsigned i = 0; i < self->depth; i++) { if (i >= other->depth) return -1; if (self->stack[i].child_index < other->stack[i].child_index) return -1; if (self->stack[i].child_index > other->stack[i].child_index) return 1; } if (self->depth < other->depth) return 1; return 0; } static inline int analysis_state__compare( const AnalysisState *self, const AnalysisState *other ) { int result = analysis_state__compare_position(self, other); if (result != 0) return result; for (unsigned i = 0; i < self->depth; i++) { if (self->stack[i].parent_symbol < other->stack[i].parent_symbol) return -1; if (self->stack[i].parent_symbol > other->stack[i].parent_symbol) return 1; if (self->stack[i].parse_state < other->stack[i].parse_state) return -1; if (self->stack[i].parse_state > other->stack[i].parse_state) return 1; if (self->stack[i].field_id < other->stack[i].field_id) return -1; if (self->stack[i].field_id > other->stack[i].field_id) return 1; } if (self->step_index < other->step_index) return -1; if (self->step_index > other->step_index) return 1; return 0; } static inline AnalysisStateEntry *analysis_state__top(AnalysisState *self) { return &self->stack[self->depth - 1]; } static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol symbol) { for (unsigned i = 0; i < self->depth; i++) { if (self->stack[i].parent_symbol == symbol) return true; } return false; } /*********************** * AnalysisSubgraphNode ***********************/ static inline int analysis_subgraph_node__compare(const AnalysisSubgraphNode *self, const AnalysisSubgraphNode *other) { if (self->state < other->state) return -1; if (self->state > other->state) return 1; if (self->child_index < other->child_index) return -1; if (self->child_index > other->child_index) return 1; if (self->done < other->done) return -1; if (self->done > other->done) return 1; if (self->production_id < other->production_id) return -1; if (self->production_id > other->production_id) return 1; return 0; } /********* * Query *********/ // The `pattern_map` contains a mapping from TSSymbol values to indices in the // `steps` array. For a given syntax node, the `pattern_map` makes it possible // to quickly find the starting steps of all of the patterns whose root matches // that node. Each entry has two fields: a `pattern_index`, which identifies one // of the patterns in the query, and a `step_index`, which indicates the start // offset of that pattern's steps within the `steps` array. // // The entries are sorted by the patterns' root symbols, and lookups use a // binary search. This ensures that the cost of this initial lookup step // scales logarithmically with the number of patterns in the query. // // This returns `true` if the symbol is present and `false` otherwise. // If the symbol is not present `*result` is set to the index where the // symbol should be inserted. static inline bool ts_query__pattern_map_search( const TSQuery *self, TSSymbol needle, uint32_t *result ) { uint32_t base_index = self->wildcard_root_pattern_count; uint32_t size = self->pattern_map.size - base_index; if (size == 0) { *result = base_index; return false; } while (size > 1) { uint32_t half_size = size / 2; uint32_t mid_index = base_index + half_size; TSSymbol mid_symbol = self->steps.contents[ self->pattern_map.contents[mid_index].step_index ].symbol; if (needle > mid_symbol) base_index = mid_index; size -= half_size; } TSSymbol symbol = self->steps.contents[ self->pattern_map.contents[base_index].step_index ].symbol; if (needle > symbol) { base_index++; if (base_index < self->pattern_map.size) { symbol = self->steps.contents[ self->pattern_map.contents[base_index].step_index ].symbol; } } *result = base_index; return needle == symbol; } // Insert a new pattern's start index into the pattern map, maintaining // the pattern map's ordering invariant. static inline void ts_query__pattern_map_insert( TSQuery *self, TSSymbol symbol, PatternEntry new_entry ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); // Ensure that the entries are sorted not only by symbol, but also // by pattern_index. This way, states for earlier patterns will be // initiated first, which allows the ordering of the states array // to be maintained more efficiently. while (index < self->pattern_map.size) { PatternEntry *entry = &self->pattern_map.contents[index]; if ( self->steps.contents[entry->step_index].symbol == symbol && entry->pattern_index < new_entry.pattern_index ) { index++; } else { break; } } array_insert(&self->pattern_map, index, new_entry); } static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Walk forward through all of the steps in the query, computing some // basic information about each step. Mark all of the steps that contain // captures, and record the indices of all of the steps that have child steps. Array(uint32_t) parent_step_indices = array_new(); for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; if (step->depth == PATTERN_DONE_MARKER) { step->parent_pattern_guaranteed = true; step->root_pattern_guaranteed = true; continue; } bool has_children = false; bool is_wildcard = step->symbol == WILDCARD_SYMBOL; step->contains_captures = step->capture_ids[0] != NONE; for (unsigned j = i + 1; j < self->steps.size; j++) { QueryStep *next_step = &self->steps.contents[j]; if ( next_step->depth == PATTERN_DONE_MARKER || next_step->depth <= step->depth ) break; if (next_step->capture_ids[0] != NONE) { step->contains_captures = true; } if (!is_wildcard) { next_step->root_pattern_guaranteed = true; next_step->parent_pattern_guaranteed = true; } has_children = true; } if (has_children && !is_wildcard) { array_push(&parent_step_indices, i); } } // For every parent symbol in the query, initialize an 'analysis subgraph'. // This subgraph lists all of the states in the parse table that are directly // involved in building subtrees for this symbol. // // In addition to the parent symbols in the query, construct subgraphs for all // of the hidden symbols in the grammar, because these might occur within // one of the parent nodes, such that their children appear to belong to the // parent. Array(AnalysisSubgraph) subgraphs = array_new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint32_t parent_step_index = parent_step_indices.contents[i]; TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; AnalysisSubgraph subgraph = { .symbol = parent_symbol }; array_insert_sorted_by(&subgraphs, .symbol, subgraph); } for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) { if (!ts_language_symbol_metadata(self->language, sym).visible) { AnalysisSubgraph subgraph = { .symbol = sym }; array_insert_sorted_by(&subgraphs, .symbol, subgraph); } } // Scan the parse table to find the data needed to populate these subgraphs. // Collect three things during this scan: // 1) All of the parse states where one of these symbols can start. // 2) All of the parse states where one of these symbols can end, along // with information about the node that would be created. // 3) A list of predecessor states for each state. StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language); for (TSStateId state = 1; state < self->language->state_count; state++) { unsigned subgraph_index, exists; LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state); while (ts_lookahead_iterator_next(&lookahead_iterator)) { if (lookahead_iterator.action_count) { for (unsigned i = 0; i < lookahead_iterator.action_count; i++) { const TSParseAction *action = &lookahead_iterator.actions[i]; if (action->type == TSParseActionTypeReduce) { const TSSymbol *aliases, *aliases_end; ts_language_aliases_for_symbol( self->language, action->reduce.symbol, &aliases, &aliases_end ); for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) { array_search_sorted_by( &subgraphs, .symbol, *symbol, &subgraph_index, &exists ); if (exists) { AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) { array_push(&subgraph->nodes, ((AnalysisSubgraphNode) { .state = state, .production_id = action->reduce.production_id, .child_index = action->reduce.child_count, .done = true, })); } } } } else if (action->type == TSParseActionTypeShift && !action->shift.extra) { TSStateId next_state = action->shift.state; state_predecessor_map_add(&predecessor_map, next_state, state); } } } else if (lookahead_iterator.next_state != 0) { if (lookahead_iterator.next_state != state) { state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state); } const TSSymbol *aliases, *aliases_end; ts_language_aliases_for_symbol( self->language, lookahead_iterator.symbol, &aliases, &aliases_end ); for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) { array_search_sorted_by( &subgraphs, .symbol, *symbol, &subgraph_index, &exists ); if (exists) { AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; if ( subgraph->start_states.size == 0 || *array_back(&subgraph->start_states) != state ) array_push(&subgraph->start_states, state); } } } } } // For each subgraph, compute the preceding states by walking backward // from the end states using the predecessor map. Array(AnalysisSubgraphNode) next_nodes = array_new(); for (unsigned i = 0; i < subgraphs.size; i++) { AnalysisSubgraph *subgraph = &subgraphs.contents[i]; if (subgraph->nodes.size == 0) { array_delete(&subgraph->start_states); array_erase(&subgraphs, i); i--; continue; } array_assign(&next_nodes, &subgraph->nodes); while (next_nodes.size > 0) { AnalysisSubgraphNode node = array_pop(&next_nodes); if (node.child_index > 1) { unsigned predecessor_count; const TSStateId *predecessors = state_predecessor_map_get( &predecessor_map, node.state, &predecessor_count ); for (unsigned j = 0; j < predecessor_count; j++) { AnalysisSubgraphNode predecessor_node = { .state = predecessors[j], .child_index = node.child_index - 1, .production_id = node.production_id, .done = false, }; unsigned index, exists; array_search_sorted_with( &subgraph->nodes, analysis_subgraph_node__compare, &predecessor_node, &index, &exists ); if (!exists) { array_insert(&subgraph->nodes, index, predecessor_node); array_push(&next_nodes, predecessor_node); } } } } } #ifdef DEBUG_ANALYZE_QUERY printf("\nSubgraphs:\n"); for (unsigned i = 0; i < subgraphs.size; i++) { AnalysisSubgraph *subgraph = &subgraphs.contents[i]; printf(" %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol)); for (unsigned j = 0; j < subgraph->start_states.size; j++) { printf( " {state: %u}\n", subgraph->start_states.contents[j] ); } for (unsigned j = 0; j < subgraph->nodes.size; j++) { AnalysisSubgraphNode *node = &subgraph->nodes.contents[j]; printf( " {state: %u, child_index: %u, production_id: %u, done: %d}\n", node->state, node->child_index, node->production_id, node->done ); } printf("\n"); } #endif // For each non-terminal pattern, determine if the pattern can successfully match, // and identify all of the possible children within the pattern where matching could fail. bool all_patterns_are_valid = true; AnalysisStateSet states = array_new(); AnalysisStateSet next_states = array_new(); AnalysisStateSet deeper_states = array_new(); Array(uint16_t) final_step_indices = array_new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint16_t parent_step_index = parent_step_indices.contents[i]; uint16_t parent_depth = self->steps.contents[parent_step_index].depth; TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; if (parent_symbol == ts_builtin_sym_error) continue; // Find the subgraph that corresponds to this pattern's root symbol. If the pattern's // root symbol is a terminal, then return an error. unsigned subgraph_index, exists; array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); if (!exists) { unsigned first_child_step_index = parent_step_index + 1; uint32_t i, exists; array_search_sorted_by(&self->step_offsets, .step_index, first_child_step_index, &i, &exists); assert(exists); *error_offset = self->step_offsets.contents[i].byte_offset; all_patterns_are_valid = false; break; } // Initialize an analysis state at every parse state in the table where // this parent symbol can occur. AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; array_clear(&states); array_clear(&deeper_states); for (unsigned j = 0; j < subgraph->start_states.size; j++) { TSStateId parse_state = subgraph->start_states.contents[j]; array_push(&states, ((AnalysisState) { .step_index = parent_step_index + 1, .stack = { [0] = { .parse_state = parse_state, .parent_symbol = parent_symbol, .child_index = 0, .field_id = 0, .done = false, }, }, .depth = 1, })); } // Walk the subgraph for this non-terminal, tracking all of the possible // sequences of progress within the pattern. bool can_finish_pattern = false; bool did_exceed_max_depth = false; unsigned recursion_depth_limit = 0; unsigned prev_final_step_count = 0; array_clear(&final_step_indices); for (;;) { #ifdef DEBUG_ANALYZE_QUERY printf("Final step indices:"); for (unsigned j = 0; j < final_step_indices.size; j++) { printf(" %4u", final_step_indices.contents[j]); } printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol)); for (unsigned j = 0; j < states.size; j++) { AnalysisState *state = &states.contents[j]; printf(" %3u: step: %u, stack: [", j, state->step_index); for (unsigned k = 0; k < state->depth; k++) { printf( " {%s, child: %u, state: %4u", self->language->symbol_names[state->stack[k].parent_symbol], state->stack[k].child_index, state->stack[k].parse_state ); if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]); if (state->stack[k].done) printf(", DONE"); printf("}"); } printf(" ]\n"); } #endif // If no further progress can be made within the current recursion depth limit, then // bump the depth limit by one, and continue to process the states the exceeded the // limit. But only allow this if progress has been made since the last time the depth // limit was increased. if (states.size == 0) { if (deeper_states.size > 0 && final_step_indices.size > prev_final_step_count) { #ifdef DEBUG_ANALYZE_QUERY printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); #endif prev_final_step_count = final_step_indices.size; recursion_depth_limit++; AnalysisStateSet _states = states; states = deeper_states; deeper_states = _states; continue; } break; } array_clear(&next_states); for (unsigned j = 0; j < states.size; j++) { AnalysisState * const state = &states.contents[j]; // For efficiency, it's important to avoid processing the same analysis state more // than once. To achieve this, keep the states in order of ascending position within // their hypothetical syntax trees. In each iteration of this loop, start by advancing // the states that have made the least progress. Avoid advancing states that have already // made more progress. if (next_states.size > 0) { int comparison = analysis_state__compare_position(state, array_back(&next_states)); if (comparison == 0) { array_insert_sorted_with(&next_states, analysis_state__compare, *state); continue; } else if (comparison > 0) { while (j < states.size) { array_push(&next_states, states.contents[j]); j++; } break; } } const TSStateId parse_state = analysis_state__top(state)->parse_state; const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; const TSFieldId parent_field_id = analysis_state__top(state)->field_id; const unsigned child_index = analysis_state__top(state)->child_index; const QueryStep * const step = &self->steps.contents[state->step_index]; unsigned subgraph_index, exists; array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); if (!exists) continue; const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; // Follow every possible path in the parse table, but only visit states that // are part of the subgraph for the current symbol. LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state); while (ts_lookahead_iterator_next(&lookahead_iterator)) { TSSymbol sym = lookahead_iterator.symbol; AnalysisSubgraphNode successor = { .state = parse_state, .child_index = child_index, }; if (lookahead_iterator.action_count) { const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1]; if (action->type == TSParseActionTypeShift) { if (!action->shift.extra) { successor.state = action->shift.state; successor.child_index++; } } else { continue; } } else if (lookahead_iterator.next_state != 0) { successor.state = lookahead_iterator.next_state; successor.child_index++; } else { continue; } unsigned node_index; array_search_sorted_with( &subgraph->nodes, analysis_subgraph_node__compare, &successor, &node_index, &exists ); while (node_index < subgraph->nodes.size) { AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; if (node->state != successor.state || node->child_index != successor.child_index) break; // Use the subgraph to determine what alias and field will eventually be applied // to this child node. TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); TSSymbol visible_symbol = alias ? alias : self->language->symbol_metadata[sym].visible ? self->language->public_symbol_map[sym] : 0; TSFieldId field_id = parent_field_id; if (!field_id) { const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end); for (; field_map != field_map_end; field_map++) { if (!field_map->inherited && field_map->child_index == child_index) { field_id = field_map->field_id; break; } } } // Create a new state that has advanced past this hypothetical subtree. AnalysisState next_state = *state; AnalysisStateEntry *next_state_top = analysis_state__top(&next_state); next_state_top->child_index = successor.child_index; next_state_top->parse_state = successor.state; if (node->done) next_state_top->done = true; // Determine if this hypothetical child node would match the current step // of the query pattern. bool does_match = false; if (visible_symbol) { does_match = true; if (step->symbol == WILDCARD_SYMBOL) { if ( step->is_named && !self->language->symbol_metadata[visible_symbol].named ) does_match = false; } else if (step->symbol != visible_symbol) { does_match = false; } if (step->field && step->field != field_id) { does_match = false; } if ( step->supertype_symbol && !analysis_state__has_supertype(state, step->supertype_symbol) ) does_match = false; } // If this child is hidden, then descend into it and walk through its children. // If the top entry of the stack is at the end of its rule, then that entry can // be replaced. Otherwise, push a new entry onto the stack. else if (sym >= self->language->token_count) { if (!next_state_top->done) { if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) { #ifdef DEBUG_ANALYZE_QUERY printf("Exceeded depth limit for state %u\n", j); #endif did_exceed_max_depth = true; continue; } next_state.depth++; next_state_top = analysis_state__top(&next_state); } *next_state_top = (AnalysisStateEntry) { .parse_state = parse_state, .parent_symbol = sym, .child_index = 0, .field_id = field_id, .done = false, }; if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { array_insert_sorted_with(&deeper_states, analysis_state__compare, next_state); continue; } } // Pop from the stack when this state reached the end of its current syntax node. while (next_state.depth > 0 && next_state_top->done) { next_state.depth--; next_state_top = analysis_state__top(&next_state); } // If this hypothetical child did match the current step of the query pattern, // then advance to the next step at the current depth. This involves skipping // over any descendant steps of the current child. const QueryStep *next_step = step; if (does_match) { for (;;) { next_state.step_index++; next_step = &self->steps.contents[next_state.step_index]; if ( next_step->depth == PATTERN_DONE_MARKER || next_step->depth <= parent_depth + 1 ) break; } } else if (successor.state == parse_state) { continue; } for (;;) { // Skip pass-through states. Although these states have alternatives, they are only // used to implement repetitions, and query analysis does not need to process // repetitions in order to determine whether steps are possible and definite. if (next_step->is_pass_through) { next_state.step_index++; next_step++; continue; } // If the pattern is finished or hypothetical parent node is complete, then // record that matching can terminate at this step of the pattern. Otherwise, // add this state to the list of states to process on the next iteration. if (!next_step->is_dead_end) { bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1; if (did_finish_pattern) can_finish_pattern = true; if (did_finish_pattern || next_state.depth == 0) { array_insert_sorted_by(&final_step_indices, , next_state.step_index); } else { array_insert_sorted_with(&next_states, analysis_state__compare, next_state); } } // If the state has advanced to a step with an alternative step, then add another state // at that alternative step. This process is simpler than the process of actually matching a // pattern during query exection, because for the purposes of query analysis, there is no // need to process repetitions. if ( does_match && next_step->alternative_index != NONE && next_step->alternative_index > next_state.step_index ) { next_state.step_index = next_step->alternative_index; next_step = &self->steps.contents[next_state.step_index]; } else { break; } } } } } AnalysisStateSet _states = states; states = next_states; next_states = _states; } // Mark as indefinite any step where a match terminated. // Later, this property will be propagated to all of the step's predecessors. for (unsigned j = 0; j < final_step_indices.size; j++) { uint32_t final_step_index = final_step_indices.contents[j]; QueryStep *step = &self->steps.contents[final_step_index]; if ( step->depth != PATTERN_DONE_MARKER && step->depth > parent_depth && !step->is_dead_end ) { step->parent_pattern_guaranteed = false; step->root_pattern_guaranteed = false; } } if (did_exceed_max_depth) { for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) { QueryStep *step = &self->steps.contents[j]; if ( step->depth <= parent_depth || step->depth == PATTERN_DONE_MARKER ) break; if (!step->is_dead_end) { step->parent_pattern_guaranteed = false; step->root_pattern_guaranteed = false; } } } // If this pattern cannot match, store the pattern index so that it can be // returned to the caller. if (all_patterns_are_valid && !can_finish_pattern && !did_exceed_max_depth) { assert(final_step_indices.size > 0); uint16_t impossible_step_index = *array_back(&final_step_indices); uint32_t i, exists; array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists); if (i >= self->step_offsets.size) i = self->step_offsets.size - 1; *error_offset = self->step_offsets.contents[i].byte_offset; all_patterns_are_valid = false; break; } } // Mark as indefinite any step with captures that are used in predicates. Array(uint16_t) predicate_capture_ids = array_new(); for (unsigned i = 0; i < self->patterns.size; i++) { QueryPattern *pattern = &self->patterns.contents[i]; // Gather all of the captures that are used in predicates for this pattern. array_clear(&predicate_capture_ids); for ( unsigned start = pattern->predicate_steps.offset, end = start + pattern->predicate_steps.length, j = start; j < end; j++ ) { TSQueryPredicateStep *step = &self->predicate_steps.contents[j]; if (step->type == TSQueryPredicateStepTypeCapture) { array_insert_sorted_by(&predicate_capture_ids, , step->value_id); } } // Find all of the steps that have these captures. for ( unsigned start = pattern->steps.offset, end = start + pattern->steps.length, j = start; j < end; j++ ) { QueryStep *step = &self->steps.contents[j]; for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) { uint16_t capture_id = step->capture_ids[k]; if (capture_id == NONE) break; unsigned index, exists; array_search_sorted_by(&predicate_capture_ids, , capture_id, &index, &exists); if (exists) { step->root_pattern_guaranteed = false; break; } } } } // Propagate fallibility. If a pattern is fallible at a given step, then it is // fallible at all of its preceding steps. bool done = self->steps.size == 0; while (!done) { done = true; for (unsigned i = self->steps.size - 1; i > 0; i--) { QueryStep *step = &self->steps.contents[i]; if (step->depth == PATTERN_DONE_MARKER) continue; // Determine if this step is definite or has definite alternatives. bool parent_pattern_guaranteed = false; for (;;) { if (step->root_pattern_guaranteed) { parent_pattern_guaranteed = true; break; } if (step->alternative_index == NONE || step->alternative_index < i) { break; } step = &self->steps.contents[step->alternative_index]; } // If not, mark its predecessor as indefinite. if (!parent_pattern_guaranteed) { QueryStep *prev_step = &self->steps.contents[i - 1]; if ( !prev_step->is_dead_end && prev_step->depth != PATTERN_DONE_MARKER && prev_step->root_pattern_guaranteed ) { prev_step->root_pattern_guaranteed = false; done = false; } } } } #ifdef DEBUG_ANALYZE_QUERY printf("Steps:\n"); for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; if (step->depth == PATTERN_DONE_MARKER) { printf(" %u: DONE\n", i); } else { printf( " %u: {symbol: %s, field: %s, depth: %u, parent_pattern_guaranteed: %d, root_pattern_guaranteed: %d}\n", i, (step->symbol == WILDCARD_SYMBOL) ? "ANY" : ts_language_symbol_name(self->language, step->symbol), (step->field ? ts_language_field_name_for_id(self->language, step->field) : "-"), step->depth, step->parent_pattern_guaranteed, step->root_pattern_guaranteed ); } } #endif // Cleanup for (unsigned i = 0; i < subgraphs.size; i++) { array_delete(&subgraphs.contents[i].start_states); array_delete(&subgraphs.contents[i].nodes); } array_delete(&subgraphs); array_delete(&next_nodes); array_delete(&states); array_delete(&next_states); array_delete(&deeper_states); array_delete(&final_step_indices); array_delete(&parent_step_indices); array_delete(&predicate_capture_ids); state_predecessor_map_delete(&predecessor_map); return all_patterns_are_valid; } static void ts_query__add_negated_fields( TSQuery *self, uint16_t step_index, TSFieldId *field_ids, uint16_t field_count ) { QueryStep *step = &self->steps.contents[step_index]; // The negated field array stores a list of field lists, separated by zeros. // Try to find the start index of an existing list that matches this new list. bool failed_match = false; unsigned match_count = 0; unsigned start_i = 0; for (unsigned i = 0; i < self->negated_fields.size; i++) { TSFieldId existing_field_id = self->negated_fields.contents[i]; // At each zero value, terminate the match attempt. If we've exactly // matched the new field list, then reuse this index. Otherwise, // start over the matching process. if (existing_field_id == 0) { if (match_count == field_count) { step->negated_field_list_id = start_i; return; } else { start_i = i + 1; match_count = 0; failed_match = false; } } // If the existing list matches our new list so far, then advance // to the next element of the new list. else if ( match_count < field_count && existing_field_id == field_ids[match_count] && !failed_match ) { match_count++; } // Otherwise, this existing list has failed to match. else { match_count = 0; failed_match = true; } } step->negated_field_list_id = self->negated_fields.size; array_extend(&self->negated_fields, field_count, field_ids); array_push(&self->negated_fields, 0); } static TSQueryError ts_query__parse_string_literal( TSQuery *self, Stream *stream ) { const char *string_start = stream->input; if (stream->next != '"') return TSQueryErrorSyntax; stream_advance(stream); const char *prev_position = stream->input; bool is_escaped = false; array_clear(&self->string_buffer); for (;;) { if (is_escaped) { is_escaped = false; switch (stream->next) { case 'n': array_push(&self->string_buffer, '\n'); break; case 'r': array_push(&self->string_buffer, '\r'); break; case 't': array_push(&self->string_buffer, '\t'); break; case '0': array_push(&self->string_buffer, '\0'); break; default: array_extend(&self->string_buffer, stream->next_size, stream->input); break; } prev_position = stream->input + stream->next_size; } else { if (stream->next == '\\') { array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); prev_position = stream->input + 1; is_escaped = true; } else if (stream->next == '"') { array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); stream_advance(stream); return TSQueryErrorNone; } else if (stream->next == '\n') { stream_reset(stream, string_start); return TSQueryErrorSyntax; } } if (!stream_advance(stream)) { stream_reset(stream, string_start); return TSQueryErrorSyntax; } } } // Parse a single predicate associated with a pattern, adding it to the // query's internal `predicate_steps` array. Predicates are arbitrary // S-expressions associated with a pattern which are meant to be handled at // a higher level of abstraction, such as the Rust/JavaScript bindings. They // can contain '@'-prefixed capture names, double-quoted strings, and bare // symbols, which also represent strings. static TSQueryError ts_query__parse_predicate( TSQuery *self, Stream *stream ) { if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *predicate_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - predicate_name; uint16_t id = symbol_table_insert_name( &self->predicate_values, predicate_name, length ); array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); stream_skip_whitespace(stream); for (;;) { if (stream->next == ')') { stream_advance(stream); stream_skip_whitespace(stream); array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeDone, .value_id = 0, })); break; } // Parse an '@'-prefixed capture name else if (stream->next == '@') { stream_advance(stream); // Parse the capture name if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *capture_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - capture_name; // Add the capture id to the first step of the pattern int capture_id = symbol_table_id_for_name( &self->captures, capture_name, length ); if (capture_id == -1) { stream_reset(stream, capture_name); return TSQueryErrorCapture; } array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeCapture, .value_id = capture_id, })); } // Parse a string literal else if (stream->next == '"') { TSQueryError e = ts_query__parse_string_literal(self, stream); if (e) return e; uint16_t id = symbol_table_insert_name( &self->predicate_values, self->string_buffer.contents, self->string_buffer.size ); array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); } // Parse a bare symbol else if (stream_is_ident_start(stream)) { const char *symbol_start = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - symbol_start; uint16_t id = symbol_table_insert_name( &self->predicate_values, symbol_start, length ); array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); } else { return TSQueryErrorSyntax; } stream_skip_whitespace(stream); } return 0; } // Read one S-expression pattern from the stream, and incorporate it into // the query's internal state machine representation. For nested patterns, // this function calls itself recursively. static TSQueryError ts_query__parse_pattern( TSQuery *self, Stream *stream, uint32_t depth, bool is_immediate ) { if (stream->next == 0) return TSQueryErrorSyntax; if (stream->next == ')' || stream->next == ']') return PARENT_DONE; const uint32_t starting_step_index = self->steps.size; // Store the byte offset of each step in the query. if ( self->step_offsets.size == 0 || array_back(&self->step_offsets)->step_index != starting_step_index ) { array_push(&self->step_offsets, ((StepOffset) { .step_index = starting_step_index, .byte_offset = stream_offset(stream), })); } // An open bracket is the start of an alternation. if (stream->next == '[') { stream_advance(stream); stream_skip_whitespace(stream); // Parse each branch, and add a placeholder step in between the branches. Array(uint32_t) branch_step_indices = array_new(); for (;;) { uint32_t start_index = self->steps.size; TSQueryError e = ts_query__parse_pattern( self, stream, depth, is_immediate ); if (e == PARENT_DONE) { if (stream->next == ']' && branch_step_indices.size > 0) { stream_advance(stream); break; } e = TSQueryErrorSyntax; } if (e) { array_delete(&branch_step_indices); return e; } array_push(&branch_step_indices, start_index); array_push(&self->steps, query_step__new(0, depth, false)); } (void)array_pop(&self->steps); // For all of the branches except for the last one, add the subsequent branch as an // alternative, and link the end of the branch to the current end of the steps. for (unsigned i = 0; i < branch_step_indices.size - 1; i++) { uint32_t step_index = branch_step_indices.contents[i]; uint32_t next_step_index = branch_step_indices.contents[i + 1]; QueryStep *start_step = &self->steps.contents[step_index]; QueryStep *end_step = &self->steps.contents[next_step_index - 1]; start_step->alternative_index = next_step_index; end_step->alternative_index = self->steps.size; end_step->is_dead_end = true; } array_delete(&branch_step_indices); } // An open parenthesis can be the start of three possible constructs: // * A grouped sequence // * A predicate // * A named node else if (stream->next == '(') { stream_advance(stream); stream_skip_whitespace(stream); // If this parenthesis is followed by a node, then it represents a grouped sequence. if (stream->next == '(' || stream->next == '"' || stream->next == '[') { bool child_is_immediate = false; for (;;) { if (stream->next == '.') { child_is_immediate = true; stream_advance(stream); stream_skip_whitespace(stream); } TSQueryError e = ts_query__parse_pattern( self, stream, depth, child_is_immediate ); if (e == PARENT_DONE) { if (stream->next == ')') { stream_advance(stream); break; } e = TSQueryErrorSyntax; } if (e) return e; child_is_immediate = false; } } // A dot/pound character indicates the start of a predicate. else if (stream->next == '.' || stream->next == '#') { stream_advance(stream); return ts_query__parse_predicate(self, stream); } // Otherwise, this parenthesis is the start of a named node. else { TSSymbol symbol; // Parse a normal node name if (stream_is_ident_start(stream)) { const char *node_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - node_name; // TODO - remove. // For temporary backward compatibility, handle predicates without the leading '#' sign. if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) { stream_reset(stream, node_name); return ts_query__parse_predicate(self, stream); } // Parse the wildcard symbol else if (length == 1 && node_name[0] == '_') { symbol = WILDCARD_SYMBOL; } else { symbol = ts_language_symbol_for_name( self->language, node_name, length, true ); if (!symbol) { stream_reset(stream, node_name); return TSQueryErrorNodeType; } } } else { return TSQueryErrorSyntax; } // Add a step for the node. array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); QueryStep *step = array_back(&self->steps); if (ts_language_symbol_metadata(self->language, symbol).supertype) { step->supertype_symbol = step->symbol; step->symbol = WILDCARD_SYMBOL; } if (symbol == WILDCARD_SYMBOL) { step->is_named = true; } stream_skip_whitespace(stream); if (stream->next == '/') { stream_advance(stream); if (!stream_is_ident_start(stream)) { return TSQueryErrorSyntax; } const char *node_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - node_name; step->symbol = ts_language_symbol_for_name( self->language, node_name, length, true ); if (!step->symbol) { stream_reset(stream, node_name); return TSQueryErrorNodeType; } stream_skip_whitespace(stream); } // Parse the child patterns bool child_is_immediate = false; uint16_t last_child_step_index = 0; uint16_t negated_field_count = 0; TSFieldId negated_field_ids[MAX_NEGATED_FIELD_COUNT]; for (;;) { // Parse a negated field assertion if (stream->next == '!') { stream_advance(stream); stream_skip_whitespace(stream); if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *field_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - field_name; stream_skip_whitespace(stream); TSFieldId field_id = ts_language_field_id_for_name( self->language, field_name, length ); if (!field_id) { stream->input = field_name; return TSQueryErrorField; } // Keep the field ids sorted. if (negated_field_count < MAX_NEGATED_FIELD_COUNT) { negated_field_ids[negated_field_count] = field_id; negated_field_count++; } continue; } // Parse a sibling anchor if (stream->next == '.') { child_is_immediate = true; stream_advance(stream); stream_skip_whitespace(stream); } uint16_t step_index = self->steps.size; TSQueryError e = ts_query__parse_pattern( self, stream, depth + 1, child_is_immediate ); if (e == PARENT_DONE) { if (stream->next == ')') { if (child_is_immediate) { if (last_child_step_index == 0) return TSQueryErrorSyntax; self->steps.contents[last_child_step_index].is_last_child = true; } if (negated_field_count) { ts_query__add_negated_fields( self, starting_step_index, negated_field_ids, negated_field_count ); } stream_advance(stream); break; } e = TSQueryErrorSyntax; } if (e) return e; last_child_step_index = step_index; child_is_immediate = false; } } } // Parse a wildcard pattern else if (stream->next == '_') { stream_advance(stream); stream_skip_whitespace(stream); // Add a step that matches any kind of node array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); } // Parse a double-quoted anonymous leaf node expression else if (stream->next == '"') { const char *string_start = stream->input; TSQueryError e = ts_query__parse_string_literal(self, stream); if (e) return e; // Add a step for the node TSSymbol symbol = ts_language_symbol_for_name( self->language, self->string_buffer.contents, self->string_buffer.size, false ); if (!symbol) { stream_reset(stream, string_start + 1); return TSQueryErrorNodeType; } array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); } // Parse a field-prefixed pattern else if (stream_is_ident_start(stream)) { // Parse the field name const char *field_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - field_name; stream_skip_whitespace(stream); if (stream->next != ':') { stream_reset(stream, field_name); return TSQueryErrorSyntax; } stream_advance(stream); stream_skip_whitespace(stream); // Parse the pattern TSQueryError e = ts_query__parse_pattern( self, stream, depth, is_immediate ); if (e == PARENT_DONE) return TSQueryErrorSyntax; if (e) return e; // Add the field name to the first step of the pattern TSFieldId field_id = ts_language_field_id_for_name( self->language, field_name, length ); if (!field_id) { stream->input = field_name; return TSQueryErrorField; } uint32_t step_index = starting_step_index; QueryStep *step = &self->steps.contents[step_index]; for (;;) { step->field = field_id; if ( step->alternative_index != NONE && step->alternative_index > step_index && step->alternative_index < self->steps.size ) { step_index = step->alternative_index; step = &self->steps.contents[step_index]; } else { break; } } } else { return TSQueryErrorSyntax; } stream_skip_whitespace(stream); // Parse suffixes modifiers for this pattern for (;;) { // Parse the one-or-more operator. if (stream->next == '+') { stream_advance(stream); stream_skip_whitespace(stream); QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); repeat_step.alternative_index = starting_step_index; repeat_step.is_pass_through = true; repeat_step.alternative_is_immediate = true; array_push(&self->steps, repeat_step); } // Parse the zero-or-more repetition operator. else if (stream->next == '*') { stream_advance(stream); stream_skip_whitespace(stream); QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); repeat_step.alternative_index = starting_step_index; repeat_step.is_pass_through = true; repeat_step.alternative_is_immediate = true; array_push(&self->steps, repeat_step); QueryStep *step = &self->steps.contents[starting_step_index]; while (step->alternative_index != NONE) { step = &self->steps.contents[step->alternative_index]; } step->alternative_index = self->steps.size; } // Parse the optional operator. else if (stream->next == '?') { stream_advance(stream); stream_skip_whitespace(stream); QueryStep *step = &self->steps.contents[starting_step_index]; while (step->alternative_index != NONE) { step = &self->steps.contents[step->alternative_index]; } step->alternative_index = self->steps.size; } // Parse an '@'-prefixed capture pattern else if (stream->next == '@') { stream_advance(stream); if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *capture_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - capture_name; stream_skip_whitespace(stream); // Add the capture id to the first step of the pattern uint16_t capture_id = symbol_table_insert_name( &self->captures, capture_name, length ); uint32_t step_index = starting_step_index; for (;;) { QueryStep *step = &self->steps.contents[step_index]; query_step__add_capture(step, capture_id); if ( step->alternative_index != NONE && step->alternative_index > step_index && step->alternative_index < self->steps.size ) { step_index = step->alternative_index; step = &self->steps.contents[step_index]; } else { break; } } } // No more suffix modifiers else { break; } } return 0; } TSQuery *ts_query_new( const TSLanguage *language, const char *source, uint32_t source_len, uint32_t *error_offset, TSQueryError *error_type ) { if ( !language || language->version > TREE_SITTER_LANGUAGE_VERSION || language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION ) { *error_type = TSQueryErrorLanguage; return NULL; } TSQuery *self = ts_malloc(sizeof(TSQuery)); *self = (TSQuery) { .steps = array_new(), .pattern_map = array_new(), .captures = symbol_table_new(), .predicate_values = symbol_table_new(), .predicate_steps = array_new(), .patterns = array_new(), .step_offsets = array_new(), .string_buffer = array_new(), .negated_fields = array_new(), .wildcard_root_pattern_count = 0, .language = language, }; array_push(&self->negated_fields, 0); // Parse all of the S-expressions in the given string. Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); while (stream.input < stream.end) { uint32_t pattern_index = self->patterns.size; uint32_t start_step_index = self->steps.size; uint32_t start_predicate_step_index = self->predicate_steps.size; array_push(&self->patterns, ((QueryPattern) { .steps = (Slice) {.offset = start_step_index}, .predicate_steps = (Slice) {.offset = start_predicate_step_index}, .start_byte = stream_offset(&stream), })); *error_type = ts_query__parse_pattern(self, &stream, 0, false); array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false)); QueryPattern *pattern = array_back(&self->patterns); pattern->steps.length = self->steps.size - start_step_index; pattern->predicate_steps.length = self->predicate_steps.size - start_predicate_step_index; // If any pattern could not be parsed, then report the error information // and terminate. if (*error_type) { if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax; *error_offset = stream_offset(&stream); ts_query_delete(self); return NULL; } // Maintain a map that can look up patterns for a given root symbol. uint16_t wildcard_root_alternative_index = NONE; for (;;) { QueryStep *step = &self->steps.contents[start_step_index]; // If a pattern has a wildcard at its root, but it has a non-wildcard child, // then optimize the matching process by skipping matching the wildcard. // Later, during the matching process, the query cursor will check that // there is a parent node, and capture it if necessary. if (step->symbol == WILDCARD_SYMBOL && step->depth == 0 && !step->field) { QueryStep *second_step = &self->steps.contents[start_step_index + 1]; if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1) { wildcard_root_alternative_index = step->alternative_index; start_step_index += 1; step = second_step; } } // Determine whether the pattern has a single root node. This affects // decisions about whether or not to start matching the pattern when // a query cursor has a range restriction. bool is_rooted = true; uint32_t start_depth = step->depth; for (uint32_t step_index = start_step_index + 1; step_index < self->steps.size; step_index++) { QueryStep *step = &self->steps.contents[step_index]; if (step->depth == start_depth) { is_rooted = false; break; } } ts_query__pattern_map_insert(self, step->symbol, (PatternEntry) { .step_index = start_step_index, .pattern_index = pattern_index, .is_rooted = is_rooted }); if (step->symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; } // If there are alternatives or options at the root of the pattern, // then add multiple entries to the pattern map. if (step->alternative_index != NONE) { start_step_index = step->alternative_index; step->alternative_index = NONE; } else if (wildcard_root_alternative_index != NONE) { start_step_index = wildcard_root_alternative_index; wildcard_root_alternative_index = NONE; } else { break; } } } if (!ts_query__analyze_patterns(self, error_offset)) { *error_type = TSQueryErrorStructure; ts_query_delete(self); return NULL; } array_delete(&self->string_buffer); return self; } void ts_query_delete(TSQuery *self) { if (self) { array_delete(&self->steps); array_delete(&self->pattern_map); array_delete(&self->predicate_steps); array_delete(&self->patterns); array_delete(&self->step_offsets); array_delete(&self->string_buffer); array_delete(&self->negated_fields); symbol_table_delete(&self->captures); symbol_table_delete(&self->predicate_values); ts_free(self); } } uint32_t ts_query_pattern_count(const TSQuery *self) { return self->patterns.size; } uint32_t ts_query_capture_count(const TSQuery *self) { return self->captures.slices.size; } uint32_t ts_query_string_count(const TSQuery *self) { return self->predicate_values.slices.size; } const char *ts_query_capture_name_for_id( const TSQuery *self, uint32_t index, uint32_t *length ) { return symbol_table_name_for_id(&self->captures, index, length); } const char *ts_query_string_value_for_id( const TSQuery *self, uint32_t index, uint32_t *length ) { return symbol_table_name_for_id(&self->predicate_values, index, length); } const TSQueryPredicateStep *ts_query_predicates_for_pattern( const TSQuery *self, uint32_t pattern_index, uint32_t *step_count ) { Slice slice = self->patterns.contents[pattern_index].predicate_steps; *step_count = slice.length; if (self->predicate_steps.contents == NULL) { return NULL; } return &self->predicate_steps.contents[slice.offset]; } uint32_t ts_query_start_byte_for_pattern( const TSQuery *self, uint32_t pattern_index ) { return self->patterns.contents[pattern_index].start_byte; } bool ts_query_is_pattern_guaranteed_at_step( const TSQuery *self, uint32_t byte_offset ) { uint32_t step_index = UINT32_MAX; for (unsigned i = 0; i < self->step_offsets.size; i++) { StepOffset *step_offset = &self->step_offsets.contents[i]; if (step_offset->byte_offset > byte_offset) break; step_index = step_offset->step_index; } if (step_index < self->steps.size) { return self->steps.contents[step_index].root_pattern_guaranteed; } else { return false; } } bool ts_query__step_is_fallible( const TSQuery *self, uint16_t step_index ) { assert((uint32_t)step_index + 1 < self->steps.size); QueryStep *step = &self->steps.contents[step_index]; QueryStep *next_step = &self->steps.contents[step_index + 1]; return ( next_step->depth != PATTERN_DONE_MARKER && next_step->depth > step->depth && !next_step->parent_pattern_guaranteed ); } void ts_query_disable_capture( TSQuery *self, const char *name, uint32_t length ) { // Remove capture information for any pattern step that previously // captured with the given name. int id = symbol_table_id_for_name(&self->captures, name, length); if (id != -1) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; query_step__remove_capture(step, id); } } } void ts_query_disable_pattern( TSQuery *self, uint32_t pattern_index ) { // Remove the given pattern from the pattern map. Its steps will still // be in the `steps` array, but they will never be read. for (unsigned i = 0; i < self->pattern_map.size; i++) { PatternEntry *pattern = &self->pattern_map.contents[i]; if (pattern->pattern_index == pattern_index) { array_erase(&self->pattern_map, i); i--; } } } /*************** * QueryCursor ***************/ TSQueryCursor *ts_query_cursor_new(void) { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { .did_exceed_match_limit = false, .ascending = false, .halted = false, .states = array_new(), .finished_states = array_new(), .capture_list_pool = capture_list_pool_new(), .start_byte = 0, .end_byte = UINT32_MAX, .start_point = {0, 0}, .end_point = POINT_MAX, }; array_reserve(&self->states, 8); array_reserve(&self->finished_states, 8); return self; } void ts_query_cursor_delete(TSQueryCursor *self) { array_delete(&self->states); array_delete(&self->finished_states); ts_tree_cursor_delete(&self->cursor); capture_list_pool_delete(&self->capture_list_pool); ts_free(self); } bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) { return self->did_exceed_match_limit; } uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self) { return self->capture_list_pool.max_capture_list_count; } void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) { self->capture_list_pool.max_capture_list_count = limit; } void ts_query_cursor_exec( TSQueryCursor *self, const TSQuery *query, TSNode node ) { array_clear(&self->states); array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); capture_list_pool_reset(&self->capture_list_pool); self->next_state_id = 0; self->depth = 0; self->ascending = false; self->halted = false; self->query = query; self->did_exceed_match_limit = false; } void ts_query_cursor_set_byte_range( TSQueryCursor *self, uint32_t start_byte, uint32_t end_byte ) { if (end_byte == 0) { end_byte = UINT32_MAX; } self->start_byte = start_byte; self->end_byte = end_byte; } void ts_query_cursor_set_point_range( TSQueryCursor *self, TSPoint start_point, TSPoint end_point ) { if (end_point.row == 0 && end_point.column == 0) { end_point = POINT_MAX; } self->start_point = start_point; self->end_point = end_point; } // Search through all of the in-progress states, and find the captured // node that occurs earliest in the document. static bool ts_query_cursor__first_in_progress_capture( TSQueryCursor *self, uint32_t *state_index, uint32_t *byte_offset, uint32_t *pattern_index, bool *root_pattern_guaranteed ) { bool result = false; *state_index = UINT32_MAX; *byte_offset = UINT32_MAX; *pattern_index = UINT32_MAX; for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i]; if (state->dead) continue; const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); if (state->consumed_capture_count >= captures->size) { continue; } TSNode node = captures->contents[state->consumed_capture_count].node; if ( ts_node_end_byte(node) <= self->start_byte || point_lte(ts_node_end_point(node), self->start_point) ) { state->consumed_capture_count++; i--; continue; } uint32_t node_start_byte = ts_node_start_byte(node); if ( !result || node_start_byte < *byte_offset || (node_start_byte == *byte_offset && state->pattern_index < *pattern_index) ) { QueryStep *step = &self->query->steps.contents[state->step_index]; if (root_pattern_guaranteed) { *root_pattern_guaranteed = step->root_pattern_guaranteed; } else if (step->root_pattern_guaranteed) { continue; } result = true; *state_index = i; *byte_offset = node_start_byte; *pattern_index = state->pattern_index; } } return result; } // Determine which node is first in a depth-first traversal int ts_query_cursor__compare_nodes(TSNode left, TSNode right) { if (left.id != right.id) { uint32_t left_start = ts_node_start_byte(left); uint32_t right_start = ts_node_start_byte(right); if (left_start < right_start) return -1; if (left_start > right_start) return 1; uint32_t left_node_count = ts_node_end_byte(left); uint32_t right_node_count = ts_node_end_byte(right); if (left_node_count > right_node_count) return -1; if (left_node_count < right_node_count) return 1; } return 0; } // Determine if either state contains a superset of the other state's captures. void ts_query_cursor__compare_captures( TSQueryCursor *self, QueryState *left_state, QueryState *right_state, bool *left_contains_right, bool *right_contains_left ) { const CaptureList *left_captures = capture_list_pool_get( &self->capture_list_pool, left_state->capture_list_id ); const CaptureList *right_captures = capture_list_pool_get( &self->capture_list_pool, right_state->capture_list_id ); *left_contains_right = true; *right_contains_left = true; unsigned i = 0, j = 0; for (;;) { if (i < left_captures->size) { if (j < right_captures->size) { TSQueryCapture *left = &left_captures->contents[i]; TSQueryCapture *right = &right_captures->contents[j]; if (left->node.id == right->node.id && left->index == right->index) { i++; j++; } else { switch (ts_query_cursor__compare_nodes(left->node, right->node)) { case -1: *right_contains_left = false; i++; break; case 1: *left_contains_right = false; j++; break; default: *right_contains_left = false; *left_contains_right = false; i++; j++; break; } } } else { *right_contains_left = false; break; } } else { if (j < right_captures->size) { *left_contains_right = false; } break; } } } #ifdef DEBUG_EXECUTE_QUERY #define LOG(...) fprintf(stderr, __VA_ARGS__) #else #define LOG(...) #endif static void ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { QueryStep *step = &self->query->steps.contents[pattern->step_index]; uint32_t start_depth = self->depth - step->depth; // Keep the states array in ascending order of start_depth and pattern_index, // so that it can be processed more efficiently elsewhere. Usually, there is // no work to do here because of two facts: // * States with lower start_depth are naturally added first due to the // order in which nodes are visited. // * Earlier patterns are naturally added first because of the ordering of the // pattern_map data structure that's used to initiate matches. // // This loop is only needed in cases where two conditions hold: // * A pattern consists of more than one sibling node, so that its states // remain in progress after exiting the node that started the match. // * The first node in the pattern matches against multiple nodes at the // same depth. // // An example of this is the pattern '((comment)* (function))'. If multiple // `comment` nodes appear in a row, then we may initiate a new state for this // pattern while another state for the same pattern is already in progress. // If there are multiple patterns like this in a query, then this loop will // need to execute in order to keep the states ordered by pattern_index. uint32_t index = self->states.size; while (index > 0) { QueryState *prev_state = &self->states.contents[index - 1]; if (prev_state->start_depth < start_depth) break; if (prev_state->start_depth == start_depth) { // Avoid inserting an unnecessary duplicate state, which would be // immediately pruned by the longest-match criteria. if ( prev_state->pattern_index == pattern->pattern_index && prev_state->step_index == pattern->step_index ) return; if (prev_state->pattern_index <= pattern->pattern_index) break; } index--; } LOG( " start state. pattern:%u, step:%u\n", pattern->pattern_index, pattern->step_index ); array_insert(&self->states, index, ((QueryState) { .id = UINT32_MAX, .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, .start_depth = start_depth, .consumed_capture_count = 0, .seeking_immediate_match = true, .has_in_progress_alternatives = false, .needs_parent = step->depth == 1, .dead = false, })); } // Acquire a capture list for this state. If there are no capture lists left in the // pool, this will steal the capture list from another existing state, and mark that // other state as 'dead'. static CaptureList *ts_query_cursor__prepare_to_capture( TSQueryCursor *self, QueryState *state, unsigned state_index_to_preserve ) { if (state->capture_list_id == NONE) { state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); // If there are no capture lists left in the pool, then terminate whichever // state has captured the earliest node in the document, and steal its // capture list. if (state->capture_list_id == NONE) { self->did_exceed_match_limit = true; uint32_t state_index, byte_offset, pattern_index; if ( ts_query_cursor__first_in_progress_capture( self, &state_index, &byte_offset, &pattern_index, NULL ) && state_index != state_index_to_preserve ) { LOG( " abandon state. index:%u, pattern:%u, offset:%u.\n", state_index, pattern_index, byte_offset ); QueryState *other_state = &self->states.contents[state_index]; state->capture_list_id = other_state->capture_list_id; other_state->capture_list_id = NONE; other_state->dead = true; CaptureList *list = capture_list_pool_get_mut( &self->capture_list_pool, state->capture_list_id ); array_clear(list); return list; } else { LOG(" ran out of capture lists"); return NULL; } } } return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id); } static void ts_query_cursor__capture( TSQueryCursor *self, QueryState *state, QueryStep *step, TSNode node ) { if (state->dead) return; CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX); if (!capture_list) { state->dead = true; return; } for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; array_push(capture_list, ((TSQueryCapture) { node, capture_id })); LOG( " capture node. type:%s, pattern:%u, capture_id:%u, capture_count:%u\n", ts_node_type(node), state->pattern_index, capture_id, capture_list->size ); } } // Duplicate the given state and insert the newly-created state immediately after // the given state in the `states` array. Ensures that the given state reference is // still valid, even if the states array is reallocated. static QueryState *ts_query_cursor__copy_state( TSQueryCursor *self, QueryState **state_ref ) { const QueryState *state = *state_ref; uint32_t state_index = state - self->states.contents; QueryState copy = *state; copy.capture_list_id = NONE; // If the state has captures, copy its capture list. if (state->capture_list_id != NONE) { CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index); if (!new_captures) return NULL; const CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); array_push_all(new_captures, old_captures); } array_insert(&self->states, state_index + 1, copy); *state_ref = &self->states.contents[state_index]; return &self->states.contents[state_index + 1]; } // Walk the tree, processing patterns until at least one pattern finishes, // If one or more patterns finish, return `true` and store their states in the // `finished_states` array. Multiple patterns can finish on the same node. If // there are no more matches, return `false`. static inline bool ts_query_cursor__advance( TSQueryCursor *self, bool stop_on_definite_step ) { bool did_match = false; for (;;) { if (self->halted) { while (self->states.size > 0) { QueryState state = array_pop(&self->states); capture_list_pool_release( &self->capture_list_pool, state.capture_list_id ); } } if (did_match || self->halted) return did_match; // Exit the current node. if (self->ascending) { LOG( "leave node. depth:%u, type:%s\n", self->depth, ts_node_type(ts_tree_cursor_current_node(&self->cursor)) ); // Leave this node by stepping to its next sibling or to its parent. if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = false; } else if (ts_tree_cursor_goto_parent(&self->cursor)) { self->depth--; } else { LOG("halt at root\n"); self->halted = true; } // After leaving a node, remove any states that cannot make further progress. uint32_t deleted_count = 0; for (unsigned i = 0, n = self->states.size; i < n; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; // If a state completed its pattern inside of this node, but was deferred from finishing // in order to search for longer matches, mark it as finished. if (step->depth == PATTERN_DONE_MARKER) { if (state->start_depth > self->depth || self->halted) { LOG(" finish pattern %u\n", state->pattern_index); array_push(&self->finished_states, *state); did_match = true; deleted_count++; continue; } } // If a state needed to match something within this node, then remove that state // as it has failed to match. else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { LOG( " failed to match. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); deleted_count++; continue; } if (deleted_count > 0) { self->states.contents[i - deleted_count] = *state; } } self->states.size -= deleted_count; } // Enter a new node. else { // Get the properties of the current node. TSNode node = ts_tree_cursor_current_node(&self->cursor); TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor); TSSymbol symbol = ts_node_symbol(node); bool is_named = ts_node_is_named(node); bool has_later_siblings; bool has_later_named_siblings; bool can_have_later_siblings_with_this_field; TSFieldId field_id = 0; TSSymbol supertypes[8] = {0}; unsigned supertype_count = 8; ts_tree_cursor_current_status( &self->cursor, &field_id, &has_later_siblings, &has_later_named_siblings, &can_have_later_siblings_with_this_field, supertypes, &supertype_count ); LOG( "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", self->depth, ts_node_type(node), ts_language_field_name_for_id(self->query->language, field_id), ts_node_start_point(node).row, self->states.size, self->finished_states.size ); bool node_intersects_range = ( ts_node_end_byte(node) > self->start_byte && ts_node_start_byte(node) < self->end_byte && point_gt(ts_node_end_point(node), self->start_point) && point_lt(ts_node_start_point(node), self->end_point) ); bool parent_intersects_range = ts_node_is_null(parent_node) || ( ts_node_end_byte(parent_node) > self->start_byte && ts_node_start_byte(parent_node) < self->end_byte && point_gt(ts_node_end_point(parent_node), self->start_point) && point_lt(ts_node_start_point(parent_node), self->end_point) ); // Add new states for any patterns whose root node is a wildcard. for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { PatternEntry *pattern = &self->query->pattern_map.contents[i]; // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. QueryStep *step = &self->query->steps.contents[pattern->step_index]; if ( (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) && (!step->field || field_id == step->field) && (!step->supertype_symbol || supertype_count > 0) ) { ts_query_cursor__add_state(self, pattern); } } // Add new states for any patterns whose root node matches this node. unsigned i; if (ts_query__pattern_map_search(self->query, symbol, &i)) { PatternEntry *pattern = &self->query->pattern_map.contents[i]; QueryStep *step = &self->query->steps.contents[pattern->step_index]; do { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if ( (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) && (!step->field || field_id == step->field) ) { ts_query_cursor__add_state(self, pattern); } // Advance to the next pattern whose root node matches this node. i++; if (i == self->query->pattern_map.size) break; pattern = &self->query->pattern_map.contents[i]; step = &self->query->steps.contents[pattern->step_index]; } while (step->symbol == symbol); } // Update all of the in-progress states with current node. for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; state->has_in_progress_alternatives = false; copy_count = 0; // Check that the node matches all of the criteria for the next // step of the pattern. if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; // Determine if this node matches this step of the pattern, and also // if this node can have later siblings that match this step of the // pattern. bool node_does_match = false; if (step->symbol == WILDCARD_SYMBOL) { node_does_match = is_named || !step->is_named; } else { node_does_match = symbol == step->symbol; } bool later_sibling_can_match = has_later_siblings; if ((step->is_immediate && is_named) || state->seeking_immediate_match) { later_sibling_can_match = false; } if (step->is_last_child && has_later_named_siblings) { node_does_match = false; } if (step->supertype_symbol) { bool has_supertype = false; for (unsigned j = 0; j < supertype_count; j++) { if (supertypes[j] == step->supertype_symbol) { has_supertype = true; break; } } if (!has_supertype) node_does_match = false; } if (step->field) { if (step->field == field_id) { if (!can_have_later_siblings_with_this_field) { later_sibling_can_match = false; } } else { node_does_match = false; } } if (step->negated_field_list_id) { TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id]; for (;;) { TSFieldId negated_field_id = *negated_field_ids; if (negated_field_id) { negated_field_ids++; if (ts_node_child_by_field_id(node, negated_field_id).id) { node_does_match = false; break; } } else { break; } } } // Remove states immediately if it is ever clear that they cannot match. if (!node_does_match) { if (!later_sibling_can_match) { LOG( " discard state. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); array_erase(&self->states, i); i--; } continue; } // Some patterns can match their root node in multiple ways, capturing different // children. If this pattern step could match later children within the same // parent, then this query state cannot simply be updated in place. It must be // split into two states: one that matches this node, and one which skips over // this node, to preserve the possibility of matching later siblings. if (later_sibling_can_match && ( step->contains_captures || ts_query__step_is_fallible(self->query, state->step_index) )) { if (ts_query_cursor__copy_state(self, &state)) { LOG( " split state for capture. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); copy_count++; } } // If this pattern started with a wildcard, such that the pattern map // actually points to the *second* step of the pattern, then check // that the node has a parent, and capture the parent node if necessary. if (state->needs_parent) { TSNode parent = ts_tree_cursor_parent_node(&self->cursor); if (ts_node_is_null(parent)) { LOG(" missing parent node\n"); state->dead = true; } else { state->needs_parent = false; QueryStep *skipped_wildcard_step = step; do { skipped_wildcard_step--; } while ( skipped_wildcard_step->is_dead_end || skipped_wildcard_step->is_pass_through || skipped_wildcard_step->depth > 0 ); if (skipped_wildcard_step->capture_ids[0] != NONE) { LOG(" capture wildcard parent\n"); ts_query_cursor__capture( self, state, skipped_wildcard_step, parent ); } } } // If the current node is captured in this pattern, add it to the capture list. if (step->capture_ids[0] != NONE) { ts_query_cursor__capture(self, state, step, node); } if (state->dead) { array_erase(&self->states, i); i--; continue; } // Advance this state to the next step of its pattern. state->step_index++; state->seeking_immediate_match = false; LOG( " advance state. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); QueryStep *next_step = &self->query->steps.contents[state->step_index]; if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true; // If this state's next step has an alternative step, then copy the state in order // to pursue both alternatives. The alternative step itself may have an alternative, // so this is an interative process. unsigned end_index = i + 1; for (unsigned j = i; j < end_index; j++) { QueryState *state = &self->states.contents[j]; QueryStep *next_step = &self->query->steps.contents[state->step_index]; if (next_step->alternative_index != NONE) { // A "dead-end" step exists only to add a non-sequential jump into the step sequence, // via its alternative index. When a state reaches a dead-end step, it jumps straight // to the step's alternative. if (next_step->is_dead_end) { state->step_index = next_step->alternative_index; j--; continue; } // A "pass-through" step exists only to add a branch into the step sequence, // via its alternative_index. When a state reaches a pass-through step, it splits // in order to process the alternative step, and then it advances to the next step. if (next_step->is_pass_through) { state->step_index++; j--; } QueryState *copy = ts_query_cursor__copy_state(self, &state); if (copy) { LOG( " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", copy->pattern_index, copy->step_index, next_step->alternative_index, next_step->alternative_is_immediate, capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size ); end_index++; copy_count++; copy->step_index = next_step->alternative_index; if (next_step->alternative_is_immediate) { copy->seeking_immediate_match = true; } } } } } for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i]; if (state->dead) { array_erase(&self->states, i); i--; continue; } // Enfore the longest-match criteria. When a query pattern contains optional or // repeated nodes, this is necessary to avoid multiple redundant states, where // one state has a strict subset of another state's captures. bool did_remove = false; for (unsigned j = i + 1; j < self->states.size; j++) { QueryState *other_state = &self->states.contents[j]; // Query states are kept in ascending order of start_depth and pattern_index. // Since the longest-match criteria is only used for deduping matches of the same // pattern and root node, we only need to perform pairwise comparisons within a // small slice of the states array. if ( other_state->start_depth != state->start_depth || other_state->pattern_index != state->pattern_index ) break; bool left_contains_right, right_contains_left; ts_query_cursor__compare_captures( self, state, other_state, &left_contains_right, &right_contains_left ); if (left_contains_right) { if (state->step_index == other_state->step_index) { LOG( " drop shorter state. pattern: %u, step_index: %u\n", state->pattern_index, state->step_index ); capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); array_erase(&self->states, j); j--; continue; } other_state->has_in_progress_alternatives = true; } if (right_contains_left) { if (state->step_index == other_state->step_index) { LOG( " drop shorter state. pattern: %u, step_index: %u\n", state->pattern_index, state->step_index ); capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); array_erase(&self->states, i); i--; did_remove = true; break; } state->has_in_progress_alternatives = true; } } // If the state is at the end of its pattern, remove it from the list // of in-progress states and add it to the list of finished states. if (!did_remove) { LOG( " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", state->pattern_index, state->start_depth, state->step_index, capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size ); QueryStep *next_step = &self->query->steps.contents[state->step_index]; if (next_step->depth == PATTERN_DONE_MARKER) { if (state->has_in_progress_alternatives) { LOG(" defer finishing pattern %u\n", state->pattern_index); } else { LOG(" finish pattern %u\n", state->pattern_index); array_push(&self->finished_states, *state); array_erase(&self->states, state - self->states.contents); did_match = true; i--; } } } } // When the current node ends prior to the desired start offset, // only descend for the purpose of continuing in-progress matches. bool should_descend = node_intersects_range; if (!should_descend) { for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i];; QueryStep *next_step = &self->query->steps.contents[state->step_index]; if ( next_step->depth != PATTERN_DONE_MARKER && state->start_depth + next_step->depth > self->depth ) { should_descend = true; break; } } } if (!should_descend) { LOG( " not descending. node end byte: %u, start byte: %u\n", ts_node_end_byte(node), self->start_byte ); } if (should_descend && ts_tree_cursor_goto_first_child(&self->cursor)) { self->depth++; } else { self->ascending = true; } } } } bool ts_query_cursor_next_match( TSQueryCursor *self, TSQueryMatch *match ) { if (self->finished_states.size == 0) { if (!ts_query_cursor__advance(self, false)) { return false; } } QueryState *state = &self->finished_states.contents[0]; if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); match->captures = captures->contents; match->capture_count = captures->size; capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); array_erase(&self->finished_states, 0); return true; } void ts_query_cursor_remove_match( TSQueryCursor *self, uint32_t match_id ) { for (unsigned i = 0; i < self->finished_states.size; i++) { const QueryState *state = &self->finished_states.contents[i]; if (state->id == match_id) { capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); array_erase(&self->finished_states, i); return; } } } bool ts_query_cursor_next_capture( TSQueryCursor *self, TSQueryMatch *match, uint32_t *capture_index ) { // The goal here is to return captures in order, even though they may not // be discovered in order, because patterns can overlap. Search for matches // until there is a finished capture that is before any unfinished capture. for (;;) { // First, find the earliest capture in an unfinished match. uint32_t first_unfinished_capture_byte; uint32_t first_unfinished_pattern_index; uint32_t first_unfinished_state_index; bool first_unfinished_state_is_definite = false; ts_query_cursor__first_in_progress_capture( self, &first_unfinished_state_index, &first_unfinished_capture_byte, &first_unfinished_pattern_index, &first_unfinished_state_is_definite ); // Then find the earliest capture in a finished match. It must occur // before the first capture in an *unfinished* match. QueryState *first_finished_state = NULL; uint32_t first_finished_capture_byte = first_unfinished_capture_byte; uint32_t first_finished_pattern_index = first_unfinished_pattern_index; for (unsigned i = 0; i < self->finished_states.size;) { QueryState *state = &self->finished_states.contents[i]; const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); // Remove states whose captures are all consumed. if (state->consumed_capture_count >= captures->size) { capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); array_erase(&self->finished_states, i); continue; } // Skip captures that precede the cursor's start byte. TSNode node = captures->contents[state->consumed_capture_count].node; if (ts_node_end_byte(node) <= self->start_byte) { state->consumed_capture_count++; continue; } uint32_t node_start_byte = ts_node_start_byte(node); if ( node_start_byte < first_finished_capture_byte || ( node_start_byte == first_finished_capture_byte && state->pattern_index < first_finished_pattern_index ) ) { first_finished_state = state; first_finished_capture_byte = node_start_byte; first_finished_pattern_index = state->pattern_index; } i++; } // If there is finished capture that is clearly before any unfinished // capture, then return its match, and its capture index. Internally // record the fact that the capture has been 'consumed'. QueryState *state; if (first_finished_state) { state = first_finished_state; } else if (first_unfinished_state_is_definite) { state = &self->states.contents[first_unfinished_state_index]; } else { state = NULL; } if (state) { if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); match->captures = captures->contents; match->capture_count = captures->size; *capture_index = state->consumed_capture_count; state->consumed_capture_count++; return true; } if (capture_list_pool_is_empty(&self->capture_list_pool)) { LOG( " abandon state. index:%u, pattern:%u, offset:%u.\n", first_unfinished_state_index, first_unfinished_pattern_index, first_unfinished_capture_byte ); capture_list_pool_release( &self->capture_list_pool, self->states.contents[first_unfinished_state_index].capture_list_id ); array_erase(&self->states, first_unfinished_state_index); } // If there are no finished matches that are ready to be returned, then // continue finding more matches. if ( !ts_query_cursor__advance(self, true) && self->finished_states.size == 0 ) return false; } } #undef LOG tree-sitter-0.20.1/src/reduce_action.h000064400000000000000000000014040072674642500157460ustar 00000000000000#ifndef TREE_SITTER_REDUCE_ACTION_H_ #define TREE_SITTER_REDUCE_ACTION_H_ #ifdef __cplusplus extern "C" { #endif #include "./array.h" #include "tree_sitter/api.h" typedef struct { uint32_t count; TSSymbol symbol; int dynamic_precedence; unsigned short production_id; } ReduceAction; typedef Array(ReduceAction) ReduceActionSet; static inline void ts_reduce_action_set_add(ReduceActionSet *self, ReduceAction new_action) { for (uint32_t i = 0; i < self->size; i++) { ReduceAction action = self->contents[i]; if (action.symbol == new_action.symbol && action.count == new_action.count) return; } array_push(self, new_action); } #ifdef __cplusplus } #endif #endif // TREE_SITTER_REDUCE_ACTION_H_ tree-sitter-0.20.1/src/reusable_node.h000064400000000000000000000052730072674642500157610ustar 00000000000000#include "./subtree.h" typedef struct { Subtree tree; uint32_t child_index; uint32_t byte_offset; } StackEntry; typedef struct { Array(StackEntry) stack; Subtree last_external_token; } ReusableNode; static inline ReusableNode reusable_node_new(void) { return (ReusableNode) {array_new(), NULL_SUBTREE}; } static inline void reusable_node_clear(ReusableNode *self) { array_clear(&self->stack); self->last_external_token = NULL_SUBTREE; } static inline Subtree reusable_node_tree(ReusableNode *self) { return self->stack.size > 0 ? self->stack.contents[self->stack.size - 1].tree : NULL_SUBTREE; } static inline uint32_t reusable_node_byte_offset(ReusableNode *self) { return self->stack.size > 0 ? self->stack.contents[self->stack.size - 1].byte_offset : UINT32_MAX; } static inline void reusable_node_delete(ReusableNode *self) { array_delete(&self->stack); } static inline void reusable_node_advance(ReusableNode *self) { StackEntry last_entry = *array_back(&self->stack); uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree); if (ts_subtree_has_external_tokens(last_entry.tree)) { self->last_external_token = ts_subtree_last_external_token(last_entry.tree); } Subtree tree; uint32_t next_index; do { StackEntry popped_entry = array_pop(&self->stack); next_index = popped_entry.child_index + 1; if (self->stack.size == 0) return; tree = array_back(&self->stack)->tree; } while (ts_subtree_child_count(tree) <= next_index); array_push(&self->stack, ((StackEntry) { .tree = ts_subtree_children(tree)[next_index], .child_index = next_index, .byte_offset = byte_offset, })); } static inline bool reusable_node_descend(ReusableNode *self) { StackEntry last_entry = *array_back(&self->stack); if (ts_subtree_child_count(last_entry.tree) > 0) { array_push(&self->stack, ((StackEntry) { .tree = ts_subtree_children(last_entry.tree)[0], .child_index = 0, .byte_offset = last_entry.byte_offset, })); return true; } else { return false; } } static inline void reusable_node_advance_past_leaf(ReusableNode *self) { while (reusable_node_descend(self)) {} reusable_node_advance(self); } static inline void reusable_node_reset(ReusableNode *self, Subtree tree) { reusable_node_clear(self); array_push(&self->stack, ((StackEntry) { .tree = tree, .child_index = 0, .byte_offset = 0, })); // Never reuse the root node, because it has a non-standard internal structure // due to transformations that are applied when it is accepted: adding the EOF // child and any extra children. if (!reusable_node_descend(self)) { reusable_node_clear(self); } } tree-sitter-0.20.1/src/stack.c000064400000000000000000000665450072674642500142630ustar 00000000000000#include "./alloc.h" #include "./language.h" #include "./subtree.h" #include "./array.h" #include "./stack.h" #include "./length.h" #include #include #define MAX_LINK_COUNT 8 #define MAX_NODE_POOL_SIZE 50 #define MAX_ITERATOR_COUNT 64 #if defined _WIN32 && !defined __GNUC__ #define inline __forceinline #else #define inline static inline __attribute__((always_inline)) #endif typedef struct StackNode StackNode; typedef struct { StackNode *node; Subtree subtree; bool is_pending; } StackLink; struct StackNode { TSStateId state; Length position; StackLink links[MAX_LINK_COUNT]; short unsigned int link_count; uint32_t ref_count; unsigned error_cost; unsigned node_count; int dynamic_precedence; }; typedef struct { StackNode *node; SubtreeArray subtrees; uint32_t subtree_count; bool is_pending; } StackIterator; typedef struct { void *payload; StackIterateCallback callback; } StackIterateSession; typedef Array(StackNode *) StackNodeArray; typedef enum { StackStatusActive, StackStatusPaused, StackStatusHalted, } StackStatus; typedef struct { StackNode *node; Subtree last_external_token; StackSummary *summary; unsigned node_count_at_last_error; TSSymbol lookahead_when_paused; StackStatus status; } StackHead; struct Stack { Array(StackHead) heads; StackSliceArray slices; Array(StackIterator) iterators; StackNodeArray node_pool; StackNode *base_node; SubtreePool *subtree_pool; }; typedef unsigned StackAction; enum { StackActionNone, StackActionStop = 1, StackActionPop = 2, }; typedef StackAction (*StackCallback)(void *, const StackIterator *); static void stack_node_retain(StackNode *self) { if (!self) return; assert(self->ref_count > 0); self->ref_count++; assert(self->ref_count != 0); } static void stack_node_release(StackNode *self, StackNodeArray *pool, SubtreePool *subtree_pool) { recur: assert(self->ref_count != 0); self->ref_count--; if (self->ref_count > 0) return; StackNode *first_predecessor = NULL; if (self->link_count > 0) { for (unsigned i = self->link_count - 1; i > 0; i--) { StackLink link = self->links[i]; if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); stack_node_release(link.node, pool, subtree_pool); } StackLink link = self->links[0]; if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); first_predecessor = self->links[0].node; } if (pool->size < MAX_NODE_POOL_SIZE) { array_push(pool, self); } else { ts_free(self); } if (first_predecessor) { self = first_predecessor; goto recur; } } static StackNode *stack_node_new(StackNode *previous_node, Subtree subtree, bool is_pending, TSStateId state, StackNodeArray *pool) { StackNode *node = pool->size > 0 ? array_pop(pool) : ts_malloc(sizeof(StackNode)); *node = (StackNode){.ref_count = 1, .link_count = 0, .state = state}; if (previous_node) { node->link_count = 1; node->links[0] = (StackLink){ .node = previous_node, .subtree = subtree, .is_pending = is_pending, }; node->position = previous_node->position; node->error_cost = previous_node->error_cost; node->dynamic_precedence = previous_node->dynamic_precedence; node->node_count = previous_node->node_count; if (subtree.ptr) { node->error_cost += ts_subtree_error_cost(subtree); node->position = length_add(node->position, ts_subtree_total_size(subtree)); node->node_count += ts_subtree_node_count(subtree); node->dynamic_precedence += ts_subtree_dynamic_precedence(subtree); } } else { node->position = length_zero(); node->error_cost = 0; } return node; } static bool stack__subtree_is_equivalent(Subtree left, Subtree right) { return left.ptr == right.ptr || (left.ptr && right.ptr && ts_subtree_symbol(left) == ts_subtree_symbol(right) && ((ts_subtree_error_cost(left) > 0 && ts_subtree_error_cost(right) > 0) || (ts_subtree_padding(left).bytes == ts_subtree_padding(right).bytes && ts_subtree_size(left).bytes == ts_subtree_size(right).bytes && ts_subtree_child_count(left) == ts_subtree_child_count(right) && ts_subtree_extra(left) == ts_subtree_extra(right) && ts_subtree_external_scanner_state_eq(left, right)))); } static void stack_node_add_link(StackNode *self, StackLink link, SubtreePool *subtree_pool) { if (link.node == self) return; for (int i = 0; i < self->link_count; i++) { StackLink *existing_link = &self->links[i]; if (stack__subtree_is_equivalent(existing_link->subtree, link.subtree)) { // In general, we preserve ambiguities until they are removed from the stack // during a pop operation where multiple paths lead to the same node. But in // the special case where two links directly connect the same pair of nodes, // we can safely remove the ambiguity ahead of time without changing behavior. if (existing_link->node == link.node) { if ( ts_subtree_dynamic_precedence(link.subtree) > ts_subtree_dynamic_precedence(existing_link->subtree) ) { ts_subtree_retain(link.subtree); ts_subtree_release(subtree_pool, existing_link->subtree); existing_link->subtree = link.subtree; self->dynamic_precedence = link.node->dynamic_precedence + ts_subtree_dynamic_precedence(link.subtree); } return; } // If the previous nodes are mergeable, merge them recursively. if (existing_link->node->state == link.node->state && existing_link->node->position.bytes == link.node->position.bytes) { for (int j = 0; j < link.node->link_count; j++) { stack_node_add_link(existing_link->node, link.node->links[j], subtree_pool); } int32_t dynamic_precedence = link.node->dynamic_precedence; if (link.subtree.ptr) { dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); } if (dynamic_precedence > self->dynamic_precedence) { self->dynamic_precedence = dynamic_precedence; } return; } } } if (self->link_count == MAX_LINK_COUNT) return; stack_node_retain(link.node); unsigned node_count = link.node->node_count; int dynamic_precedence = link.node->dynamic_precedence; self->links[self->link_count++] = link; if (link.subtree.ptr) { ts_subtree_retain(link.subtree); node_count += ts_subtree_node_count(link.subtree); dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); } if (node_count > self->node_count) self->node_count = node_count; if (dynamic_precedence > self->dynamic_precedence) self->dynamic_precedence = dynamic_precedence; } static void stack_head_delete(StackHead *self, StackNodeArray *pool, SubtreePool *subtree_pool) { if (self->node) { if (self->last_external_token.ptr) { ts_subtree_release(subtree_pool, self->last_external_token); } if (self->summary) { array_delete(self->summary); ts_free(self->summary); } stack_node_release(self->node, pool, subtree_pool); } } static StackVersion ts_stack__add_version(Stack *self, StackVersion original_version, StackNode *node) { StackHead head = { .node = node, .node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error, .last_external_token = self->heads.contents[original_version].last_external_token, .status = StackStatusActive, .lookahead_when_paused = 0, }; array_push(&self->heads, head); stack_node_retain(node); if (head.last_external_token.ptr) ts_subtree_retain(head.last_external_token); return (StackVersion)(self->heads.size - 1); } static void ts_stack__add_slice(Stack *self, StackVersion original_version, StackNode *node, SubtreeArray *subtrees) { for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { StackVersion version = self->slices.contents[i].version; if (self->heads.contents[version].node == node) { StackSlice slice = {*subtrees, version}; array_insert(&self->slices, i + 1, slice); return; } } StackVersion version = ts_stack__add_version(self, original_version, node); StackSlice slice = { *subtrees, version }; array_push(&self->slices, slice); } inline StackSliceArray stack__iter(Stack *self, StackVersion version, StackCallback callback, void *payload, int goal_subtree_count) { array_clear(&self->slices); array_clear(&self->iterators); StackHead *head = array_get(&self->heads, version); StackIterator iterator = { .node = head->node, .subtrees = array_new(), .subtree_count = 0, .is_pending = true, }; bool include_subtrees = false; if (goal_subtree_count >= 0) { include_subtrees = true; array_reserve(&iterator.subtrees, ts_subtree_alloc_size(goal_subtree_count) / sizeof(Subtree)); } array_push(&self->iterators, iterator); while (self->iterators.size > 0) { for (uint32_t i = 0, size = self->iterators.size; i < size; i++) { StackIterator *iterator = &self->iterators.contents[i]; StackNode *node = iterator->node; StackAction action = callback(payload, iterator); bool should_pop = action & StackActionPop; bool should_stop = action & StackActionStop || node->link_count == 0; if (should_pop) { SubtreeArray subtrees = iterator->subtrees; if (!should_stop) { ts_subtree_array_copy(subtrees, &subtrees); } ts_subtree_array_reverse(&subtrees); ts_stack__add_slice( self, version, node, &subtrees ); } if (should_stop) { if (!should_pop) ts_subtree_array_delete(self->subtree_pool, &iterator->subtrees); array_erase(&self->iterators, i); i--, size--; continue; } for (uint32_t j = 1; j <= node->link_count; j++) { StackIterator *next_iterator; StackLink link; if (j == node->link_count) { link = node->links[0]; next_iterator = &self->iterators.contents[i]; } else { if (self->iterators.size >= MAX_ITERATOR_COUNT) continue; link = node->links[j]; StackIterator current_iterator = self->iterators.contents[i]; array_push(&self->iterators, current_iterator); next_iterator = array_back(&self->iterators); ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees); } next_iterator->node = link.node; if (link.subtree.ptr) { if (include_subtrees) { array_push(&next_iterator->subtrees, link.subtree); ts_subtree_retain(link.subtree); } if (!ts_subtree_extra(link.subtree)) { next_iterator->subtree_count++; if (!link.is_pending) { next_iterator->is_pending = false; } } } else { next_iterator->subtree_count++; next_iterator->is_pending = false; } } } } return self->slices; } Stack *ts_stack_new(SubtreePool *subtree_pool) { Stack *self = ts_calloc(1, sizeof(Stack)); array_init(&self->heads); array_init(&self->slices); array_init(&self->iterators); array_init(&self->node_pool); array_reserve(&self->heads, 4); array_reserve(&self->slices, 4); array_reserve(&self->iterators, 4); array_reserve(&self->node_pool, MAX_NODE_POOL_SIZE); self->subtree_pool = subtree_pool; self->base_node = stack_node_new(NULL, NULL_SUBTREE, false, 1, &self->node_pool); ts_stack_clear(self); return self; } void ts_stack_delete(Stack *self) { if (self->slices.contents) array_delete(&self->slices); if (self->iterators.contents) array_delete(&self->iterators); stack_node_release(self->base_node, &self->node_pool, self->subtree_pool); for (uint32_t i = 0; i < self->heads.size; i++) { stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); } array_clear(&self->heads); if (self->node_pool.contents) { for (uint32_t i = 0; i < self->node_pool.size; i++) ts_free(self->node_pool.contents[i]); array_delete(&self->node_pool); } array_delete(&self->heads); ts_free(self); } uint32_t ts_stack_version_count(const Stack *self) { return self->heads.size; } TSStateId ts_stack_state(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->node->state; } Length ts_stack_position(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->node->position; } Subtree ts_stack_last_external_token(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->last_external_token; } void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token) { StackHead *head = array_get(&self->heads, version); if (token.ptr) ts_subtree_retain(token); if (head->last_external_token.ptr) ts_subtree_release(self->subtree_pool, head->last_external_token); head->last_external_token = token; } unsigned ts_stack_error_cost(const Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); unsigned result = head->node->error_cost; if ( head->status == StackStatusPaused || (head->node->state == ERROR_STATE && !head->node->links[0].subtree.ptr)) { result += ERROR_COST_PER_RECOVERY; } return result; } unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); if (head->node->node_count < head->node_count_at_last_error) { head->node_count_at_last_error = head->node->node_count; } return head->node->node_count - head->node_count_at_last_error; } void ts_stack_push(Stack *self, StackVersion version, Subtree subtree, bool pending, TSStateId state) { StackHead *head = array_get(&self->heads, version); StackNode *new_node = stack_node_new(head->node, subtree, pending, state, &self->node_pool); if (!subtree.ptr) head->node_count_at_last_error = new_node->node_count; head->node = new_node; } inline StackAction iterate_callback(void *payload, const StackIterator *iterator) { StackIterateSession *session = payload; session->callback( session->payload, iterator->node->state, iterator->subtree_count ); return StackActionNone; } void ts_stack_iterate(Stack *self, StackVersion version, StackIterateCallback callback, void *payload) { StackIterateSession session = {payload, callback}; stack__iter(self, version, iterate_callback, &session, -1); } inline StackAction pop_count_callback(void *payload, const StackIterator *iterator) { unsigned *goal_subtree_count = payload; if (iterator->subtree_count == *goal_subtree_count) { return StackActionPop | StackActionStop; } else { return StackActionNone; } } StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) { return stack__iter(self, version, pop_count_callback, &count, count); } inline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { (void)payload; if (iterator->subtree_count >= 1) { if (iterator->is_pending) { return StackActionPop | StackActionStop; } else { return StackActionStop; } } else { return StackActionNone; } } StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) { StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0); if (pop.size > 0) { ts_stack_renumber_version(self, pop.contents[0].version, version); pop.contents[0].version = version; } return pop; } inline StackAction pop_error_callback(void *payload, const StackIterator *iterator) { if (iterator->subtrees.size > 0) { bool *found_error = payload; if (!*found_error && ts_subtree_is_error(iterator->subtrees.contents[0])) { *found_error = true; return StackActionPop | StackActionStop; } else { return StackActionStop; } } else { return StackActionNone; } } SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) { StackNode *node = array_get(&self->heads, version)->node; for (unsigned i = 0; i < node->link_count; i++) { if (node->links[i].subtree.ptr && ts_subtree_is_error(node->links[i].subtree)) { bool found_error = false; StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1); if (pop.size > 0) { assert(pop.size == 1); ts_stack_renumber_version(self, pop.contents[0].version, version); return pop.contents[0].subtrees; } break; } } return (SubtreeArray){.size = 0}; } inline StackAction pop_all_callback(void *payload, const StackIterator *iterator) { (void)payload; return iterator->node->link_count == 0 ? StackActionPop : StackActionNone; } StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version) { return stack__iter(self, version, pop_all_callback, NULL, 0); } typedef struct { StackSummary *summary; unsigned max_depth; } SummarizeStackSession; inline StackAction summarize_stack_callback(void *payload, const StackIterator *iterator) { SummarizeStackSession *session = payload; TSStateId state = iterator->node->state; unsigned depth = iterator->subtree_count; if (depth > session->max_depth) return StackActionStop; for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) { StackSummaryEntry entry = session->summary->contents[i]; if (entry.depth < depth) break; if (entry.depth == depth && entry.state == state) return StackActionNone; } array_push(session->summary, ((StackSummaryEntry){ .position = iterator->node->position, .depth = depth, .state = state, })); return StackActionNone; } void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) { SummarizeStackSession session = { .summary = ts_malloc(sizeof(StackSummary)), .max_depth = max_depth }; array_init(session.summary); stack__iter(self, version, summarize_stack_callback, &session, -1); StackHead *head = &self->heads.contents[version]; if (head->summary) { array_delete(head->summary); ts_free(head->summary); } head->summary = session.summary; } StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { return array_get(&self->heads, version)->summary; } int ts_stack_dynamic_precedence(Stack *self, StackVersion version) { return array_get(&self->heads, version)->node->dynamic_precedence; } bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version) { const StackHead *head = array_get(&self->heads, version); const StackNode *node = head->node; if (node->error_cost == 0) return true; while (node) { if (node->link_count > 0) { Subtree subtree = node->links[0].subtree; if (subtree.ptr) { if (ts_subtree_total_bytes(subtree) > 0) { return true; } else if ( node->node_count > head->node_count_at_last_error && ts_subtree_error_cost(subtree) == 0 ) { node = node->links[0].node; continue; } } } break; } return false; } void ts_stack_remove_version(Stack *self, StackVersion version) { stack_head_delete(array_get(&self->heads, version), &self->node_pool, self->subtree_pool); array_erase(&self->heads, version); } void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) { if (v1 == v2) return; assert(v2 < v1); assert((uint32_t)v1 < self->heads.size); StackHead *source_head = &self->heads.contents[v1]; StackHead *target_head = &self->heads.contents[v2]; if (target_head->summary && !source_head->summary) { source_head->summary = target_head->summary; target_head->summary = NULL; } stack_head_delete(target_head, &self->node_pool, self->subtree_pool); *target_head = *source_head; array_erase(&self->heads, v1); } void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) { StackHead temporary_head = self->heads.contents[v1]; self->heads.contents[v1] = self->heads.contents[v2]; self->heads.contents[v2] = temporary_head; } StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { assert(version < self->heads.size); array_push(&self->heads, self->heads.contents[version]); StackHead *head = array_back(&self->heads); stack_node_retain(head->node); if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token); head->summary = NULL; return self->heads.size - 1; } bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) { if (!ts_stack_can_merge(self, version1, version2)) return false; StackHead *head1 = &self->heads.contents[version1]; StackHead *head2 = &self->heads.contents[version2]; for (uint32_t i = 0; i < head2->node->link_count; i++) { stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool); } if (head1->node->state == ERROR_STATE) { head1->node_count_at_last_error = head1->node->node_count; } ts_stack_remove_version(self, version2); return true; } bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) { StackHead *head1 = &self->heads.contents[version1]; StackHead *head2 = &self->heads.contents[version2]; return head1->status == StackStatusActive && head2->status == StackStatusActive && head1->node->state == head2->node->state && head1->node->position.bytes == head2->node->position.bytes && head1->node->error_cost == head2->node->error_cost && ts_subtree_external_scanner_state_eq(head1->last_external_token, head2->last_external_token); } void ts_stack_halt(Stack *self, StackVersion version) { array_get(&self->heads, version)->status = StackStatusHalted; } void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) { StackHead *head = array_get(&self->heads, version); head->status = StackStatusPaused; head->lookahead_when_paused = lookahead; head->node_count_at_last_error = head->node->node_count; } bool ts_stack_is_active(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->status == StackStatusActive; } bool ts_stack_is_halted(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->status == StackStatusHalted; } bool ts_stack_is_paused(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->status == StackStatusPaused; } TSSymbol ts_stack_resume(Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); assert(head->status == StackStatusPaused); TSSymbol result = head->lookahead_when_paused; head->status = StackStatusActive; head->lookahead_when_paused = 0; return result; } void ts_stack_clear(Stack *self) { stack_node_retain(self->base_node); for (uint32_t i = 0; i < self->heads.size; i++) { stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); } array_clear(&self->heads); array_push(&self->heads, ((StackHead){ .node = self->base_node, .last_external_token = NULL_SUBTREE, .status = StackStatusActive, .lookahead_when_paused = 0, })); } bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) { array_reserve(&self->iterators, 32); bool was_recording_allocations = ts_toggle_allocation_recording(false); if (!f) f = stderr; fprintf(f, "digraph stack {\n"); fprintf(f, "rankdir=\"RL\";\n"); fprintf(f, "edge [arrowhead=none]\n"); Array(StackNode *) visited_nodes = array_new(); array_clear(&self->iterators); for (uint32_t i = 0; i < self->heads.size; i++) { StackHead *head = &self->heads.contents[i]; if (head->status == StackStatusHalted) continue; fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i); fprintf(f, "node_head_%u -> node_%p [", i, (void *)head->node); if (head->status == StackStatusPaused) { fprintf(f, "color=red "); } fprintf(f, "label=%u, fontcolor=blue, weight=10000, labeltooltip=\"node_count: %u\nerror_cost: %u", i, ts_stack_node_count_since_error(self, i), ts_stack_error_cost(self, i) ); if (head->summary) { fprintf(f, "\nsummary_size: %u", head->summary->size); } if (head->last_external_token.ptr) { const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state; const char *data = ts_external_scanner_state_data(state); fprintf(f, "\nexternal_scanner_state:"); for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]); } fprintf(f, "\"]\n"); array_push(&self->iterators, ((StackIterator){.node = head->node })); } bool all_iterators_done = false; while (!all_iterators_done) { all_iterators_done = true; for (uint32_t i = 0; i < self->iterators.size; i++) { StackIterator iterator = self->iterators.contents[i]; StackNode *node = iterator.node; for (uint32_t j = 0; j < visited_nodes.size; j++) { if (visited_nodes.contents[j] == node) { node = NULL; break; } } if (!node) continue; all_iterators_done = false; fprintf(f, "node_%p [", (void *)node); if (node->state == ERROR_STATE) { fprintf(f, "label=\"?\""); } else if ( node->link_count == 1 && node->links[0].subtree.ptr && ts_subtree_extra(node->links[0].subtree) ) { fprintf(f, "shape=point margin=0 label=\"\""); } else { fprintf(f, "label=\"%d\"", node->state); } fprintf( f, " tooltip=\"position: %u,%u\nnode_count:%u\nerror_cost: %u\ndynamic_precedence: %d\"];\n", node->position.extent.row + 1, node->position.extent.column, node->node_count, node->error_cost, node->dynamic_precedence ); for (int j = 0; j < node->link_count; j++) { StackLink link = node->links[j]; fprintf(f, "node_%p -> node_%p [", (void *)node, (void *)link.node); if (link.is_pending) fprintf(f, "style=dashed "); if (link.subtree.ptr && ts_subtree_extra(link.subtree)) fprintf(f, "fontcolor=gray "); if (!link.subtree.ptr) { fprintf(f, "color=red"); } else { fprintf(f, "label=\""); bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree); if (quoted) fprintf(f, "'"); const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree)); for (const char *c = name; *c; c++) { if (*c == '\"' || *c == '\\') fprintf(f, "\\"); fprintf(f, "%c", *c); } if (quoted) fprintf(f, "'"); fprintf(f, "\""); fprintf( f, "labeltooltip=\"error_cost: %u\ndynamic_precedence: %u\"", ts_subtree_error_cost(link.subtree), ts_subtree_dynamic_precedence(link.subtree) ); } fprintf(f, "];\n"); StackIterator *next_iterator; if (j == 0) { next_iterator = &self->iterators.contents[i]; } else { array_push(&self->iterators, iterator); next_iterator = array_back(&self->iterators); } next_iterator->node = link.node; } array_push(&visited_nodes, node); } } fprintf(f, "}\n"); array_delete(&visited_nodes); ts_toggle_allocation_recording(was_recording_allocations); return true; } #undef inline tree-sitter-0.20.1/src/stack.h000064400000000000000000000107110072674642500142500ustar 00000000000000#ifndef TREE_SITTER_PARSE_STACK_H_ #define TREE_SITTER_PARSE_STACK_H_ #ifdef __cplusplus extern "C" { #endif #include "./array.h" #include "./subtree.h" #include "./error_costs.h" #include typedef struct Stack Stack; typedef unsigned StackVersion; #define STACK_VERSION_NONE ((StackVersion)-1) typedef struct { SubtreeArray subtrees; StackVersion version; } StackSlice; typedef Array(StackSlice) StackSliceArray; typedef struct { Length position; unsigned depth; TSStateId state; } StackSummaryEntry; typedef Array(StackSummaryEntry) StackSummary; // Create a stack. Stack *ts_stack_new(SubtreePool *); // Release the memory reserved for a given stack. void ts_stack_delete(Stack *); // Get the stack's current number of versions. uint32_t ts_stack_version_count(const Stack *); // Get the state at the top of the given version of the stack. If the stack is // empty, this returns the initial state, 0. TSStateId ts_stack_state(const Stack *, StackVersion); // Get the last external token associated with a given version of the stack. Subtree ts_stack_last_external_token(const Stack *, StackVersion); // Set the last external token associated with a given version of the stack. void ts_stack_set_last_external_token(Stack *, StackVersion, Subtree ); // Get the position of the given version of the stack within the document. Length ts_stack_position(const Stack *, StackVersion); // Push a tree and state onto the given version of the stack. // // This transfers ownership of the tree to the Stack. Callers that // need to retain ownership of the tree for their own purposes should // first retain the tree. void ts_stack_push(Stack *, StackVersion, Subtree , bool, TSStateId); // Pop the given number of entries from the given version of the stack. This // operation can increase the number of stack versions by revealing multiple // versions which had previously been merged. It returns an array that // specifies the index of each revealed version and the trees that were // removed from that version. StackSliceArray ts_stack_pop_count(Stack *, StackVersion, uint32_t count); // Remove an error at the top of the given version of the stack. SubtreeArray ts_stack_pop_error(Stack *, StackVersion); // Remove any pending trees from the top of the given version of the stack. StackSliceArray ts_stack_pop_pending(Stack *, StackVersion); // Remove any all trees from the given version of the stack. StackSliceArray ts_stack_pop_all(Stack *, StackVersion); // Get the maximum number of tree nodes reachable from this version of the stack // since the last error was detected. unsigned ts_stack_node_count_since_error(const Stack *, StackVersion); int ts_stack_dynamic_precedence(Stack *, StackVersion); bool ts_stack_has_advanced_since_error(const Stack *, StackVersion); // Compute a summary of all the parse states near the top of the given // version of the stack and store the summary for later retrieval. void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); // Retrieve a summary of all the parse states near the top of the // given version of the stack. StackSummary *ts_stack_get_summary(Stack *, StackVersion); // Get the total cost of all errors on the given version of the stack. unsigned ts_stack_error_cost(const Stack *, StackVersion version); // Merge the given two stack versions if possible, returning true // if they were successfully merged and false otherwise. bool ts_stack_merge(Stack *, StackVersion, StackVersion); // Determine whether the given two stack versions can be merged. bool ts_stack_can_merge(Stack *, StackVersion, StackVersion); TSSymbol ts_stack_resume(Stack *, StackVersion); void ts_stack_pause(Stack *, StackVersion, TSSymbol); void ts_stack_halt(Stack *, StackVersion); bool ts_stack_is_active(const Stack *, StackVersion); bool ts_stack_is_paused(const Stack *, StackVersion); bool ts_stack_is_halted(const Stack *, StackVersion); void ts_stack_renumber_version(Stack *, StackVersion, StackVersion); void ts_stack_swap_versions(Stack *, StackVersion, StackVersion); StackVersion ts_stack_copy_version(Stack *, StackVersion); // Remove the given version from the stack. void ts_stack_remove_version(Stack *, StackVersion); void ts_stack_clear(Stack *); bool ts_stack_print_dot_graph(Stack *, const TSLanguage *, FILE *); typedef void (*StackIterateCallback)(void *, TSStateId, uint32_t); void ts_stack_iterate(Stack *, StackVersion, StackIterateCallback, void *); #ifdef __cplusplus } #endif #endif // TREE_SITTER_PARSE_STACK_H_ tree-sitter-0.20.1/src/subtree.c000064400000000000000000001052640072674642500146170ustar 00000000000000#include #include #include #include #include #include #include "./alloc.h" #include "./atomic.h" #include "./subtree.h" #include "./length.h" #include "./language.h" #include "./error_costs.h" #include typedef struct { Length start; Length old_end; Length new_end; } Edit; #define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX #define TS_MAX_TREE_POOL_SIZE 32 static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0}; // ExternalScannerState void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) { self->length = length; if (length > sizeof(self->short_data)) { self->long_data = ts_malloc(length); memcpy(self->long_data, data, length); } else { memcpy(self->short_data, data, length); } } ExternalScannerState ts_external_scanner_state_copy(const ExternalScannerState *self) { ExternalScannerState result = *self; if (self->length > sizeof(self->short_data)) { result.long_data = ts_malloc(self->length); memcpy(result.long_data, self->long_data, self->length); } return result; } void ts_external_scanner_state_delete(ExternalScannerState *self) { if (self->length > sizeof(self->short_data)) { ts_free(self->long_data); } } const char *ts_external_scanner_state_data(const ExternalScannerState *self) { if (self->length > sizeof(self->short_data)) { return self->long_data; } else { return self->short_data; } } bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) { return a == b || ( a->length == b->length && !memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length) ); } // SubtreeArray void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) { dest->size = self.size; dest->capacity = self.capacity; dest->contents = self.contents; if (self.capacity > 0) { dest->contents = ts_calloc(self.capacity, sizeof(Subtree)); memcpy(dest->contents, self.contents, self.size * sizeof(Subtree)); for (uint32_t i = 0; i < self.size; i++) { ts_subtree_retain(dest->contents[i]); } } } void ts_subtree_array_clear(SubtreePool *pool, SubtreeArray *self) { for (uint32_t i = 0; i < self->size; i++) { ts_subtree_release(pool, self->contents[i]); } array_clear(self); } void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) { ts_subtree_array_clear(pool, self); array_delete(self); } void ts_subtree_array_remove_trailing_extras( SubtreeArray *self, SubtreeArray *destination ) { array_clear(destination); while (self->size > 0) { Subtree last = self->contents[self->size - 1]; if (ts_subtree_extra(last)) { self->size--; array_push(destination, last); } else { break; } } ts_subtree_array_reverse(destination); } void ts_subtree_array_reverse(SubtreeArray *self) { for (uint32_t i = 0, limit = self->size / 2; i < limit; i++) { size_t reverse_index = self->size - 1 - i; Subtree swap = self->contents[i]; self->contents[i] = self->contents[reverse_index]; self->contents[reverse_index] = swap; } } // SubtreePool SubtreePool ts_subtree_pool_new(uint32_t capacity) { SubtreePool self = {array_new(), array_new()}; array_reserve(&self.free_trees, capacity); return self; } void ts_subtree_pool_delete(SubtreePool *self) { if (self->free_trees.contents) { for (unsigned i = 0; i < self->free_trees.size; i++) { ts_free(self->free_trees.contents[i].ptr); } array_delete(&self->free_trees); } if (self->tree_stack.contents) array_delete(&self->tree_stack); } static SubtreeHeapData *ts_subtree_pool_allocate(SubtreePool *self) { if (self->free_trees.size > 0) { return array_pop(&self->free_trees).ptr; } else { return ts_malloc(sizeof(SubtreeHeapData)); } } static void ts_subtree_pool_free(SubtreePool *self, SubtreeHeapData *tree) { if (self->free_trees.capacity > 0 && self->free_trees.size + 1 <= TS_MAX_TREE_POOL_SIZE) { array_push(&self->free_trees, (MutableSubtree) {.ptr = tree}); } else { ts_free(tree); } } // Subtree static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t lookahead_bytes) { return padding.bytes < TS_MAX_INLINE_TREE_LENGTH && padding.extent.row < 16 && padding.extent.column < TS_MAX_INLINE_TREE_LENGTH && size.extent.row == 0 && size.extent.column < TS_MAX_INLINE_TREE_LENGTH && lookahead_bytes < 16; } Subtree ts_subtree_new_leaf( SubtreePool *pool, TSSymbol symbol, Length padding, Length size, uint32_t lookahead_bytes, TSStateId parse_state, bool has_external_tokens, bool depends_on_column, bool is_keyword, const TSLanguage *language ) { TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); bool extra = symbol == ts_builtin_sym_end; bool is_inline = ( symbol <= UINT8_MAX && !has_external_tokens && ts_subtree_can_inline(padding, size, lookahead_bytes) ); if (is_inline) { return (Subtree) {{ .parse_state = parse_state, .symbol = symbol, .padding_bytes = padding.bytes, .padding_rows = padding.extent.row, .padding_columns = padding.extent.column, .size_bytes = size.bytes, .lookahead_bytes = lookahead_bytes, .visible = metadata.visible, .named = metadata.named, .extra = extra, .has_changes = false, .is_missing = false, .is_keyword = is_keyword, .is_inline = true, }}; } else { SubtreeHeapData *data = ts_subtree_pool_allocate(pool); *data = (SubtreeHeapData) { .ref_count = 1, .padding = padding, .size = size, .lookahead_bytes = lookahead_bytes, .error_cost = 0, .child_count = 0, .symbol = symbol, .parse_state = parse_state, .visible = metadata.visible, .named = metadata.named, .extra = extra, .fragile_left = false, .fragile_right = false, .has_changes = false, .has_external_tokens = has_external_tokens, .depends_on_column = depends_on_column, .is_missing = false, .is_keyword = is_keyword, {{.first_leaf = {.symbol = 0, .parse_state = 0}}} }; return (Subtree) {.ptr = data}; } } void ts_subtree_set_symbol( MutableSubtree *self, TSSymbol symbol, const TSLanguage *language ) { TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); if (self->data.is_inline) { assert(symbol < UINT8_MAX); self->data.symbol = symbol; self->data.named = metadata.named; self->data.visible = metadata.visible; } else { self->ptr->symbol = symbol; self->ptr->named = metadata.named; self->ptr->visible = metadata.visible; } } Subtree ts_subtree_new_error( SubtreePool *pool, int32_t lookahead_char, Length padding, Length size, uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language ) { Subtree result = ts_subtree_new_leaf( pool, ts_builtin_sym_error, padding, size, bytes_scanned, parse_state, false, false, false, language ); SubtreeHeapData *data = (SubtreeHeapData *)result.ptr; data->fragile_left = true; data->fragile_right = true; data->lookahead_char = lookahead_char; return result; } // Clone a subtree. MutableSubtree ts_subtree_clone(Subtree self) { size_t alloc_size = ts_subtree_alloc_size(self.ptr->child_count); Subtree *new_children = ts_malloc(alloc_size); Subtree *old_children = ts_subtree_children(self); memcpy(new_children, old_children, alloc_size); SubtreeHeapData *result = (SubtreeHeapData *)&new_children[self.ptr->child_count]; if (self.ptr->child_count > 0) { for (uint32_t i = 0; i < self.ptr->child_count; i++) { ts_subtree_retain(new_children[i]); } } else if (self.ptr->has_external_tokens) { result->external_scanner_state = ts_external_scanner_state_copy( &self.ptr->external_scanner_state ); } result->ref_count = 1; return (MutableSubtree) {.ptr = result}; } // Get mutable version of a subtree. // // This takes ownership of the subtree. If the subtree has only one owner, // this will directly convert it into a mutable version. Otherwise, it will // perform a copy. MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) { if (self.data.is_inline) return (MutableSubtree) {self.data}; if (self.ptr->ref_count == 1) return ts_subtree_to_mut_unsafe(self); MutableSubtree result = ts_subtree_clone(self); ts_subtree_release(pool, self); return result; } static void ts_subtree__compress( MutableSubtree self, unsigned count, const TSLanguage *language, MutableSubtreeArray *stack ) { unsigned initial_stack_size = stack->size; MutableSubtree tree = self; TSSymbol symbol = tree.ptr->symbol; for (unsigned i = 0; i < count; i++) { if (tree.ptr->ref_count > 1 || tree.ptr->child_count < 2) break; MutableSubtree child = ts_subtree_to_mut_unsafe(ts_subtree_children(tree)[0]); if ( child.data.is_inline || child.ptr->child_count < 2 || child.ptr->ref_count > 1 || child.ptr->symbol != symbol ) break; MutableSubtree grandchild = ts_subtree_to_mut_unsafe(ts_subtree_children(child)[0]); if ( grandchild.data.is_inline || grandchild.ptr->child_count < 2 || grandchild.ptr->ref_count > 1 || grandchild.ptr->symbol != symbol ) break; ts_subtree_children(tree)[0] = ts_subtree_from_mut(grandchild); ts_subtree_children(child)[0] = ts_subtree_children(grandchild)[grandchild.ptr->child_count - 1]; ts_subtree_children(grandchild)[grandchild.ptr->child_count - 1] = ts_subtree_from_mut(child); array_push(stack, tree); tree = grandchild; } while (stack->size > initial_stack_size) { tree = array_pop(stack); MutableSubtree child = ts_subtree_to_mut_unsafe(ts_subtree_children(tree)[0]); MutableSubtree grandchild = ts_subtree_to_mut_unsafe(ts_subtree_children(child)[child.ptr->child_count - 1]); ts_subtree_summarize_children(grandchild, language); ts_subtree_summarize_children(child, language); ts_subtree_summarize_children(tree, language); } } void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *language) { array_clear(&pool->tree_stack); if (ts_subtree_child_count(self) > 0 && self.ptr->ref_count == 1) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); } while (pool->tree_stack.size > 0) { MutableSubtree tree = array_pop(&pool->tree_stack); if (tree.ptr->repeat_depth > 0) { Subtree child1 = ts_subtree_children(tree)[0]; Subtree child2 = ts_subtree_children(tree)[tree.ptr->child_count - 1]; long repeat_delta = (long)ts_subtree_repeat_depth(child1) - (long)ts_subtree_repeat_depth(child2); if (repeat_delta > 0) { unsigned n = repeat_delta; for (unsigned i = n / 2; i > 0; i /= 2) { ts_subtree__compress(tree, i, language, &pool->tree_stack); n -= i; } } } for (uint32_t i = 0; i < tree.ptr->child_count; i++) { Subtree child = ts_subtree_children(tree)[i]; if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); } } } } // Assign all of the node's properties that depend on its children. void ts_subtree_summarize_children( MutableSubtree self, const TSLanguage *language ) { assert(!self.data.is_inline); self.ptr->named_child_count = 0; self.ptr->visible_child_count = 0; self.ptr->error_cost = 0; self.ptr->repeat_depth = 0; self.ptr->node_count = 1; self.ptr->has_external_tokens = false; self.ptr->depends_on_column = false; self.ptr->dynamic_precedence = 0; uint32_t structural_index = 0; const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); uint32_t lookahead_end_byte = 0; const Subtree *children = ts_subtree_children(self); for (uint32_t i = 0; i < self.ptr->child_count; i++) { Subtree child = children[i]; if ( self.ptr->size.extent.row == 0 && ts_subtree_depends_on_column(child) ) { self.ptr->depends_on_column = true; } if (i == 0) { self.ptr->padding = ts_subtree_padding(child); self.ptr->size = ts_subtree_size(child); } else { self.ptr->size = length_add(self.ptr->size, ts_subtree_total_size(child)); } uint32_t child_lookahead_end_byte = self.ptr->padding.bytes + self.ptr->size.bytes + ts_subtree_lookahead_bytes(child); if (child_lookahead_end_byte > lookahead_end_byte) lookahead_end_byte = child_lookahead_end_byte; if (ts_subtree_symbol(child) != ts_builtin_sym_error_repeat) { self.ptr->error_cost += ts_subtree_error_cost(child); } uint32_t grandchild_count = ts_subtree_child_count(child); if (self.ptr->symbol == ts_builtin_sym_error || self.ptr->symbol == ts_builtin_sym_error_repeat) { if (!ts_subtree_extra(child) && !(ts_subtree_is_error(child) && grandchild_count == 0)) { if (ts_subtree_visible(child)) { self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE; } else if (grandchild_count > 0) { self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * child.ptr->visible_child_count; } } } self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child); self.ptr->node_count += ts_subtree_node_count(child); if (alias_sequence && alias_sequence[structural_index] != 0 && !ts_subtree_extra(child)) { self.ptr->visible_child_count++; if (ts_language_symbol_metadata(language, alias_sequence[structural_index]).named) { self.ptr->named_child_count++; } } else if (ts_subtree_visible(child)) { self.ptr->visible_child_count++; if (ts_subtree_named(child)) self.ptr->named_child_count++; } else if (grandchild_count > 0) { self.ptr->visible_child_count += child.ptr->visible_child_count; self.ptr->named_child_count += child.ptr->named_child_count; } if (ts_subtree_has_external_tokens(child)) self.ptr->has_external_tokens = true; if (ts_subtree_is_error(child)) { self.ptr->fragile_left = self.ptr->fragile_right = true; self.ptr->parse_state = TS_TREE_STATE_NONE; } if (!ts_subtree_extra(child)) structural_index++; } self.ptr->lookahead_bytes = lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes; if (self.ptr->symbol == ts_builtin_sym_error || self.ptr->symbol == ts_builtin_sym_error_repeat) { self.ptr->error_cost += ERROR_COST_PER_RECOVERY + ERROR_COST_PER_SKIPPED_CHAR * self.ptr->size.bytes + ERROR_COST_PER_SKIPPED_LINE * self.ptr->size.extent.row; } if (self.ptr->child_count > 0) { Subtree first_child = children[0]; Subtree last_child = children[self.ptr->child_count - 1]; self.ptr->first_leaf.symbol = ts_subtree_leaf_symbol(first_child); self.ptr->first_leaf.parse_state = ts_subtree_leaf_parse_state(first_child); if (ts_subtree_fragile_left(first_child)) self.ptr->fragile_left = true; if (ts_subtree_fragile_right(last_child)) self.ptr->fragile_right = true; if ( self.ptr->child_count >= 2 && !self.ptr->visible && !self.ptr->named && ts_subtree_symbol(first_child) == self.ptr->symbol ) { if (ts_subtree_repeat_depth(first_child) > ts_subtree_repeat_depth(last_child)) { self.ptr->repeat_depth = ts_subtree_repeat_depth(first_child) + 1; } else { self.ptr->repeat_depth = ts_subtree_repeat_depth(last_child) + 1; } } } } // Create a new parent node with the given children. // // This takes ownership of the children array. MutableSubtree ts_subtree_new_node( TSSymbol symbol, SubtreeArray *children, unsigned production_id, const TSLanguage *language ) { TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); bool fragile = symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat; // Allocate the node's data at the end of the array of children. size_t new_byte_size = ts_subtree_alloc_size(children->size); if (children->capacity * sizeof(Subtree) < new_byte_size) { children->contents = ts_realloc(children->contents, new_byte_size); children->capacity = new_byte_size / sizeof(Subtree); } SubtreeHeapData *data = (SubtreeHeapData *)&children->contents[children->size]; *data = (SubtreeHeapData) { .ref_count = 1, .symbol = symbol, .child_count = children->size, .visible = metadata.visible, .named = metadata.named, .has_changes = false, .fragile_left = fragile, .fragile_right = fragile, .is_keyword = false, {{ .node_count = 0, .production_id = production_id, .first_leaf = {.symbol = 0, .parse_state = 0}, }} }; MutableSubtree result = {.ptr = data}; ts_subtree_summarize_children(result, language); return result; } // Create a new error node contaning the given children. // // This node is treated as 'extra'. Its children are prevented from having // having any effect on the parse state. Subtree ts_subtree_new_error_node( SubtreeArray *children, bool extra, const TSLanguage *language ) { MutableSubtree result = ts_subtree_new_node( ts_builtin_sym_error, children, 0, language ); result.ptr->extra = extra; return ts_subtree_from_mut(result); } // Create a new 'missing leaf' node. // // This node is treated as 'extra'. Its children are prevented from having // having any effect on the parse state. Subtree ts_subtree_new_missing_leaf( SubtreePool *pool, TSSymbol symbol, Length padding, const TSLanguage *language ) { Subtree result = ts_subtree_new_leaf( pool, symbol, padding, length_zero(), 0, 0, false, false, false, language ); if (result.data.is_inline) { result.data.is_missing = true; } else { ((SubtreeHeapData *)result.ptr)->is_missing = true; } return result; } void ts_subtree_retain(Subtree self) { if (self.data.is_inline) return; assert(self.ptr->ref_count > 0); atomic_inc((volatile uint32_t *)&self.ptr->ref_count); assert(self.ptr->ref_count != 0); } void ts_subtree_release(SubtreePool *pool, Subtree self) { if (self.data.is_inline) return; array_clear(&pool->tree_stack); assert(self.ptr->ref_count > 0); if (atomic_dec((volatile uint32_t *)&self.ptr->ref_count) == 0) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); } while (pool->tree_stack.size > 0) { MutableSubtree tree = array_pop(&pool->tree_stack); if (tree.ptr->child_count > 0) { Subtree *children = ts_subtree_children(tree); for (uint32_t i = 0; i < tree.ptr->child_count; i++) { Subtree child = children[i]; if (child.data.is_inline) continue; assert(child.ptr->ref_count > 0); if (atomic_dec((volatile uint32_t *)&child.ptr->ref_count) == 0) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); } } ts_free(children); } else { if (tree.ptr->has_external_tokens) { ts_external_scanner_state_delete(&tree.ptr->external_scanner_state); } ts_subtree_pool_free(pool, tree.ptr); } } } bool ts_subtree_eq(Subtree self, Subtree other) { if (self.data.is_inline || other.data.is_inline) { return memcmp(&self, &other, sizeof(SubtreeInlineData)) == 0; } if (self.ptr) { if (!other.ptr) return false; } else { return !other.ptr; } if (self.ptr->symbol != other.ptr->symbol) return false; if (self.ptr->visible != other.ptr->visible) return false; if (self.ptr->named != other.ptr->named) return false; if (self.ptr->padding.bytes != other.ptr->padding.bytes) return false; if (self.ptr->size.bytes != other.ptr->size.bytes) return false; if (self.ptr->symbol == ts_builtin_sym_error) return self.ptr->lookahead_char == other.ptr->lookahead_char; if (self.ptr->child_count != other.ptr->child_count) return false; if (self.ptr->child_count > 0) { if (self.ptr->visible_child_count != other.ptr->visible_child_count) return false; if (self.ptr->named_child_count != other.ptr->named_child_count) return false; for (uint32_t i = 0; i < self.ptr->child_count; i++) { if (!ts_subtree_eq(ts_subtree_children(self)[i], ts_subtree_children(other)[i])) { return false; } } } return true; } int ts_subtree_compare(Subtree left, Subtree right) { if (ts_subtree_symbol(left) < ts_subtree_symbol(right)) return -1; if (ts_subtree_symbol(right) < ts_subtree_symbol(left)) return 1; if (ts_subtree_child_count(left) < ts_subtree_child_count(right)) return -1; if (ts_subtree_child_count(right) < ts_subtree_child_count(left)) return 1; for (uint32_t i = 0, n = ts_subtree_child_count(left); i < n; i++) { Subtree left_child = ts_subtree_children(left)[i]; Subtree right_child = ts_subtree_children(right)[i]; switch (ts_subtree_compare(left_child, right_child)) { case -1: return -1; case 1: return 1; default: break; } } return 0; } static inline void ts_subtree_set_has_changes(MutableSubtree *self) { if (self->data.is_inline) { self->data.has_changes = true; } else { self->ptr->has_changes = true; } } Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool) { typedef struct { Subtree *tree; Edit edit; } StackEntry; Array(StackEntry) stack = array_new(); array_push(&stack, ((StackEntry) { .tree = &self, .edit = (Edit) { .start = {edit->start_byte, edit->start_point}, .old_end = {edit->old_end_byte, edit->old_end_point}, .new_end = {edit->new_end_byte, edit->new_end_point}, }, })); while (stack.size) { StackEntry entry = array_pop(&stack); Edit edit = entry.edit; bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes; bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes; bool invalidate_first_row = ts_subtree_depends_on_column(*entry.tree); Length size = ts_subtree_size(*entry.tree); Length padding = ts_subtree_padding(*entry.tree); uint32_t lookahead_bytes = ts_subtree_lookahead_bytes(*entry.tree); uint32_t end_byte = padding.bytes + size.bytes + lookahead_bytes; if (edit.start.bytes > end_byte || (is_noop && edit.start.bytes == end_byte)) continue; // If the edit is entirely within the space before this subtree, then shift this // subtree over according to the edit without changing its size. if (edit.old_end.bytes <= padding.bytes) { padding = length_add(edit.new_end, length_sub(padding, edit.old_end)); } // If the edit starts in the space before this subtree and extends into this subtree, // shrink the subtree's content to compensate for the change in the space before it. else if (edit.start.bytes < padding.bytes) { size = length_sub(size, length_sub(edit.old_end, padding)); padding = edit.new_end; } // If the edit is a pure insertion right at the start of the subtree, // shift the subtree over according to the insertion. else if (edit.start.bytes == padding.bytes && is_pure_insertion) { padding = edit.new_end; } // If the edit is within this subtree, resize the subtree to reflect the edit. else { uint32_t total_bytes = padding.bytes + size.bytes; if (edit.start.bytes < total_bytes || (edit.start.bytes == total_bytes && is_pure_insertion)) { size = length_add( length_sub(edit.new_end, padding), length_sub(size, length_sub(edit.old_end, padding)) ); } } MutableSubtree result = ts_subtree_make_mut(pool, *entry.tree); if (result.data.is_inline) { if (ts_subtree_can_inline(padding, size, lookahead_bytes)) { result.data.padding_bytes = padding.bytes; result.data.padding_rows = padding.extent.row; result.data.padding_columns = padding.extent.column; result.data.size_bytes = size.bytes; } else { SubtreeHeapData *data = ts_subtree_pool_allocate(pool); data->ref_count = 1; data->padding = padding; data->size = size; data->lookahead_bytes = lookahead_bytes; data->error_cost = 0; data->child_count = 0; data->symbol = result.data.symbol; data->parse_state = result.data.parse_state; data->visible = result.data.visible; data->named = result.data.named; data->extra = result.data.extra; data->fragile_left = false; data->fragile_right = false; data->has_changes = false; data->has_external_tokens = false; data->depends_on_column = false; data->is_missing = result.data.is_missing; data->is_keyword = result.data.is_keyword; result.ptr = data; } } else { result.ptr->padding = padding; result.ptr->size = size; } ts_subtree_set_has_changes(&result); *entry.tree = ts_subtree_from_mut(result); Length child_left, child_right = length_zero(); for (uint32_t i = 0, n = ts_subtree_child_count(*entry.tree); i < n; i++) { Subtree *child = &ts_subtree_children(*entry.tree)[i]; Length child_size = ts_subtree_total_size(*child); child_left = child_right; child_right = length_add(child_left, child_size); // If this child ends before the edit, it is not affected. if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < edit.start.bytes) continue; // Keep editing child nodes until a node is reached that starts after the edit. // Also, if this node's validity depends on its column position, then continue // invaliditing child nodes until reaching a line break. if (( (child_left.bytes > edit.old_end.bytes) || (child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0) ) && ( !invalidate_first_row || child_left.extent.row > entry.tree->ptr->padding.extent.row )) { break; } // Transform edit into the child's coordinate space. Edit child_edit = { .start = length_sub(edit.start, child_left), .old_end = length_sub(edit.old_end, child_left), .new_end = length_sub(edit.new_end, child_left), }; // Clamp child_edit to the child's bounds. if (edit.start.bytes < child_left.bytes) child_edit.start = length_zero(); if (edit.old_end.bytes < child_left.bytes) child_edit.old_end = length_zero(); if (edit.new_end.bytes < child_left.bytes) child_edit.new_end = length_zero(); if (edit.old_end.bytes > child_right.bytes) child_edit.old_end = child_size; // Interpret all inserted text as applying to the *first* child that touches the edit. // Subsequent children are only never have any text inserted into them; they are only // shrunk to compensate for the edit. if ( child_right.bytes > edit.start.bytes || (child_right.bytes == edit.start.bytes && is_pure_insertion) ) { edit.new_end = edit.start; } // Children that occur before the edit are not reshaped by the edit. else { child_edit.old_end = child_edit.start; child_edit.new_end = child_edit.start; } // Queue processing of this child's subtree. array_push(&stack, ((StackEntry) { .tree = child, .edit = child_edit, })); } } array_delete(&stack); return self; } Subtree ts_subtree_last_external_token(Subtree tree) { if (!ts_subtree_has_external_tokens(tree)) return NULL_SUBTREE; while (tree.ptr->child_count > 0) { for (uint32_t i = tree.ptr->child_count - 1; i + 1 > 0; i--) { Subtree child = ts_subtree_children(tree)[i]; if (ts_subtree_has_external_tokens(child)) { tree = child; break; } } } return tree; } static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) { if (c == -1) return snprintf(s, n, "INVALID"); else if (c == '\0') return snprintf(s, n, "'\\0'"); else if (c == '\n') return snprintf(s, n, "'\\n'"); else if (c == '\t') return snprintf(s, n, "'\\t'"); else if (c == '\r') return snprintf(s, n, "'\\r'"); else if (0 < c && c < 128 && isprint(c)) return snprintf(s, n, "'%c'", c); else return snprintf(s, n, "%d", c); } static void ts_subtree__write_dot_string(FILE *f, const char *string) { for (const char *c = string; *c; c++) { if (*c == '"') { fputs("\\\"", f); } else if (*c == '\n') { fputs("\\n", f); } else { fputc(*c, f); } } } static const char *ROOT_FIELD = "__ROOT__"; static size_t ts_subtree__write_to_string( Subtree self, char *string, size_t limit, const TSLanguage *language, bool include_all, TSSymbol alias_symbol, bool alias_is_named, const char *field_name ) { if (!self.ptr) return snprintf(string, limit, "(NULL)"); char *cursor = string; char **writer = (limit > 0) ? &cursor : &string; bool is_root = field_name == ROOT_FIELD; bool is_visible = include_all || ts_subtree_missing(self) || ( alias_symbol ? alias_is_named : ts_subtree_visible(self) && ts_subtree_named(self) ); if (is_visible) { if (!is_root) { cursor += snprintf(*writer, limit, " "); if (field_name) { cursor += snprintf(*writer, limit, "%s: ", field_name); } } if (ts_subtree_is_error(self) && ts_subtree_child_count(self) == 0 && self.ptr->size.bytes > 0) { cursor += snprintf(*writer, limit, "(UNEXPECTED "); cursor += ts_subtree__write_char_to_string(*writer, limit, self.ptr->lookahead_char); } else { TSSymbol symbol = alias_symbol ? alias_symbol : ts_subtree_symbol(self); const char *symbol_name = ts_language_symbol_name(language, symbol); if (ts_subtree_missing(self)) { cursor += snprintf(*writer, limit, "(MISSING "); if (alias_is_named || ts_subtree_named(self)) { cursor += snprintf(*writer, limit, "%s", symbol_name); } else { cursor += snprintf(*writer, limit, "\"%s\"", symbol_name); } } else { cursor += snprintf(*writer, limit, "(%s", symbol_name); } } } else if (is_root) { TSSymbol symbol = ts_subtree_symbol(self); const char *symbol_name = ts_language_symbol_name(language, symbol); cursor += snprintf(*writer, limit, "(\"%s\")", symbol_name); } if (ts_subtree_child_count(self)) { const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map( language, self.ptr->production_id, &field_map, &field_map_end ); uint32_t structural_child_index = 0; for (uint32_t i = 0; i < self.ptr->child_count; i++) { Subtree child = ts_subtree_children(self)[i]; if (ts_subtree_extra(child)) { cursor += ts_subtree__write_to_string( child, *writer, limit, language, include_all, 0, false, NULL ); } else { TSSymbol alias_symbol = alias_sequence ? alias_sequence[structural_child_index] : 0; bool alias_is_named = alias_symbol ? ts_language_symbol_metadata(language, alias_symbol).named : false; const char *child_field_name = is_visible ? NULL : field_name; for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { if (!i->inherited && i->child_index == structural_child_index) { child_field_name = language->field_names[i->field_id]; break; } } cursor += ts_subtree__write_to_string( child, *writer, limit, language, include_all, alias_symbol, alias_is_named, child_field_name ); structural_child_index++; } } } if (is_visible) cursor += snprintf(*writer, limit, ")"); return cursor - string; } char *ts_subtree_string( Subtree self, const TSLanguage *language, bool include_all ) { char scratch_string[1]; size_t size = ts_subtree__write_to_string( self, scratch_string, 0, language, include_all, 0, false, ROOT_FIELD ) + 1; char *result = ts_malloc(size * sizeof(char)); ts_subtree__write_to_string( self, result, size, language, include_all, 0, false, ROOT_FIELD ); return result; } void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, const TSLanguage *language, TSSymbol alias_symbol, FILE *f) { TSSymbol subtree_symbol = ts_subtree_symbol(*self); TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol; uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self); fprintf(f, "tree_%p [label=\"", (void *)self); ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol)); fprintf(f, "\""); if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext"); if (ts_subtree_extra(*self)) fprintf(f, ", fontcolor=gray"); fprintf(f, ", tooltip=\"" "range: %u - %u\n" "state: %d\n" "error-cost: %u\n" "has-changes: %u\n" "depends-on-column: %u\n" "repeat-depth: %u\n" "lookahead-bytes: %u", start_offset, end_offset, ts_subtree_parse_state(*self), ts_subtree_error_cost(*self), ts_subtree_has_changes(*self), ts_subtree_depends_on_column(*self), ts_subtree_repeat_depth(*self), ts_subtree_lookahead_bytes(*self) ); if (ts_subtree_is_error(*self) && ts_subtree_child_count(*self) == 0) { fprintf(f, "\ncharacter: '%c'", self->ptr->lookahead_char); } fprintf(f, "\"]\n"); uint32_t child_start_offset = start_offset; uint32_t child_info_offset = language->max_alias_sequence_length * ts_subtree_production_id(*self); for (uint32_t i = 0, n = ts_subtree_child_count(*self); i < n; i++) { const Subtree *child = &ts_subtree_children(*self)[i]; TSSymbol alias_symbol = 0; if (!ts_subtree_extra(*child) && child_info_offset) { alias_symbol = language->alias_sequences[child_info_offset]; child_info_offset++; } ts_subtree__print_dot_graph(child, child_start_offset, language, alias_symbol, f); fprintf(f, "tree_%p -> tree_%p [tooltip=%u]\n", (void *)self, (void *)child, i); child_start_offset += ts_subtree_total_bytes(*child); } } void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *f) { fprintf(f, "digraph tree {\n"); fprintf(f, "edge [arrowhead=none]\n"); ts_subtree__print_dot_graph(&self, 0, language, 0, f); fprintf(f, "}\n"); } bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) { const ExternalScannerState *state1 = &empty_state; const ExternalScannerState *state2 = &empty_state; if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) { state1 = &self.ptr->external_scanner_state; } if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) { state2 = &other.ptr->external_scanner_state; } return ts_external_scanner_state_eq(state1, state2); } tree-sitter-0.20.1/src/subtree.h000064400000000000000000000241420072674642500146170ustar 00000000000000#ifndef TREE_SITTER_SUBTREE_H_ #define TREE_SITTER_SUBTREE_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #include "./length.h" #include "./array.h" #include "./error_costs.h" #include "tree_sitter/api.h" #include "tree_sitter/parser.h" #define TS_TREE_STATE_NONE USHRT_MAX #define NULL_SUBTREE ((Subtree) {.ptr = NULL}) // The serialized state of an external scanner. // // Every time an external token subtree is created after a call to an // external scanner, the scanner's `serialize` function is called to // retrieve a serialized copy of its state. The bytes are then copied // onto the subtree itself so that the scanner's state can later be // restored using its `deserialize` function. // // Small byte arrays are stored inline, and long ones are allocated // separately on the heap. typedef struct { union { char *long_data; char short_data[24]; }; uint32_t length; } ExternalScannerState; // A compact representation of a subtree. // // This representation is used for small leaf nodes that are not // errors, and were not created by an external scanner. typedef struct { bool is_inline : 1; bool visible : 1; bool named : 1; bool extra : 1; bool has_changes : 1; bool is_missing : 1; bool is_keyword : 1; uint8_t symbol; uint8_t padding_bytes; uint8_t size_bytes; uint8_t padding_columns; uint8_t padding_rows : 4; uint8_t lookahead_bytes : 4; uint16_t parse_state; } SubtreeInlineData; // A heap-allocated representation of a subtree. // // This representation is used for parent nodes, external tokens, // errors, and other leaf nodes whose data is too large to fit into // the inlinen representation. typedef struct { volatile uint32_t ref_count; Length padding; Length size; uint32_t lookahead_bytes; uint32_t error_cost; uint32_t child_count; TSSymbol symbol; TSStateId parse_state; bool visible : 1; bool named : 1; bool extra : 1; bool fragile_left : 1; bool fragile_right : 1; bool has_changes : 1; bool has_external_tokens : 1; bool depends_on_column: 1; bool is_missing : 1; bool is_keyword : 1; union { // Non-terminal subtrees (`child_count > 0`) struct { uint32_t visible_child_count; uint32_t named_child_count; uint32_t node_count; uint32_t repeat_depth; int32_t dynamic_precedence; uint16_t production_id; struct { TSSymbol symbol; TSStateId parse_state; } first_leaf; }; // External terminal subtrees (`child_count == 0 && has_external_tokens`) ExternalScannerState external_scanner_state; // Error terminal subtrees (`child_count == 0 && symbol == ts_builtin_sym_error`) int32_t lookahead_char; }; } SubtreeHeapData; // The fundamental building block of a syntax tree. typedef union { SubtreeInlineData data; const SubtreeHeapData *ptr; } Subtree; // Like Subtree, but mutable. typedef union { SubtreeInlineData data; SubtreeHeapData *ptr; } MutableSubtree; typedef Array(Subtree) SubtreeArray; typedef Array(MutableSubtree) MutableSubtreeArray; typedef struct { MutableSubtreeArray free_trees; MutableSubtreeArray tree_stack; } SubtreePool; void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned); const char *ts_external_scanner_state_data(const ExternalScannerState *); void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); void ts_subtree_array_clear(SubtreePool *, SubtreeArray *); void ts_subtree_array_delete(SubtreePool *, SubtreeArray *); void ts_subtree_array_remove_trailing_extras(SubtreeArray *, SubtreeArray *); void ts_subtree_array_reverse(SubtreeArray *); SubtreePool ts_subtree_pool_new(uint32_t capacity); void ts_subtree_pool_delete(SubtreePool *); Subtree ts_subtree_new_leaf( SubtreePool *, TSSymbol, Length, Length, uint32_t, TSStateId, bool, bool, bool, const TSLanguage * ); Subtree ts_subtree_new_error( SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * ); MutableSubtree ts_subtree_new_node(TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); Subtree ts_subtree_new_error_node(SubtreeArray *, bool, const TSLanguage *); Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, const TSLanguage *); MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree); void ts_subtree_retain(Subtree); void ts_subtree_release(SubtreePool *, Subtree); bool ts_subtree_eq(Subtree, Subtree); int ts_subtree_compare(Subtree, Subtree); void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); void ts_subtree_summarize(MutableSubtree, const Subtree *, uint32_t, const TSLanguage *); void ts_subtree_summarize_children(MutableSubtree, const TSLanguage *); void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all); void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *); Subtree ts_subtree_last_external_token(Subtree); bool ts_subtree_external_scanner_state_eq(Subtree, Subtree); #define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name) static inline TSSymbol ts_subtree_symbol(Subtree self) { return SUBTREE_GET(self, symbol); } static inline bool ts_subtree_visible(Subtree self) { return SUBTREE_GET(self, visible); } static inline bool ts_subtree_named(Subtree self) { return SUBTREE_GET(self, named); } static inline bool ts_subtree_extra(Subtree self) { return SUBTREE_GET(self, extra); } static inline bool ts_subtree_has_changes(Subtree self) { return SUBTREE_GET(self, has_changes); } static inline bool ts_subtree_missing(Subtree self) { return SUBTREE_GET(self, is_missing); } static inline bool ts_subtree_is_keyword(Subtree self) { return SUBTREE_GET(self, is_keyword); } static inline TSStateId ts_subtree_parse_state(Subtree self) { return SUBTREE_GET(self, parse_state); } static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE_GET(self, lookahead_bytes); } #undef SUBTREE_GET // Get the size needed to store a heap-allocated subtree with the given // number of children. static inline size_t ts_subtree_alloc_size(uint32_t child_count) { return child_count * sizeof(Subtree) + sizeof(SubtreeHeapData); } // Get a subtree's children, which are allocated immediately before the // tree's own heap data. #define ts_subtree_children(self) \ ((self).data.is_inline ? NULL : (Subtree *)((self).ptr) - (self).ptr->child_count) static inline void ts_subtree_set_extra(MutableSubtree *self, bool is_extra) { if (self->data.is_inline) { self->data.extra = is_extra; } else { self->ptr->extra = is_extra; } } static inline TSSymbol ts_subtree_leaf_symbol(Subtree self) { if (self.data.is_inline) return self.data.symbol; if (self.ptr->child_count == 0) return self.ptr->symbol; return self.ptr->first_leaf.symbol; } static inline TSStateId ts_subtree_leaf_parse_state(Subtree self) { if (self.data.is_inline) return self.data.parse_state; if (self.ptr->child_count == 0) return self.ptr->parse_state; return self.ptr->first_leaf.parse_state; } static inline Length ts_subtree_padding(Subtree self) { if (self.data.is_inline) { Length result = {self.data.padding_bytes, {self.data.padding_rows, self.data.padding_columns}}; return result; } else { return self.ptr->padding; } } static inline Length ts_subtree_size(Subtree self) { if (self.data.is_inline) { Length result = {self.data.size_bytes, {0, self.data.size_bytes}}; return result; } else { return self.ptr->size; } } static inline Length ts_subtree_total_size(Subtree self) { return length_add(ts_subtree_padding(self), ts_subtree_size(self)); } static inline uint32_t ts_subtree_total_bytes(Subtree self) { return ts_subtree_total_size(self).bytes; } static inline uint32_t ts_subtree_child_count(Subtree self) { return self.data.is_inline ? 0 : self.ptr->child_count; } static inline uint32_t ts_subtree_repeat_depth(Subtree self) { return self.data.is_inline ? 0 : self.ptr->repeat_depth; } static inline uint32_t ts_subtree_node_count(Subtree self) { return (self.data.is_inline || self.ptr->child_count == 0) ? 1 : self.ptr->node_count; } static inline uint32_t ts_subtree_visible_child_count(Subtree self) { if (ts_subtree_child_count(self) > 0) { return self.ptr->visible_child_count; } else { return 0; } } static inline uint32_t ts_subtree_error_cost(Subtree self) { if (ts_subtree_missing(self)) { return ERROR_COST_PER_MISSING_TREE + ERROR_COST_PER_RECOVERY; } else { return self.data.is_inline ? 0 : self.ptr->error_cost; } } static inline int32_t ts_subtree_dynamic_precedence(Subtree self) { return (self.data.is_inline || self.ptr->child_count == 0) ? 0 : self.ptr->dynamic_precedence; } static inline uint16_t ts_subtree_production_id(Subtree self) { if (ts_subtree_child_count(self) > 0) { return self.ptr->production_id; } else { return 0; } } static inline bool ts_subtree_fragile_left(Subtree self) { return self.data.is_inline ? false : self.ptr->fragile_left; } static inline bool ts_subtree_fragile_right(Subtree self) { return self.data.is_inline ? false : self.ptr->fragile_right; } static inline bool ts_subtree_has_external_tokens(Subtree self) { return self.data.is_inline ? false : self.ptr->has_external_tokens; } static inline bool ts_subtree_depends_on_column(Subtree self) { return self.data.is_inline ? false : self.ptr->depends_on_column; } static inline bool ts_subtree_is_fragile(Subtree self) { return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right); } static inline bool ts_subtree_is_error(Subtree self) { return ts_subtree_symbol(self) == ts_builtin_sym_error; } static inline bool ts_subtree_is_eof(Subtree self) { return ts_subtree_symbol(self) == ts_builtin_sym_end; } static inline Subtree ts_subtree_from_mut(MutableSubtree self) { Subtree result; result.data = self.data; return result; } static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) { MutableSubtree result; result.data = self.data; return result; } #ifdef __cplusplus } #endif #endif // TREE_SITTER_SUBTREE_H_ tree-sitter-0.20.1/src/tree.c000064400000000000000000000066070072674642500141060ustar 00000000000000#include "tree_sitter/api.h" #include "./array.h" #include "./get_changed_ranges.h" #include "./subtree.h" #include "./tree_cursor.h" #include "./tree.h" TSTree *ts_tree_new( Subtree root, const TSLanguage *language, const TSRange *included_ranges, unsigned included_range_count ) { TSTree *result = ts_malloc(sizeof(TSTree)); result->root = root; result->language = language; result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange)); memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange)); result->included_range_count = included_range_count; return result; } TSTree *ts_tree_copy(const TSTree *self) { ts_subtree_retain(self->root); return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count); } void ts_tree_delete(TSTree *self) { if (!self) return; SubtreePool pool = ts_subtree_pool_new(0); ts_subtree_release(&pool, self->root); ts_subtree_pool_delete(&pool); ts_free(self->included_ranges); ts_free(self); } TSNode ts_tree_root_node(const TSTree *self) { return ts_node_new(self, &self->root, ts_subtree_padding(self->root), 0); } const TSLanguage *ts_tree_language(const TSTree *self) { return self->language; } void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { for (unsigned i = 0; i < self->included_range_count; i++) { TSRange *range = &self->included_ranges[i]; if (range->end_byte >= edit->old_end_byte) { if (range->end_byte != UINT32_MAX) { range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte); range->end_point = point_add( edit->new_end_point, point_sub(range->end_point, edit->old_end_point) ); if (range->end_byte < edit->new_end_byte) { range->end_byte = UINT32_MAX; range->end_point = POINT_MAX; } } if (range->start_byte >= edit->old_end_byte) { range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); range->start_point = point_add( edit->new_end_point, point_sub(range->start_point, edit->old_end_point) ); if (range->start_byte < edit->new_end_byte) { range->start_byte = UINT32_MAX; range->start_point = POINT_MAX; } } } } SubtreePool pool = ts_subtree_pool_new(0); self->root = ts_subtree_edit(self->root, edit, &pool); ts_subtree_pool_delete(&pool); } TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) { TreeCursor cursor1 = {NULL, array_new()}; TreeCursor cursor2 = {NULL, array_new()}; ts_tree_cursor_init(&cursor1, ts_tree_root_node(self)); ts_tree_cursor_init(&cursor2, ts_tree_root_node(other)); TSRangeArray included_range_differences = array_new(); ts_range_array_get_changed_ranges( self->included_ranges, self->included_range_count, other->included_ranges, other->included_range_count, &included_range_differences ); TSRange *result; *count = ts_subtree_get_changed_ranges( &self->root, &other->root, &cursor1, &cursor2, self->language, &included_range_differences, &result ); array_delete(&included_range_differences); array_delete(&cursor1.stack); array_delete(&cursor2.stack); return result; } void ts_tree_print_dot_graph(const TSTree *self, FILE *file) { ts_subtree_print_dot_graph(self->root, self->language, file); } tree-sitter-0.20.1/src/tree.h000064400000000000000000000011010072674642500140730ustar 00000000000000#ifndef TREE_SITTER_TREE_H_ #define TREE_SITTER_TREE_H_ #ifdef __cplusplus extern "C" { #endif typedef struct { const Subtree *child; const Subtree *parent; Length position; TSSymbol alias_symbol; } ParentCacheEntry; struct TSTree { Subtree root; const TSLanguage *language; TSRange *included_ranges; unsigned included_range_count; }; TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned); TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol); #ifdef __cplusplus } #endif #endif // TREE_SITTER_TREE_H_ tree-sitter-0.20.1/src/tree_cursor.c000064400000000000000000000362010072674642500154740ustar 00000000000000#include "tree_sitter/api.h" #include "./alloc.h" #include "./tree_cursor.h" #include "./language.h" #include "./tree.h" typedef struct { Subtree parent; const TSTree *tree; Length position; uint32_t child_index; uint32_t structural_child_index; const TSSymbol *alias_sequence; } CursorChildIterator; // CursorChildIterator static inline CursorChildIterator ts_tree_cursor_iterate_children(const TreeCursor *self) { TreeCursorEntry *last_entry = array_back(&self->stack); if (ts_subtree_child_count(*last_entry->subtree) == 0) { return (CursorChildIterator) {NULL_SUBTREE, self->tree, length_zero(), 0, 0, NULL}; } const TSSymbol *alias_sequence = ts_language_alias_sequence( self->tree->language, last_entry->subtree->ptr->production_id ); return (CursorChildIterator) { .tree = self->tree, .parent = *last_entry->subtree, .position = last_entry->position, .child_index = 0, .structural_child_index = 0, .alias_sequence = alias_sequence, }; } static inline bool ts_tree_cursor_child_iterator_next(CursorChildIterator *self, TreeCursorEntry *result, bool *visible) { if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false; const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; *result = (TreeCursorEntry) { .subtree = child, .position = self->position, .child_index = self->child_index, .structural_child_index = self->structural_child_index, }; *visible = ts_subtree_visible(*child); bool extra = ts_subtree_extra(*child); if (!extra && self->alias_sequence) { *visible |= self->alias_sequence[self->structural_child_index]; self->structural_child_index++; } self->position = length_add(self->position, ts_subtree_size(*child)); self->child_index++; if (self->child_index < self->parent.ptr->child_count) { Subtree next_child = ts_subtree_children(self->parent)[self->child_index]; self->position = length_add(self->position, ts_subtree_padding(next_child)); } return true; } // TSTreeCursor - lifecycle TSTreeCursor ts_tree_cursor_new(TSNode node) { TSTreeCursor self = {NULL, NULL, {0, 0}}; ts_tree_cursor_init((TreeCursor *)&self, node); return self; } void ts_tree_cursor_reset(TSTreeCursor *_self, TSNode node) { ts_tree_cursor_init((TreeCursor *)_self, node); } void ts_tree_cursor_init(TreeCursor *self, TSNode node) { self->tree = node.tree; array_clear(&self->stack); array_push(&self->stack, ((TreeCursorEntry) { .subtree = (const Subtree *)node.id, .position = { ts_node_start_byte(node), ts_node_start_point(node) }, .child_index = 0, .structural_child_index = 0, })); } void ts_tree_cursor_delete(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; array_delete(&self->stack); } // TSTreeCursor - walking the tree bool ts_tree_cursor_goto_first_child(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; bool did_descend; do { did_descend = false; bool visible; TreeCursorEntry entry; CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { if (visible) { array_push(&self->stack, entry); return true; } if (ts_subtree_visible_child_count(*entry.subtree) > 0) { array_push(&self->stack, entry); did_descend = true; break; } } } while (did_descend); return false; } int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t goal_byte) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; uint32_t visible_child_index = 0; bool did_descend; do { did_descend = false; bool visible; TreeCursorEntry entry; CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { uint32_t end_byte = entry.position.bytes + ts_subtree_size(*entry.subtree).bytes; bool at_goal = end_byte > goal_byte; uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); if (at_goal) { if (visible) { array_push(&self->stack, entry); return visible_child_index; } if (visible_child_count > 0) { array_push(&self->stack, entry); did_descend = true; break; } } else if (visible) { visible_child_index++; } else { visible_child_index += visible_child_count; } } } while (did_descend); self->stack.size = initial_size; return -1; } int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *_self, TSPoint goal_point) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; uint32_t visible_child_index = 0; bool did_descend; do { did_descend = false; bool visible; TreeCursorEntry entry; CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { TSPoint end_point = point_add(entry.position.extent, ts_subtree_size(*entry.subtree).extent); bool at_goal = point_gt(end_point, goal_point); uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); if (at_goal) { if (visible) { array_push(&self->stack, entry); return visible_child_index; } if (visible_child_count > 0) { array_push(&self->stack, entry); did_descend = true; break; } } else if (visible) { visible_child_index++; } else { visible_child_index += visible_child_count; } } } while (did_descend); self->stack.size = initial_size; return -1; } bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; while (self->stack.size > 1) { TreeCursorEntry entry = array_pop(&self->stack); CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); iterator.child_index = entry.child_index; iterator.structural_child_index = entry.structural_child_index; iterator.position = entry.position; bool visible = false; ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible); if (visible && self->stack.size + 1 < initial_size) break; while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { if (visible) { array_push(&self->stack, entry); return true; } if (ts_subtree_visible_child_count(*entry.subtree)) { array_push(&self->stack, entry); ts_tree_cursor_goto_first_child(_self); return true; } } } self->stack.size = initial_size; return false; } bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) { TreeCursorEntry *entry = &self->stack.contents[i]; if (ts_subtree_visible(*entry->subtree)) { self->stack.size = i + 1; return true; } if (i > 0 && !ts_subtree_extra(*entry->subtree)) { TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; if (ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, entry->structural_child_index )) { self->stack.size = i + 1; return true; } } } return false; } TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; TreeCursorEntry *last_entry = array_back(&self->stack); TSSymbol alias_symbol = 0; if (self->stack.size > 1 && !ts_subtree_extra(*last_entry->subtree)) { TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2]; alias_symbol = ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, last_entry->structural_child_index ); } return ts_node_new( self->tree, last_entry->subtree, last_entry->position, alias_symbol ); } // Private - Get various facts about the current node that are needed // when executing tree queries. void ts_tree_cursor_current_status( const TSTreeCursor *_self, TSFieldId *field_id, bool *has_later_siblings, bool *has_later_named_siblings, bool *can_have_later_siblings_with_this_field, TSSymbol *supertypes, unsigned *supertype_count ) { const TreeCursor *self = (const TreeCursor *)_self; unsigned max_supertypes = *supertype_count; *field_id = 0; *supertype_count = 0; *has_later_siblings = false; *has_later_named_siblings = false; *can_have_later_siblings_with_this_field = false; // Walk up the tree, visiting the current node and its invisible ancestors, // because fields can refer to nodes through invisible *wrapper* nodes, for (unsigned i = self->stack.size - 1; i > 0; i--) { TreeCursorEntry *entry = &self->stack.contents[i]; TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; const TSSymbol *alias_sequence = ts_language_alias_sequence( self->tree->language, parent_entry->subtree->ptr->production_id ); #define subtree_symbol(subtree, structural_child_index) \ (( \ !ts_subtree_extra(subtree) && \ alias_sequence && \ alias_sequence[structural_child_index] \ ) ? \ alias_sequence[structural_child_index] : \ ts_subtree_symbol(subtree)) // Stop walking up when a visible ancestor is found. TSSymbol entry_symbol = subtree_symbol( *entry->subtree, entry->structural_child_index ); TSSymbolMetadata entry_metadata = ts_language_symbol_metadata( self->tree->language, entry_symbol ); if (i != self->stack.size - 1 && entry_metadata.visible) break; // Record any supertypes if (entry_metadata.supertype && *supertype_count < max_supertypes) { supertypes[*supertype_count] = entry_symbol; (*supertype_count)++; } // Determine if the current node has later siblings. if (!*has_later_siblings) { unsigned sibling_count = parent_entry->subtree->ptr->child_count; unsigned structural_child_index = entry->structural_child_index; if (!ts_subtree_extra(*entry->subtree)) structural_child_index++; for (unsigned j = entry->child_index + 1; j < sibling_count; j++) { Subtree sibling = ts_subtree_children(*parent_entry->subtree)[j]; TSSymbolMetadata sibling_metadata = ts_language_symbol_metadata( self->tree->language, subtree_symbol(sibling, structural_child_index) ); if (sibling_metadata.visible) { *has_later_siblings = true; if (*has_later_named_siblings) break; if (sibling_metadata.named) { *has_later_named_siblings = true; break; } } else if (ts_subtree_visible_child_count(sibling) > 0) { *has_later_siblings = true; if (*has_later_named_siblings) break; if (sibling.ptr->named_child_count > 0) { *has_later_named_siblings = true; break; } } if (!ts_subtree_extra(sibling)) structural_child_index++; } } #undef subtree_symbol if (!ts_subtree_extra(*entry->subtree)) { const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map( self->tree->language, parent_entry->subtree->ptr->production_id, &field_map, &field_map_end ); // Look for a field name associated with the current node. if (!*field_id) { for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { if (!i->inherited && i->child_index == entry->structural_child_index) { *field_id = i->field_id; break; } } } // Determine if the current node can have later siblings with the same field name. if (*field_id) { for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { if ( i->field_id == *field_id && i->child_index > entry->structural_child_index ) { *can_have_later_siblings_with_this_field = true; break; } } } } } } TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; for (int i = (int)self->stack.size - 2; i >= 0; i--) { TreeCursorEntry *entry = &self->stack.contents[i]; bool is_visible = true; TSSymbol alias_symbol = 0; if (i > 0) { TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; alias_symbol = ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, entry->structural_child_index ); is_visible = (alias_symbol != 0) || ts_subtree_visible(*entry->subtree); } if (is_visible) { return ts_node_new( self->tree, entry->subtree, entry->position, alias_symbol ); } } return ts_node_new(NULL, NULL, length_zero(), 0); } TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; // Walk up the tree, visiting the current node and its invisible ancestors. for (unsigned i = self->stack.size - 1; i > 0; i--) { TreeCursorEntry *entry = &self->stack.contents[i]; TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; // Stop walking up when another visible node is found. if (i != self->stack.size - 1) { if (ts_subtree_visible(*entry->subtree)) break; if ( !ts_subtree_extra(*entry->subtree) && ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, entry->structural_child_index ) ) break; } if (ts_subtree_extra(*entry->subtree)) break; const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map( self->tree->language, parent_entry->subtree->ptr->production_id, &field_map, &field_map_end ); for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { if (!i->inherited && i->child_index == entry->structural_child_index) { return i->field_id; } } } return 0; } const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) { TSFieldId id = ts_tree_cursor_current_field_id(_self); if (id) { const TreeCursor *self = (const TreeCursor *)_self; return self->tree->language->field_names[id]; } else { return NULL; } } TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *_cursor) { const TreeCursor *cursor = (const TreeCursor *)_cursor; TSTreeCursor res = {NULL, NULL, {0, 0}}; TreeCursor *copy = (TreeCursor *)&res; copy->tree = cursor->tree; array_init(©->stack); array_push_all(©->stack, &cursor->stack); return res; } tree-sitter-0.20.1/src/tree_cursor.h000064400000000000000000000011320072674642500154740ustar 00000000000000#ifndef TREE_SITTER_TREE_CURSOR_H_ #define TREE_SITTER_TREE_CURSOR_H_ #include "./subtree.h" typedef struct { const Subtree *subtree; Length position; uint32_t child_index; uint32_t structural_child_index; } TreeCursorEntry; typedef struct { const TSTree *tree; Array(TreeCursorEntry) stack; } TreeCursor; void ts_tree_cursor_init(TreeCursor *, TSNode); void ts_tree_cursor_current_status( const TSTreeCursor *, TSFieldId *, bool *, bool *, bool *, TSSymbol *, unsigned * ); TSNode ts_tree_cursor_parent_node(const TSTreeCursor *); #endif // TREE_SITTER_TREE_CURSOR_H_ tree-sitter-0.20.1/src/unicode/ICU_SHA000064400000000000000000000000510072674642500154520ustar 00000000000000552b01f61127d30d6589aa4bf99468224979b661 tree-sitter-0.20.1/src/unicode/LICENSE000064400000000000000000000510110072674642500154230ustar 00000000000000COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) Copyright © 1991-2019 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in https://www.unicode.org/copyright.html. Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that either (a) this copyright and permission notice appear with all copies of the Data Files or Software, or (b) this copyright and permission notice appear in associated Documentation. THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE. Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder. --------------------- Third-Party Software Licenses This section contains third-party software notices and/or additional terms for licensed third-party software components included within ICU libraries. 1. ICU License - ICU 1.8.1 to ICU 57.1 COPYRIGHT AND PERMISSION NOTICE Copyright (c) 1995-2016 International Business Machines Corporation and others All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, provided that the above copyright notice(s) and this permission notice appear in all copies of the Software and that both the above copyright notice(s) and this permission notice appear in supporting documentation. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization of the copyright holder. All trademarks and registered trademarks mentioned herein are the property of their respective owners. 2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt) # The Google Chrome software developed by Google is licensed under # the BSD license. Other software included in this distribution is # provided under other licenses, as set forth below. # # The BSD License # http://opensource.org/licenses/bsd-license.php # Copyright (C) 2006-2008, Google Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided with # the distribution. # Neither the name of Google Inc. nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # The word list in cjdict.txt are generated by combining three word lists # listed below with further processing for compound word breaking. The # frequency is generated with an iterative training against Google web # corpora. # # * Libtabe (Chinese) # - https://sourceforge.net/project/?group_id=1519 # - Its license terms and conditions are shown below. # # * IPADIC (Japanese) # - http://chasen.aist-nara.ac.jp/chasen/distribution.html # - Its license terms and conditions are shown below. # # ---------COPYING.libtabe ---- BEGIN-------------------- # # /* # * Copyright (c) 1999 TaBE Project. # * Copyright (c) 1999 Pai-Hsiang Hsiao. # * All rights reserved. # * # * Redistribution and use in source and binary forms, with or without # * modification, are permitted provided that the following conditions # * are met: # * # * . Redistributions of source code must retain the above copyright # * notice, this list of conditions and the following disclaimer. # * . Redistributions in binary form must reproduce the above copyright # * notice, this list of conditions and the following disclaimer in # * the documentation and/or other materials provided with the # * distribution. # * . Neither the name of the TaBE Project nor the names of its # * contributors may be used to endorse or promote products derived # * from this software without specific prior written permission. # * # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # * OF THE POSSIBILITY OF SUCH DAMAGE. # */ # # /* # * Copyright (c) 1999 Computer Systems and Communication Lab, # * Institute of Information Science, Academia # * Sinica. All rights reserved. # * # * Redistribution and use in source and binary forms, with or without # * modification, are permitted provided that the following conditions # * are met: # * # * . Redistributions of source code must retain the above copyright # * notice, this list of conditions and the following disclaimer. # * . Redistributions in binary form must reproduce the above copyright # * notice, this list of conditions and the following disclaimer in # * the documentation and/or other materials provided with the # * distribution. # * . Neither the name of the Computer Systems and Communication Lab # * nor the names of its contributors may be used to endorse or # * promote products derived from this software without specific # * prior written permission. # * # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # * OF THE POSSIBILITY OF SUCH DAMAGE. # */ # # Copyright 1996 Chih-Hao Tsai @ Beckman Institute, # University of Illinois # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4 # # ---------------COPYING.libtabe-----END-------------------------------- # # # ---------------COPYING.ipadic-----BEGIN------------------------------- # # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science # and Technology. All Rights Reserved. # # Use, reproduction, and distribution of this software is permitted. # Any copy of this software, whether in its original form or modified, # must include both the above copyright notice and the following # paragraphs. # # Nara Institute of Science and Technology (NAIST), # the copyright holders, disclaims all warranties with regard to this # software, including all implied warranties of merchantability and # fitness, in no event shall NAIST be liable for # any special, indirect or consequential damages or any damages # whatsoever resulting from loss of use, data or profits, whether in an # action of contract, negligence or other tortuous action, arising out # of or in connection with the use or performance of this software. # # A large portion of the dictionary entries # originate from ICOT Free Software. The following conditions for ICOT # Free Software applies to the current dictionary as well. # # Each User may also freely distribute the Program, whether in its # original form or modified, to any third party or parties, PROVIDED # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear # on, or be attached to, the Program, which is distributed substantially # in the same form as set out herein and that such intended # distribution, if actually made, will neither violate or otherwise # contravene any of the laws and regulations of the countries having # jurisdiction over the User or the intended distribution itself. # # NO WARRANTY # # The program was produced on an experimental basis in the course of the # research and development conducted during the project and is provided # to users as so produced on an experimental basis. Accordingly, the # program is provided without any warranty whatsoever, whether express, # implied, statutory or otherwise. The term "warranty" used herein # includes, but is not limited to, any warranty of the quality, # performance, merchantability and fitness for a particular purpose of # the program and the nonexistence of any infringement or violation of # any right of any third party. # # Each user of the program will agree and understand, and be deemed to # have agreed and understood, that there is no warranty whatsoever for # the program and, accordingly, the entire risk arising from or # otherwise connected with the program is assumed by the user. # # Therefore, neither ICOT, the copyright holder, or any other # organization that participated in or was otherwise related to the # development of the program and their respective officials, directors, # officers and other employees shall be held liable for any and all # damages, including, without limitation, general, special, incidental # and consequential damages, arising out of or otherwise in connection # with the use or inability to use the program or any product, material # or result produced or otherwise obtained by using the program, # regardless of whether they have been advised of, or otherwise had # knowledge of, the possibility of such damages at any time during the # project or thereafter. Each user will be deemed to have agreed to the # foregoing by his or her commencement of use of the program. The term # "use" as used herein includes, but is not limited to, the use, # modification, copying and distribution of the program and the # production of secondary products from the program. # # In the case where the program, whether in its original form or # modified, was distributed or delivered to or received by a user from # any person, organization or entity other than ICOT, unless it makes or # grants independently of ICOT any specific warranty to the user in # writing, such person, organization or entity, will also be exempted # from and not be held liable to the user for any such damages as noted # above as far as the program is concerned. # # ---------------COPYING.ipadic-----END---------------------------------- 3. Lao Word Break Dictionary Data (laodict.txt) # Copyright (c) 2013 International Business Machines Corporation # and others. All Rights Reserved. # # Project: http://code.google.com/p/lao-dictionary/ # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt # (copied below) # # This file is derived from the above dictionary, with slight # modifications. # ---------------------------------------------------------------------- # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, # are permitted provided that the following conditions are met: # # # Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. Redistributions in # binary form must reproduce the above copyright notice, this list of # conditions and the following disclaimer in the documentation and/or # other materials provided with the distribution. # # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # OF THE POSSIBILITY OF SUCH DAMAGE. # -------------------------------------------------------------------------- 4. Burmese Word Break Dictionary Data (burmesedict.txt) # Copyright (c) 2014 International Business Machines Corporation # and others. All Rights Reserved. # # This list is part of a project hosted at: # github.com/kanyawtech/myanmar-karen-word-lists # # -------------------------------------------------------------------------- # Copyright (c) 2013, LeRoy Benjamin Sharon # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: Redistributions of source code must retain the above # copyright notice, this list of conditions and the following # disclaimer. Redistributions in binary form must reproduce the # above copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # Neither the name Myanmar Karen Word Lists, nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # -------------------------------------------------------------------------- 5. Time Zone Database ICU uses the public domain data and code derived from Time Zone Database for its time zone support. The ownership of the TZ database is explained in BCP 175: Procedure for Maintaining the Time Zone Database section 7. # 7. Database Ownership # # The TZ database itself is not an IETF Contribution or an IETF # document. Rather it is a pre-existing and regularly updated work # that is in the public domain, and is intended to remain in the # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do # not apply to the TZ Database or contributions that individuals make # to it. Should any claims be made and substantiated against the TZ # Database, the organization that is providing the IANA # Considerations defined in this RFC, under the memorandum of # understanding with the IETF, currently ICANN, may act in accordance # with all competent court orders. No ownership claims will be made # by ICANN or the IETF Trust on the database or the code. Any person # making a contribution to the database or code waives all rights to # future claims in that contribution or in the TZ Database. 6. Google double-conversion Copyright 2006-2011, the V8 project authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. tree-sitter-0.20.1/src/unicode/README.md000064400000000000000000000023660072674642500157060ustar 00000000000000# ICU Parts This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu). ### License The license for these files is contained in the `LICENSE` file within this directory. ### Contents * Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory: * `utf8.h` * `utf16.h` * `umachine.h` * Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed: * `ptypes.h` * `urename.h` * `utf.h` * `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained. * `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository. * `README.md` - This text file. ### Updating ICU To incorporate changes from the upstream `icu` repository: * Update `ICU_SHA` with the new Git SHA. * Update `LICENSE` with the license text from the directory mentioned above. * Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository. tree-sitter-0.20.1/src/unicode/ptypes.h000064400000000000000000000001100072674642500161050ustar 00000000000000// This file must exist in order for `utf8.h` and `utf16.h` to be used. tree-sitter-0.20.1/src/unicode/umachine.h000064400000000000000000000350210072674642500163630ustar 00000000000000// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: umachine.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999sep13 * created by: Markus W. Scherer * * This file defines basic types and constants for ICU to be * platform-independent. umachine.h and utf.h are included into * utypes.h to provide all the general definitions for ICU. * All of these definitions used to be in utypes.h before * the UTF-handling macros made this unmaintainable. */ #ifndef __UMACHINE_H__ #define __UMACHINE_H__ /** * \file * \brief Basic types and constants for UTF * *

Basic types and constants for UTF

* This file defines basic types and constants for utf.h to be * platform-independent. umachine.h and utf.h are included into * utypes.h to provide all the general definitions for ICU. * All of these definitions used to be in utypes.h before * the UTF-handling macros made this unmaintainable. * */ /*==========================================================================*/ /* Include platform-dependent definitions */ /* which are contained in the platform-specific file platform.h */ /*==========================================================================*/ #include "unicode/ptypes.h" /* platform.h is included in ptypes.h */ /* * ANSI C headers: * stddef.h defines wchar_t */ #include /*==========================================================================*/ /* For C wrappers, we use the symbol U_STABLE. */ /* This works properly if the includer is C or C++. */ /* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */ /*==========================================================================*/ /** * \def U_CFUNC * This is used in a declaration of a library private ICU C function. * @stable ICU 2.4 */ /** * \def U_CDECL_BEGIN * This is used to begin a declaration of a library private ICU C API. * @stable ICU 2.4 */ /** * \def U_CDECL_END * This is used to end a declaration of a library private ICU C API * @stable ICU 2.4 */ #ifdef __cplusplus # define U_CFUNC extern "C" # define U_CDECL_BEGIN extern "C" { # define U_CDECL_END } #else # define U_CFUNC extern # define U_CDECL_BEGIN # define U_CDECL_END #endif #ifndef U_ATTRIBUTE_DEPRECATED /** * \def U_ATTRIBUTE_DEPRECATED * This is used for GCC specific attributes * @internal */ #if U_GCC_MAJOR_MINOR >= 302 # define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated)) /** * \def U_ATTRIBUTE_DEPRECATED * This is used for Visual C++ specific attributes * @internal */ #elif defined(_MSC_VER) && (_MSC_VER >= 1400) # define U_ATTRIBUTE_DEPRECATED __declspec(deprecated) #else # define U_ATTRIBUTE_DEPRECATED #endif #endif /** This is used to declare a function as a public ICU C API @stable ICU 2.0*/ #define U_CAPI U_CFUNC U_EXPORT /** This is used to declare a function as a stable public ICU C API*/ #define U_STABLE U_CAPI /** This is used to declare a function as a draft public ICU C API */ #define U_DRAFT U_CAPI /** This is used to declare a function as a deprecated public ICU C API */ #define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED /** This is used to declare a function as an obsolete public ICU C API */ #define U_OBSOLETE U_CAPI /** This is used to declare a function as an internal ICU C API */ #define U_INTERNAL U_CAPI /** * \def U_OVERRIDE * Defined to the C++11 "override" keyword if available. * Denotes a class or member which is an override of the base class. * May result in an error if it applied to something not an override. * @internal */ #ifndef U_OVERRIDE #define U_OVERRIDE override #endif /** * \def U_FINAL * Defined to the C++11 "final" keyword if available. * Denotes a class or member which may not be overridden in subclasses. * May result in an error if subclasses attempt to override. * @internal */ #if !defined(U_FINAL) || defined(U_IN_DOXYGEN) #define U_FINAL final #endif // Before ICU 65, function-like, multi-statement ICU macros were just defined as // series of statements wrapped in { } blocks and the caller could choose to // either treat them as if they were actual functions and end the invocation // with a trailing ; creating an empty statement after the block or else omit // this trailing ; using the knowledge that the macro would expand to { }. // // But doing so doesn't work well with macros that look like functions and // compiler warnings about empty statements (ICU-20601) and ICU 65 therefore // switches to the standard solution of wrapping such macros in do { } while. // // This will however break existing code that depends on being able to invoke // these macros without a trailing ; so to be able to remain compatible with // such code the wrapper is itself defined as macros so that it's possible to // build ICU 65 and later with the old macro behaviour, like this: // // CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""' // runConfigureICU ... /** * \def UPRV_BLOCK_MACRO_BEGIN * Defined as the "do" keyword by default. * @internal */ #ifndef UPRV_BLOCK_MACRO_BEGIN #define UPRV_BLOCK_MACRO_BEGIN do #endif /** * \def UPRV_BLOCK_MACRO_END * Defined as "while (FALSE)" by default. * @internal */ #ifndef UPRV_BLOCK_MACRO_END #define UPRV_BLOCK_MACRO_END while (FALSE) #endif /*==========================================================================*/ /* limits for int32_t etc., like in POSIX inttypes.h */ /*==========================================================================*/ #ifndef INT8_MIN /** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */ # define INT8_MIN ((int8_t)(-128)) #endif #ifndef INT16_MIN /** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */ # define INT16_MIN ((int16_t)(-32767-1)) #endif #ifndef INT32_MIN /** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */ # define INT32_MIN ((int32_t)(-2147483647-1)) #endif #ifndef INT8_MAX /** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */ # define INT8_MAX ((int8_t)(127)) #endif #ifndef INT16_MAX /** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */ # define INT16_MAX ((int16_t)(32767)) #endif #ifndef INT32_MAX /** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */ # define INT32_MAX ((int32_t)(2147483647)) #endif #ifndef UINT8_MAX /** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT8_MAX ((uint8_t)(255U)) #endif #ifndef UINT16_MAX /** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT16_MAX ((uint16_t)(65535U)) #endif #ifndef UINT32_MAX /** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT32_MAX ((uint32_t)(4294967295U)) #endif #if defined(U_INT64_T_UNAVAILABLE) # error int64_t is required for decimal format and rule-based number format. #else # ifndef INT64_C /** * Provides a platform independent way to specify a signed 64-bit integer constant. * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C * @stable ICU 2.8 */ # define INT64_C(c) c ## LL # endif # ifndef UINT64_C /** * Provides a platform independent way to specify an unsigned 64-bit integer constant. * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C * @stable ICU 2.8 */ # define UINT64_C(c) c ## ULL # endif # ifndef U_INT64_MIN /** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */ # define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1)) # endif # ifndef U_INT64_MAX /** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */ # define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807))) # endif # ifndef U_UINT64_MAX /** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */ # define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615))) # endif #endif /*==========================================================================*/ /* Boolean data type */ /*==========================================================================*/ /** The ICU boolean type @stable ICU 2.0 */ typedef int8_t UBool; #ifndef TRUE /** The TRUE value of a UBool @stable ICU 2.0 */ # define TRUE 1 #endif #ifndef FALSE /** The FALSE value of a UBool @stable ICU 2.0 */ # define FALSE 0 #endif /*==========================================================================*/ /* Unicode data types */ /*==========================================================================*/ /* wchar_t-related definitions -------------------------------------------- */ /* * \def U_WCHAR_IS_UTF16 * Defined if wchar_t uses UTF-16. * * @stable ICU 2.0 */ /* * \def U_WCHAR_IS_UTF32 * Defined if wchar_t uses UTF-32. * * @stable ICU 2.0 */ #if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) # ifdef __STDC_ISO_10646__ # if (U_SIZEOF_WCHAR_T==2) # define U_WCHAR_IS_UTF16 # elif (U_SIZEOF_WCHAR_T==4) # define U_WCHAR_IS_UTF32 # endif # elif defined __UCS2__ # if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2) # define U_WCHAR_IS_UTF16 # endif # elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__)) # if (U_SIZEOF_WCHAR_T==4) # define U_WCHAR_IS_UTF32 # endif # elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED) # define U_WCHAR_IS_UTF32 # elif U_PLATFORM_HAS_WIN32_API # define U_WCHAR_IS_UTF16 # endif #endif /* UChar and UChar32 definitions -------------------------------------------- */ /** Number of bytes in a UChar. @stable ICU 2.0 */ #define U_SIZEOF_UCHAR 2 /** * \def U_CHAR16_IS_TYPEDEF * If 1, then char16_t is a typedef and not a real type (yet) * @internal */ #if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) // for AIX, uchar.h needs to be included # include # define U_CHAR16_IS_TYPEDEF 1 #elif defined(_MSC_VER) && (_MSC_VER < 1900) // Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type, // and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx # define U_CHAR16_IS_TYPEDEF 1 #else # define U_CHAR16_IS_TYPEDEF 0 #endif /** * \var UChar * * The base type for UTF-16 code units and pointers. * Unsigned 16-bit integer. * Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar. * * UChar is configurable by defining the macro UCHAR_TYPE * on the preprocessor or compiler command line: * -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc. * (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.) * This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16. * * The default is UChar=char16_t. * * C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type. * * In C, char16_t is a simple typedef of uint_least16_t. * ICU requires uint_least16_t=uint16_t for data memory mapping. * On macOS, char16_t is not available because the uchar.h standard header is missing. * * @stable ICU 4.4 */ #if 1 // #if 1 is normal. UChar defaults to char16_t in C++. // For configuration testing of UChar=uint16_t temporarily change this to #if 0. // The intltest Makefile #defines UCHAR_TYPE=char16_t, // so we only #define it to uint16_t if it is undefined so far. #elif !defined(UCHAR_TYPE) # define UCHAR_TYPE uint16_t #endif #if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \ defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) // Inside the ICU library code, never configurable. typedef char16_t UChar; #elif defined(UCHAR_TYPE) typedef UCHAR_TYPE UChar; #elif defined(__cplusplus) typedef char16_t UChar; #else typedef uint16_t UChar; #endif /** * \var OldUChar * Default ICU 58 definition of UChar. * A base type for UTF-16 code units and pointers. * Unsigned 16-bit integer. * * Define OldUChar to be wchar_t if that is 16 bits wide. * If wchar_t is not 16 bits wide, then define UChar to be uint16_t. * * This makes the definition of OldUChar platform-dependent * but allows direct string type compatibility with platforms with * 16-bit wchar_t types. * * This is how UChar was defined in ICU 58, for transition convenience. * Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined. * The current UChar responds to UCHAR_TYPE but OldUChar does not. * * @stable ICU 59 */ #if U_SIZEOF_WCHAR_T==2 typedef wchar_t OldUChar; #elif defined(__CHAR16_TYPE__) typedef __CHAR16_TYPE__ OldUChar; #else typedef uint16_t OldUChar; #endif /** * Define UChar32 as a type for single Unicode code points. * UChar32 is a signed 32-bit integer (same as int32_t). * * The Unicode code point range is 0..0x10ffff. * All other values (negative or >=0x110000) are illegal as Unicode code points. * They may be used as sentinel values to indicate "done", "error" * or similar non-code point conditions. * * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) * or else to be uint32_t. * That is, the definition of UChar32 was platform-dependent. * * @see U_SENTINEL * @stable ICU 2.4 */ typedef int32_t UChar32; /** * This value is intended for sentinel values for APIs that * (take or) return single code points (UChar32). * It is outside of the Unicode code point range 0..0x10ffff. * * For example, a "done" or "error" value in a new API * could be indicated with U_SENTINEL. * * ICU APIs designed before ICU 2.4 usually define service-specific "done" * values, mostly 0xffff. * Those may need to be distinguished from * actual U+ffff text contents by calling functions like * CharacterIterator::hasNext() or UnicodeString::length(). * * @return -1 * @see UChar32 * @stable ICU 2.4 */ #define U_SENTINEL (-1) #include "unicode/urename.h" #endif tree-sitter-0.20.1/src/unicode/urename.h000064400000000000000000000001100072674642500162150ustar 00000000000000// This file must exist in order for `utf8.h` and `utf16.h` to be used. tree-sitter-0.20.1/src/unicode/utf.h000064400000000000000000000001100072674642500153570ustar 00000000000000// This file must exist in order for `utf8.h` and `utf16.h` to be used. tree-sitter-0.20.1/src/unicode/utf16.h000064400000000000000000000565060072674642500155520ustar 00000000000000// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utf16.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999sep09 * created by: Markus W. Scherer */ /** * \file * \brief C API: 16-bit Unicode handling macros * * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings. * * For more information see utf.h and the ICU User Guide Strings chapter * (http://userguide.icu-project.org/strings). * * Usage: * ICU coding guidelines for if() statements should be followed when using these macros. * Compound statements (curly braces {}) must be used for if-else-while... * bodies and all macro statements should be terminated with semicolon. */ #ifndef __UTF16_H__ #define __UTF16_H__ #include "unicode/umachine.h" #ifndef __UTF_H__ # include "unicode/utf.h" #endif /* single-code point definitions -------------------------------------------- */ /** * Does this code unit alone encode a code point (BMP, not a surrogate)? * @param c 16-bit code unit * @return TRUE or FALSE * @stable ICU 2.4 */ #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) /** * Is this code unit a lead surrogate (U+d800..U+dbff)? * @param c 16-bit code unit * @return TRUE or FALSE * @stable ICU 2.4 */ #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) /** * Is this code unit a trail surrogate (U+dc00..U+dfff)? * @param c 16-bit code unit * @return TRUE or FALSE * @stable ICU 2.4 */ #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) /** * Is this code unit a surrogate (U+d800..U+dfff)? * @param c 16-bit code unit * @return TRUE or FALSE * @stable ICU 2.4 */ #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) /** * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), * is it a lead surrogate? * @param c 16-bit code unit * @return TRUE or FALSE * @stable ICU 2.4 */ #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) /** * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), * is it a trail surrogate? * @param c 16-bit code unit * @return TRUE or FALSE * @stable ICU 4.2 */ #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) /** * Helper constant for U16_GET_SUPPLEMENTARY. * @internal */ #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) /** * Get a supplementary code point value (U+10000..U+10ffff) * from its lead and trail surrogates. * The result is undefined if the input values are not * lead and trail surrogates. * * @param lead lead surrogate (U+d800..U+dbff) * @param trail trail surrogate (U+dc00..U+dfff) * @return supplementary code point (U+10000..U+10ffff) * @stable ICU 2.4 */ #define U16_GET_SUPPLEMENTARY(lead, trail) \ (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) /** * Get the lead surrogate (0xd800..0xdbff) for a * supplementary code point (0x10000..0x10ffff). * @param supplementary 32-bit code point (U+10000..U+10ffff) * @return lead surrogate (U+d800..U+dbff) for supplementary * @stable ICU 2.4 */ #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) /** * Get the trail surrogate (0xdc00..0xdfff) for a * supplementary code point (0x10000..0x10ffff). * @param supplementary 32-bit code point (U+10000..U+10ffff) * @return trail surrogate (U+dc00..U+dfff) for supplementary * @stable ICU 2.4 */ #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) /** * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). * @param c 32-bit code point * @return 1 or 2 * @stable ICU 2.4 */ #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) /** * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). * @return 2 * @stable ICU 2.4 */ #define U16_MAX_LENGTH 2 /** * Get a code point from a string at a random-access offset, * without changing the offset. * "Unsafe" macro, assumes well-formed UTF-16. * * The offset may point to either the lead or trail surrogate unit * for a supplementary code point, in which case the macro will read * the adjacent matching surrogate as well. * The result is undefined if the offset points to a single, unpaired surrogate. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. * * @param s const UChar * string * @param i string offset * @param c output UChar32 variable * @see U16_GET * @stable ICU 2.4 */ #define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[i]; \ if(U16_IS_SURROGATE(c)) { \ if(U16_IS_SURROGATE_LEAD(c)) { \ (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \ } else { \ (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Get a code point from a string at a random-access offset, * without changing the offset. * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The offset may point to either the lead or trail surrogate unit * for a supplementary code point, in which case the macro will read * the adjacent matching surrogate as well. * * The length can be negative for a NUL-terminated string. * * If the offset points to a single, unpaired surrogate, then * c is set to that unpaired surrogate. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. * * @param s const UChar * string * @param start starting string offset (usually 0) * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ } \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Get a code point from a string at a random-access offset, * without changing the offset. * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The offset may point to either the lead or trail surrogate unit * for a supplementary code point, in which case the macro will read * the adjacent matching surrogate as well. * * The length can be negative for a NUL-terminated string. * * If the offset points to a single, unpaired surrogate, then * c is set to U+FFFD. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD. * * @param s const UChar * string * @param start starting string offset (usually 0) * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ } else { \ (c)=0xfffd; \ } \ } \ } \ } UPRV_BLOCK_MACRO_END /* definitions with forward iteration --------------------------------------- */ /** * Get a code point from a string at a code point boundary offset, * and advance the offset to the next code point boundary. * (Post-incrementing forward iteration.) * "Unsafe" macro, assumes well-formed UTF-16. * * The offset may point to the lead surrogate unit * for a supplementary code point, in which case the macro will read * the following trail surrogate as well. * If the offset points to a trail surrogate, then that itself * will be returned as the code point. * The result is undefined if the offset points to a single, unpaired lead surrogate. * * @param s const UChar * string * @param i string offset * @param c output UChar32 variable * @see U16_NEXT * @stable ICU 2.4 */ #define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ if(U16_IS_LEAD(c)) { \ (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \ } \ } UPRV_BLOCK_MACRO_END /** * Get a code point from a string at a code point boundary offset, * and advance the offset to the next code point boundary. * (Post-incrementing forward iteration.) * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The length can be negative for a NUL-terminated string. * * The offset may point to the lead surrogate unit * for a supplementary code point, in which case the macro will read * the following trail surrogate as well. * If the offset points to a trail surrogate or * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate. * * @param s const UChar * string * @param i string offset, must be i>10)+0xd7c0); \ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ } \ } UPRV_BLOCK_MACRO_END /** * Append a code point to a string, overwriting 1 or 2 code units. * The offset points to the current end of the string contents * and is advanced (post-increment). * "Safe" macro, checks for a valid code point. * If a surrogate pair is written, checks for sufficient space in the string. * If the code point is not valid or a trail surrogate does not fit, * then isError is set to TRUE. * * @param s const UChar * string buffer * @param i string offset, must be i>10)+0xd7c0); \ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ } else /* c>0x10ffff or not enough space */ { \ (isError)=TRUE; \ } \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the next. * (Post-incrementing iteration.) * "Unsafe" macro, assumes well-formed UTF-16. * * @param s const UChar * string * @param i string offset * @see U16_FWD_1 * @stable ICU 2.4 */ #define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(U16_IS_LEAD((s)[(i)++])) { \ ++(i); \ } \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the next. * (Post-incrementing iteration.) * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The length can be negative for a NUL-terminated string. * * @param s const UChar * string * @param i string offset, must be i0) { \ U16_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the n-th next one, * i.e., move forward by n code points. * (Post-incrementing iteration.) * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The length can be negative for a NUL-terminated string. * * @param s const UChar * string * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ U16_FWD_1(s, i, length); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary * at the start of a code point. * If the offset points to the trail surrogate of a surrogate pair, * then the offset is decremented. * Otherwise, it is not modified. * "Unsafe" macro, assumes well-formed UTF-16. * * @param s const UChar * string * @param i string offset * @see U16_SET_CP_START * @stable ICU 2.4 */ #define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(U16_IS_TRAIL((s)[i])) { \ --(i); \ } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary * at the start of a code point. * If the offset points to the trail surrogate of a surrogate pair, * then the offset is decremented. * Otherwise, it is not modified. * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * @param s const UChar * string * @param start starting string offset (usually 0) * @param i string offset, must be start<=i * @see U16_SET_CP_START_UNSAFE * @stable ICU 2.4 */ #define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ --(i); \ } \ } UPRV_BLOCK_MACRO_END /* definitions with backward iteration -------------------------------------- */ /** * Move the string offset from one code point boundary to the previous one * and get the code point between them. * (Pre-decrementing backward iteration.) * "Unsafe" macro, assumes well-formed UTF-16. * * The input offset may be the same as the string length. * If the offset is behind a trail surrogate unit * for a supplementary code point, then the macro will read * the preceding lead surrogate as well. * If the offset is behind a lead surrogate, then that itself * will be returned as the code point. * The result is undefined if the offset is behind a single, unpaired trail surrogate. * * @param s const UChar * string * @param i string offset * @param c output UChar32 variable * @see U16_PREV * @stable ICU 2.4 */ #define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ if(U16_IS_TRAIL(c)) { \ (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the previous one * and get the code point between them. * (Pre-decrementing backward iteration.) * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The input offset may be the same as the string length. * If the offset is behind a trail surrogate unit * for a supplementary code point, then the macro will read * the preceding lead surrogate as well. * If the offset is behind a lead surrogate or behind a single, unpaired * trail surrogate, then c is set to that unpaired surrogate. * * @param s const UChar * string * @param start starting string offset (usually 0) * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ --(i); \ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the previous one * and get the code point between them. * (Pre-decrementing backward iteration.) * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The input offset may be the same as the string length. * If the offset is behind a trail surrogate unit * for a supplementary code point, then the macro will read * the preceding lead surrogate as well. * If the offset is behind a lead surrogate or behind a single, unpaired * trail surrogate, then c is set to U+FFFD. * * @param s const UChar * string * @param start starting string offset (usually 0) * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ --(i); \ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ } else { \ (c)=0xfffd; \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the previous one. * (Pre-decrementing backward iteration.) * The input offset may be the same as the string length. * "Unsafe" macro, assumes well-formed UTF-16. * * @param s const UChar * string * @param i string offset * @see U16_BACK_1 * @stable ICU 2.4 */ #define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(U16_IS_TRAIL((s)[--(i)])) { \ --(i); \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the previous one. * (Pre-decrementing backward iteration.) * The input offset may be the same as the string length. * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * @param s const UChar * string * @param start starting string offset (usually 0) * @param i string offset, must be start(start) && U16_IS_LEAD((s)[(i)-1])) { \ --(i); \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the n-th one before it, * i.e., move backward by n code points. * (Pre-decrementing backward iteration.) * The input offset may be the same as the string length. * "Unsafe" macro, assumes well-formed UTF-16. * * @param s const UChar * string * @param i string offset * @param n number of code points to skip * @see U16_BACK_N * @stable ICU 2.4 */ #define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __N=(n); \ while(__N>0) { \ U16_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the n-th one before it, * i.e., move backward by n code points. * (Pre-decrementing backward iteration.) * The input offset may be the same as the string length. * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * @param s const UChar * string * @param start start of string * @param i string offset, must be start0 && (i)>(start)) { \ U16_BACK_1(s, start, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary after a code point. * If the offset is behind the lead surrogate of a surrogate pair, * then the offset is incremented. * Otherwise, it is not modified. * The input offset may be the same as the string length. * "Unsafe" macro, assumes well-formed UTF-16. * * @param s const UChar * string * @param i string offset * @see U16_SET_CP_LIMIT * @stable ICU 2.4 */ #define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(U16_IS_LEAD((s)[(i)-1])) { \ ++(i); \ } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary after a code point. * If the offset is behind the lead surrogate of a surrogate pair, * then the offset is incremented. * Otherwise, it is not modified. * The input offset may be the same as the string length. * "Safe" macro, handles unpaired surrogates and checks for string boundaries. * * The length can be negative for a NUL-terminated string. * * @param s const UChar * string * @param start int32_t starting string offset (usually 0) * @param i int32_t string offset, start<=i<=length * @param length int32_t string length * @see U16_SET_CP_LIMIT_UNSAFE * @stable ICU 2.4 */ #define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \ ++(i); \ } \ } UPRV_BLOCK_MACRO_END #endif tree-sitter-0.20.1/src/unicode/utf8.h000064400000000000000000000757220072674642500154740ustar 00000000000000// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utf8.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999sep13 * created by: Markus W. Scherer */ /** * \file * \brief C API: 8-bit Unicode handling macros * * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. * * For more information see utf.h and the ICU User Guide Strings chapter * (http://userguide.icu-project.org/strings). * * Usage: * ICU coding guidelines for if() statements should be followed when using these macros. * Compound statements (curly braces {}) must be used for if-else-while... * bodies and all macro statements should be terminated with semicolon. */ #ifndef __UTF8_H__ #define __UTF8_H__ #include "unicode/umachine.h" #ifndef __UTF_H__ # include "unicode/utf.h" #endif /* internal definitions ----------------------------------------------------- */ /** * Counts the trail bytes for a UTF-8 lead byte. * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. * leadByte might be evaluated multiple times. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable. * * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. * @internal */ #define U8_COUNT_TRAIL_BYTES(leadByte) \ (U8_IS_LEAD(leadByte) ? \ ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) /** * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. * leadByte might be evaluated multiple times. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable. * * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. * @internal */ #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) /** * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable. * @internal */ #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) /** * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. * Lead byte E0..EF bits 3..0 are used as byte index, * first trail byte bits 7..5 are used as bit index into that byte. * @see U8_IS_VALID_LEAD3_AND_T1 * @internal */ #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" /** * Internal 3-byte UTF-8 validity check. * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. * @internal */ #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) /** * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. * First trail byte bits 7..4 are used as byte index, * lead byte F0..F4 bits 2..0 are used as bit index into that byte. * @see U8_IS_VALID_LEAD4_AND_T1 * @internal */ #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" /** * Internal 4-byte UTF-8 validity check. * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. * @internal */ #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) /** * Function for handling "next code point" with error-checking. * * This is internal since it is not meant to be called directly by external clients; * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this * file and thus must remain stable, and should not be hidden when other internal * functions are hidden (otherwise public macros would fail to compile). * @internal */ U_STABLE UChar32 U_EXPORT2 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); /** * Function for handling "append code point" with error-checking. * * This is internal since it is not meant to be called directly by external clients; * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this * file and thus must remain stable, and should not be hidden when other internal * functions are hidden (otherwise public macros would fail to compile). * @internal */ U_STABLE int32_t U_EXPORT2 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); /** * Function for handling "previous code point" with error-checking. * * This is internal since it is not meant to be called directly by external clients; * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this * file and thus must remain stable, and should not be hidden when other internal * functions are hidden (otherwise public macros would fail to compile). * @internal */ U_STABLE UChar32 U_EXPORT2 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); /** * Function for handling "skip backward one code point" with error-checking. * * This is internal since it is not meant to be called directly by external clients; * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this * file and thus must remain stable, and should not be hidden when other internal * functions are hidden (otherwise public macros would fail to compile). * @internal */ U_STABLE int32_t U_EXPORT2 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); /* single-code point definitions -------------------------------------------- */ /** * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? * @param c 8-bit code unit (byte) * @return TRUE or FALSE * @stable ICU 2.4 */ #define U8_IS_SINGLE(c) (((c)&0x80)==0) /** * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) * @param c 8-bit code unit (byte) * @return TRUE or FALSE * @stable ICU 2.4 */ #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) // 0x32=0xf4-0xc2 /** * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) * @param c 8-bit code unit (byte) * @return TRUE or FALSE * @stable ICU 2.4 */ #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) /** * How many code units (bytes) are used for the UTF-8 encoding * of this Unicode code point? * @param c 32-bit code point * @return 1..4, or 0 if c is a surrogate or not a Unicode code point * @stable ICU 2.4 */ #define U8_LENGTH(c) \ ((uint32_t)(c)<=0x7f ? 1 : \ ((uint32_t)(c)<=0x7ff ? 2 : \ ((uint32_t)(c)<=0xd7ff ? 3 : \ ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ ((uint32_t)(c)<=0xffff ? 3 : 4)\ ) \ ) \ ) \ ) /** * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). * @return 4 * @stable ICU 2.4 */ #define U8_MAX_LENGTH 4 /** * Get a code point from a string at a random-access offset, * without changing the offset. * The offset may point to either the lead byte or one of the trail bytes * for a code point, in which case the macro will read all of the bytes * for the code point. * The result is undefined if the offset points to an illegal UTF-8 * byte sequence. * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. * * @param s const uint8_t * string * @param i string offset * @param c output UChar32 variable * @see U8_GET * @stable ICU 2.4 */ #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ int32_t _u8_get_unsafe_index=(int32_t)(i); \ U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ } UPRV_BLOCK_MACRO_END /** * Get a code point from a string at a random-access offset, * without changing the offset. * The offset may point to either the lead byte or one of the trail bytes * for a code point, in which case the macro will read all of the bytes * for the code point. * * The length can be negative for a NUL-terminated string. * * If the offset points to an illegal UTF-8 byte sequence, then * c is set to a negative value. * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. * * @param s const uint8_t * string * @param start int32_t starting string offset * @param i int32_t string offset, must be start<=i=0xe0 ? \ ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \ U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ (__t&=0x3f, 1) \ : /* U+10000..U+10FFFF */ \ ((c)-=0xf0)<=4 && \ U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ (__t=(s)[i]-0x80)<=0x3f) && \ /* valid second-to-last trail byte */ \ ((c)=((c)<<6)|__t, ++(i)!=(length)) \ : /* U+0080..U+07FF */ \ (c)>=0xc2 && ((c)&=0x1f, 1)) && \ /* last trail byte */ \ (__t=(s)[i]-0x80)<=0x3f && \ ((c)=((c)<<6)|__t, ++(i), 1)) { \ } else { \ (c)=(sub); /* ill-formed*/ \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Append a code point to a string, overwriting 1 to 4 bytes. * The offset points to the current end of the string contents * and is advanced (post-increment). * "Unsafe" macro, assumes a valid code point and sufficient space in the string. * Otherwise, the result is undefined. * * @param s const uint8_t * string buffer * @param i string offset * @param c code point to append * @see U8_APPEND * @stable ICU 2.4 */ #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ uint32_t __uc=(c); \ if(__uc<=0x7f) { \ (s)[(i)++]=(uint8_t)__uc; \ } else { \ if(__uc<=0x7ff) { \ (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ } else { \ if(__uc<=0xffff) { \ (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ } else { \ (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ } \ } UPRV_BLOCK_MACRO_END /** * Append a code point to a string, overwriting 1 to 4 bytes. * The offset points to the current end of the string contents * and is advanced (post-increment). * "Safe" macro, checks for a valid code point. * If a non-ASCII code point is written, checks for sufficient space in the string. * If the code point is not valid or trail bytes do not fit, * then isError is set to TRUE. * * @param s const uint8_t * string buffer * @param i int32_t string offset, must be i>6)|0xc0); \ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \ (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \ (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ } else { \ (isError)=TRUE; \ } \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the next. * (Post-incrementing iteration.) * "Unsafe" macro, assumes well-formed UTF-8. * * @param s const uint8_t * string * @param i string offset * @see U8_FWD_1 * @stable ICU 2.4 */ #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the next. * (Post-incrementing iteration.) * "Safe" macro, checks for illegal sequences and for string boundaries. * * The length can be negative for a NUL-terminated string. * * @param s const uint8_t * string * @param i int32_t string offset, must be i=0xf0 */ { \ if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ ++(i); \ } \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the n-th next one, * i.e., move forward by n code points. * (Post-incrementing iteration.) * "Unsafe" macro, assumes well-formed UTF-8. * * @param s const uint8_t * string * @param i string offset * @param n number of code points to skip * @see U8_FWD_N * @stable ICU 2.4 */ #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __N=(n); \ while(__N>0) { \ U8_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Advance the string offset from one code point boundary to the n-th next one, * i.e., move forward by n code points. * (Post-incrementing iteration.) * "Safe" macro, checks for illegal sequences and for string boundaries. * * The length can be negative for a NUL-terminated string. * * @param s const uint8_t * string * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ U8_FWD_1(s, i, length); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary * at the start of a code point. * If the offset points to a UTF-8 trail byte, * then the offset is moved backward to the corresponding lead byte. * Otherwise, it is not modified. * "Unsafe" macro, assumes well-formed UTF-8. * * @param s const uint8_t * string * @param i string offset * @see U8_SET_CP_START * @stable ICU 2.4 */ #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ while(U8_IS_TRAIL((s)[i])) { --(i); } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary * at the start of a code point. * If the offset points to a UTF-8 trail byte, * then the offset is moved backward to the corresponding lead byte. * Otherwise, it is not modified. * * "Safe" macro, checks for illegal sequences and for string boundaries. * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. * * @param s const uint8_t * string * @param start int32_t starting string offset (usually 0) * @param i int32_t string offset, must be start<=i * @see U8_SET_CP_START_UNSAFE * @see U8_TRUNCATE_IF_INCOMPLETE * @stable ICU 2.4 */ #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ if(U8_IS_TRAIL((s)[(i)])) { \ (i)=utf8_back1SafeBody(s, start, (i)); \ } \ } UPRV_BLOCK_MACRO_END /** * If the string ends with a UTF-8 byte sequence that is valid so far * but incomplete, then reduce the length of the string to end before * the lead byte of that incomplete sequence. * For example, if the string ends with E1 80, the length is reduced by 2. * * In all other cases (the string ends with a complete sequence, or it is not * possible for any further trail byte to extend the trailing sequence) * the length remains unchanged. * * Useful for processing text split across multiple buffers * (save the incomplete sequence for later) * and for optimizing iteration * (check for string length only once per character). * * "Safe" macro, checks for illegal sequences and for string boundaries. * Unlike U8_SET_CP_START(), this macro never reads s[length]. * * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) * * @param s const uint8_t * string * @param start int32_t starting string offset (usually 0) * @param length int32_t string length (usually start<=length) * @see U8_SET_CP_START * @stable ICU 61 */ #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \ if((length)>(start)) { \ uint8_t __b1=s[(length)-1]; \ if(U8_IS_SINGLE(__b1)) { \ /* common ASCII character */ \ } else if(U8_IS_LEAD(__b1)) { \ --(length); \ } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ uint8_t __b2=s[(length)-2]; \ if(0xe0<=__b2 && __b2<=0xf4) { \ if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ (length)-=2; \ } \ } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ uint8_t __b3=s[(length)-3]; \ if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ (length)-=3; \ } \ } \ } \ } \ } UPRV_BLOCK_MACRO_END /* definitions with backward iteration -------------------------------------- */ /** * Move the string offset from one code point boundary to the previous one * and get the code point between them. * (Pre-decrementing backward iteration.) * "Unsafe" macro, assumes well-formed UTF-8. * * The input offset may be the same as the string length. * If the offset is behind a multi-byte sequence, then the macro will read * the whole sequence. * If the offset is behind a lead byte, then that itself * will be returned as the code point. * The result is undefined if the offset is behind an illegal UTF-8 sequence. * * @param s const uint8_t * string * @param i string offset * @param c output UChar32 variable * @see U8_PREV * @stable ICU 2.4 */ #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(uint8_t)(s)[--(i)]; \ if(U8_IS_TRAIL(c)) { \ uint8_t __b, __count=1, __shift=6; \ \ /* c is a trail byte */ \ (c)&=0x3f; \ for(;;) { \ __b=(s)[--(i)]; \ if(__b>=0xc0) { \ U8_MASK_LEAD_BYTE(__b, __count); \ (c)|=(UChar32)__b<<__shift; \ break; \ } else { \ (c)|=(UChar32)(__b&0x3f)<<__shift; \ ++__count; \ __shift+=6; \ } \ } \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the previous one * and get the code point between them. * (Pre-decrementing backward iteration.) * "Safe" macro, checks for illegal sequences and for string boundaries. * * The input offset may be the same as the string length. * If the offset is behind a multi-byte sequence, then the macro will read * the whole sequence. * If the offset is behind a lead byte, then that itself * will be returned as the code point. * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. * * @param s const uint8_t * string * @param start int32_t starting string offset (usually 0) * @param i int32_t string offset, must be start0) { \ U8_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Move the string offset from one code point boundary to the n-th one before it, * i.e., move backward by n code points. * (Pre-decrementing backward iteration.) * The input offset may be the same as the string length. * "Safe" macro, checks for illegal sequences and for string boundaries. * * @param s const uint8_t * string * @param start int32_t index of the start of the string * @param i int32_t string offset, must be start0 && (i)>(start)) { \ U8_BACK_1(s, start, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary after a code point. * If the offset is behind a partial multi-byte sequence, * then the offset is incremented to behind the whole sequence. * Otherwise, it is not modified. * The input offset may be the same as the string length. * "Unsafe" macro, assumes well-formed UTF-8. * * @param s const uint8_t * string * @param i string offset * @see U8_SET_CP_LIMIT * @stable ICU 2.4 */ #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ U8_BACK_1_UNSAFE(s, i); \ U8_FWD_1_UNSAFE(s, i); \ } UPRV_BLOCK_MACRO_END /** * Adjust a random-access offset to a code point boundary after a code point. * If the offset is behind a partial multi-byte sequence, * then the offset is incremented to behind the whole sequence. * Otherwise, it is not modified. * The input offset may be the same as the string length. * "Safe" macro, checks for illegal sequences and for string boundaries. * * The length can be negative for a NUL-terminated string. * * @param s const uint8_t * string * @param start int32_t starting string offset (usually 0) * @param i int32_t string offset, must be start<=i<=length * @param length int32_t string length * @see U8_SET_CP_LIMIT_UNSAFE * @stable ICU 2.4 */ #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ if((start)<(i) && ((i)<(length) || (length)<0)) { \ U8_BACK_1(s, start, i); \ U8_FWD_1(s, i, length); \ } \ } UPRV_BLOCK_MACRO_END #endif tree-sitter-0.20.1/src/unicode.h000064400000000000000000000017040072674642500145730ustar 00000000000000#ifndef TREE_SITTER_UNICODE_H_ #define TREE_SITTER_UNICODE_H_ #ifdef __cplusplus extern "C" { #endif #include #include #define U_EXPORT #define U_EXPORT2 #include "unicode/utf8.h" #include "unicode/utf16.h" static const int32_t TS_DECODE_ERROR = U_SENTINEL; // These functions read one unicode code point from the given string, // returning the number of bytes consumed. typedef uint32_t (*UnicodeDecodeFunction)( const uint8_t *string, uint32_t length, int32_t *code_point ); static inline uint32_t ts_decode_utf8( const uint8_t *string, uint32_t length, int32_t *code_point ) { uint32_t i = 0; U8_NEXT(string, i, length, *code_point); return i; } static inline uint32_t ts_decode_utf16( const uint8_t *string, uint32_t length, int32_t *code_point ) { uint32_t i = 0; U16_NEXT(((uint16_t *)string), i, length, *code_point); return i * 2; } #ifdef __cplusplus } #endif #endif // TREE_SITTER_UNICODE_H_