pax_global_header00006660000000000000000000000064150072710610014511gustar00rootroot0000000000000052 comment=e4b3efb449ccb994d39230eb6e8440d267471f4a snowball-3.0.1/000077500000000000000000000000001500727106100133335ustar00rootroot00000000000000snowball-3.0.1/.github/000077500000000000000000000000001500727106100146735ustar00rootroot00000000000000snowball-3.0.1/.github/workflows/000077500000000000000000000000001500727106100167305ustar00rootroot00000000000000snowball-3.0.1/.github/workflows/ci.yml000066400000000000000000000160221500727106100200470ustar00rootroot00000000000000name: CI # Use bash by default on all platforms. defaults: run: shell: bash on: push: paths-ignore: - '*.rst' - NEWS pull_request: branches: master paths-ignore: - '*.rst' - NEWS # Allows you to run this workflow manually from the Actions tab workflow_dispatch: jobs: build: strategy: matrix: include: - name: "C distribution build" CFLAGS_DIST_BUILD: '-O2 -Wall -W -std=c90 -Wmissing-prototypes -Wmissing-declarations -Wshadow -Wdeclaration-after-statement -Werror' - name: "C distribution build (clang)" CFLAGS_DIST_BUILD: '-O2 -Wall -W -std=c90 -Wmissing-prototypes -Wmissing-declarations -Wshadow -Wdeclaration-after-statement -Werror' CC: clang - name: "C" c_tests: y WERROR: '-std=c99 -Werror' - name: "C (clang)" c_tests: y WERROR: '-std=c99 -Werror' CC: clang - name: java JAVA: java JAVAC: javac JAVACFLAGS: '-Xlint:all -Werror' - name: go_old os: 'ubuntu-22.04' apt_packages: 'golang-1.13' GO: go - name: go_new os: 'ubuntu-24.04' apt_packages: 'golang-1.22' GO: go - name: javascript_node JSRUN: node apt_packages: 'nodejs' - name: rust RUST: rust apt_packages: 'rustc' - name: csharp MCS: mcs apt_packages: 'mono-devel' - name: Pascal FPC: fpc apt_packages: 'fpc' - name: Python 3.8 PYTHON_VERSION: 3.8 os: 'ubuntu-22.04' # The pure Python versions run slowly - when we used travis for CI # we used to need to thin the testdata for languages such as Arabic # where there's a lot to avoid the build exceeding the maximum time # allowed for a CI job. GHA allows jobs to take up to 6 hours so # we should no longer need to do this. THIN_FACTOR: 1 - name: Python 3.10 PYTHON: python3.10 os: 'ubuntu-22.04' apt_packages: 'python3.10' THIN_FACTOR: 1 - name: Python 3.12 PYTHON: python3.12 os: 'ubuntu-24.04' apt_packages: 'python3.12' THIN_FACTOR: 1 - name: Python (pypy3) PYTHON: pypy3 apt_packages: 'pypy3' - name: Ada gprbuild: gprbuild apt_packages: 'gnat gprbuild' - name: Windows (C) os: windows-latest c_tests: y ccache: sccache - name: Windows (Go) os: windows-latest GO: go MAKE: mingw32-make mingw64_packages: 'mingw-w64-ucrt-x86_64-go' ccache: sccache fail-fast: false runs-on: ${{ matrix.os || 'ubuntu-latest' }} env: CC: ${{ matrix.CC || 'gcc' }} MAKE: ${{ matrix.MAKE || 'make' }} STEMMING_DATA: 'snowball-data' steps: - name: Checkout uses: actions/checkout@v4 with: show-progress: false - name: Checkout data run: | # Try to check out a branch of the same name from the snowball-data # repo sibling of this snowball repo, so that PRs requiring changes to # both can be CI tested easily. # # For a PR, GHA will have merged the PR branch into upstream master so # we need to similarly merge the snowball-data branch into upstream # master of the snowball-data repo as there may be changes there # required by snowball master. # # If there's no such branch (or repo) we just use the standard # snowball-data repo's default branch. If there is such a branch but # the merge fails, we treat that as a fatal error. UPSTREAM_REPO_URL=https://github.com/snowballstem/snowball-data.git if [ -n "$GITHUB_HEAD_REF" ] ; then # Pull-request. GH_BRANCH=${GITHUB_HEAD_REF} GH_REPO_OWNER=${GITHUB_ACTOR} GH_REPO_URL=https://github.com/$GH_REPO_OWNER/snowball-data.git git clone "$UPSTREAM_REPO_URL" cd snowball-data git remote add pr "$GH_REPO_URL" git config --global user.email "ci@example.org" git config --global user.name "CI" echo "Trying branch $GH_BRANCH from $GH_REPO_URL" if git fetch pr && git branch --track "$GH_BRANCH" pr/"$GH_BRANCH" ; then git merge "$GH_BRANCH" else echo "Falling back to $UPSTREAM_REPO_URL" fi else # Push. GH_BRANCH=${GITHUB_REF_NAME} GH_REPO_OWNER=${GITHUB_REPOSITORY_OWNER} GH_REPO_URL=https://github.com/$GH_REPO_OWNER/snowball-data.git echo "Trying branch $GH_BRANCH from $GH_REPO_URL" if ! git clone -b "$GH_BRANCH" "$GH_REPO_URL" ; then echo "Falling back to $UPSTREAM_REPO_URL" git clone "$UPSTREAM_REPO_URL" fi fi - name: Install CCache uses: hendrikmuhs/ccache-action@v1 with: key: ${{ matrix.name }} variant: ${{ matrix.ccache || 'ccache' }} - name: Install Ubuntu packages if: matrix.apt_packages run: | sudo apt-get update sudo apt-get install -y ${{ matrix.apt_packages }} - name: Install mingw64 packages if: matrix.mingw64_packages uses: msys2/setup-msys2@v2 with: msystem: ucrt64 install: base-devel ${{ matrix.mingw64_packages }} - name: Build run: $MAKE CC="${{ matrix.ccache || 'ccache' }} $CC" - name: Test C dist if: matrix.CFLAGS_DIST_BUILD run: | pip install setuptools build $MAKE dist mkdir tmp cd tmp tar xf ../dist/libstemmer_c-*.tar.gz cd libstemmer_c-* $MAKE CFLAGS="${{ matrix.CFLAGS_DIST_BUILD }}" - name: Test C if: matrix.c_tests run: $MAKE check CC="$CC" - uses: actions/setup-python@v5 with: python-version: ${{ matrix.PYTHON_VERSION }} if: matrix.PYTHON_VERSION - name: Test Python if: matrix.PYTHON || matrix.PYTHON_VERSION run: $MAKE check_python python="${{ matrix.PYTHON || 'python' }}" THIN_FACTOR="${{ matrix.THIN_FACTOR }}" - name: Test Java if: matrix.JAVA && matrix.JAVAC run: $MAKE check_java JAVA="${{ matrix.JAVA }}" JAVAC="${{ matrix.JAVAC }}" JAVACFLAGS="${{ matrix.JAVACFLAGS }}" - name: Test C# if: matrix.MCS run: $MAKE check_csharp MCS="${{ matrix.MCS }}" - name: Test Javascript if: matrix.JSRUN run: $MAKE check_js JSRUN="${{ matrix.JSRUN }}" - name: Test Rust if: matrix.RUST run: $MAKE check_rust RUST="${{ matrix.RUST }}" - name: Test Go if: matrix.GO run: | go mod init github.com/snowballstem/snowball $MAKE check_go GO="${{ matrix.GO }}" - name: Test Pascal if: matrix.FPC run: $MAKE check_pascal FPC="${{ matrix.FPC }}" - name: Test Ada if: matrix.gprbuild run: $MAKE check_ada gprbuild="${{ matrix.gprbuild }}" snowball-3.0.1/.gitignore000066400000000000000000000006721500727106100153300ustar00rootroot00000000000000*.o /ada/bin/ /ada/obj/ /algorithms.mk /libstemmer/libstemmer.c /libstemmer/libstemmer_utf8.c /libstemmer/mkinc.mak /libstemmer/mkinc_utf8.mak /libstemmer/modules.h /libstemmer/modules_utf8.h /libstemmer.a /snowball /src_c /stemtest /stemwords /dist /java/org/tartarus/snowball/ext/ /js_out /python_check /python_out *.generated.cs /rust/Cargo.lock /rust/src/snowball/algorithms/*.rs /rust/target/ /go/algorithms/ /go/stemwords/algorithms.go snowball-3.0.1/AUTHORS000066400000000000000000000013171500727106100144050ustar00rootroot00000000000000Authors ======= Martin Porter ------------- - Designed the snowball language. - Implemented the snowball to C compiler. - Implemented the stemming algorithms in C. - Wrote the documentation. Richard Boulton --------------- - Implemented Java backend of the snowball compiler. - Developed build system. - Assisted with website maintenance. Assistance from --------------- Olivier Bornet - fixes to java packaging and build system. Andreas Jung - useful bug reports on the libstemmer library. Olly Betts - several patches, bug reports, and performance improvements. Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms. Ralf Junker - fix a potential memory leak in sb_stemmer_new(). snowball-3.0.1/CONTRIBUTING.rst000066400000000000000000000272131500727106100160010ustar00rootroot00000000000000General contribution guidelines =============================== We don't have a formally defined coding style guide, but please strive to make new/changed code look like the code around it. Use spaces-only for indentation except where there's a syntax reason (e.g. ``GNUmakefile``) or a strong convention (e.g. Go's standard seems to be tabs, and ``gofmt`` reindents code using tabs). Avoid adding trailing whitespace on lines. Make sure there's a newline character at the end of new text files. Avoid mixing code reformatting changes with functional changes - doing so makes it harder to review patches. Adding a new stemming algorithm =============================== To add a new stemming algorithm you need to submit PRs against three repositories. See below for details of what's needed in each of these. Name the branch the same for at least `snowball` and `snowball-data` and push to `snowball-data` first, then the CI should use your new vocabulary list when running the testsuite. snowball repo ------------- This is where the implementation of the new algorithm goes. Add the `.sbl` source implementing it to the `algorithms/` subdirectory. Add entry to `libstemmer/modules.txt`, maintaining the current sorted order by the first column. The columns are: * Algorithm name (needs to match the `.sbl` source without extension) * Encodings to support. Wide-character Unicode is always supported and doesn't need to be listed here. You should always include `UTF_8`, and also any of `ISO_8859_1`, `ISO_8859_2` and `KOI8_R` which the language can usefully be written using only characters from (in particular they need to contain all the characters the stemmer explicitly uses). Support for other single-byte character sets is easy to add if they're useful. * Names and ISO-639 codes for the language. Wikipedia has a handy list of `all the ISO-639 codes `_ - find the row for your new language and include the codes from the "639-1", "639-2/T" and (if different) "639-2/B" columns. For example, for the `Afar` language you'd put `afar,aa,aar` here. Some points to note about algorithm implementations: * Avoid literal non-ASCII characters in snowball string literals - they will work OK for languages that use UTF-8, but not wide-character Unicode or other encodings. Instead use ``stringdef`` like the existing stemmers do, and please use the newer `U+` notation rather than the older ``hex`` or ``decimal`` as this allows us to support different encodings without having to modify the source files - for example:: stringdef o" {U+00F6} define foo 'o{o"}' not:: stringdef o" hex F6 define foo 'o{o"}' and definitely not:: define foo 'oö' It's OK to use UTF-8 in comments. * It's helpful to consistently use the same ``stringdef`` codes across the different stemmers - for languages using the latin alphabet our website has `guidance on what to use `_ and a `list of stringdef lines for common characters to cut and paste from `_. snowball-data repo ------------------ Add subdirectory named after new stemmer containing: * voc.txt - word list * output.txt - stemmed equivalents * COPYING - licensing details (word lists need to be under an OSI-approved licence) If you don't have access to a suitably licensed word list of a suitable size, you may be able to use the `wikipedia-most-common-words` script to generate one by extracting the most frequent words from a Wikipedia dump in the language the stemmer is for. You need to specify the Unicode "script" (that's "script" in the sense of alphabet) to use - you can find the appropriate one by looking in the Unicode `Scripts.txt `_. The script name is the second column, between `;` and `#`. The first entries are all "Common" which isn't what you want - scroll down to get to the entries that are useful here. You also need to specify the minimum frequency to select. Picking this value will probably need some experimentation as the appropriate threshold depends on how much data there is in the wikipedia dump for a particular language, as well as the size of the vocabulary for the language, and how inflected the language is. Try counting the number of unique words extracted (`wc -l voc.txt` on Unix) and also looking through the list - some proper nouns, words from other languages, typos, etc are OK (since the stemmer will encounter all these in practice too), but at some point "more" stops being "better". snowball-website repo --------------------- This is where a description of the new algorithm goes. Experience from maintaining Snowball for many years has shown us that the most important points to cover are **WHY** particular things are done or are not done. For example, if a particular ending isn't removed because doing so causes problems in other cases it's really helpful to have that recorded. Then if years later we get a bug report because this ending isn't removed we can easily answer, and don't have to try to contact you and hope you can remember, or try to work out why for ourselves. The original set of Snowball stemmers each have an English prose description of the algorithm which focuses on **WHAT** the algorithm does. These might be helpful if you want to implement the algorithm from scratch in a separate language, but they've not proved very useful for maintaining the Snowball implementations - if the prose and Snowball code disagree we know something is wrong, but it's hard to know which is right! Therefore we recommend to let the Snowball implementation describe what the algorithm does, and only comment on "**WHAT**" in cases where the implementation needs explanation to help the reader understand it. If your algorithm is based on an academic paper, cite the paper and describe any differences between your implementation and that described in the paper. For example, sometimes papers have ambiguities that need resolving to re-implement the algorithm described - see the `Hindi `_ and `Indonesian `_ stemming algorithms descriptions for examples. The mechanics of adding the algorithm description are: * Create subdirectory of `algorithms/` named after the language. * Create `stemmer.tt` which describes the stemming algorithm. This is a "template toolkit" template which is essentially a mix of HTML and some macros for adding the navigation, sample vocabulary, etc. See the existing `stemmer.tt` files for other algorithms for how to use these macros. * If you have a stopword list, add that as `stop.txt` in your new subdirectory. The `generate` script checks if such a file exists and if it does a link to it is automatically added. * Link to your new `stemmer.tt` from `algorithms/index.tt`. * Add a news entry to `index.tt`. * Add the new stemmer to the online demo. Assuming you have checkouts of the `snowball`, `snowball-data` and `snowball-website` repos in sibling directories: * run `make check_js` in the `snowball` repo * run `./update-js` * add the new stemmer to git with: `git add js/*-stemmer.js` * if the new language is written right-to-left (RTL) then add it to the check in `demo.tt` (search for `rtl` to find the place to change.) * `git commit`. Adding a new programming language generator =========================================== This is a short guide to adding support for generating code for another programming language. Is a new generator the right solution? -------------------------------------- Adding a new code generator is probably not your only option if you want to use Snowball from another language - most languages have support for writing bindings to a C library, so this is probably another option. Generating code can have advantages. For example, it can be simpler to deploy without C bindings which need to be built for a specific platform. However, it's likely to be significantly more work to implement a new generator than to write bindings to the generated C code, especially as the libstemmer C API is a very small and simple one. Generated code can also be slower - currently the Snowball compiler often generates code that assumes an optimising compiler will clean up redundant constructs, which is not a problem for C, and probably not for most compiled languages, but for a language like Python C bindings are much faster than the generated Python code (using pypy helps a lot, but is still slower). See doc/libstemmer_python_README for some timings. That said, the unoptimised generated code has improved over time, and is likely to improve further in the future. Key problems to solve --------------------- * You need to work out how to map the required flow of control in response to Snowball signals. In the generated C code this is mostly done using `goto`. If your language doesn't provide an equivalent to `goto` then you'll need an alternative solution. In Java and JavaScript we use labelled `break` from blocks and loops instead. If your language has an equivalent to this feature, that will probably work. For Python, we currently generate a `try:` ... `raise lab123` ... `except lab123: pass` construct. This works, but doesn't seem ideal. If one of the mechanisms above sounds suitable then take a look at the generator for the respective generated output and generator code. If not, come and talk to us on the snowball-discuss mailing list. * Snowball's division is specified as integer division with semantics matching C - i.e. the result should be truncated (rounded towards zero). Some languages lack a built-in integer division operation, or have one which instead implements rounding towards negative infinity. Existing backends with special handling here which may be useful to look at include Javascript, Pascal and Python. Don't hardcode algorithm names ------------------------------ We want to avoid hard-coded lists of algorithms in the language-specific code that have to be manually updated each time a new algorithm is added, because that adds some extra tedious work for adding a new algorithm, and mechanical updates done by hand tend to miss places that need updating, or code gets copied and pasted from an existing case but not fully updated. All the existing language backends generate any such code at build time, and adding a new algorithm just requires updating `libstemmer/modules.txt`. You can probably copy the approach used for Pascal (script `pascal/generate.pl` works from template `stemwords-template.dpr` which has marked blocks of code that get expanded for each stemming algorithm with a placeholder replaced by the algorithm name. For an alternative approach, see Rust where this is done by `rust/build.rs`. Mechanics of adding a new generator ----------------------------------- Copy an existing `compiler/generator_*.c` for your new language and modify away (`generator.c` has the generator for C, but also some common functions so if you start from this one you'll need to remove those common functions). Please resist reformatting existing C code - there's currently a lot of code repeated in each generator which ought to be pulled out as common code, and if you reformat that just makes that job harder. Add your new source to `COMPILER_SOURCES` in `GNUmakefile`. Add prototypes for the new functions to `compiler/header.h`. Add support to `compiler/driver.c`. Add targets to `GNUmakefile` to run tests for the new language. Hook up automated testing via CI in `.github/workflows/ci.yml`. Add to the list of languages in `README.rst`. snowball-3.0.1/COPYING000066400000000000000000000031671500727106100143750ustar00rootroot00000000000000Copyright (c) 2001, Dr Martin Porter Copyright (c) 2004,2005, Richard Boulton Copyright (c) 2013, Yoshiki Shibukawa Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Snowball project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. snowball-3.0.1/GNUmakefile000066400000000000000000000670011500727106100154110ustar00rootroot00000000000000# -*- makefile -*- # After changing this, run `make update_version` to update various sources # which hard-code it. SNOWBALL_VERSION = 3.0.1 ifeq ($(OS),Windows_NT) EXEEXT = .exe endif c_src_dir = src_c JAVACFLAGS ?= JAVAC ?= javac JAVA ?= java -ea java_src_main_dir = java/org/tartarus/snowball java_src_dir = $(java_src_main_dir)/ext MONO ?= mono MCS ?= mcs csharp_src_main_dir = csharp/Snowball csharp_src_dir = $(csharp_src_main_dir)/Algorithms csharp_sample_dir = csharp/Stemwords FPC ?= fpc # Enable warnings, info, notes; select "FILE:LINE:" diagnostic format. FPC_FLAGS ?= -veiwnr pascal_src_dir = pascal python ?= python3 python_output_dir = python_out python_runtime_dir = snowballstemmer python_sample_dir = sample js_output_dir = js_out js_runtime_dir = javascript js_sample_dir = sample JSRUN ?= node JSTYPE ?= global cargo ?= cargo cargoflags ?= --release rust_src_main_dir = rust/src rust_src_dir = $(rust_src_main_dir)/snowball/algorithms go ?= go goflags ?= stemwords/algorithms.go stemwords/main.go gofmt ?= gofmt go_src_main_dir = go go_src_dir = $(go_src_main_dir)/algorithms gprbuild ?= gprbuild ada_src_main_dir = ada ada_src_dir = $(ada_src_main_dir)/algorithms DIFF = diff ifeq ($(OS),Windows_NT) DIFF = diff --strip-trailing-cr endif ICONV = iconv #ICONV = python ./iconv.py # Where the data files are located - assumes their repo is checked out as # a sibling to this one. STEMMING_DATA ?= ../snowball-data STEMMING_DATA_ABS := $(abspath $(STEMMING_DATA)) # Keep one in $(THIN_FACTOR) entries from gzipped vocabularies. THIN_FACTOR ?= 3 ifneq (1,$(THIN_FACTOR)) ifneq (,$(THIN_FACTOR)) # Command to thin out the testdata. Used for Python tests, which otherwise # take a long time (unless you use pypy). THIN_TEST_DATA := |awk '(FNR % $(THIN_FACTOR) == 0){print}' endif endif tarball_ext = .tar.gz # algorithms.mk is generated from libstemmer/modules.txt and defines: # * libstemmer_algorithms # * ISO_8859_1_algorithms # * ISO_8859_2_algorithms # * KOI8_R_algorithms include algorithms.mk other_algorithms = lovins all_algorithms = $(libstemmer_algorithms) $(other_algorithms) COMPILER_SOURCES = compiler/space.c \ compiler/tokeniser.c \ compiler/analyser.c \ compiler/generator.c \ compiler/driver.c \ compiler/generator_csharp.c \ compiler/generator_java.c \ compiler/generator_js.c \ compiler/generator_pascal.c \ compiler/generator_python.c \ compiler/generator_rust.c \ compiler/generator_go.c \ compiler/generator_ada.c COMPILER_HEADERS = compiler/header.h \ compiler/syswords.h RUNTIME_SOURCES = runtime/api.c \ runtime/utilities.c RUNTIME_HEADERS = runtime/api.h \ runtime/header.h JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ java/org/tartarus/snowball/SnowballProgram.java \ java/org/tartarus/snowball/SnowballStemmer.java \ java/org/tartarus/snowball/TestApp.java CSHARP_RUNTIME_SOURCES = csharp/Snowball/Among.cs \ csharp/Snowball/Stemmer.cs \ csharp/Snowball/AssemblyInfo.cs CSHARP_STEMWORDS_SOURCES = csharp/Stemwords/Program.cs JS_RUNTIME_SOURCES = javascript/base-stemmer.js JS_SAMPLE_SOURCES = javascript/stemwords.js PASCAL_RUNTIME_SOURCES = pascal/SnowballProgram.pas PASCAL_STEMWORDS_SOURCES = pascal/stemwords.dpr PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \ python/snowballstemmer/among.py PYTHON_SAMPLE_SOURCES = python/testapp.py \ python/stemwords.py PYTHON_PACKAGE_FILES = python/MANIFEST.in \ python/setup.py \ python/setup.cfg LIBSTEMMER_SOURCES = libstemmer/libstemmer.c LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/libstemmer_c.in STEMWORDS_SOURCES = examples/stemwords.c STEMTEST_SOURCES = tests/stemtest.c PYTHON_STEMWORDS_SOURCE = python/stemwords.py COMMON_FILES = COPYING \ NEWS ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%.sbl) C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) CSHARP_SOURCES = $(libstemmer_algorithms:%=$(csharp_src_dir)/%Stemmer.generated.cs) PASCAL_SOURCES = $(ISO_8859_1_algorithms:%=$(pascal_src_dir)/%Stemmer.pas) PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \ $(python_output_dir)/__init__.py JS_SOURCES = $(libstemmer_algorithms:%=$(js_output_dir)/%-stemmer.js) \ $(js_output_dir)/base-stemmer.js RUST_SOURCES = $(libstemmer_algorithms:%=$(rust_src_dir)/%_stemmer.rs) GO_SOURCES = $(libstemmer_algorithms:%=$(go_src_dir)/%_stemmer.go) \ $(go_src_main_dir)/stemwords/algorithms.go ADA_SOURCES = $(libstemmer_algorithms:%=$(ada_src_dir)/stemmer-%.ads) \ $(libstemmer_algorithms:%=$(ada_src_dir)/stemmer-%.adb) \ $(ada_src_dir)/stemmer-factory.ads $(ada_src_dir)/stemmer-factory.adb COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) STEMTEST_OBJECTS=$(STEMTEST_SOURCES:.c=.o) C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class) CFLAGS=-g -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations -Wshadow $(WERROR) CPPFLAGS= INCLUDES=-Iinclude all: snowball$(EXEEXT) libstemmer.a stemwords$(EXEEXT) $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) algorithms.mk: libstemmer/mkalgorithms.pl libstemmer/modules.txt libstemmer/mkalgorithms.pl algorithms.mk libstemmer/modules.txt clean: rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball$(EXEEXT) \ libstemmer.a stemwords$(EXEEXT) \ libstemmer/modules.h \ libstemmer/modules_utf8.h \ $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ $(CSHARP_SOURCES) \ $(PASCAL_SOURCES) pascal/stemwords.dpr pascal/stemwords pascal/*.o pascal/*.ppu \ $(PYTHON_SOURCES) \ $(JS_SOURCES) \ $(RUST_SOURCES) \ $(ADA_SOURCES) ada/bin/generate ada/bin/stemwords \ stemtest$(EXEEXT) $(STEMTEST_OBJECTS) \ libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c \ algorithms.mk rm -rf ada/obj dist -rmdir $(c_src_dir) -rmdir $(python_output_dir) -rmdir $(js_output_dir) update_version: perl -pi -e 's/(SNOWBALL_VERSION.*?)\d+\.\d+\.\d+/$${1}$(SNOWBALL_VERSION)/' \ compiler/header.h \ csharp/Snowball/AssemblyInfo.cs \ python/setup.py .PHONY: all clean update_version $(STEMMING_DATA)/% $(STEMMING_DATA_ABS)/%: @[ -f '$@' ] || { echo '$@: Test data not found'; echo 'Checkout the snowball-data repo as "$(STEMMING_DATA_ABS)"'; exit 1; } snowball$(EXEEXT): $(COMPILER_OBJECTS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(COMPILER_OBJECTS): $(COMPILER_HEADERS) libstemmer/libstemmer.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules.h/' $^ >$@ libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules.txt libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc_utf8.mak utf8 libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) libstemmer.a: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) $(AR) -cru $@ $^ examples/%.o: examples/%.c $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< stemwords$(EXEEXT): $(STEMWORDS_OBJECTS) libstemmer.a $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ tests/%.o: tests/%.c $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< stemtest$(EXEEXT): $(STEMTEST_OBJECTS) libstemmer.a $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ csharp_stemwords$(EXEEXT): $(CSHARP_STEMWORDS_SOURCES) $(CSHARP_RUNTIME_SOURCES) $(CSHARP_SOURCES) $(MCS) -unsafe -target:exe -out:$@ $(CSHARP_STEMWORDS_SOURCES) $(CSHARP_RUNTIME_SOURCES) $(CSHARP_SOURCES) pascal/stemwords.dpr: pascal/stemwords-template.dpr libstemmer/modules.txt pascal/generate.pl $(ISO_8859_1_algorithms) < pascal/stemwords-template.dpr > $@ pascal/stemwords: $(PASCAL_STEMWORDS_SOURCES) $(PASCAL_RUNTIME_SOURCES) $(PASCAL_SOURCES) $(FPC) $(FPC_FLAGS) -o$@ -Mdelphi $(PASCAL_STEMWORDS_SOURCES) $(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) ./snowball $< -o "$(c_src_dir)/stem_UTF_8_$*" -eprefix $*_UTF_8_ -r ../runtime -u $(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) ./snowball charsets/KOI8-R.sbl $< -o "$(c_src_dir)/stem_KOI8_R_$*" -eprefix $*_KOI8_R_ -r ../runtime $(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) ./snowball $< -o "$(c_src_dir)/stem_ISO_8859_1_$*" -eprefix $*_ISO_8859_1_ -r ../runtime $(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) ./snowball charsets/ISO-8859-2.sbl $< -o "$(c_src_dir)/stem_ISO_8859_2_$*" -eprefix $*_ISO_8859_2_ -r ../runtime $(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< $(java_src_dir)/%Stemmer.java: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(java_src_dir) ./snowball $< -j -o "$(java_src_dir)/$*Stemmer" -p org.tartarus.snowball.SnowballStemmer $(csharp_src_dir)/%Stemmer.generated.cs: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(csharp_src_dir) ./snowball $< -cs -o "$(csharp_src_dir)/$*Stemmer.generated" $(pascal_src_dir)/%Stemmer.pas: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(pascal_src_dir) ./snowball $< -pascal -o "$(pascal_src_dir)/$*Stemmer" $(python_output_dir)/%_stemmer.py: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(python_output_dir) ./snowball $< -py -o "$(python_output_dir)/$*_stemmer" $(python_output_dir)/__init__.py: $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) $(python) python/create_init.py $(python_output_dir) $(rust_src_dir)/%_stemmer.rs: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(rust_src_dir) ./snowball $< -rust -o "$(rust_src_dir)/$*_stemmer" $(go_src_main_dir)/stemwords/algorithms.go: go/stemwords/generate.go libstemmer/modules.txt @echo "Generating algorithms.go" @cd go/stemwords && go generate $(go_src_dir)/%_stemmer.go: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(go_src_dir)/$* ./snowball $< -go -o "$(go_src_dir)/$*/$*_stemmer" -gop $* $(gofmt) -s -w $(go_src_dir)/$*/$*_stemmer.go $(js_output_dir)/%-stemmer.js: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(js_output_dir) ./snowball $< -js -o "$(js_output_dir)/$*-stemmer" $(js_output_dir)/base-stemmer.js: $(js_runtime_dir)/base-stemmer.js @mkdir -p $(js_output_dir) cp $< $@ $(ada_src_dir)/stemmer-%.ads: algorithms/%.sbl snowball @mkdir -p $(ada_src_dir) ./snowball $< -ada -P $* -o "$(ada_src_dir)/stemmer-$*" .PHONY: dist dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python # Make a full source distribution dist: dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python # Make a distribution of all the sources involved in snowball dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) $(STEMTEST_SOURCES) \ $(COMMON_FILES) \ GNUmakefile README.rst doc/TODO libstemmer/mkmodules.pl destname=snowball-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ for file in $^; do \ dir=`dirname $$file` && \ mkdir -p $${dest}/$${dir} && \ cp -a $${file} $${dest}/$${dir} || exit 1 ; \ done && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C library. dist_libstemmer_c: \ $(RUNTIME_SOURCES) \ $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(C_LIB_SOURCES) \ $(C_LIB_HEADERS) \ $(COMMON_FILES) \ libstemmer/mkinc.mak \ libstemmer/mkinc_utf8.mak destname=libstemmer_c-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_c_README $${dest}/README && \ mkdir -p $${dest}/examples && \ cp -a examples/stemwords.c $${dest}/examples && \ mkdir -p $${dest}/$(c_src_dir) && \ cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ mkdir -p $${dest}/runtime && \ cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ mkdir -p $${dest}/libstemmer && \ cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ mkdir -p $${dest}/include && \ mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ (cd $${dest} && \ echo "README.rst" >> MANIFEST && \ ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ ls runtime/*.c runtime/*.h >> MANIFEST && \ ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ ls include/*.h >> MANIFEST) && \ cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ cp -a $(COMMON_FILES) $${dest} && \ echo 'include mkinc.mak' >> $${dest}/Makefile && \ echo 'ifeq ($$(OS),Windows_NT)' >> $${dest}/Makefile && \ echo 'EXEEXT=.exe' >> $${dest}/Makefile && \ echo 'endif' >> $${dest}/Makefile && \ echo 'CFLAGS=-O2' >> $${dest}/Makefile && \ echo 'CPPFLAGS=-Iinclude' >> $${dest}/Makefile && \ echo 'all: libstemmer.a stemwords$$(EXEEXT)' >> $${dest}/Makefile && \ echo 'libstemmer.a: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \ echo 'stemwords$$(EXEEXT): examples/stemwords.o libstemmer.a' >> $${dest}/Makefile && \ echo ' $$(CC) $$(CFLAGS) -o $$@ $$^' >> $${dest}/Makefile && \ echo 'clean:' >> $${dest}/Makefile && \ echo ' rm -f stemwords$$(EXEEXT) libstemmer.a *.o $(c_src_dir)/*.o examples/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the Java library. dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(COMMON_FILES) \ $(LIBSTEMMER_EXTRA) \ $(JAVA_SOURCES) destname=libstemmer_java-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_java_README $${dest}/README && \ mkdir -p $${dest}/$(java_src_dir) && \ cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ mkdir -p $${dest}/$(java_src_main_dir) && \ cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ cp -a $(COMMON_FILES) $${dest} && \ (cd $${dest} && \ echo "README" >> MANIFEST && \ ls $(java_src_dir)/*.java >> MANIFEST && \ ls $(java_src_main_dir)/*.java >> MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C# library. dist_libstemmer_csharp: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(COMMON_FILES) \ $(LIBSTEMMER_EXTRA) \ $(CSHARP_SOURCES) destname=libstemmer_csharp-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_csharp_README $${dest}/README && \ mkdir -p $${dest}/$(csharp_src_dir) && \ cp -a $(CSHARP_SOURCES) $${dest}/$(csharp_src_dir) && \ mkdir -p $${dest}/$(csharp_src_main_dir) && \ cp -a $(CSHARP_RUNTIME_SOURCES) $${dest}/$(csharp_src_main_dir) && \ mkdir -p $${dest}/$(csharp_sample_dir) && \ cp -a $(CSHARP_STEMWORDS_SOURCES) $${dest}/$(csharp_sample_dir) && \ cp -a $(COMMON_FILES) $${dest} && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} dist_libstemmer_python: $(PYTHON_SOURCES) $(COMMON_FILES) destname=snowballstemmer-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/src/$(python_runtime_dir) && \ mkdir -p $${dest}/src/$(python_sample_dir) && \ cp libstemmer/modules.txt $${dest} && \ cp doc/libstemmer_python_README $${dest}/README.rst && \ cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \ cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \ cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \ cp -a $(COMMON_FILES) $(PYTHON_PACKAGE_FILES) $${dest} && \ (cd $${dest} && $(python) -m build && cp dist/*.tar.gz dist/*.whl ..) && \ rm -rf $${dest} dist_libstemmer_js: $(JS_SOURCES) $(COMMON_FILES) destname=jsstemmer-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/$(js_runtime_dir) && \ mkdir -p $${dest}/$(js_sample_dir) && \ cp -a doc/libstemmer_js_README $${dest}/README.rst && \ cp -a $(COMMON_FILES) $${dest} && \ cp -a $(JS_RUNTIME_SOURCES) $${dest}/$(js_runtime_dir) && \ cp -a $(JS_SAMPLE_SOURCES) $${dest}/$(js_sample_dir) && \ cp -a $(JS_SOURCES) $${dest}/$(js_runtime_dir) && \ (cd $${dest} && \ ls README.rst $(COMMON_FILES) $(js_runtime_dir)/*.js $(js_sample_dir)/*.js > MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} ############################################################################### # C ############################################################################### .PHONY: check check_stemtest check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r check: check_stemtest check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r check_stemtest: stemtest$(EXEEXT) ./stemtest check_utf8: $(libstemmer_algorithms:%=check_utf8_%) check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%) check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%) check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%) check_utf8_%: $(STEMMING_DATA)/% stemwords$(EXEEXT) @echo "Checking output of $* stemmer with UTF-8" @if test -f '$ tmp.txt @$(ICONV) -f UTF-8 -t ISO-8859-1 '$ tmp.in; \ $(JSRUN) javascript/stemwords.js -l $* -i tmp.in -o tmp.txt; \ rm tmp.in; \ else \ $(JSRUN) javascript/stemwords.js -l $* -i $ tmp.in; \ $(cargo) run $(cargoflags) -- -l $* -i tmp.in -o $(PWD)/tmp.txt; \ rm tmp.in; \ else \ $(cargo) run $(cargoflags) -- -l $* -i $ tmp.in; \ $(go) run $(goflags) -l $* -i tmp.in -o $(PWD)/tmp.txt; \ rm tmp.in; \ else \ $(go) run $(goflags) -l $* -i $ tmp.in; \ $(python) stemwords.py -c utf8 -l $* -i tmp.in -o $(PWD)/tmp.txt; \ rm tmp.in; \ else \ $(python) stemwords.py -c utf8 -l $* -i $ tmp.in; \ ./bin/stemwords $* tmp.in $(PWD)/tmp.txt; \ rm tmp.in; \ else \ ./bin/stemwords $* $= 7. Java 7 was released in 2011, and Java 6's EOL was 2013 so we don't expect this to be a problematic requirement. See #195. * Optimisations: + We now store the current string in a `char[]` rather than using a `StringBuilder` to reduce overheads. The `getCurrent()` method continues to return a Java `String`, but the `char[]` can be accessed using the new `getCurrentBuffer()` and `getCurrentBufferLength()` methods. Patch from Robert Muir (#195). + Use a more efficient mechanism for calling `among` functions. Patch from Robert Muir (#195). * Code quality: + Consistently put `[]` right after element type for array types, which seems the most used style. + Fix javac warnings in SnowballProgram.java. + Improve formatting of generated code. Javascript ---------- * Bug fixes: + Use base class specified by `-p` in string `$` rather than hard-coding `BaseStemmer` (which is the default if you don't specify `-p`). None of the shipped stemmers use string `$`, though the Schinke Latin stemmer algorithm on the website does. * Code quality: + Modernise the generated code a bit. Loosely based on changes proposed in #123 by Emily Marigold Klassen. * Other changes: + The Javascript runner is now specified by make variable `JSRUN` instead of `NODE` (since node is just one JS implementation). The default value is now `node` instead of `nodejs` (older Debian and Ubuntu packages used `/usr/bin/nodejs` because `/usr/bin/node` was already in use by a completely different package, but that has since changed). Pascal ------ * Bug fixes: + Add missing semicolons to code generated in some cases for a function which always succeeds or always fails. The new dutch.sbl was triggering this bug. + If the end of a routine was unreachable code the Snowball compiler would think the start of the next routine was also unreachable and would not generate it. This didn't affect code generation for any algorithms we currently ship. * Code quality: + Eliminate commented out code generated for string `$`. None of the shipped stemmers use string `$`, though the Schinke Latin stemmer algorithm on the website does. * Other changes: + Enable warnings, etc from fpc. + Select GNU-style diagnostic format. Python ------ * Optimisations: + Use Python set for grouping checks. This speeds up running the Python testsuite by about 4%. + Routines used in `among` are now referenced by name directly in the generated code, rather than using a string containing the name. This avoids a `getattr()` call each time an among wants to call a routine. This doesn't seem to make a measurable speed difference, but it's cleaner and avoids problems with name mangling. Suggested by David Corbett in #217. + Simplify code generated for `loop`. If the iteration count is constant and at most 4 then iterate over a tuple which microbenchmarking shows is faster. The only current uses of loop in the shipped stemmers are `loop 2` so benefit from this. Otherwise we now use `range(AE)` instead of `range (AE, 0, -1)` (the actual value of the loop variable is never used so only the number of iterations matter). * Bug fixes: + Correctly handle stemmer names with an underscore. * Code quality: + Generate Python with UTF-8 source encoding. This makes the generated code easier to follow, which helps during development. It's also a bit smaller. For now codepoints U+0590 and above are still emitted as escape sequences to avoid confusing source code rendering when LTR scripts are involved. * Other changes: + Set python_requires to indicate to install tools that the generated code won't work with Python 3.0.x, 3.1.x and 3.2.x (due to use of `u"foo"` string literals). Closes #192 and #191, opened by Andreas Maier. + Add classifiers to indicate support for Python 3.3 and for 3.8 to 3.13. Fixes #158, reported by Dmitry Shachnev. + Stop marking the wheel as universal, which had started to give a warning message. Patch from Dmitry Shachnev (#210). + Stop calling `setup.py` directly which is deprecated and now produces a warning - use the `build` module instead. Patch from Dmitry Shachnev (#210). Rust ---- * Optimisations: + Shortcut unnecessary calls to find_among, porting an optimization from the C generator. In some stemming benchmarks this improves the performance of the rust english stemmer by about 27%. Patch from jedav (#202). * Code quality: + Suppress unused_parens warning, for example triggered by the code generated for `$x = x*x` (where `x` is an integer). + Dispatch `among` result with `match` instead of an `if` ... `else if` chain (which looks like we did because the Rust generator evolved from the Python generator and Python didn't used to have a switch-like construct. This results in a 3% speed-up for an unoptimised Rust compile but doesn't seem to make a measurable difference when optimising so it seems the Rust compiler is optimising both to equivalent code. However using a `match` here seems clearer, a better match for the intent, and is a bit simpler to generate. + Generate Rust with UTF-8 source encoding. This makes the generated code easier to follow, which helps during development. It's also a bit smaller. For now codepoints U+0590 and above are still emitted as escape sequences to avoid confusing source code rendering when LTR scripts are involved. New stemming algorithms ----------------------- * Add Esperanto stemmer from David Corbett (#185). * Add Estonian algorithm from Linda Freienthal (#108). Behavioural changes to existing algorithms ------------------------------------------ * Dutch: Switch to Kraaij-Pohlmann as the default for Dutch. In case you want Martin Porter's Dutch stemming algorithm for compatibility, this is now available as `dutch_porter`. Fixes #1, reported by gboer. * Dutch (Kraaij-Pohlmann): Fix differences between the Snowball implementation and the original C implementation. * Dutch (Kraaij-Pohlmann): Add a small number of exceptions to the Snowball implementation to avoid unwanted conflations. This addresses all cases so far identified which Martin's Dutch stemmer handled better. Fixes #208. * Dutch (Porter): The "at least 3 characters" part of the R1 definition was actually implemented such that when working in UTF-8 it was "at least 3 bytes". We stripped accents normally found in Dutch except for `è` before setting R1, and no Dutch words starting `è` seem to stem differently depending on encoding, but proper nouns and other words of foreign origin may contain other accented characters and it seems better for the stemmer to handle such words the same way regardless of the encoding in use. * English: Replace '-ogist' with '-og' to conflate "geologist" and "geology", etc. Suggested by Marc Schipperheijn on snowball-discuss. * English: Add extra condition to undoubling. We no longer undouble if the double consonant is preceded by exactly "a", "e" or "o" to avoid conflating "add"/"ad", "egg"/"eg", "off"/"of", etc. Fixes #182, reported by Ed Page. * English: Avoid conflating 'emerge' and 'emergency'. Reported by Frederick Ross on snowball-discuss. * English: Avoid conflating 'evening' and 'even'. Reported by Ann B on snowball-discuss. * English: Avoid conflating 'lateral' and 'later'. Reported by Steve Tolkin on snowball-discuss. * English: Avoid conflating 'organ', 'organic' and 'organize'. * English: Avoid conflating 'past' and 'paste'. Reported by Sonny on snowball-discuss. * English: Avoid conflating 'universe', 'universal' and 'university'. Reported by Clem Wang on snowball-discuss. * English: Handle -eed and -ing exceptions in their respective rules. This avoids the overhead of checking for them for the majority of words which don't end -eed or -ing. It also allows us to easily handle vying->vie and hying->hie at basically no extra cost. Reduces the time to stem all words in our English word list by nearly 2%. * French: Remove elisions as first step. See #187. Originally reported by Paul Rudin and kelson42. * French: Remove -aise and -aises so for example, "française" and "françaises" are now conflated with "français". Fixes #209. Originally reported by ririsoft and Fred Fung. * French: Avoid incorrect conflation of `mauvais` (bad) with `mauve` (mauve, mallow or seagull); avoid conflating `mal` with `malais`, `pal` with `palais`, etc. * French: Avoid conflating `ni` (neither/nor) with `niais` (inexperienced/silly) and `nie`/`nié`/`nier`/`nierais`/`nierons` (to deny). * French: -oux -> -ou. Fixes #91, reported by merwok. * German: Replace with the "german2" variant. This normalises umlauts ("ä" to "ae", "ö" to "oe", "ü" to "ue") which is presumably much less common in newly created text than it once was as modern computer systems generally don't have the limitations which motivated this, but there will still be large amounts of legacy text which it seems helpful for the stemmer to handle without having to know to select a variant. On our sample German vocabulary which contains 35033 words, 77 words give different stems. A significant proportion of these are foreign words, and some are proper nouns. Some cases definitely seem improved, and quite a few are just different but effectively just change the stem for a word or group of words to a stem that isn't otherwise generated. There don't seem any changes that are clearly worse, though there are some changes that have both good and bad aspects to them. Fixes #92, reported by jrabensc. * German: Don't remove -em if preceded by -syst to avoid overstemming words ending -system. This change means we now conflate e.g. "system" and "systemen". Partly addresses #161, reported by Olga Gusenikova. * German: Remove -erin and -erinnen suffixes which conflates singular and plural female versions of nouns with the male versions. Fixes #85 and partly addresses #161, reported by Olga Gusenikova. * German: Replace -ln and -lns with -l. This improves 82 cases in the current sample data without making anything worse. Tests on a larger word list look good too. Partly addresses #161, reported by Olga Gusenikova. * German: Remove -et suffix when we safely can. Fixes #200, reported by Robert Frunzke. * Greek: Fix "faulty slice operation" for input `ισαισα`. The fix changes `ισα` to stem to `ισ` instead of the empty string, which seems better (and to be what the second paper actually says to do if read carefully). Fixes #204, reported by subnix. * Italian: Address overstemming of "divano" (sofa) which previously stemmed to "div", which is the stem for 'diva' (diva). Now it is stemmed to 'divan', which is what its plural form 'divani' already stemmed to. Fixes #49, reported by francesco. * Norwegian: Improve stemming of words ending -ers. Fixes #175, reported by Karianne Berg. * Norwegian: Include more accented vowels - treating "ê", "ò", "ó" and "ô" as vowels improves the stemming of a fairly small number of words, but there's basically no cost to having extra vowels in the grouping, and some of these words are commonly used. Fixes #218, reported by András Jankovics. * Romanian: Fix to work with Romanian text encoded using the correct Unicode characters. Romanian uses a "comma below" diacritic on letters "s" and "t" ("ș" and "ț"). Before Unicode these weren't easily available so Romanian text was written using the visually similar "cedilla" diacritic on these letters instead ("ş" and "ţ"). Previously our stemmer only recognised the latter. Now it maps the cedilla forms to "comma below" as a first step. Patch from Robert Muir. * Spanish: Handle -acion like -ación and -ucion like -ución. It's apparently common to miss off accents in Spanish, and there are examples in our test vocabulary that these change helps. Proposed by Damian Janowski. * Swedish: Replace suffix "öst" with "ös" when preceded by any of 'iklnprtuv' rather than just 'l'. The new rule only requires the "öst" to be in R1 whereas previously we required all of "löst" to be. This second tweak doesn't seem to affect any words ending "löst" but it conflates a few extra cases when combined with the expanded list of preceding letters, and seems more logical linguistically (since "ös" is akin to "ous" in English). Fixes #152, reported by znakeeye. * Swedish: Remove -et/-ets in cases where it helps. Removing -et can't be done unconditionally because many words end in -et where this isn't a suffix. However it's a very common suffix so it seems worth crafting a more complex condition under which to remove. Fixes #47. * Turkish: Remove proper noun suffixes. For example, `Türkiye'dir` ("it is Turkey") is now conflated with `Türkiye` ("Turkey"). Fixes #188. * Yiddish: Avoid generating empty stem for input "גע" (not a valid word, but it's better to avoid an empty stem for any non-empty input). Optimisations to existing algorithms ------------------------------------ * General change: Use `gopast` everywhere to establish R1 and R2 as it is a little more efficient to do so. * Basque: Use an empty action rather than replacing the suffix with itself which seems clearer and is a little more efficient. * Dutch (Porter): Optimise prelude routine. * English: Remove unnecessary exception for `skis` as the algorithm stems `skis` to `ski` by itself (`skies` and `sky` do still need a special case to avoid conflation with `ski` though). * Hungarian: We no longer take digraphs into account when determining where R1 starts. This can only make a difference to the stemming if we removed a suffix that started with the last character of the digraph (or with "zs" in the case of "dzs"), and that doesn't happen for any of the suffixes we remove for any valid Hungarian words. This simplification speeds up stemming by ~2% on the current sample vocabulary list. See #216. Thanks to András Jankovics for confirming no Hungarian words are affected by this change. * Lithuanian: Remove redundant R1 check. * Nepali: Eliminate redundant check_category_2 routine. * Tamil: Optimise by using `among` instead of long `or` chains. The generated C version now takes 43% less time to processes the test vocabulary. * Tamil: Remove many cases which can't be triggered due to being handled by another case. * Tamil: Clean up some uses of `test`. * Tamil: Make `fix_va_start` simpler and faster. * Tamil: Localise use of `found_a_match` flag. * Tamil: Eliminate pointless flag changes. * Turkish: Minor optimisations. Code clarity improvements to existing algorithms ------------------------------------------------ * Stop noting dates changes were made in comments in the code - we now maintain a changelog in each algorithm's description page on the website (and the version control history provides a finer grained view). * Always use `insert` instead of `<+` as the named command seems clearer. * English: Add comments documenting motivating examples for all exceptional cases. * Lithuanian: Change to recommended latin stringdef codes. Using common codes makes it easier to work across algorithms, but they are more mnemonic so also seem clearer when just considering this one algorithm. * Serbian: Change to recommended latin stringdef codes. Using common codes makes it easier to work across algorithms, but they are more mnemonic so also seem clearer when just considering this one algorithm. * Turkish: Use `{sc}` for s-cedilla and `{i}` for dotless-i to match other uses. Compiler -------- * Generic code generation improvements: + Show Snowball source leafname in "generated" comment at start of files. + Add generic reachability tracking machinery. This facilitates various new optimisations, so far the following have been implemented: - Tail-calling - Simpler code for calling routines which always give the same signal - Simpler code when a routine ends in a integer test (this also allows eliminating an Ada-specific codegen optimisation which did something similar but only for routines which consisted *entirely* of a single integer test. - Dead code reporting and removal (only in simple cases currently) Currently this overlaps in functionality with the existing reachability tracking which is implemented on a per-language basis, and only for some languages. This reachability tracking was originally added for Java where some unreachable code is invalid and result in a compile time error, but then seems to have been copied for some other newer languages which may or may not actually need it. The approach it uses unfortunately relies on correctly updating the reachability flag anywhere in the generator code where reachability can change which has proved to be a source of bugs, some unfixed. This new approach seems better and with some more work should allow us to eliminate the older code. Fixes #83. + Omit check for `among` failing in generated code when we can tell at compile time that it can't fail. + Optimise `goto`/`gopast` applied to a grouping or inverted grouping (which is by far the most common way to use `goto`/`gopast`) for all target languages (new for Go, Java, Javascript, Pascal and Rust). + We never need to restore the cursor after `not`. If `not` turns signal `f` into `t` then it sets `c` back to its old position; otherwise, `not` signals `f` and `c` will get reset by whatever ultimately handles this `f` (or the program exits and the position of `c` no longer matters). This slightly improves the generated code for the `english` and `porter` stemmers. + Don't generate code for undefined or unused routines. + Avoid generating variable names and then not actually using them. This eliminates mysterious gaps in the numbering of variables in the generated code. + Eliminate `!`/`not` from integer test code by generating the inverse comparison operator instead for all languages, e.g. for Python we now generate if self.I_p1 >= self.I_x: instead of if not self.I_p1 < self.I_x: This isn't going to be faster in compiled languages with an optimiser but for scripting languages it may be faster, and even if not, it makes for a little less work when loading the script. + Canonicalise `hop 1` to `next` as the generated code for `next` can be slightly more efficient. This will also apply to `hop` followed by a constant expression which Snowball can reduce to `1`. + Avoid trailing whitespace in generated files. + Fix problems with --comments option: - When generating C code we would segfault for code containing `atleast`, `hop` or integer tests. - Fix missing comments for some commands in some target languages. - Fix inconsistent formatting of comments in some target languages. - Comments in C are now always on their own line - previously some were after at the end of the line and some on their own line which made them harder to follow. - Emit comments before `among` and before routine/external definitions. + Simplify more cases of numeric expressions (e.g. `x * 1` to `x`). * Improve --help output. * Division by zero during constant folding now gives an error. * For `hop` followed by an unexpected token (e.g. `hop hop`) we were already emitting a suitable error but would then segfault. * Emit error for redefinition of a grouping. * Improve errors for `define` of an undeclared name. We already peek at the next token to decide whether to try to parse as a routine or grouping. Previously we parsed as a routine if it was `as`, and a grouping otherwise, but routine definitions are more common and a grouping can only start with a literal string or a name, so now we assume a routine definition with a missing `as` if the next token isn't valid for either. * Suppress duplicate (or even triplicate) "unexpected" errors for the same token when the compiler tried to recover from the error by adjusting the parse stare and marking the token to be reparsed, but the same token then failed to parse in the new state. * Fix NULL pointer dereference if an undefined grouping is used in the definition of another grouping. * Fix mangled error for `set` or `unset` on a non-boolean: test.sbl:2: nameInvalid type 98 in name_of_type() * Emit warning if `=>` is used. The documentation of how it works doesn't match the implementation, and it seems it has only ever been used in the Schinke stemmer implementation (which assumes the implemented behaviour). We've updated the Schinke implementation to avoid it. If you're using it in your own Snowball code please let us know. * Improve errors for unterminated string literals. * Fix NULL pointer dereference on invalid code such as `$x = $y`. * If malloc fails while compiling the compiler will now report the failure and exit. Previously the NULL return from malloc wasn't checked for so we'd typically segfault. * `lenof` and `sizeof` applied to a string variable now mark the variable as used, which avoids a bogus error followed by a confusing additional message if this is the only use of that variable: lenofsizeofbug.sbl:3: warning: string 's' is set but never used Unhandled type of dead assignment via sizeof This is situation is unlikely to occur in real world code. * The reported line number for "string not terminated" error was one too high in the case where we were in a stringdef (but correct if we weren't). * Eliminate special handling for among starter. We now convert the starter to be a command before the among, adding an explict substring if there isn't one. * We now warn if the body of a `repeat` or `atleast` loop always signals `t` (meaning it will loop forever which is very undesirable for a stemming algorithm) or always signals `f` (meaning it will never loop, which seems unlikely to be what was intended). * Release memory in compiler before exit. The OS will free all allocated memory when a process exits, so this memory isn't actually leaked, but it can be annoying with when using snowball as part of a larger build process with some leak-finding tools. Patch from jsteemann in #166. * Store textual data more efficiently in memory during Snowball compilation. Previously almost all textual data was stored as 16 bit values, but most such data only uses 8 bit character values. Doubling the memory usage isn't really an issue as Snowball programs are tiny, but this also complicated code handling such data. Now only literal strings use the 16 bit values. * Fix clang -Wunused-but-set-variable warning in compiler code. * Fix a few -Wshadow warnings in compiler and enable this warning by default. * Tighten parsing of `writef()` format strings. We now error out on unrecognised escape codes or if a numbered escape is used with too high a number or a non-digit. This change reveals that the Go and Rust generators were using invalid escape ~A - the old writef() code was substituting this with just A which is what is wanted so this case was harmless but being lenient here could hide bugs, especially when copying code between generators as they don't all support the same set of format codes. Build system ------------ * Turn on Java warnings and make them errors. * Compile C code with -g by default. This makes debugging easier, and matches the default for at least some other build systems (e.g. autotools). * Fix "make clean" to remove all built Ada files. * Clean `stemtest` too. Patch from Stefano Rivera. * Add missing `COMMON_FILES` dependency to dist targets. * GNUmakefile: Tidy up and make more consistent * GNUmakefile: Make use of $* to improve speed and readability. * Use $(patsubst ...) instead of sed in .java.class rule which gives cleaner make output and is a bit more efficient. * Add `WERROR` make variable to provide a way to add `-Werror` to existing CFLAGS. libstemmer ---------- Testsuite --------- * Give a clear error if snowball-data isn't found. Fixes #196, reported by Andrea Maccis. * Handle not thinning testdata better. If THIN_FACTOR is set to 1 we no longer run gzipped test data through awk. We also now handle THIN_FACTOR being set empty as equivalent to 1 for convenience. * csharp_stemwords: Correctly handle a stemmer name containing an underscore. * csharp_stemwords: Make `-i` option optional and read from stdin if omitted, like the C version does. * csharp_stemwords: Process the input line by line which is more helpful for interactive testing, and also a little faster. * Fix Java TestApp to allow a single argument. The documented command line syntax is that you only need to specify the language and there was already code to read from stdin if no input file was specified, but at least two command line options were required. * Fix deprecation warning in TestApp.java. * Optimise TestApp.java by creating fewer objects. Patch from Robert Muir. * stemwords.py: We no longer create an empty output file if we fail to open the input file. * stemwords: Improve error message to say "Out of memory or internal error" rather than just "Out of memory". Documentation ------------- * Include "what is stemming" section in each README. * Include section on threads in each README. Based on patch for Python from dbcerigo. * Document that input should be lowercase with composed accents. See #186, reported by 1993fpale. * Add README section on building, including notes on cross-compiling. Fixes #205, reported by sin-ack. * CONTRIBUTING.rst: Clarify which charsets to list * CONTRIBUTING.rst: Add general advice section. In particular, note to use spaces-only for indentation in most cases. Thanks to Dmitry Shachnev for raising this point. * CONTRIBUTING.rst: Note that UTF-8 is OK in comments. Thanks to Dmitry Shachnev for asking. * Fix some typos. Patch from Josh Soref. * Document that our CI now uses github actions. * Update link to Greek stemmer PDF. Patch from Michael Bissett (#33). Snowball 2.2.0 (2021-11-10) =========================== New Code Generators ------------------- * Add Ada generator from Stephane Carrez (#135). Javascript ---------- * Fix generated code to use integer division rather than floating point division. Noted by David Corbett. Pascal ------ * Fix code generated for division. Previously real division was used and the generated code would fail to compile with an "Incompatible types" error. Noted by David Corbett. * Fix code generated for Snowball's `minint` and `maxint` constant. Python ------ * Python 2 is no longer actively supported, as proposed on the mailing list: https://lists.tartarus.org/pipermail/snowball-discuss/2021-August/001721.html * Fix code generated for division. Previously the Python code we generated used integer division but rounded negative fractions towards negative infinity rather than zero under Python 2, and under Python 3 used floating point division. Noted by David Corbett. Code quality Improvements ------------------------- * C/C++: Generate INT_MIN and INT_MAX directly, including from the generated C file if necessary, and remove the MAXINT and MININT macros from runtime/header.h. * C#: An `among` without functions is now generated as `static` and groupings are now generated as constant. Patches from James Turner in #146 and #147. Code generation improvements ---------------------------- * General: + Constant numeric subexpressions and constant numeric tests are now evaluated at Snowball compile time. + Simplify the following degnerate `loop` and `atleast` constructs where N is a compile-time constant: - loop N C where N <= 0 is a no-op. - loop N C where N == 1 is just C. - atleast N C where N <= 0 is just repeat C. If the value of N doesn't depend on the current target language, platform or Unicode settings then we also issue a warning. Behavioural changes to existing algorithms ------------------------------------------ * german2: Fix handling of `qu` to match algorithm description. Previously the implementation erroneously did `skip 2` after `qu`. We suspect this was intended to skip the `qu` but that's already been done by the substring/among matching, so it actually skips an extra two characters. The implementation has always differed in this way, but there's no good reason to skip two extra characters here so overall it seems best to change the code to match the description. This change only affects the stemming of a single word in the sample vocabulary - `quae` which seems to actually be Latin rather than German. Optimisations to existing algorithms ------------------------------------ * arabic: Handle exception cases in the among they're exceptions to. * greek: Remove unused slice setting, handle exception cases in the among they're exceptions to, and turn `substring ... among ... or substring ... among ...` into a single `substring ... among ...` in cases where it is trivial to do so. * hindi: Eliminate the need for variable `p`. * irish: Minor optimisation in setting `pV` and `p1`. * yiddish: Make use of `among` more. Compiler -------- * Fix handling of `len` and `lenof` being declared as names. For compatibility with programs written for older Snowball versions len and lenof stop being tokens if declared as names. However this code didn't work correctly if the tokeniser's name buffer needed to be enlarged to hold the token name (i.e. 3 or 5 elements respectively). * Report a clearer error if `=` is used instead of `==` in an integer test. * Replace a single entry command list with its contents in the internal syntax tree. This puts things in a more canonical form, which helps subsequent optimisations. Build system ------------ * Support building on Microsoft Windows (using mingw+msys or a similar Unix-like environment). Patch from Jannick in #129. * Split out INCLUDES from CPPFLAGS so that CPPFLAGS can now be overridden by the user if required. Fixes #148, reported by Dominique Leuenberger. * Regenerate algorithms.mk only when needed rather than on every `make` run. libstemmer ---------- * The libstemmer static library now has a `.a` extension, rather than `.o`. Patch from Michal Vasilek in #150. Testsuite --------- * stemtest: Test that numbers and numeric codes aren't damaged by any of the algorithms. Regression test for #66. Fixes #81. * ada: Fix ada tests to fail if output differs. There was an extra `| head -300` compared to other languages, which meant that the exit code of `diff` was ignored. It seems more helpful (and is more consistent) not to limit how many differences are shown so just drop this addition. * go: Stop thinning testdata. It looks like we only are because the test harness code was based on that for rust, which was based on that for javascript, which was only thinning because it was reading everything into memory and the larger vocabulary lists were resulting in out of memory issues. * javascript: Speed up stemwords.js. Process input line-by-line rather than reading the whole file into memory, splitting, iterating, and creating an array with all the output, joining and writing out a single huge string. This also means we can stop thinning the test data for javascript, which we were only doing because the huge arabic test data file was causing out of memory errors. Also drop the -p option, which isn't useful here and complicates the code. * rust: Turn on optimisation in the makefile rather than the CI config. This makes the tests run in about 1/5 of the time and there's really no reason to be thinning the testdata for rust. Documentation ------------- * CONTRIBUTING.rst: Improve documentation for adding a new stemming algorithm. * Improve wording of Python docs. Snowball 2.1.0 (2021-01-21) =========================== C/C++ ----- * Fix decoding of 4-byte UTF-8 sequences in `grouping` checks. This bug affected Unicode codepoints U+40000 to U+7FFFF and U+C0000 to U+FFFFF and doesn't affect any of the stemming algorithms we currently ship (#138, reported by Stephane Carrez). Python ------ * Fix snowballstemmer.algorithms() method (#132, reported by kkaiser). * Update code to generate trove language classifiers for PyPI. All the natural languages we previously had stemmers for have now been added to PyPI's list, but Armenian and Yiddish aren't on it. Patch from Dmitry Shachnev. Code Quality Improvements ------------------------- * Suppress GCC warning in compiler code. * Use `const` pointers more in C runtime. * Only use spaces for indentation in javascript code. Change proposed by Emily Marigold Klassen in #123, and seems to be the modern Javascript norm. New Snowball Language Features ------------------------------ * `lenof` and `sizeof` can now be applied to a literal string, which can be useful if you want to do calculations on cursor values. This change actually simplifies the language a little, since you can now use a literal string in any read-only context which accepts a string variable. Code generation improvements ---------------------------- * General: + Fix bugs in the code generated to handle failure of `goto`, `gopast` or `try` inside `setlimit` or string-`$`. This affected all languages (though the issue with `try` wasn't present for C). These bugs don't affect any of the stemming algorithms we currently ship. Reported by Stefan Petkovic on snowball-discuss. + Change `hop` with a negative argument to work as documented. The manual says a negative argument to hop will raise signal f, but the implementation for all languages was actually to move the cursor in the opposite direction to `hop` with a positive argument. The implemented behaviour is problematic as it allows invalidating implicitly saved cursor values by modifying the string outside the current region, so we've decided it's best to fix the implementation to match the documentation. The only Snowball code we're aware of which relies on this was the original version of the new Yiddish stemming algorithm, which has been updated not to rely on this. The compiler now issues a warning for `hop` with a constant negative argument (internally now converted to `false`), and for `hop` with a constant zero argument (internally now converted to `true`). + Canonicalise `among` actions equivalent to `()` such as `(true)` which previously resulted in an extra case in the among, and for Python we'd generate invalid Python code (`if` or `elif` with an empty body). Bug revealed by Assaf Urieli's Yiddish stemmer in #137. + Eliminate variables whose values are never used - they no longer have corresponding member variables, etc, and no code is generated for any assignments to them. + Don't generate anything for an unused `grouping`. + Stop warning "grouping X defined but not used" for a `grouping` which is only used to define another `grouping`. * C/C++: + Store booleans in same array as integers. This means each boolean is stored as an int instead of an unsigned char which means 4 bytes instead of 1, but we save a pointer (4 or 8 bytes) in struct SN_env which is a win for all the current stemmers. For an algorithm which uses both integers and booleans, we also save the overhead of allocating a block on the heap, and potentially improve data locality. + Eliminate duplicate generated C comment for sliceto. * Pascal: + Avoid generating unused variables. The Pascal code generated for the stemmers we ship is now warning free (tested with fpc 3.2.0). + Don't emit empty `private` sections. Cosmetic, but makes the generated code a bit easier to follow. * Python: + End `if`-chain with `else` where possible, avoiding a redundant test of the variable being switched on. This optimisation kicks in for an `among` where all cases have commands. This change seems to speed up `make check_python_arabic` by a few percent. New stemming algorithms ----------------------- * Add Serbian stemmer from stef4np (#113). * Add Yiddish stemmer from Assaf Urieli (#137). * Add Armenian stemmer from Astghik Mkrtchyan. It's been on the website for over a decade, and included in Xapian for over 9 years without any negative feedback. Optimisations to existing algorithms ------------------------------------ * kraaij_pohlmann: Use `$v = limit` instead of `do (tolimit setmark v)` since this generates simpler code, and also matches the code other algorithm implementations use. Probably for languages like C with optimising compilers the compiler will generate equivalent code anyway, but e.g. for Python this should be an improvement. Code clarity improvements to existing algorithms ------------------------------------------------ * hindi.sbl: Fix comment typo. Compiler -------- * Don't count `$x = x + 1` as initialising or using `x`, so it's now handled like `$x += 1` already is. * Comments are now only included in the generated code if command line option -comments is specified. The comments in the generated code are useful if you're trying to debug the compiler, and perhaps also if you are trying to debug your Snowball code, but for everyone else they just bloat the code which as the number of languages we support grows becomes more of an issue. * `-parentclassname` is not only for java and csharp so don't disable it if those backends are disabled. * `-syntax` now reports the value for each numeric literal. * Report location for excessive get nesting error. * Internally the compiler now represents negated literal numbers as a simple `c_number` rather than `c_neg` applied to a `c_number` with a positive value. This simplifies optimisations that want to check for a constant numeric expression. Build system ------------ * Link binaries with LDFLAGS if it's set, which is needed for some platform (e.g. OpenEmbedded). Patch from Andreas Müller (#120). * Add missing dependencies of algorithms.go rule. Testsuite --------- * C: Add stemtest for low-level regression tests. Documentation ------------- * Document a C99 compiler as a requirement for building the snowball compiler (but the C code it generates should still work with any ISO C compiler). A few declarations mixed with code crept in some time ago (which nobody's complained about), so this is really just formally documenting a requirement which already existed. * README: Explain what Snowball is and what Stemming is (#131, reported by Sean Kelly). * CONTRIBUTING.rst: Expand section on adding a new generator. * For Python snowballstemmer module include global NEWS instead of Python-specific CHANGES.rst and use README.rst as the long description. Patch from Dmitry Shachnev (#119). * COPYING: Update and incorporate Python backend licensing information which was previously in a separate file. Snowball 2.0.0 (2019-10-02) =========================== C/C++ ----- * Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled sequences of any length, but commands which look at the character value only handled sequences up to length 3. Fixes #89. * Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`. Java ---- * TestApp.java: - Always use UTF-8 for I/O. Patch from David Corbett (#80). - Allow reading input from stdin. - Remove rather pointless "stem n times" feature. - Only lower case ASCII to match stemwords.c. - Stem empty lines too to match stemwords.c. Code Quality Improvements ------------------------- * Fix various warnings from newer compilers. * Improve use of `const`. * Share common functions between compiler backends rather than having multiple copies of the same code. * Assorted code clean-up. * Initialise line_labelled member of struct generator to 0. Previously we were invoking undefined behaviour, though in practice it'll be zero initialised on most platforms. New Code Generators ------------------- * Add Python generator (#24). Originally written by Yoshiki Shibukawa, with additional updates by Dmitry Shachnev. * Add Javascript generator. Based on JSX generator (#26) written by Yoshiki Shibukawa. * Add Rust generator from Jakob Demler (#51). * Add Go generator from Marty Schoch (#57). * Add C# generator. Based on patch from Cesar Souza (#16, #17). * Add Pascal generator. Based on Delphi backend from stemming.zip file on old website (#75). New Snowball Language Features ------------------------------ * Add `len` and `lenof` to measure Unicode length. These are similar to `size` and `sizeof` (respectively), but `size` and `sizeof` return the length in bytes under `-utf8`, whereas these new commands give the same result whether using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in the length of the string). For compatibility with existing code which might use these as variable or function names, they stop being treated as tokens if declared to be a variable or function. * New `{U+1234}` stringdef notation for Unicode codepoints. * More versatile integer tests. Now you can compare any two arithmetic expressions with a relational operator in parentheses after the `$`, so for example `$(len > 3)` can now be used when previously a temporary variable was required: `$tmp = len $tmp > 3` Code generation improvements ---------------------------- * General: + Avoid unnecessarily saving and restoring of the cursor for more commands - `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always restore its value, and for C `booltest` (which other languages already handled). + Special case handling for `setlimit tomark AE`. All uses of setlimit in the current stemmers we ship follow this pattern, and by special-casing we can avoid having to save and restore the cursor (#74). + Merge duplicate actions in the same `among`. This reduces the size of the switch/if-chain in the generated code which dispatch the among for many of the stemmers. + Generate simpler code for `among`. We always check for a zero return value when we call the among, so there's no point also checking for that in the switch/if-chain. We can also avoid the switch/if-chain entirely when there's only one possible outcome (besides the zero return). + Optimise code generated for `do `. This speeds up "make check_python" by about 2%, and should speed up other interpreted languages too (#110). + Generate more and better comments referencing snowball source. + Add homepage URL and compiler version as comments in generated files. * C/C++: + Fix `size` and `sizeof` to not report one too high (reported by Assem Chelli in #32). + If signal `f` from a function call would lead to return from the current function then handle this and bailing out on an error together with a simple `if (ret <= 0) return ret;` + Inline testing for a single character literals. + Avoiding generating `|| 0` in corner case - this can result in a compiler warning when building the generated code. + Implement `insert_v()` in terms of `insert_s()`. + Add conditional `extern "C"` so `runtime/api.h` can be included from C++ code. Closes #90, reported by vvarma. * Java: + Fix functions in `among` to work in Java. We seem to need to make the methods called from among `public` instead of `private`, and to call them on `this` instead of the `methodObject` (which is cleaner anyway). No revision in version control seems to generate working code for this case, but Richard says it definitely used to work - possibly older JVMs failed to correctly enforce the access controls when methods were invoked by reflection. + Code after handling `f` by returning from the current function is unreachable too. + Previously we incorrectly decided that code after an `or` was unreachable in certain cases. None of the current stemmers in the distribution triggered this, but Martin Porter's snowball version of the Schinke Latin stemmer does. Fixes #58, reported by Alexander Myltsev. + The reachability logic was failing to consider reachability from the final command in an `or`. Fixes #82, reported by David Corbett. + Fix `maxint` and `minint`. Patch from David Corbett in #31. + Fix `$` on strings. The previous generated code was just wrong. This doesn't affect any of the included algorithms, but for example breaks Martin Porter's snowball implementation of Schinke's Latin Stemmer. Issue noted by Jakob Demler while working on the Rust backend in #51, and reported in the Schinke's Latin Stemmer by Alexander Myltsev in #58. + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43. + Eliminate range-check implementation for groupings. This was removed from the C generator 10 years earlier, isn't used for any of the existing algorithms, and it doesn't seem likely it would be - the grouping would have to consist entirely of a contiguous block of Unicode code-points. + Simplify code generated for `repeat` and `atleast`. + Eliminate unused return values and variables from runtime functions. + Only import the `among` and `SnowballProgram` classes if they're actually used. + Only generate `copy_from()` method if it's used. + Merge runtime functions `eq_s` and `eq_v` functions. + Java arrays know their own length so stop storing it separately. + Escape char 127 (DEL) in generated Java code. It's unlikely that this character would actually be used in a real stemmer, so this was more of a theoretical bug. + Drop unused import of InvocationTargetException from SnowballStemmer. Reported by GerritDeMeulder in #72. + Fix lint check issues in generated Java code. The stemmer classes are only referenced in the example app via reflection, so add @SuppressWarnings("unused") for them. The stemmer classes override equals() and hashCode() methods from the standard java Object class, so mark these with @Override. Both suggested by GerritDeMeulder in #72. + Declare Java variables at point of use in generated code. Putting all declarations at the top of the function was adding unnecessary complexity to the Java generator code for no benefit. + Improve formatting of generated code. New stemming algorithms ----------------------- * Add Tamil stemmer from Damodharan Rajalingam (#2, #3). * Add Arabic stemmer from Assem Chelli (#32, #50). * Add Irish stemmer from Jim O'Regan (#48). * Add Nepali stemmer from Arthur Zakirov (#70). * Add Indonesian stemmer from Olly Betts (#71). * Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review. * Add Lithuanian stemmer from Dainius Jocas (#22, #76). * Add Greek stemmer from Oleg Smirnov (#44). * Add Catalan and Basque stemmers from Israel Olalla (#104). Behavioural changes to existing algorithms ------------------------------------------ * Portuguese: + Replace incorrect Spanish suffixes by Portuguese suffixes (#1). * French: + The MSDOS CP850 version of the French algorithm was missing changes present in the ISO8859-1 and Unicode versions. There's now a single version of each algorithm which was based on the Unicode version. + Recognize French suffixes even when they begin with diaereses. Patch from David Corbett in #78. * Russian: + We now normalise 'ё' to 'е' before stemming. The documentation has long said "we assume ['ё'] is mapped into ['е']" but it's more convenient for the stemmer to actually perform this normalisation. This change has no effect if the caller is already normalising as we recommend. It's a change in behaviour they aren't, but 'ё' occurs rarely (there are currently no instances in our test vocabulary) and this improves behaviour when it does occur. Patch from Eugene Mirotin (#65, #68). * Finish: + Adjust the Finnish algorithm not to mangle numbers. This change also means it tends to leave foreign words alone. Fixes #66. * Danish: + Adjust Danish algorithm not to mangle alphanumeric codes. In particular alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000, space1999) are no longer mangled. See #81. Optimisations to existing algorithms ------------------------------------ * Turkish: + Simplify uses of `test` in stemmer code. + Check for 'ad' or 'soyad' more efficiently, and without needing the strlen variable. This speeds up "make check_utf8_turkish" by 11% on x86 Linux. * Kraaij-Pohlmann: + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient than `setmark x $x >= p1`. Code clarity improvements to existing algorithms ------------------------------------------------ * Turkish: + Use , for cedilla to match the conventions used in other stemmers. * Kraaij-Pohlmann: + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same `[substring] among (` ... `)` construct we do in other stemmers. Compiler -------- * Support conventional --help and --version options. * Warn if -r or -ep used with backend other than C/C++. * Warn if encoding command line options are specified when generating code in a language with a fixed encoding. * The default classname is now set based on the output filename, so `-n` is now often no longer needed. Fixes #64. * Avoid potential one byte buffer over-read when parsing snowball code. * Avoid comparing with uninitialised array element during compilation. * Improve `-syntax` output for `setlimit L for C`. * Optimise away double negation so generators don't have to worry about generating `--` (decrement operator in many languages). Fixes #52, reported by David Corbett. * Improved compiler error and warning messages: - We now report FILE:LINE: before each diagnostic message. - Improve warnings for unused declarations/definitions. - Warn for variables which are used, but either never initialised or never read. - Flag non-ASCII literal strings. This is an error for wide Unicode, but only a warning for single-byte and UTF-8 which work so long as the source encoding matches the encoding used in the generated stemmer code. - Improve error recovery after an undeclared `define`. We now sniff the token after the identifier and if it is `as` we parse as a routine, otherwise we parse as a grouping. Previously we always just assumed it was a routine, which gave a confusing second error if it was a grouping. - Improve error recovery after an unexpected token in `among`. Previously we acted as if the unexpected token closed the `among` (this probably wasn't intended but just a missing `break;` in a switch statement). Now we issue an error and try the next token. * Report error instead of silently truncating character values (e.g. `hex 123` previously silently became byte 0x23 which is `#` rather than a g-with-cedilla). * Enlarge the initial input buffer size to 8192 bytes and double each time we hit the end. Snowball programs are typically a few KB in size (with the current largest we ship being the Greek stemmer at 27KB) so the previous approach of starting with a 10 byte input buffer and increasing its size by 50% plus 40 bytes each time it filled was inefficient, needing up to 15 reallocations to load greek.sbl. * Identify variables only used by one `routine`/`external`. This information isn't yet used, but such variables which are also always written to before being read can be emitted as local variables in most target languages. * We now allow multiple source files on command line, and allow them to be after (or even interspersed) with options to better match modern Unix conventions. Support for multiple source files allows specifying a single byte character set mapping via a source file of `stringdef`. * Avoid infinite recursion in compiler when optimising a recursive snowball function. Recursive functions aren't typical in snowball programs, but the compiler shouldn't crash for any input, especially not a valid one. We now simply limit on how deep the compiler will recurse and make the pessimistic assumption in the unlikely event we hit this limit. Build system ------------ * `make clean` in C libstemmer_c distribution now removes `examples/*.o`. (#59) * Fix all the places which previously had to have a list of stemmers to work dynamically or be generated, so now only modules.txt needs updating to add a new stemmer. * Add check_java make target which runs tests for java. * Support gzipped test data (the uncompressed arabic test data is too big for github). * GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball invocations for Java - these are only meaningful when generating C code. * Pass CFLAGS when linking which matches convention (e.g. automake does it) and facilitates use of tools such as ASan. Fixes #84, reported by Thomas Pointhuber. * Add CI builds with -std=c90 to check compiler and generated code are C90 (#54) libstemmer ---------- * Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords. * Add -O2 to CFLAGS. * Make generated tables of encodings and modules const. * Fix clang static analyzer memory leak warning (in practice this code path can never actually be taken). Patch from Patrick O. Perry (#56) Documentation ------------- * Added copyright and licensing details (#10). * Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian and romanian are available in ISO_8859_2. * Remove documentation falsely claiming that libstemmer supports CP850 encoding. * CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and new language backends. * Overhaul libstemmer_python_README. Most notably, replace the benchmark data which was very out of date. snowball-3.0.1/README.rst000066400000000000000000000067371500727106100150370ustar00rootroot00000000000000Snowball is a small string processing language for creating stemming algorithms for use in Information Retrieval, plus a collection of stemming algorithms implemented using it. Snowball was originally designed and built by Martin Porter. Martin retired from development in 2014 and Snowball is now maintained as a community project. Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent string handling language from the 1960s. It now also serves as a metaphor for how the project grows by gathering contributions over time. The Snowball compiler translates a Snowball program into source code in another language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal, Python and Rust are supported. This repository contains the source code for the snowball compiler and the stemming algorithms. The snowball compiler is written in ISO C - you'll need a C compiler which support C99 to build it (but the C code it generates should work with any ISO C compiler). See https://snowballstem.org/ for more information about Snowball. What is Stemming? ================= Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Building Snowball ================= GNU make is required to build Snowball. The build system is currently structured as two separate stages for many of the target languages. The first stage builds the Snowball compiler and runs it to create target language code (and it can also run tests on each stemmer). The expectation is that you then create a "distribution" tarballs of this code with ``make dist`` (or to create one for a specific target language, e.g. ``make dist_libstemmer_c`` for C). These tarballs are created in the ``dist/`` subdirectory. To actually build the libstemmer library you then unpack and build the distribution tarball, e.g. for C:: tar xf dist/libstemmer_c-3.0.0.tar.gz cd libstemmer_c-3.0.0 make Cross-compiling --------------- If cross-compiling starting from the git repo, the Snowball compiler needs to be built with a native compiler then libstemmer with the cross-compiler. For example:: make CC=cc dist_libstemmer_c tar xf dist/libstemmer_c-3.0.0.tar.gz cd libstemmer_c-3.0.0 make CC=riscv64-unknown-linux-gnu-gcc If you are cross-compiling to or from Microsoft Windows, you'll need to also work around an assumption in libstemmer's ``Makefile`` which sets ``EXEEXT`` based on the OS you are building on:: ifeq ($(OS),Windows_NT) EXEEXT=.exe endif For example, if cross-compiling from Linux to Microsoft Windows, use something like this for the libstemmer build:: make CC=x86_64-w64-mingw32-gcc EXEEXT=.exe When going the other way, you'll need to use ``EXEEXT=``. snowball-3.0.1/ada/000077500000000000000000000000001500727106100140605ustar00rootroot00000000000000snowball-3.0.1/ada/README.md000066400000000000000000000041711500727106100153420ustar00rootroot00000000000000# Ada Target for Snowball The Ada Snowball generator generates an Ada child package for each Snowball algorithm. The parent package is named `Stemmer` and it provides various operations used by the generated code. The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory or from https://github.com/stcarrez/ada-stemmer. The generated child package declares the `Context_Type` tagged type and the `Stem` procedure: ```Ada package Stemmer. is type Context_Type is new Stemmer.Context_Type with private; procedure Stem (Z : in out Context_Type; Result : out Boolean); private type Context_Type is new Stemmer.Context_Type with record ... end record; end Stemmer.; ``` It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package. ## Usage To generate Ada source for a Snowball algorithm: ``` $ snowball path/to/algorithm.sbl -ada -P -o src/stemmer- ``` ### Ada specific options `-P ` the child package name used in the generated Ada file (defaults to `snowball`). It must be a valid Ada identifier. ## Code Organization `compiler/generator_ada.c` has the Ada code generation logic `ada/src` contains the default Ada Snowball runtime support which is also available at https://github.com/stcarrez/ada-stemmer `ada/algorithms` location where the makefile generated code will end up ## Using the Generated Stemmers To use the generated stemmer, import the Ada generated package, declare an instance of the generated `Context_Type` and call the `Stem_Word` procedure. ``` with Stemmer.English; Ctx : Stemmer.English.Context_Type; Result : Boolean; Ctx.Stem_Word ("zealously", Result); if Result then Ada.Text_IO.Put_Line (Ctx.Get_Result); end if; ``` You can use the context as many times as you want. ## Testing To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool. Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language. Run: ``` $ make check_ada ``` snowball-3.0.1/ada/generate.gpr000066400000000000000000000007441500727106100163710ustar00rootroot00000000000000with "stemmer_config"; project Generate is Mains := ("generate.adb"); for Main use Mains; for Source_Dirs use ("generate"); for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj"; for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin"; package Binder renames Stemmer_Config.Binder; package Builder renames Stemmer_Config.Builder; package Compiler renames Stemmer_Config.Compiler; package Linker renames Stemmer_Config.Linker; end Generate; snowball-3.0.1/ada/generate/000077500000000000000000000000001500727106100156525ustar00rootroot00000000000000snowball-3.0.1/ada/generate/generate.adb000066400000000000000000000055051500727106100201210ustar00rootroot00000000000000with Ada.Characters.Handling; with Ada.Text_IO; with Ada.Command_Line; with Ada.Containers.Indefinite_Vectors; procedure Generate is use Ada.Characters.Handling; use Ada.Text_IO; package String_Vectors is new Ada.Containers.Indefinite_Vectors (Element_Type => String, Index_Type => Positive); Languages : String_Vectors.Vector; function Capitalize (S : in String) return String is (To_Upper (S (S'First)) & S (S'First + 1 .. S'Last)); procedure Write_Spec is File : File_Type; I : Natural := 0; begin Create (File, Out_File, "stemmer-factory.ads"); Put_Line (File, "package Stemmer.Factory with SPARK_Mode is"); New_Line (File); Put (File, " type Language_Type is ("); for Lang of Languages loop Put (File, "L_" & To_Upper (Lang)); I := I + 1; if I < Natural (Languages.Length) then Put_Line (File, ","); Put (File, " "); end if; end loop; Put_Line (File, ");"); New_Line (File); Put_Line (File, " function Stem (Language : in Language_Type;"); Put_Line (File, " Word : in String) return String;"); New_Line (File); Put_Line (File, "end Stemmer.Factory;"); Close (File); end Write_Spec; procedure Write_Body is File : File_Type; begin Create (File, Out_File, "stemmer-factory.adb"); for Lang of Languages loop Put_Line (File, "with Stemmer." & Capitalize (Lang) & ";"); end loop; Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is"); New_Line (File); Put_Line (File, " function Stem (Language : in Language_Type;"); Put_Line (File, " Word : in String) return String is"); Put_Line (File, " Result : Boolean := False;"); Put_Line (File, " begin"); Put_Line (File, " case Language is"); for Lang of Languages loop Put_Line (File, " when L_" & To_Upper (Lang) & " =>"); Put_Line (File, " declare"); Put_Line (File, " C : Stemmer." & Capitalize (Lang) & ".Context_Type;"); Put_Line (File, " begin"); Put_Line (File, " C.Stem_Word (Word, Result);"); Put_Line (File, " return Get_Result (C);"); Put_Line (File, " end;"); New_Line (File); end loop; Put_Line (File, " end case;"); Put_Line (File, " end Stem;"); New_Line (File); Put_Line (File, "end Stemmer.Factory;"); Close (File); end Write_Body; Count : constant Natural := Ada.Command_Line.Argument_Count; begin for I in 1 .. Count loop Languages.Append (To_Lower (Ada.Command_Line.Argument (I))); end loop; Write_Spec; Write_Body; end Generate; snowball-3.0.1/ada/src/000077500000000000000000000000001500727106100146475ustar00rootroot00000000000000snowball-3.0.1/ada/src/stemmer.adb000066400000000000000000000513451500727106100170030ustar00rootroot00000000000000----------------------------------------------------------------------- -- stemmer -- Multi-language stemmer with Snowball generator -- Written by Stephane Carrez (Stephane.Carrez@gmail.com) -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- -- 1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- 3. Neither the name of the Snowball project nor the names of its contributors -- may be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------- with Interfaces; package body Stemmer with SPARK_Mode is subtype Byte is Interfaces.Unsigned_8; use type Interfaces.Unsigned_8; procedure Stem_Word (Context : in out Context_Type'Class; Word : in String; Result : out Boolean) is begin Context.P (1 .. Word'Length) := Word; Context.Len := Word'Length; Context.C := 0; Context.L := Word'Length; Context.Lb := 0; Stemmer.Stem (Context, Result); end Stem_Word; function Get_Result (Context : in Context_Type'Class) return String is begin return Context.P (1 .. Context.Len); end Get_Result; function Eq_S (Context : in Context_Type'Class; S : in String) return Char_Index is begin if Context.L - Context.C < S'Length then return 0; end if; if Context.P (Context.C + 1 .. Context.C + S'Length) /= S then return 0; end if; return S'Length; end Eq_S; function Eq_S_Backward (Context : in Context_Type'Class; S : in String) return Char_Index is begin if Context.C - Context.Lb < S'Length then return 0; end if; if Context.P (Context.C + 1 - S'Length .. Context.C) /= S then return 0; end if; return S'Length; end Eq_S_Backward; function Length_Utf8 (Context : in Context_Type'Class) return Natural is Count : Natural := 0; Pos : Positive := 1; Val : Byte; begin while Pos <= Context.Len loop Val := Character'Pos (Context.P (Pos)); Pos := Pos + 1; if Val >= 16#C0# or Val < 16#80# then Count := Count + 1; end if; end loop; return Count; end Length_Utf8; function Length_Utf8 (S : in String) return Natural is Count : Natural := 0; Pos : Positive := 1; Val : Byte; begin while Pos <= S'Length loop Val := Character'Pos (S (Pos)); Pos := Pos + 1; if Val >= 16#C0# or Val < 16#80# then Count := Count + 1; end if; end loop; return Count; end Length_Utf8; function Check_Among (Context : in Context_Type'Class; Pos : in Char_Index; Shift : in Natural; Mask : in Mask_Type) return Boolean is use Interfaces; Val : constant Byte := Character'Pos (Context.P (Pos + 1)); begin if Natural (Shift_Right (Val, 5)) /= Shift then return True; end if; return (Shift_Right (Unsigned_64 (Mask), Natural (Val and 16#1f#)) and 1) = 0; end Check_Among; procedure Find_Among (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) is I : Natural := Amongs'First; J : Natural := Amongs'Last + 1; Common_I : Natural := 0; Common_J : Natural := 0; First_Key_Inspected : Boolean := False; C : constant Natural := Context.C; L : constant Integer := Context.L; begin loop declare K : constant Natural := I + (J - I) / 2; W : constant Among_Type := Amongs (K); Common : Natural := (if Common_I < Common_J then Common_I else Common_J); Diff : Integer := 0; begin for I2 in W.First + Common .. W.Last loop if C + Common = L then Diff := -1; exit; end if; Diff := Character'Pos (Context.P (C + Common + 1)) - Character'Pos (Pattern (I2)); exit when Diff /= 0; Common := Common + 1; end loop; if Diff < 0 then J := K; Common_J := Common; else I := K; Common_I := Common; end if; end; if J - I <= 1 then exit when I > 0 or J = I or First_Key_Inspected; First_Key_Inspected := True; end if; end loop; loop declare W : constant Among_Type := Amongs (I); Len : constant Natural := W.Last - W.First + 1; Status : Boolean; begin if Common_I >= Len then Context.C := C + Len; if W.Operation = 0 then Result := W.Result; return; end if; Execute (Context, W.Operation, Status); Context.C := C + Len; if Status then Result := W.Result; return; end if; end if; exit when W.Substring_I < 0; I := W.Substring_I; end; end loop; Result := 0; end Find_Among; procedure Find_Among_Backward (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) is I : Natural := Amongs'First; J : Natural := Amongs'Last + 1; Common_I : Natural := 0; Common_J : Natural := 0; First_Key_Inspected : Boolean := False; C : constant Integer := Context.C; Lb : constant Integer := Context.Lb; begin loop declare K : constant Natural := I + (J - I) / 2; W : constant Among_Type := Amongs (K); Common : Natural := (if Common_I < Common_J then Common_I else Common_J); Diff : Integer := 0; begin for I2 in reverse W.First .. W.Last - Common loop if C - Common = Lb then Diff := -1; exit; end if; Diff := Character'Pos (Context.P (C - Common)) - Character'Pos (Pattern (I2)); exit when Diff /= 0; Common := Common + 1; end loop; if Diff < 0 then J := K; Common_J := Common; else I := K; Common_I := Common; end if; end; if J - I <= 1 then exit when I > 0 or J = I or First_Key_Inspected; First_Key_Inspected := True; end if; end loop; loop declare W : constant Among_Type := Amongs (I); Len : constant Natural := W.Last - W.First + 1; Status : Boolean; begin if Common_I >= Len then Context.C := C - Len; if W.Operation = 0 then Result := W.Result; return; end if; Execute (Context, W.Operation, Status); Context.C := C - Len; if Status then Result := W.Result; return; end if; end if; exit when W.Substring_I < 0; I := W.Substring_I; end; end loop; Result := 0; end Find_Among_Backward; function Skip_Utf8 (Context : in Context_Type'Class) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if Pos >= Context.L then return -1; end if; Pos := Pos + 1; Val := Character'Pos (Context.P (Pos)); if Val >= 16#C0# then while Pos < Context.L loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0# or Val < 16#80#; Pos := Pos + 1; end loop; end if; return Pos; end Skip_Utf8; function Skip_Utf8 (Context : in Context_Type'Class; N : in Integer) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if N < 0 then return -1; end if; for I in 1 .. N loop if Pos >= Context.L then return -1; end if; Pos := Pos + 1; Val := Character'Pos (Context.P (Pos)); if Val >= 16#C0# then while Pos < Context.L loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0# or Val < 16#80#; Pos := Pos + 1; end loop; end if; end loop; return Pos; end Skip_Utf8; function Skip_Utf8_Backward (Context : in Context_Type'Class) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if Pos <= Context.Lb then return -1; end if; Val := Character'Pos (Context.P (Pos)); Pos := Pos - 1; if Val >= 16#80# then while Pos > Context.Lb loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0#; Pos := Pos - 1; end loop; end if; return Pos; end Skip_Utf8_Backward; function Skip_Utf8_Backward (Context : in Context_Type'Class; N : in Integer) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if N < 0 then return -1; end if; for I in 1 .. N loop if Pos <= Context.Lb then return -1; end if; Val := Character'Pos (Context.P (Pos)); Pos := Pos - 1; if Val >= 16#80# then while Pos > Context.Lb loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0#; Pos := Pos - 1; end loop; end if; end loop; return Pos; end Skip_Utf8_Backward; function Shift_Left (Value : in Utf8_Type; Shift : in Natural) return Utf8_Type is (Utf8_Type (Interfaces.Shift_Left (Interfaces.Unsigned_32 (Value), Shift))); procedure Get_Utf8 (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural) is B0, B1, B2, B3 : Byte; begin if Context.C >= Context.L then Value := 0; Count := 0; return; end if; B0 := Character'Pos (Context.P (Context.C + 1)); if B0 < 16#C0# or Context.C + 1 >= Context.L then Value := Utf8_Type (B0); Count := 1; return; end if; B1 := Character'Pos (Context.P (Context.C + 2)) and 16#3F#; if B0 < 16#E0# or Context.C + 2 >= Context.L then Value := Shift_Left (Utf8_Type (B0 and 16#1F#), 6) or Utf8_Type (B1); Count := 2; return; end if; B2 := Character'Pos (Context.P (Context.C + 3)) and 16#3F#; if B0 < 16#F0# or Context.C + 3 >= Context.L then Value := Shift_Left (Utf8_Type (B0 and 16#0F#), 12) or Shift_Left (Utf8_Type (B1), 6) or Utf8_Type (B2); Count := 3; return; end if; B3 := Character'Pos (Context.P (Context.C + 4)) and 16#3F#; Value := Shift_Left (Utf8_Type (B0 and 16#07#), 18) or Shift_Left (Utf8_Type (B1), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 4; end Get_Utf8; procedure Get_Utf8_Backward (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural) is B0, B1, B2, B3 : Byte; begin if Context.C <= Context.Lb then Value := 0; Count := 0; return; end if; B3 := Character'Pos (Context.P (Context.C)); if B3 < 16#80# or Context.C - 1 <= Context.Lb then Value := Utf8_Type (B3); Count := 1; return; end if; B2 := Character'Pos (Context.P (Context.C - 1)); if B2 >= 16#C0# or Context.C - 2 <= Context.Lb then B3 := B3 and 16#3F#; Value := Shift_Left (Utf8_Type (B2 and 16#1F#), 6) or Utf8_Type (B3); Count := 2; return; end if; B1 := Character'Pos (Context.P (Context.C - 2)); if B1 >= 16#E0# or Context.C - 3 <= Context.Lb then B3 := B3 and 16#3F#; B2 := B2 and 16#3F#; Value := Shift_Left (Utf8_Type (B1 and 16#0F#), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 3; return; end if; B0 := Character'Pos (Context.P (Context.C - 3)); B1 := B1 and 16#1F#; B2 := B2 and 16#3F#; B3 := B3 and 16#3F#; Value := Shift_Left (Utf8_Type (B0 and 16#07#), 18) or Shift_Left (Utf8_Type (B1), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 4; end Get_Utf8_Backward; procedure Out_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C >= Context.L then Result := -1; return; end if; loop Get_Utf8 (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch <= Max and Ch >= Min then Ch := Ch - Min; if S (Ch) then Result := Count; return; end if; end if; Context.C := Context.C + Count; exit when not Repeat; end loop; Result := 0; end Out_Grouping; procedure Out_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C <= Context.Lb then Result := -1; return; end if; loop Get_Utf8_Backward (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch <= Max and Ch >= Min then Ch := Ch - Min; if S (Ch) then Result := Count; return; end if; end if; Context.C := Context.C - Count; exit when not Repeat; end loop; Result := 0; end Out_Grouping_Backward; procedure In_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C >= Context.L then Result := -1; return; end if; loop Get_Utf8 (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch > Max or Ch < Min then Result := Count; return; end if; Ch := Ch - Min; if not S (Ch) then Result := Count; return; end if; Context.C := Context.C + Count; exit when not Repeat; end loop; Result := 0; end In_Grouping; procedure In_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C <= Context.Lb then Result := -1; return; end if; loop Get_Utf8_Backward (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch > Max or Ch < Min then Result := Count; return; end if; Ch := Ch - Min; if not S (Ch) then Result := Count; return; end if; Context.C := Context.C - Count; exit when not Repeat; end loop; Result := 0; end In_Grouping_Backward; procedure Replace (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String; Adjustment : out Integer) is begin Adjustment := S'Length - (C_Ket - C_Bra); if Adjustment > 0 then Context.P (C_Bra + S'Length + 1 .. Context.Len + Adjustment + 1) := Context.P (C_Ket + 1 .. Context.Len + 1); end if; if S'Length > 0 then Context.P (C_Bra + 1 .. C_Bra + S'Length) := S; end if; if Adjustment < 0 then Context.P (C_Bra + S'Length + 1 .. Context.Len + Adjustment + 1) := Context.P (C_Ket + 1 .. Context.Len + 1); end if; Context.Len := Context.Len + Adjustment; Context.L := Context.L + Adjustment; if Context.C >= C_Ket then Context.C := Context.C + Adjustment; elsif Context.C > C_Bra then Context.C := C_Bra; end if; end Replace; procedure Slice_Del (Context : in out Context_Type'Class) is Result : Integer; begin Replace (Context, Context.Bra, Context.Ket, "", Result); end Slice_Del; procedure Slice_From (Context : in out Context_Type'Class; Text : in String) is Result : Integer; begin Replace (Context, Context.Bra, Context.Ket, Text, Result); end Slice_From; function Slice_To (Context : in Context_Type'Class) return String is begin return Context.P (Context.Bra + 1 .. Context.Ket); end Slice_To; procedure Insert (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String) is Result : Integer; begin Replace (Context, C_Bra, C_Ket, S, Result); if C_Bra <= Context.Bra then Context.Bra := Context.Bra + Result; end if; if C_Bra <= Context.Ket then Context.Ket := Context.Ket + Result; end if; end Insert; end Stemmer; snowball-3.0.1/ada/src/stemmer.ads000066400000000000000000000226621500727106100170240ustar00rootroot00000000000000----------------------------------------------------------------------- -- stemmer -- Multi-language stemmer with Snowball generator -- Written by Stephane Carrez (Stephane.Carrez@gmail.com) -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- -- 1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- 3. Neither the name of the Snowball project nor the names of its contributors -- may be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------- package Stemmer with SPARK_Mode is pragma Preelaborate; WORD_MAX_LENGTH : constant := 1024; type Context_Type is abstract tagged private; -- Apply the stemming algorithm on the word initialized in the context. procedure Stem (Context : in out Context_Type; Result : out Boolean) is abstract; -- Stem the word and return True if it was reduced. procedure Stem_Word (Context : in out Context_Type'Class; Word : in String; Result : out Boolean) with Global => null, Pre => Word'Length < WORD_MAX_LENGTH; -- Get the stem or the input word unmodified. function Get_Result (Context : in Context_Type'Class) return String with Global => null, Post => Get_Result'Result'Length < WORD_MAX_LENGTH; private type Mask_Type is mod 2**32; -- A 32-bit character value that was read from UTF-8 sequence. -- A modular value is used because shift and logical arithmetic is necessary. type Utf8_Type is mod 2**32; -- Index of the Grouping_Array. The index comes from the 32-bit character value -- minus a starting offset. We don't expect large tables and we check against -- a maximum value. subtype Grouping_Index is Utf8_Type range 0 .. 16384; type Grouping_Array is array (Grouping_Index range <>) of Boolean with Pack; subtype Among_Index is Natural range 0 .. 65535; subtype Among_Start_Index is Among_Index range 1 .. Among_Index'Last; subtype Operation_Index is Natural range 0 .. 65535; subtype Result_Index is Integer range -1 .. WORD_MAX_LENGTH - 1; subtype Char_Index is Result_Index range 0 .. Result_Index'Last; type Among_Type is record First : Among_Start_Index; Last : Among_Index; Substring_I : Integer; Result : Integer; Operation : Operation_Index; end record; type Among_Array_Type is array (Natural range <>) of Among_Type; function Eq_S (Context : in Context_Type'Class; S : in String) return Char_Index with Global => null, Pre => S'Length > 0, Post => Eq_S'Result = 0 or Eq_S'Result = S'Length; function Eq_S_Backward (Context : in Context_Type'Class; S : in String) return Char_Index with Global => null, Pre => S'Length > 0, Post => Eq_S_Backward'Result = 0 or Eq_S_Backward'Result = S'Length; procedure Find_Among (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) with Global => null, Pre => Pattern'Length > 0 and Amongs'Length > 0; procedure Find_Among_Backward (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) with Global => null, Pre => Pattern'Length > 0 and Amongs'Length > 0; function Skip_Utf8 (Context : in Context_Type'Class) return Result_Index with Global => null; function Skip_Utf8 (Context : in Context_Type'Class; N : in Integer) return Result_Index with Global => null; function Skip_Utf8_Backward (Context : in Context_Type'Class) return Result_Index with Global => null; function Skip_Utf8_Backward (Context : in Context_Type'Class; N : in Integer) return Result_Index with Global => null; procedure Get_Utf8 (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural); procedure Get_Utf8_Backward (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural); function Length_Utf8 (Context : in Context_Type'Class) return Natural; function Length_Utf8 (S : in String) return Natural; function Check_Among (Context : in Context_Type'Class; Pos : in Char_Index; Shift : in Natural; Mask : in Mask_Type) return Boolean; procedure Out_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure Out_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure In_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure In_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure Replace (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String; Adjustment : out Integer) with Global => null, Pre => C_Ket >= C_Bra; procedure Slice_Del (Context : in out Context_Type'Class) with Global => null, Pre => Context.Ket >= Context.Bra; procedure Slice_From (Context : in out Context_Type'Class; Text : in String) with Global => null, Pre => Context.Ket >= Context.Bra and Context.Len - (Context.Ket - Context.Bra) + Text'Length < Context.P'Length; function Slice_To (Context : in Context_Type'Class) return String; procedure Insert (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String) with Global => null, Pre => C_Ket >= C_Bra and Context.Len - (C_Ket - C_Bra) + S'Length < Context.P'Length; -- The context indexes follow the C paradigm: they start at 0 for the first character. -- This is necessary because several algorithms rely on this when they compare the -- cursor position ('C') or setup some markers from the cursor. type Context_Type is abstract tagged record C : Char_Index := 0; Len : Char_Index := 0; L : Char_Index := 0; Lb : Char_Index := 0; Bra : Char_Index := 0; Ket : Char_Index := 0; P : String (1 .. WORD_MAX_LENGTH); end record; end Stemmer; snowball-3.0.1/ada/src/stemwords.adb000066400000000000000000000045651500727106100173600ustar00rootroot00000000000000with Ada.Text_IO; with Ada.Command_Line; with Stemmer.Factory; procedure Stemwords is use Stemmer.Factory; function Get_Language (Name : in String) return Language_Type; function Is_Space (C : in Character) return Boolean; function Is_Space (C : in Character) return Boolean is begin return C = ' ' or C = ASCII.HT; end Is_Space; function Get_Language (Name : in String) return Language_Type is begin return Language_Type'Value ("L_" & Name); exception when Constraint_Error => Ada.Text_IO.Put_Line ("Unsupported language: " & Name); return L_ENGLISH; end Get_Language; Count : constant Natural := Ada.Command_Line.Argument_Count; begin if Count /= 3 then Ada.Text_IO.Put_Line ("Usage: stemwords "); return; end if; declare Lang : constant Language_Type := Get_Language (Ada.Command_Line.Argument (1)); Input : constant String := Ada.Command_Line.Argument (2); Output : constant String := Ada.Command_Line.Argument (3); Src_File : Ada.Text_IO.File_Type; Dst_File : Ada.Text_IO.File_Type; begin Ada.Text_IO.Open (Src_File, Ada.Text_IO.In_File, Input); Ada.Text_IO.Create (Dst_File, Ada.Text_IO.Out_File, Output); while not Ada.Text_IO.End_Of_File (Src_File) loop declare Line : constant String := Ada.Text_IO.Get_Line (Src_File); Pos : Positive := Line'First; Last_Pos : Positive; Start_Pos : Positive; begin while Pos <= Line'Last loop Last_Pos := Pos; while Pos <= Line'Last and then Is_Space (Line (Pos)) loop Pos := Pos + 1; end loop; if Last_Pos < Pos then Ada.Text_IO.Put (Dst_File, Line (Last_Pos .. Pos - 1)); end if; exit when Pos > Line'Last; Start_Pos := Pos; while Pos <= Line'Last and then not Is_Space (Line (Pos)) loop Pos := Pos + 1; end loop; Ada.Text_IO.Put (Dst_File, Stemmer.Factory.Stem (Lang, Line (Start_Pos .. Pos - 1))); end loop; Ada.Text_IO.New_Line (Dst_File); end; end loop; Ada.Text_IO.Close (Src_File); Ada.Text_IO.Close (Dst_File); end; end Stemwords; snowball-3.0.1/ada/stemmer_config.gpr000066400000000000000000000044371500727106100176030ustar00rootroot00000000000000abstract project Stemmer_Config is for Source_Dirs use (); type Yes_No is ("yes", "no"); type Library_Type_Type is ("relocatable", "static", "static-pic"); type Build_Type is ("distrib", "debug", "optimize", "profile", "coverage"); Mode : Build_Type := external ("BUILD", "distrib"); Processors := External ("PROCESSORS", "1"); package Builder is case Mode is when "debug" => for Default_Switches ("Ada") use ("-g", "-j" & Processors); when others => for Default_Switches ("Ada") use ("-g", "-O3", "-j" & Processors); end case; end Builder; package compiler is warnings := ("-gnatwua"); defaults := ("-gnat2012"); case Mode is when "distrib" => for Default_Switches ("Ada") use defaults & ("-gnatafno", "-gnatVa", "-gnatwa"); when "debug" => for Default_Switches ("Ada") use defaults & warnings & ("-gnata", "-gnatVaMI", "-gnaty3abcefhiklmnprstxM99"); when "coverage" => for Default_Switches ("Ada") use defaults & warnings & ("-gnata", "-gnatVaMI", "-gnaty3abcefhiklmnprstxM99", "-fprofile-arcs", "-ftest-coverage"); when "optimize" => for Default_Switches ("Ada") use defaults & warnings & ("-gnatn", "-gnatp", "-fdata-sections", "-ffunction-sections"); when "profile" => for Default_Switches ("Ada") use defaults & warnings & ("-pg"); end case; end compiler; package binder is case Mode is when "debug" => for Default_Switches ("Ada") use ("-E"); when others => for Default_Switches ("Ada") use ("-E"); end case; end binder; package linker is case Mode is when "profile" => for Default_Switches ("Ada") use ("-pg"); when "distrib" => for Default_Switches ("Ada") use ("-s"); when "optimize" => for Default_Switches ("Ada") use ("-Wl,--gc-sections"); when "coverage" => for Default_Switches ("ada") use ("-fprofile-arcs"); when others => null; end case; end linker; package Ide is for VCS_Kind use "git"; end Ide; end Stemmer_Config; snowball-3.0.1/ada/stemwords.gpr000066400000000000000000000007601500727106100166240ustar00rootroot00000000000000with "stemmer_config"; project Stemwords is Mains := ("stemwords.adb"); for Main use Mains; for Source_Dirs use ("src", "algorithms"); for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj"; for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin"; package Binder renames Stemmer_Config.Binder; package Builder renames Stemmer_Config.Builder; package Compiler renames Stemmer_Config.Compiler; package Linker renames Stemmer_Config.Linker; end Stemwords; snowball-3.0.1/algorithms/000077500000000000000000000000001500727106100155045ustar00rootroot00000000000000snowball-3.0.1/algorithms/arabic.sbl000066400000000000000000000422551500727106100174370ustar00rootroot00000000000000/* * Authors: * - Assem Chelli, < assem [dot] ch [at] gmail > * - Abdelkrim Aries * */ stringescapes { } /* the Arabic letters in Unicode */ // Hamza stringdef o '{U+0621}' // Hamza stringdef ao '{U+0623}' // Hamza above Alef stringdef ao_ '{U+0625}' // Hamza below Alef stringdef a~ '{U+0622}' // Alef madda stringdef wo '{U+0624}' // Hamza above waw stringdef yo '{U+0626}' // Hamza above yeh // Letters stringdef a '{U+0627}' // Alef stringdef a_ '{U+0649}' // Alef Maksura stringdef b '{U+0628}' // Beh stringdef t_ '{U+0629}' // Teh_Marbuta stringdef t '{U+062A}' // Teh stringdef th '{U+062B}' // Theh stringdef j '{U+062C}' // Jeem stringdef h '{U+062D}' // Hah stringdef x '{U+062E}' // Khah stringdef d '{U+062F}' // Dal stringdef dz '{U+0630}' // Thal stringdef r '{U+0631}' // Reh stringdef z '{U+0632}' // Zain stringdef s '{U+0633}' // Seen stringdef sh '{U+0634}' // Sheen stringdef c '{U+0635}' // Sad stringdef dh '{U+0636}' // Dad stringdef tt '{U+0637}' // Tah stringdef zh '{U+0638}' // Zah stringdef i '{U+0639}' // Ain stringdef gh '{U+063A}' // Ghain stringdef f '{U+0641}' // Feh stringdef q '{U+0642}' // Qaf stringdef k '{U+0643}' // Kaf stringdef l '{U+0644}' // Lam stringdef m '{U+0645}' // Meem stringdef n '{U+0646}' // Noon stringdef e '{U+0647}' // Heh stringdef w '{U+0648}' // Waw stringdef y '{U+064A}' // Yeh // Diacritics stringdef aan '{U+064B}' // FatHatan stringdef uun '{U+064C}' // Dammatan stringdef iin '{U+064D}' // Kasratan stringdef aa '{U+064E}' // FatHa stringdef uu '{U+064F}' // Damma stringdef ii '{U+0650}' // Kasra stringdef oo '{U+0652}' // Sukun stringdef ~ '{U+0651}' // Shadda // Hindu–Arabic numerals stringdef 0 '{U+0660}' stringdef 1 '{U+0661}' stringdef 2 '{U+0662}' stringdef 3 '{U+0663}' stringdef 4 '{U+0664}' stringdef 5 '{U+0665}' stringdef 6 '{U+0666}' stringdef 7 '{U+0667}' stringdef 8 '{U+0668}' stringdef 9 '{U+0669}' // Kasheeda stringdef _ '{U+0640}' // Kasheeda, Tatweel // Shaped forms stringdef o1 '{U+FE80}' // HAMZA stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW stringdef yo1 '{U+FE8B}' // YEH_HAMZA stringdef yo2 '{U+FE8C}' // YEH_HAMZA stringdef yo3 '{U+FE89}' // YEH_HAMZA stringdef yo4 '{U+FE8A}' // YEH_HAMZA stringdef a~1 '{U+FE81}' // ALEF_MADDA stringdef a~2 '{U+FE82}' // ALEF_MADDA stringdef wo1 '{U+FE85}' // WAW_HAMZA stringdef wo2 '{U+FE86}' // WAW_HAMZA stringdef a1 '{U+FE8D}' // ALEF stringdef a2 '{U+FE8E}' // ALEF stringdef b1 '{U+FE8F}' // BEH stringdef b2 '{U+FE90}' // BEH stringdef b3 '{U+FE91}' // BEH stringdef b4 '{U+FE92}' // BEH stringdef t_1 '{U+FE93}' // TEH_MARBUTA stringdef t_2 '{U+FE94}' // TEH_MARBUTA stringdef t1 '{U+FE97}' // TEH stringdef t2 '{U+FE98}' // TEH stringdef t3 '{U+FE95}' // TEH stringdef t4 '{U+FE96}' // TEH stringdef th1 '{U+FE9B}' // THEH stringdef th2 '{U+FE9C}' // THEH stringdef th3 '{U+FE9A}' // THEH stringdef th4 '{U+FE99}' // THEH stringdef j1 '{U+FE9F}' // JEEM stringdef j2 '{U+FEA0}' // JEEM stringdef j3 '{U+FE9D}' // JEEM stringdef j4 '{U+FE9E}' // JEEM stringdef h1 '{U+FEA3}' // HAH stringdef h2 '{U+FEA4}' // HAH stringdef h3 '{U+FEA1}' // HAH stringdef h4 '{U+FEA2}' // HAH stringdef x1 '{U+FEA7}' // KHAH stringdef x2 '{U+FEA8}' // KHAH stringdef x3 '{U+FEA5}' // KHAH stringdef x4 '{U+FEA6}' // KHAH stringdef d1 '{U+FEA9}' // DAL stringdef d2 '{U+FEAA}' // DAL stringdef dz1 '{U+FEAB}' // THAL stringdef dz2 '{U+FEAC}' // THAL stringdef r1 '{U+FEAD}' // REH stringdef r2 '{U+FEAE}' // REH stringdef z1 '{U+FEAF}' // ZAIN stringdef z2 '{U+FEB0}' // ZAIN stringdef s1 '{U+FEB3}' // SEEN stringdef s2 '{U+FEB4}' // SEEN stringdef s3 '{U+FEB1}' // SEEN stringdef s4 '{U+FEB2}' // SEEN stringdef sh1 '{U+FEB7}' // SHEEN stringdef sh2 '{U+FEB8}' // SHEEN stringdef sh3 '{U+FEB5}' // SHEEN stringdef sh4 '{U+FEB6}' // SHEEN stringdef c1 '{U+FEBB}' // SAD stringdef c2 '{U+FEBC}' // SAD stringdef c3 '{U+FEB9}' // SAD stringdef c4 '{U+FEBA}' // SAD stringdef dh1 '{U+FEBF}' // DAD stringdef dh2 '{U+FEC0}' // DAD stringdef dh3 '{U+FEBD}' // DAD stringdef dh4 '{U+FEBE}' // DAD stringdef tt1 '{U+FEC3}' // TAH stringdef tt2 '{U+FEC4}' // TAH stringdef tt3 '{U+FEC1}' // TAH stringdef tt4 '{U+FEC2}' // TAH stringdef zh1 '{U+FEC7}' // ZAH stringdef zh2 '{U+FEC8}' // ZAH stringdef zh3 '{U+FEC5}' // ZAH stringdef zh4 '{U+FEC6}' // ZAH stringdef i1 '{U+FECB}' // AIN stringdef i2 '{U+FECC}' // AIN stringdef i3 '{U+FEC9}' // AIN stringdef i4 '{U+FECA}' // AIN stringdef gh1 '{U+FECF}' // GHAIN stringdef gh2 '{U+FED0}' // GHAIN stringdef gh3 '{U+FECD}' // GHAIN stringdef gh4 '{U+FECE}' // GHAIN stringdef f1 '{U+FED3}' // FEH stringdef f2 '{U+FED4}' // FEH stringdef f3 '{U+FED1}' // FEH stringdef f4 '{U+FED2}' // FEH stringdef q1 '{U+FED7}' // QAF stringdef q2 '{U+FED8}' // QAF stringdef q3 '{U+FED5}' // QAF stringdef q4 '{U+FED6}' // QAF stringdef k1 '{U+FEDB}' // KAF stringdef k2 '{U+FEDC}' // KAF stringdef k3 '{U+FED9}' // KAF stringdef k4 '{U+FEDA}' // KAF stringdef l1 '{U+FEDF}' // LAM stringdef l2 '{U+FEE0}' // LAM stringdef l3 '{U+FEDD}' // LAM stringdef l4 '{U+FEDE}' // LAM stringdef m1 '{U+FEE3}' // MEEM stringdef m2 '{U+FEE4}' // MEEM stringdef m3 '{U+FEE1}' // MEEM stringdef m4 '{U+FEE2}' // MEEM stringdef n1 '{U+FEE7}' // NOON stringdef n2 '{U+FEE8}' // NOON stringdef n3 '{U+FEE5}' // NOON stringdef n4 '{U+FEE6}' // NOON stringdef e1 '{U+FEEB}' // HEH stringdef e2 '{U+FEEC}' // HEH stringdef e3 '{U+FEE9}' // HEH stringdef e4 '{U+FEEA}' // HEH stringdef w1 '{U+FEED}' // WAW stringdef w2 '{U+FEEE}' // WAW stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA stringdef y1 '{U+FEF3}' // YEH stringdef y2 '{U+FEF4}' // YEH stringdef y3 '{U+FEF1}' // YEH stringdef y4 '{U+FEF2}' // YEH // Ligatures Lam-Alef stringdef la '{U+FEFB}' // LAM_ALEF stringdef la2 '{U+FEFC}' // LAM_ALEF stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE booleans ( is_noun is_verb is_defined ) routines ( Prefix_Step1 Prefix_Step2 Prefix_Step3a_Noun Prefix_Step3b_Noun Prefix_Step3_Verb Prefix_Step4_Verb Suffix_All_alef_maqsura Suffix_Noun_Step1a Suffix_Noun_Step1b Suffix_Noun_Step2a Suffix_Noun_Step2b Suffix_Noun_Step2c1 Suffix_Noun_Step2c2 Suffix_Noun_Step3 Suffix_Verb_Step1 Suffix_Verb_Step2a Suffix_Verb_Step2b Suffix_Verb_Step2c Normalize_post Normalize_pre Checks1 ) externals ( stem ) groupings ( ) // Normalizations define Normalize_pre as ( do repeat ( ( [substring] among ( '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization '{_}' ( delete ) // strip kasheeda // Hindu–Arabic numerals '{0}' ( <- '0') '{1}' ( <- '1') '{2}' ( <- '2') '{3}' ( <- '3') '{4}' ( <- '4') '{5}' ( <- '5') '{6}' ( <- '6') '{7}' ( <- '7') '{8}' ( <- '8') '{9}' ( <- '9') // Shaped forms '{o1}' ( <- '{o}' ) // HAMZA '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA '{a1}' '{a2}' ( <- '{a}' ) // ALEF '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH '{d1}' '{d2}' ( <- '{d}' ) // DAL '{dz1}''{dz2}' ( <- '{dz}' ) // THAL '{r1}' '{r2}'( <- '{r}' ) // REH '{z1}' '{z2}' ( <- '{z}' ) // ZAIN '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH '{w1}' '{w2}' ( <- '{w}' ) // WAW '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH // Ligatures Lam-Alef '{la}' '{la2}' (<- '{l}{a}') '{lao}' '{lao2}' (<- '{l}{ao}') '{lao_}' '{lao_2}' (<- '{l}{ao_}') '{la~}' '{la~2}' (<- '{l}{a~}') ) ) or next ) ) define Normalize_post as ( do ( // normalize last hamza backwards ( [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{o}') '{wo}' ( <- '{o}') '{yo}' ( <- '{o}') ) ) ) do repeat ( ( // normalize other hamza's [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{a}') '{wo}' ( <- '{w}') '{yo}' ( <- '{y}') ) ) or next ) ) // Checks define Checks1 as ( [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined) '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined) ) ) //prefixes define Prefix_Step1 as ( [substring] among ( '{ao}{ao}' ($(len > 3) <- '{ao}' ) '{ao}{a~}' ($(len > 3) <- '{a~}' ) '{ao}{wo}' ($(len > 3) <- '{ao}' ) '{ao}{a}' ($(len > 3) <- '{a}' ) '{ao}{ao_}' ($(len > 3) <- '{ao_}' ) // '{ao}' ($(len > 3) delete) //rare case ) ) define Prefix_Step2 as ( [substring] among ( '{f}' '{w}' ($(len > 3) not '{a}' delete) ) ) define Prefix_Step3a_Noun as ( // it is noun and defined [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete) '{l}{l}' '{a}{l}' ($(len > 4) delete) ) ) define Prefix_Step3b_Noun as ( // probably noun and defined [substring] among ( '{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here '{b}' ($(len > 3) delete) // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion '{b}{b}' ($(len > 3) <- '{b}' ) '{k}{k}' ($(len > 3) <- '{k}' ) ) ) define Prefix_Step3_Verb as ( [substring] among ( //'{s}' ($(len > 4) delete)// BUG: cause confusion '{s}{y}' ($(len > 4) <- '{y}' ) '{s}{t}' ($(len > 4) <- '{t}') '{s}{n}' ($(len > 4) <- '{n}') '{s}{ao}' ($(len > 4) <- '{ao}') ) ) define Prefix_Step4_Verb as ( [substring] among ( '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' ) ) ) // suffixes backwardmode ( define Suffix_Noun_Step1a as ( [substring] among ( '{y}' '{k}' '{e}' ($(len >= 4) delete) '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete) '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete) ) ) define Suffix_Noun_Step1b as ( [substring] among ( '{n}' ($(len > 5) delete) ) ) define Suffix_Noun_Step2a as ( [substring] among ( '{a}' '{y}' '{w}' ($(len > 4) delete) ) ) define Suffix_Noun_Step2b as ( [substring] among ( '{a}{t}' ($(len >= 5) delete) ) ) define Suffix_Noun_Step2c1 as ( [substring] among ( '{t}' ($(len >= 4) delete) ) ) define Suffix_Noun_Step2c2 as ( // feminine t_ [substring] among ( '{t_}' ($(len >= 4) delete) ) ) define Suffix_Noun_Step3 as ( // ya' nisbiya [substring] among ( '{y}' ($(len >= 3) delete) ) ) define Suffix_Verb_Step1 as ( [substring] among ( '{e}' '{k}' ($(len >= 4) delete) '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete) '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete) ) ) define Suffix_Verb_Step2a as ( [substring] among ( '{t}' ($(len >= 4) delete) '{a}' '{n}' '{y}' ($(len >= 4) delete) '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present '{t}{m}{a}' ($(len >= 6) delete) ) ) define Suffix_Verb_Step2b as ( [substring] among ( '{w}{a}' '{t}{m}' ($(len >= 5) delete) ) ) define Suffix_Verb_Step2c as ( [substring] among ( '{w}' ($(len >= 4) delete) '{t}{m}{w}' ($(len >= 6) delete) ) ) define Suffix_All_alef_maqsura as ( [substring] among ( '{a_}' ( <- '{y}' ) // spell error // '{a_}' ( delete ) // if noun > 3 // '{a_}' ( <- '{a}') // if verb ) ) ) define stem as ( // set initial values set is_noun set is_verb unset is_defined // guess type and properties do Checks1 // normalization pre-stemming do Normalize_pre backwards ( do ( //Suffixes for verbs ( is_verb ( ( (atleast 1 Suffix_Verb_Step1) ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) ) or Suffix_Verb_Step2b or Suffix_Verb_Step2a ) ) //Suffixes for nouns or ( is_noun ( try ( Suffix_Noun_Step2c2 or (not is_defined Suffix_Noun_Step1a ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1 or next)) or (Suffix_Noun_Step1b ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1)) or (not is_defined Suffix_Noun_Step2a) or (Suffix_Noun_Step2b) ) Suffix_Noun_Step3 ) ) // Suffixes for alef maqsura or Suffix_All_alef_maqsura ) ) //Prefixes do ( try Prefix_Step1 try Prefix_Step2 ( Prefix_Step3a_Noun or (is_noun Prefix_Step3b_Noun) or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) ) ) // normalization post-stemming do Normalize_post ) snowball-3.0.1/algorithms/armenian.sbl000066400000000000000000000162551500727106100200110ustar00rootroot00000000000000stringescapes {} stringdef a '{U+0561}' // 531 stringdef b '{U+0562}' // 532 stringdef g '{U+0563}' // 533 stringdef d '{U+0564}' // 534 stringdef ye '{U+0565}' // 535 stringdef z '{U+0566}' // 536 stringdef e '{U+0567}' // 537 stringdef y '{U+0568}' // 538 stringdef dt '{U+0569}' // 539 stringdef zh '{U+056A}' // 53A stringdef i '{U+056B}' // 53B stringdef l '{U+056C}' // 53C stringdef kh '{U+056D}' // 53D stringdef ts '{U+056E}' // 53E stringdef k '{U+056F}' // 53F stringdef h '{U+0570}' // 540 stringdef dz '{U+0571}' // 541 stringdef gh '{U+0572}' // 542 stringdef djch '{U+0573}' // 543 stringdef m '{U+0574}' // 544 stringdef j '{U+0575}' // 545 stringdef n '{U+0576}' // 546 stringdef sh '{U+0577}' // 547 stringdef vo '{U+0578}' // 548 stringdef ch '{U+0579}' // 549 stringdef p '{U+057A}' // 54A stringdef dj '{U+057B}' // 54B stringdef r '{U+057C}' // 54C stringdef s '{U+057D}' // 54D stringdef v '{U+057E}' // 54E stringdef t '{U+057F}' // 54F stringdef r' '{U+0580}' // 550 stringdef c '{U+0581}' // 551 stringdef u '{U+0582}' // 552 //vjun stringdef bp '{U+0583}' // 553 stringdef q '{U+0584}' // 554 stringdef ev '{U+0587}' stringdef o '{U+0585}' // 555 stringdef f '{U+0586}' // 556 routines ( mark_regions R2 adjective verb noun ending ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{ye}{vo}{y}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define adjective as ( [substring] among ( '{b}{a}{r'}' '{p}{ye}{s}' '{vo}{r'}{e}{n}' '{vo}{v}{i}{n}' '{a}{k}{i}' '{l}{a}{j}{n}' '{r'}{vo}{r'}{d}' '{ye}{r'}{vo}{r'}{d}' '{a}{k}{a}{n}' '{a}{l}{i}' '{k}{vo}{t}' '{ye}{k}{ye}{n}' '{vo}{r'}{a}{k}' '{ye}{gh}' '{v}{vo}{u}{n}' '{ye}{r'}{ye}{n}' '{a}{r'}{a}{n}' '{ye}{n}' '{a}{v}{ye}{t}' '{g}{i}{n}' '{i}{v}' '{a}{t}' '{i}{n}' (delete) ) ) define verb as ( [substring] among ( '{vo}{u}{m}' '{v}{vo}{u}{m}' '{a}{l}{vo}{u}' '{ye}{l}{vo}{u}' '{v}{ye}{l}' '{a}{n}{a}{l}' '{ye}{l}{vo}{u}{c}' '{a}{l}{vo}{u}{c}' '{y}{a}{l}' '{y}{ye}{l}' '{a}{l}{vo}{v}' '{ye}{l}{vo}{v}' '{a}{l}{i}{s}' '{ye}{l}{i}{s}' '{ye}{n}{a}{l}' '{a}{c}{n}{a}{l}' '{ye}{c}{n}{ye}{l}' '{c}{n}{ye}{l}' '{n}{ye}{l}' '{a}{t}{ye}{l}' '{vo}{t}{ye}{l}' '{k}{vo}{t}{ye}{l}' '{t}{ye}{l}' '{v}{a}{ts}' '{ye}{c}{v}{ye}{l}' '{a}{c}{v}{ye}{l}' '{ye}{c}{i}{r'}' '{a}{c}{i}{r'}' '{ye}{c}{i}{n}{q}' '{a}{c}{i}{n}{q}' '{v}{ye}{c}{i}{r'}' '{v}{ye}{c}{i}{n}{q}' '{v}{ye}{c}{i}{q}' '{v}{ye}{c}{i}{n}' '{a}{c}{r'}{i}{r'}' '{a}{c}{r'}{ye}{c}' '{a}{c}{r'}{i}{n}{q}' '{a}{c}{r'}{i}{q}' '{a}{c}{r'}{i}{n}' '{ye}{c}{i}{q}' '{a}{c}{i}{q}' '{ye}{c}{i}{n}' '{a}{c}{i}{n}' '{a}{c}{a}{r'}' '{a}{c}{a}{v}' '{a}{c}{a}{n}{q}' '{a}{c}{a}{q}' '{a}{c}{a}{n}' '{v}{ye}{c}{i}' '{a}{c}{r'}{i}' '{ye}{c}{a}{r'}' '{ye}{c}{a}{v}' '{c}{a}{n}{q}' '{c}{a}{q}' '{c}{a}{n}' '{a}{c}{a}' '{a}{c}{i}' '{ye}{c}{a}' '{ch}{ye}{l}' '{ye}{c}{i}' '{a}{r'}' '{a}{v}' '{a}{n}{q}' '{a}{q}' '{a}{n}' '{a}{l}' '{ye}{l}' '{ye}{c}' '{a}{c}' '{v}{ye}' '{a}' (delete) ) ) define noun as ( [substring] among ( '{a}{ts}{vo}' '{a}{n}{a}{k}' '{a}{n}{o}{c}' '{a}{r'}{a}{n}' '{a}{r'}{q}' '{p}{a}{n}' '{s}{t}{a}{n}' '{ye}{gh}{e}{n}' '{ye}{n}{q}' '{i}{k}' '{i}{ch}' '{i}{q}' '{m}{vo}{u}{n}{q}' '{j}{a}{k}' '{j}{vo}{u}{n}' '{vo}{n}{q}' '{vo}{r'}{d}' '{vo}{c}' '{ch}{ye}{q}' '{v}{a}{ts}{q}' '{v}{vo}{r'}' '{a}{v}{vo}{r'}' '{vo}{u}{dt}{j}{vo}{u}{n}' '{vo}{u}{k}' '{vo}{u}{h}{i}' '{vo}{u}{j}{dt}' '{vo}{u}{j}{q}' '{vo}{u}{s}{t}' '{vo}{u}{s}' '{c}{i}' '{a}{l}{i}{q}' '{a}{n}{i}{q}' '{i}{l}' '{i}{ch}{q}' '{vo}{u}{n}{q}' '{g}{a}{r'}' '{vo}{u}' '{a}{k}' '{a}{n}' '{q}' (delete) ) ) define ending as ( [substring] R2 among ( '{n}{ye}{r'}{y}' '{n}{ye}{r'}{n}' '{n}{ye}{r'}{i}' '{n}{ye}{r'}{d}' '{ye}{r'}{i}{c}' '{n}{ye}{r'}{i}{c}' '{ye}{r'}{i}' '{ye}{r'}{d}' '{ye}{r'}{n}' '{ye}{r'}{y}' '{n}{ye}{r'}{i}{n}' '{vo}{u}{dt}{j}{a}{n}{n}' '{vo}{u}{dt}{j}{a}{n}{y}' '{vo}{u}{dt}{j}{a}{n}{s}' '{vo}{u}{dt}{j}{a}{n}{d}' '{vo}{u}{dt}{j}{a}{n}' '{ye}{r'}{i}{n}' '{i}{n}' '{s}{a}' '{vo}{dj}' '{i}{c}' '{ye}{r'}{vo}{v}' '{n}{ye}{r'}{vo}{v}' '{ye}{r'}{vo}{u}{m}' '{n}{ye}{r'}{vo}{u}{m}' '{vo}{u}{n}' '{vo}{u}{d}' '{v}{a}{n}{s}' '{v}{a}{n}{y}' '{v}{a}{n}{d}' '{a}{n}{y}' '{a}{n}{d}' '{v}{a}{n}' '{vo}{dj}{y}' '{vo}{dj}{s}' '{vo}{dj}{d}' '{vo}{c}' '{vo}{u}{c}' '{vo}{dj}{i}{c}' '{c}{i}{c}' '{v}{i}{c}' '{v}{i}' '{v}{vo}{v}' '{vo}{v}' '{a}{n}{vo}{v}' '{a}{n}{vo}{u}{m}' '{v}{a}{n}{i}{c}' '{a}{m}{b}' '{a}{n}' '{n}{ye}{r'}' '{ye}{r'}' '{v}{a}' '{y}' '{n}' '{d}' '{c}' '{i}' (delete) ) ) ) define stem as ( do mark_regions backwards setlimit tomark pV for ( do ending do verb do adjective do noun ) ) snowball-3.0.1/algorithms/basque.sbl000066400000000000000000000114621500727106100174720ustar00rootroot00000000000000routines ( aditzak izenak adjetiboak mark_regions RV R2 R1 ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef n~ '{U+00F1}' define v 'aeiou' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define RV as $pV <= cursor define R2 as $p2 <= cursor define R1 as $p1 <= cursor define aditzak as ( [substring] among( 'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea' 'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza' 'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza' 'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez' 'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea' 'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena' 'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea' 'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari' 'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu' 'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako' ( RV delete ) 'garri' 'garria' 'tza' (R2 delete) 'atseden' 'arabera' 'baditu' ( ) ) ) define izenak as ( [substring] among( 'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina' 'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea' 'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua' 'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di' 'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa' 'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia' 'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia' 'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua' 'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara' 'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge' 'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua' 'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia' 'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde' 'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea' 'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea' 'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia' 'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa' 'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa' 'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila' 'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa' 'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia' 'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena' 'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan' 'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek' 'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara' 'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket' 'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko' 'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera' 'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko' ( RV delete ) 'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza' ( R2 delete ) 'joka' (<- 'jok') 'tzen' 'ten' 'en' 'tatu' (R1 delete) 'trako' (<- 'tra') 'minutuko' (<- 'minutu') 'zehar' 'geldi' 'igaro' 'aurka' ( ) ) ) define adjetiboak as ( [substring] among( 'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria' 'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik' ( RV delete ) 'zlea' (<- 'z') ) ) ) define stem as ( do mark_regions backwards ( repeat aditzak repeat izenak do adjetiboak ) ) snowball-3.0.1/algorithms/catalan.sbl000066400000000000000000000165571500727106100176270ustar00rootroot00000000000000routines ( cleaning mark_regions R1 R2 attached_pronoun standard_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef a` '{U+00E0}' // a-grave stringdef cc '{U+00E7}' // c-cedilla stringdef e' '{U+00E9}' // e-acute stringdef e` '{U+00E8}' // e-grave stringdef i' '{U+00ED}' // i-acute stringdef i` '{U+00EC}' // i-grave stringdef i" '{U+00EF}' // i-diaeresis stringdef o' '{U+00F3}' // o-acute stringdef o` '{U+00F2}' // o-grave stringdef u' '{U+00FA}' // u-acute stringdef u" '{U+00FC}' // u-diaeresis stringdef . '{U+00B7}' // - per l aggeminades define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' define mark_regions as ( $p1 = limit $p2 = limit // defaults do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define cleaning as repeat ( [substring] among( '{a'}' (<- 'a') '{a`}' (<- 'a') '{e'}' (<- 'e') '{e`}' (<- 'e') '{i'}' (<- 'i') '{i`}' (<- 'i') '{o'}' (<- 'o') '{o`}' (<- 'o') '{u'}' (<- 'u') '{u"}' (<- 'u') '{i"}' (<- 'i') '{.}' (<- '.') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among ( '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' '-ls' '-la' '-les' '-li' 'vos' 'se' 'nos' '-nos' '-us' 'us' '{'}n' '{'}ns' '-n' '-ns' '{'}m' '-me' '-m' '-te' '{'}t' 'li' 'lo' 'los' 'me' 'sela' 'selo' 'selas' 'selos' 'le' 'la' 'las' 'les' 'ens' 'ho' 'hi' (R1 delete) ) ) define standard_suffix as ( [substring] among( 'ar' 'atge' 'formes' 'icte' 'ictes' 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all' 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' 'itar' 'ables' 'adors' 'idores' 'idors' 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' 'assa' 'asses' 'assos' 'ent' 'ents' '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' 'ims' 'ima' 'imes' 'isme' 'ista' 'ismes' 'istes' 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' 'oses' 'osos' 'ient' 'otes' 'ots' (R1 delete) 'acions' 'ada' 'ades' (R2 delete) 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' (R2 <- 'log') 'ic' 'ica' 'ics' 'iques' (R2 <- 'ic') 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' (R1 <- 'c') ) ) define verb_suffix as ( [substring] among( 'ador' 'adora' 'adors' 'adores' 're' 'ie' 'ent' 'ents' 'udes' 'ar{a`}' 'eren' 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'aria' 'arian' 'arien' 'aries' 'ar{a`}s' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' 'ar{e'}' 'ar{e'}s' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'er' 'erau' 'erass' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' 'ia' 'ies' '{i'}em' '{i`}eu' 'ien' 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' 'ar{i'}em' 'ar{i'}eu' 'areu' 'aren' 'ant' '{i"}m' '{i"}u' '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' 'ieu' 'ii' 'io' 'i{a`}' 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' '{i"}ra' '{i"}ren' '{i"}res' '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' (R1 delete) 'ando' (R2 delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' (R1 delete) 'iqu' (R1 <- 'ic') ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or verb_suffix ) do residual_suffix ) do cleaning ) snowball-3.0.1/algorithms/danish.sbl000066400000000000000000000034551500727106100174630ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix undouble ) externals ( stem ) strings ( ch ) integers ( p1 x ) groupings ( c v s_ending ) stringescapes {} /* special characters */ stringdef ae '{U+00E6}' stringdef ao '{U+00E5}' stringdef o/ '{U+00F8}' define c 'bcdfghjklmnpqrstvwxz' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'abcdfghjklmnoprtvyz{ao}' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) gopast v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' 'erets' 'et' 'eret' (delete) 's' (s_ending delete) ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'gd' // significant in the call from other_suffix 'dt' 'gt' 'kt' ) ) next] delete ) define other_suffix as ( do ( ['st'] 'ig' delete ) setlimit tomark p1 for ([substring]) among( 'ig' 'lig' 'elig' 'els' (delete do consonant_pair) 'l{o/}st' (<-'l{o/}s') ) ) define undouble as ( setlimit tomark p1 for ([c] ->ch) ch delete ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix do undouble ) ) snowball-3.0.1/algorithms/dutch.sbl000066400000000000000000000203311500727106100173140ustar00rootroot00000000000000// Dutch stemming algorithm developed by Wessel Kraaij and Renée Pohlmann strings ( ch ) integers ( p1 p2 ) booleans ( stemmed GE_removed ) routines ( R1 R2 C V VX lengthen_V Step_1 Step_2 Step_3 Step_4 Step_7 Step_6 Step_1c Lose_prefix Lose_infix measure ) externals ( stem ) groupings ( v v_WX A AEIOU AIOU E I O U ) stringescapes {} /* special characters */ stringdef a` '{U+00E0}' stringdef a' '{U+00E1}' stringdef a^ '{U+00E2}' stringdef a" '{U+00E4}' stringdef e` '{U+00E8}' stringdef e' '{U+00E9}' stringdef e^ '{U+00EA}' stringdef e" '{U+00EB}' stringdef i` '{U+00EC}' stringdef i' '{U+00ED}' stringdef i^ '{U+00EE}' stringdef i" '{U+00EF}' stringdef o` '{U+00F2}' stringdef o' '{U+00F3}' stringdef o^ '{U+00F4}' stringdef o" '{U+00F6}' stringdef u` '{U+00F9}' stringdef u' '{U+00FA}' stringdef u^ '{U+00FB}' stringdef u" '{U+00FC}' define A 'a{a"}{a'}{a`}{a^}' define E 'e{e"}{e'}{e`}{e^}' define I 'i{i"}{i'}{i`}{i^}' define O 'o{o"}{o'}{o`}{o^}' define U 'u{u"}{u'}{u`}{u^}' define AIOU A + I + O + U define AEIOU A + E + I + O + U define v AEIOU + 'y' define v_WX v + 'wx' backwardmode ( define R1 as ($p1 <= cursor) define R2 as ($p2 <= cursor) define V as test (v or 'ij') define VX as test (next v or 'ij') define C as test (not 'ij' non-v) define lengthen_V as do ( non-v_WX [substring] among ( 'a' '{a"}' '{a'}' '{a`}' '{a^}' 'o' '{o"}' '{o'}' '{o`}' '{o^}' 'u' '{u"}' '{u'}' '{u`}' '{u^}' (test (non-AEIOU or atlimit) ->ch insert ch) 'e' '{e'}' '{e`}' '{e^}' (test (non-AEIOU or atlimit not (AIOU or (E atlimit)) not (next AIOU non-AEIOU)) ->ch insert ch) 'e{e"}' (<-'e{e"}e') 'i{e"}' (<-'iee') ) ) define Step_1 as ( [substring] among ( '{'}s' (delete) 's' (R1 not ('t' R1) C delete) 'ies' (R1 <-'ie') 'es' ((test ('ar' R1 C) delete lengthen_V) or (test ('er' R1 C) delete) or (R1 C <-'e')) '{e'}s' (R1 <-'{e'}') 'aus' (R1 V <-'au') 'en' (('hed' R1 ] <-'heid') or ('nd' delete) or ('d' R1 C ] delete) or ('i' or 'j' V delete) or (R1 C delete lengthen_V)) 'nde' (<-'nd') ) ) define Step_2 as ( [substring] among ( 'je' (('{'}t' ] delete) or ('et' ] R1 C delete) or ('rnt' ] <-'rn') or ('t' ] R1 VX delete) or ('ink' ] <-'ing') or ('mp' ] <-'m') or ('{'}' ] R1 delete) or (] R1 C delete)) 'ge' (R1 <-'g') 'lijke'(R1 <-'lijk') 'ische'(R1 <-'isch') 'de' (R1 C delete) 'te' (R1 <-'t') 'se' (R1 <-'s') 're' (R1 <-'r') 'le' (R1 delete attach 'l' lengthen_V) 'ene' (R1 C delete attach 'en' lengthen_V) 'ieve' (R1 C <-'ief') ) ) define Step_3 as ( [substring] among ( 'atie' (R1 <-'eer') 'iteit' (R1 delete lengthen_V) 'heid' 'sel' 'ster' (R1 delete) 'rder' (<-'r') 'ing' 'isme' 'erij' (// Exception added to avoid conflating // `schilderij` (painting) and `schild` (shield). ('ild' <- 'er') or (R1 delete lengthen_V)) 'arij' (R1 C <-'aar') 'fie' (R2 delete attach 'f' lengthen_V) 'gie' (R2 delete attach 'g' lengthen_V) 'tst' (R1 C <-'t') 'dst' (R1 C <-'d') ) ) define Step_4 as ( ( [substring] among ( 'ioneel' (R1 <-'ie') 'atief' (R1 <-'eer') 'baar' (R1 delete) 'naar' (R1 V <-'n') 'laar' (R1 V <-'l') 'raar' (R1 V <-'r') 'tant' (R1 <-'teer') 'lijker' 'lijkst' (R1 <-'lijk') 'achtig' 'achtiger' 'achtigst'(R1 delete) 'eriger' 'erigst' 'erig' 'end' (R1 C delete lengthen_V) ) ) or ( [substring] among ( 'iger' 'igst' 'ig' (R1 // Exception added to avoid conflating // `innig` (intimate) and `in` (in). not ('inn' atlimit) C delete lengthen_V) ) ) ) define Step_7 as ( [substring] among ( 'kt' (<-'k') 'ft' (<-'f') 'pt' (<-'p') ) ) define Step_6 as ( [substring] among ( 'bb' (<-'b') 'cc' (<-'c') 'dd' (<-'d') 'ff' (<-'f') 'gg' (<-'g') 'hh' (<-'h') 'jj' (<-'j') 'kk' (<-'k') 'll' (<-'l') 'mm' (<-'m') 'nn' (// Exception added to avoid conflating // `innen` (to collect/cash) and `in` (in). not ('i' atlimit) <-'n') 'pp' (<-'p') 'qq' (<-'q') 'rr' (<-'r') 'ss' (<-'s') 'tt' (<-'t') 'vv' (<-'v') 'ww' (<-'w') 'xx' (<-'x') 'zz' (<-'z') 'v' (<-'f') 'z' (<-'s') ) ) define Step_1c as ( [substring] R1 C among ( 'd' (not ('n' R1) // Exception added to avoid conflating // `geïnd` (collected/cashed) and `in` (in). // Instead we conflate `geïnd` with `innen`. ('in' atlimit <-'n') or delete) 't' (not ('h' R1) // Exception added to avoid conflating // `geënt` (grafted) and `en` (and). not ('en' atlimit) delete ) ) ) ) define Lose_prefix as ( ['ge'] test hop 3 test (gopast ('ij' or v) repeat ('ij' or v) not atlimit) // Exceptions added: among ( // Avoid conflating `geeft` and `effen`/`effende`\`geeffende`. 'eft' (false) // Avoid conflating `gevallen`/`geval` and `vallen`. 'val' (false) 'vali' (true) // Avoid conflating `gevaren`/`gevaar` (danger), `gevaarten` (huge // objects) and `varen` (to sail) 'vaa' 'vare' (false) '' (true) ) set GE_removed delete do ( [substring] among ( '{e"}' (<-'e') '{i"}' (<-'i') ) ) ) define Lose_infix as ( next gopast (['ge']) test hop 3 test (gopast ('ij' or v) repeat ('ij' or v) not atlimit) set GE_removed delete do ( [substring] among ( '{e"}' (<-'e') '{i"}' (<-'i') ) ) ) define measure as ( $p1 = limit $p2 = limit do( repeat non-v atleast 1 ('ij' or v) non-v setmark p1 repeat non-v atleast 1 ('ij' or v) non-v setmark p2 ) ) define stem as ( unset stemmed measure backwards ( do (Step_1 set stemmed ) do (Step_2 set stemmed ) do (Step_3 set stemmed ) do (Step_4 set stemmed ) ) unset GE_removed do (Lose_prefix and measure) backwards ( do (GE_removed set stemmed Step_1c) ) unset GE_removed do (Lose_infix and measure) backwards ( do (GE_removed set stemmed Step_1c) ) backwards ( do (Step_7 set stemmed ) do (stemmed Step_6) ) ) snowball-3.0.1/algorithms/dutch_porter.sbl000066400000000000000000000071131500727106100207120ustar00rootroot00000000000000// Dutch stemming algorithm developed by Martin Porter routines ( prelude postlude e_ending en_ending mark_regions R1 R2 undouble standard_suffix ) externals ( stem ) booleans ( e_found ) integers ( p1 p2 x ) groupings ( v v_I v_j ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef e" '{U+00EB}' stringdef i" '{U+00EF}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef a' '{U+00E1}' stringdef e' '{U+00E9}' stringdef i' '{U+00ED}' stringdef o' '{U+00F3}' stringdef u' '{U+00FA}' stringdef e` '{U+00E8}' define v 'aeiouy{e`}' define v_I v + 'I' define v_j v + 'j' define prelude as ( test repeat ( [substring] among( '{a"}' '{a'}' (<- 'a') '{e"}' '{e'}' (<- 'e') '{i"}' '{i'}' (<- 'i') '{o"}' '{o'}' (<- 'o') '{u"}' '{u'}' (<- 'u') '' (next) ) ) try(['y'] <- 'Y') repeat ( gopast v try ( // If we see `i` not followed by a vowel then we know it couldn't // match on the next iteration so we can advance past it. // // However if we replace `i` with `I` we do need to check the vowel // after the `i` in the next iteration to match the documented // behaviour, e.g. consider input `iiiii`. This may well not make // a difference for any actual Dutch words though. [('i'] do(v <- 'I')) or ('y'] <- 'Y') ) ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'I' (<- 'i') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define undouble as ( test among('kk' 'dd' 'tt') [next] delete ) define e_ending as ( unset e_found ['e'] R1 test non-v delete set e_found undouble ) define en_ending as ( R1 non-v and not 'gem' delete undouble ) define standard_suffix as ( do ( [substring] among( 'heden' ( R1 <- 'heid' ) 'en' 'ene' ( en_ending ) 's' 'se' ( R1 non-v_j delete ) ) ) do e_ending do ( ['heid'] R2 not 'c' delete ['en'] en_ending ) do ( [substring] among( 'end' 'ing' ( R2 delete (['ig'] R2 not 'e' delete) or undouble ) 'ig' ( R2 not 'e' delete ) 'lijk' ( R2 delete e_ending ) 'baar' ( R2 delete ) 'bar' ( R2 e_found delete ) ) ) do ( non-v_I test ( among ('aa' 'ee' 'oo' 'uu') non-v ) [next] delete ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-3.0.1/algorithms/english.sbl000066400000000000000000000132211500727106100176360ustar00rootroot00000000000000integers ( p1 p2 ) booleans ( Y_found ) routines ( prelude postlude mark_regions shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 exception1 ) externals ( stem ) groupings ( aeo v v_WXY valid_LI ) stringescapes {} define aeo 'aeo' define v 'aeiouy' define v_WXY v + 'wxY' define valid_LI 'cdeghkmnrt' define prelude as ( unset Y_found do ( ['{'}'] delete) do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) ) define mark_regions as ( $p1 = limit $p2 = limit do( among ( 'gener' // generate/general/generic/generous 'commun' // communication/communism/community 'arsen' // arsenic/arsenal 'past' // past/paste 'univers' // universe/universal/university 'later' // lateral/later 'emerg' // emerge/emergency 'organ' // organ/organic/organize // ... extensions possible here ... ) or (gopast v gopast non-v) setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define shortv as ( ( non-v_WXY v non-v ) or ( non-v v atlimit ) or ( 'past' ) // pasted/pasting ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( try ( [substring] among ( '{'}' '{'}s' '{'}s{'}' (delete) ) ) [substring] among ( 'sses' (<-'ss') 'ied' 'ies' ((hop 2 <-'i') or <-'ie') 's' (next gopast v delete) 'us' 'ss' ) ) define Step_1b as ( [substring] among ( 'eed' 'eedly' ( do ( among ( 'proc' 'exc' 'succ' (atlimit) ) or ( R1 <-'ee' ) ) ) 'ed' 'edly' 'ingly' (false) // Handled below. 'ing' ( // Handle exceptional cases here, rest handled below. among ( // dying->die, lying->die, tying->tie, vying->vie 'y' (test(non-v atlimit) ] <-'ie') // Leave inning, outing, etc along. 'inn' 'out' 'cann' 'herr' 'earr' 'even' (atlimit) ) ) '' () ) or ( // Handle 'ed' 'edly' 'ing' 'ingly' test gopast v delete [] test ( substring among( 'at' 'bl' 'iz' (fail(<- 'e')) 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x (not (aeo atlimit)) '' (fail(atmark p1 test shortv <- 'e')) ) ) [next] delete ) ) define Step_1c as ( ['y' or 'Y'] non-v not atlimit <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alism' 'aliti' 'alli' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' 'bli' (<-'ble') 'ogist' (<-'og') 'ogi' ('l' <-'og') 'fulli' (<-'ful') 'lessli' (<-'less') 'li' (valid_LI delete) ) ) define Step_3 as ( [substring] R1 among ( 'tional' (<- 'tion') 'ational' (<- 'ate') 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ful' 'ness' (delete) 'ative' (R2 delete) ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5 as ( [substring] among ( 'e' (R2 or (R1 not shortv) delete) 'l' (R2 'l' delete) ) ) ) define exception1 as ( [substring] atlimit among( /* special changes: */ 'skies' (<-'sky') /* special -LY cases */ 'idly' (<-'idl') 'gently' (<-'gentl') 'ugly' (<-'ugli') 'early' (<-'earli') 'only' (<-'onli') 'singly' (<-'singl') // ... extensions possible here ... /* invariant forms: */ 'sky' 'news' 'howe' 'atlas' 'cosmos' 'bias' 'andes' // not plural forms // ... extensions possible here ... ) ) define postlude as (Y_found repeat(goto (['Y']) <-'y')) define stem as ( exception1 or not hop 3 or ( do prelude do mark_regions backwards ( do Step_1a do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5 ) do postlude ) ) snowball-3.0.1/algorithms/esperanto.sbl000066400000000000000000000070751500727106100202170ustar00rootroot00000000000000booleans ( foreign ) routines ( canonical_form correlative final_apostrophe initial_apostrophe long_word merged_numeral not_after_letter pronoun standard_suffix ujn_suffix uninflected ) externals ( stem ) groupings ( vowel aou digit ) define vowel 'aeiou' define aou 'aou' define digit '0123456789' stringescapes {} stringdef c^ '{U+0109}' stringdef g^ '{U+011D}' stringdef h^ '{U+0125}' stringdef j^ '{U+0135}' stringdef s^ '{U+015D}' stringdef u+ '{U+016D}' stringdef a' '{U+00E1}' stringdef e' '{U+00E9}' stringdef i' '{U+00ED}' stringdef o' '{U+00F3}' stringdef u' '{U+00FA}' define canonical_form as ( unset foreign repeat ( [substring] among( 'cx' (<- '{c^}') 'gx' (<- '{g^}') 'hx' (<- '{h^}') 'jx' (<- '{j^}') 'sx' (<- '{s^}') 'ux' (<- '{u+}') '{a'}' (<- 'a' set foreign) '{e'}' (<- 'e' set foreign) '{i'}' (<- 'i' set foreign) '{o'}' (<- 'o' set foreign) '{u'}' (<- 'u' set foreign) 'q' 'w' 'x' 'y' (set foreign) '-' (unset foreign) '' (next) ) ) not foreign ) define initial_apostrophe as ( ['{'}'] 'st' among('as' 'i' 'is' 'os' 'u' 'us') atlimit <- 'e' ) backwardmode ( define pronoun as ( [try 'n'] among( 'ci' 'gi' '{g^}i' 'hi' 'ili' 'i{s^}i' 'ivi' 'li' 'mal{s^}i' 'mi' 'ni' 'oni' 'ri' 'si' '{s^}i' '{s^}li' 'vi' ) (atlimit or '-') delete ) define final_apostrophe as ( ['{'}'] ('l' atlimit <- 'a') or ('un' atlimit <- 'u') or ( among( 'adi' 'almen' 'amb' 'ank' 'ankor' 'anstat' 'anta{u+}hier' 'apen' 'bald' '{c^}irk' 'hier' 'hodi' 'kontr' 'kvaz' 'malbald' 'malgr' 'morg' 'postmorg' 'presk' 'tut{c^}irk' ) (atlimit or '-') <- 'a{u+}' ) or (<- 'o') ) define ujn_suffix as ( [try 'n' try 'j'] among('aliu' 'unu') (atlimit or '-') delete ) define uninflected as ( among( 'aha' 'amen' 'dirlididi' 'disde' 'ehe' 'ekde' 'elde' 'haha' 'haleluja' 'hola' 'hosana' 'hura' '{h^}a{h^}a' 'mal{c^}i' 'malkaj' 'malpli' 'maltra' 'maltre' 'maltro' 'minus' 'muu' 'oho' 'tamen' 'uhu' ) (atlimit or '-') ) define merged_numeral as ( among('du' 'tri' 'unu') among('cent' 'dek') ) define correlative as ( [] // Ignore -al, -am, etc. since they can't be confused with suffixes. test ( ((try 'n'] 'e') or (try 'n' try 'j'] aou)) 'i' try among('{c^}' 'k' 'kelk' 'mult' 'nen' 'samt' 't') (atlimit or '-') ) delete ) define long_word as ( loop 2 gopast vowel or (gopast '-' next) or gopast digit ) define not_after_letter as ('-' or digit) define standard_suffix as ( [substring try '-'] among( 'a' 'aj' 'ajn' 'an' 'e' 'en' 'i' 'as' 'is' 'os' 'u' 'us' 'o' 'oj' 'ojn' 'on' 'j' not_after_letter 'jn' not_after_letter 'n' not_after_letter ) delete ) ) define stem as ( test canonical_form do initial_apostrophe backwards ( not pronoun do final_apostrophe not correlative not uninflected not merged_numeral not ujn_suffix test long_word standard_suffix ) ) snowball-3.0.1/algorithms/estonian.sbl000066400000000000000000000252401500727106100200310ustar00rootroot00000000000000/* Estonian stemmer Made by Linda Freienthal in January 2019. */ routines ( mark_regions LONGV special_noun_endings case_ending emphasis plural_three_first_cases undouble_kpt i_plural degrees substantive verb_exceptions verb nu ) stringescapes {} stringdef a" '{U+00E4}' //a-umlaut ä stringdef o" '{U+00F6}' //o-umlaut ö stringdef o~ '{U+00F5}' //o with tilde õ stringdef u" '{U+00FC}' //u-umlaut ü stringdef sv '{U+0161}' //s-caron š stringdef zv '{U+017E}' //z-caron ž externals ( stem ) integers ( p1 ) groupings ( V1 RV KI GI) define V1 'aeiou{o~}{a"}{o"}{u"}' define RV 'aeiuo' define KI 'kptgbdshf{sv}z{zv}' define GI 'cjlmnqrvwxaeiou{o~}{a"}{o"}{u"}' define mark_regions as ( $p1 = limit gopast V1 gopast non-V1 setmark p1 ) backwardmode ( define emphasis as ( setlimit tomark p1 for ([substring]) test hop 4 //kingi -> kingi among( 'gi' ((GI and not LONGV) delete) //jookse-me-gi, bioloogi -> bioloogi 'ki' (KI delete) //kookki -> kook ) ) // Signals t if a replacement was made; f otherwise. define verb as ( setlimit tomark p1 for ([substring]) among( 'nuksin' 'nuksime' 'nuksid' 'nuksite' (delete) //seleta-nuksite 'ksin' 'ksid' 'ksime' 'ksite' (delete) //personal conditional: rõõmusta-ksin 'mata' (delete) 'takse' 'dakse' (delete) //impersonal: laul-dakse, luba-takse 'taks' 'daks' (delete) //impersonal conditional: laul-daks, saade-taks 'akse' (<-'a') //impersonal: tulla-kse, süüa-kse (-> söö), teha-kse (-> tegi), püüta-kse, leita-kse 'sime' (delete) //pl1pst: saat-sime 'site' (delete) //pl2pst: saat-site 'sin' (delete) //sg1pst: laul-sin, saat-sin 'me' (V1 delete) //pl1prs: laula-me, tule-me 'da' (V1 delete) //da-infinitive: luba-da 'n' (V1 delete) //sg1prs: kirjuta-n 'b' (V1 delete) //sg3prs: laula-b ) ) define LONGV as among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}' '{u"}{u"}' '{o~}{o~}') define i_plural as ( setlimit tomark p1 for ([substring]) among( 'i' (RV) //raama-tu-i, lapsiku-i ) delete ) define special_noun_endings as ( setlimit tomark p1 for ([substring]) among( 'lasse' (<- 'lase') //teadlasse -> teadlase 'last' (<- 'lase') //teadlast -> teadlase 'lane' (<- 'lase') //teadlane -> teadlase 'lasi'(<- 'lase') //teadlasi -> teadlase 'misse' (<- 'mise') //tegemisse -> tegemise 'mist' (<- 'mise') //kasutamist -> kasutamise 'mine' (<- 'mise') //tegemine -> tegemise 'misi' (<- 'mise') //kasutamisi -> kasutamise 'lisse' (<- 'lise') //rohelisse -> rohelise 'list' (<- 'lise') //tavalist -> tavalise 'line' (<- 'lise') //roheline -> rohelise 'lisi' (<- 'lise') //tavalisi -> tavalise ) ) define case_ending as ( setlimit tomark p1 for ([substring]) among( 'sse' (RV or LONGV) //illative: saapa-sse 'st' (RV or LONGV) //elative: saapa-st and kapsas-t 'le' (RV or LONGV) //allative: raama-tu-le 'lt' (RV or LONGV) //ablative: raama-tu-lt 'ga' (RV or LONGV) //komitatiive: õpetaja-ga 'ks' (RV or LONGV) //translative: õpetaja-ks 'ta' (RV or LONGV) //abessive and da-infinitive: õpetaja-ta and hüpa-ta 't' (test hop 4) //partitiiv, raamatu-t 's' (RV or LONGV) //inessive and sg3pst: raama-tu-s and sõiti-s 'l' (RV or LONGV) //adessive: raama-tu-l and kapsa-l. ) delete ) define plural_three_first_cases as ( setlimit tomark p1 for ([substring]) among( 'ikkude' (<-'iku') //plural genitive: õnnelikkude -> õnneliku 'ikke' (<-'iku') //plural partitive: rahulikke -> rahuliku 'ike' (<-'iku') //plural genitive: ohtlike -> ohtliku 'sid' (not LONGV delete) //plural partitive and sg2pst and pl3pst: auto-sid and laul-sid (exludes plural nominative with words like gaasid, roosid) // plural genitive and pl2: ministri-te, oluliste -> olulise and saada-te, laula-te; // also torte -> tort (if not in compound word) and kokkuvõtte -> kokkuvõte and roheliste -> rohelise, tegemiste -> tegemise, teadlaste -> teadlase 'te' ( (test hop 4 among ( 'mis' 'las' 'lis' (<- 'e') 't' () '' (delete) ) ) or <- 't' ) 'de' ((RV or LONGV) delete) //plural genitive: lauda-de 'd' ((RV or LONGV) delete) //plural nominative: voodi-d, rattai-d (rata), lapsiku-i-d ) ) define nu as ( setlimit tomark p1 for ([substring]) among( 'nu' //haka-nu(-te-ga) 'tu' //luba-tu(-d) 'du' //laul-du(-te-st) 'va' //laul-va(-te-le) ) delete ) define undouble_kpt as ( // undouble '-C1C1V' where C1 is k, p or t: // mõtte(-le) -> mõte, hakka(-n) -> haka // // We only undouble if the vowel is in R1 to avoid modifying short // non-words (mostly to avoid modifying acronyms/initialisms such // as "PPE"). V1 $(p1 <= cursor) [substring] among( 'kk' (<- 'k') 'pp' (<- 'p') 'tt' (<- 't') ) ) define degrees as ( setlimit tomark p1 for ([substring]) among( 'mai' (RV delete) //heleda-mai(-le) 'ma' (delete) //tuge-va-ma(-le) and ma-infinitive: sõit-ma 'm' (RV delete) //kauge-i-m, rõõmsa-m ) ) define substantive as ( do special_noun_endings do case_ending do plural_three_first_cases do degrees do i_plural do nu ) ) define verb_exceptions as ( [substring] atlimit among( 'joon' 'jood' 'joob' 'joote' 'joome' 'joovad' (<-'joo') 'j{o~}in' 'j{o~}id' 'j{o~}i' 'j{o~}ime' 'j{o~}ite' (<-'joo') 'joomata' 'juuakse' 'joodakse' 'juua' 'jooma' (<- 'joo') 'saan' 'saad' 'saab' 'saate' 'saame' 'saavad' (<-'saa') 'saaksin' 'saaksid' 'saaks' 'saaksite' 'saaksime' (<-'saa') 'sain' 'said' 'sai' 'saite' 'saime' (<-'saa') 'saamata' 'saadakse' 'saadi' 'saama' 'saada' (<-'saa') 'viin' 'viid' 'viib' 'viite' 'viime' 'viivad' (<-'viima') 'viiksin' 'viiksid' 'viiks' 'viiksite' 'viiksime' (<-'viima') 'viisin' 'viisite' 'viisime' (<-'viima') 'viimata' 'viiakse' 'viidi' 'viima' 'viia' (<-'viima') 'keen' 'keeb' 'keed' 'kees' 'keeme' 'keete' 'keevad' (<-'keesi') 'keeksin' 'keeks' 'keeksid' 'keeksime' 'keeksite' (<-'keesi') 'keemata' 'keema' 'keeta' 'keedakse' (<-'keesi') 'l{o"}{o"}n' 'l{o"}{o"}d' 'l{o"}{o"}b' 'l{o"}{o"}me' 'l{o"}{o"}te' 'l{o"}{o"}vad' (<-'l{o"}{o"}') 'l{o"}{o"}ksin' 'l{o"}{o"}ksid' 'l{o"}{o"}ks' 'l{o"}{o"}ksime' 'l{o"}{o"}ksite' (<-'l{o"}{o"}') 'l{o"}{o"}mata' 'l{u"}{u"}akse' 'l{o"}{o"}dakse' 'l{o"}{o"}di' 'l{o"}{o"}ma' 'l{u"}{u"}a' (<-'l{o"}{o"}') // Both looma and lööma have these same past tense forms 'l{o~}in' 'l{o~}id' 'l{o~}i' 'l{o~}ime' 'l{o~}ite' (<-'l{o~}i') 'loon' 'lood' 'loob' 'loome' 'loote' 'loovad' (<-'loo') 'looksin' 'looksid' 'looks' 'looksime' 'looksite' (<-'loo') 'loomata' 'luuakse' 'loodi' 'luua' 'looma' (<-'loo') 'k{a"}in' 'k{a"}ib' 'k{a"}id' 'k{a"}is' 'k{a"}ime' 'k{a"}ite' 'k{a"}ivad' (<-'k{a"}isi') 'k{a"}iksin' 'k{a"}iks' 'k{a"}iksid' 'k{a"}iksime' 'k{a"}iksite' (<-'k{a"}isi') 'k{a"}imata' 'k{a"}iakse' 'k{a"}idi' 'k{a"}ia' 'k{a"}ima' (<-'k{a"}isi') 's{o"}{o"}n' 's{o"}{o"}b' 's{o"}{o"}d' 's{o"}{o"}me' 's{o"}{o"}te' 's{o"}{o"}vad' (<-'s{o"}{o"}') 's{o"}{o"}ksin' 's{o"}{o"}ks' 's{o"}{o"}ksid' 's{o"}{o"}ksime' 's{o"}{o"}ksite' (<-'s{o"}{o"}') 's{o~}in' 's{o~}i' 's{o~}id' 's{o~}ime' 's{o~}ite' (<-'s{o"}{o"}') 's{o"}{o"}mata' 's{u"}{u"}akse' 's{o"}{o"}dakse' 's{o"}{o"}di' 's{o"}{o"}ma' 's{u"}{u"}a' (<-'s{o"}{o"}') 'toon' 'tood' 'toob' 'toote' 'toome' 'toovad' (<-'too') 'tooksin' 'tooksid' 'tooks' 'tooksite' 'tooksime' (<-'too') 't{o~}in' 't{o~}id' 't{o~}i' 't{o~}ime' 't{o~}ite' (<-'too') 'toomata' 'tuuakse' 'toodi' 'tooma' 'tuua' (<-'too') 'v{o~}in' 'v{o~}id' 'v{o~}ib' 'v{o~}ime' 'v{o~}is' 'v{o~}ite' 'v{o~}ivad' (<-'v{o~}isi') 'v{o~}iksin' 'v{o~}iksid' 'v{o~}iks' 'v{o~}iksime' 'v{o~}iksite' (<-'v{o~}isi') 'v{o~}imata' 'v{o~}idakse' 'v{o~}idi' 'v{o~}ida' 'v{o~}ima' (<-'v{o~}isi') 'j{a"}{a"}n' 'j{a"}{a"}d' 'j{a"}{a"}b' 'j{a"}{a"}me' 'j{a"}{a"}te' 'j{a"}{a"}vad' (<-'j{a"}{a"}ma') 'j{a"}{a"}ksin' 'j{a"}{a"}ksid' 'j{a"}{a"}ks' 'j{a"}{a"}ksime' 'j{a"}{a"}ksite' (<-'j{a"}{a"}ma') 'j{a"}ime' 'j{a"}ite' 'j{a"}in' 'j{a"}id' 'j{a"}i' (<-'j{a"}{a"}ma') 'j{a"}{a"}mata' 'j{a"}{a"}dakse' 'j{a"}{a"}da' 'j{a"}{a"}ma' 'j{a"}{a"}di' (<-'j{a"}{a"}ma') 'm{u"}{u"}n' 'm{u"}{u"}d' 'm{u"}{u"}b' 'm{u"}{u"}s' 'm{u"}{u"}me' 'm{u"}{u"}te' 'm{u"}{u"}vad' (<-'m{u"}{u"}si') 'm{u"}{u"}ksin' 'm{u"}{u"}ksid' 'm{u"}{u"}ks' 'm{u"}{u"}ksime' 'm{u"}{u"}ksite' (<-'m{u"}{u"}si') 'm{u"}{u"}mata' 'm{u"}{u"}akse' 'm{u"}{u"}di' 'm{u"}{u"}a' 'm{u"}{u"}ma' (<-'m{u"}{u"}si') 'loeb' 'loen' 'loed' 'loeme' 'loete' 'loevad' (<- 'luge') 'loeks' 'loeksin' 'loeksid' 'loeksime' 'loeksite' (<- 'luge') 'p{o~}en' 'p{o~}eb' 'p{o~}ed' 'p{o~}eme' 'p{o~}ete' 'p{o~}evad' (<- 'p{o~}de') 'p{o~}eksin' 'p{o~}eks' 'p{o~}eksid' 'p{o~}eksime' 'p{o~}eksite' (<- 'p{o~}de') 'laon' 'laob' 'laod' 'laome' 'laote' 'laovad' (<- 'ladu') 'laoksin' 'laoks' 'laoksid' 'laoksime' 'laoksite' (<- 'ladu') 'teeksin' 'teeks' 'teeksid' 'teeksime' 'teeksite' (<- 'tegi') 'teen' 'teeb' 'teed' 'teeme' 'teete' 'teevad' (<- 'tegi') 'tegemata' 'tehakse' 'tehti' 'tegema' 'teha' (<-'tegi') 'n{a"}en' 'n{a"}eb' 'n{a"}ed' 'n{a"}eme' 'n{a"}ete' 'n{a"}evad' (<-'n{a"}gi') 'n{a"}eksin' 'n{a"}eks' 'n{a"}eksid' 'n{a"}eksime' 'n{a"}eksite' (<-'n{a"}gi') 'n{a"}gemata' 'n{a"}hakse' 'n{a"}hti' 'n{a"}ha' 'n{a"}gema' (<-'n{a"}gi') ) ) define stem as ( not verb_exceptions // p1 isn't used by verb_exceptions do mark_regions backwards ( do emphasis do ( verb or substantive ) do undouble_kpt ) ) snowball-3.0.1/algorithms/finnish.sbl000066400000000000000000000121471500727106100176510ustar00rootroot00000000000000 /* Finnish stemmer. Numbers in square brackets refer to the sections in Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 ISBN 0-415-20705-3 */ routines ( mark_regions R2 particle_etc possessive LONG VI case_ending i_plural t_plural other_endings tidy ) externals ( stem ) integers ( p1 p2 ) strings ( x ) booleans ( ending_removed ) groupings ( AEI C V1 V2 particle_end ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' define AEI 'a{a"}ei' define C 'bcdfghjklmnpqrstvwxz' define V1 'aeiouy{a"}{o"}' define V2 'aeiou{a"}{o"}' define particle_end V1 + 'nt' define mark_regions as ( $p1 = limit $p2 = limit gopast V1 gopast non-V1 setmark p1 gopast V1 gopast non-V1 setmark p2 ) backwardmode ( define R2 as $p2 <= cursor define particle_etc as ( setlimit tomark p1 for ([substring]) among( 'kin' 'kaan' 'k{a"}{a"}n' 'ko' 'k{o"}' 'han' 'h{a"}n' 'pa' 'p{a"}' // Particles [91] (particle_end) 'sti' // Adverb [87] (R2) ) delete ) define possessive as ( // [36] setlimit tomark p1 for ([substring]) among( 'si' (not 'k' delete) // take 'ksi' as the Comitative case 'ni' (delete ['kse'] <- 'ksi') // kseni = ksi + ni 'nsa' 'ns{a"}' 'mme' 'nne' (delete) /* Now for Vn possessives after case endings: [36] */ 'an' (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) '{a"}n' (among('t{a"}' 'ss{a"}' 'st{a"}' 'll{a"}' 'lt{a"}' 'n{a"}') delete) 'en' (among('lle' 'ine') delete) ) ) define LONG as among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') define VI as ('i' V2) define case_ending as ( setlimit tomark p1 for ([substring]) among( 'han' ('a') //-. 'hen' ('e') // | 'hin' ('i') // | 'hon' ('o') // | 'h{a"}n' ('{a"}') // Illative [43] 'h{o"}n' ('{o"}') // | 'siin' VI // | 'seen' LONG //-' 'den' VI 'tten' VI // Genitive plurals [34] () 'n' // Genitive or Illative ( try ( LONG // Illative or 'ie' // Genitive and next ] ) /* otherwise Genitive */ ) 'a' '{a"}' //-. (V1 C) // | 'tta' 'tt{a"}' // Partitive [32] ('e') // | 'ta' 't{a"}' //-' 'ssa' 'ss{a"}' // Inessive [41] 'sta' 'st{a"}' // Elative [42] 'lla' 'll{a"}' // Adessive [44] 'lta' 'lt{a"}' // Ablative [51] 'lle' // Allative [46] 'na' 'n{a"}' // Essive [49] 'ksi' // Translative[50] 'ine' // Comitative [51] /* Abessive and Instructive are too rare for inclusion [51] */ ) delete set ending_removed ) define other_endings as ( setlimit tomark p2 for ([substring]) among( 'mpi' 'mpa' 'mp{a"}' 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] (not 'po') //-improves things 'impi' 'impa' 'imp{a"}' 'immi' 'imma' 'imm{a"}' // Superlative forms [86] 'eja' 'ej{a"}' // indicates agent [93.1B] ) delete ) define i_plural as ( // [26] setlimit tomark p1 for ([substring]) among( 'i' 'j' ) delete ) define t_plural as ( // [26] setlimit tomark p1 for ( ['t'] test V1 delete ) setlimit tomark p2 for ([substring]) among( 'mma' (not 'po') //-mmat endings 'imma' //-immat endings ) delete ) define tidy as ( setlimit tomark p1 for ( do ( LONG and ([next] delete ) ) // undouble vowel do ( [AEI] C delete ) // remove trailing a, a", e, i do ( ['j'] 'o' or 'u' delete ) do ( ['o'] 'j' delete ) ) goto non-V1 [C] -> x x delete // undouble consonant ) ) define stem as ( do mark_regions unset ending_removed backwards ( do particle_etc do possessive do case_ending do other_endings (ending_removed do i_plural) or do t_plural do tidy ) ) snowball-3.0.1/algorithms/french.sbl000066400000000000000000000157101500727106100174570ustar00rootroot00000000000000routines ( elisions prelude postlude mark_regions RV R1 R2 standard_suffix i_verb_suffix verb_suffix residual_suffix un_double un_accent ) externals ( stem ) integers ( pV p1 p2 ) groupings ( elision_char v keep_with_s oux_ending ) stringescapes {} /* special characters */ stringdef a^ '{U+00E2}' // a-circumflex stringdef a` '{U+00E0}' // a-grave stringdef cc '{U+00E7}' // c-cedilla stringdef e" '{U+00EB}' // e-diaeresis (rare) stringdef e' '{U+00E9}' // e-acute stringdef e^ '{U+00EA}' // e-circumflex stringdef e` '{U+00E8}' // e-grave stringdef i" '{U+00EF}' // i-diaeresis stringdef i^ '{U+00EE}' // i-circumflex stringdef o^ '{U+00F4}' // o-circumflex stringdef u^ '{U+00FB}' // u-circumflex stringdef u` '{U+00F9}' // u-grave define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' // Replace -oux with -ou if preceded by one of these letters. define oux_ending 'bhjlnp' // Single character elisions define elision_char 'cdjlmnst' define elisions as ( [ (elision_char or 'qu') '{'}' ] not atlimit delete ) define prelude as repeat goto ( ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') or ('y' ] <- 'Y') ) or ( [ '{e"}' ] <- 'He' ) or ( [ '{i"}' ] <- 'Hi' ) or ( ['y'] v <- 'Y' ) or ( 'q' ['u'] <- 'U' ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v v next ) or among ( // Exception list: 'par' // paris, parie, pari 'col' // colis 'tap' // tapis () 'ni' (v) // niais/nierais/nié/niâmes/nièrent // extensions possible here ) or ( next gopast v ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') 'Y' (<- 'y') 'He' (<- '{e"}') 'Hi' (<- '{i"}') 'H' (delete) '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' 'ances' 'iqUes' 'ismes' 'ables' 'istes' ( R2 delete ) 'atrice' 'ateur' 'ation' 'atrices' 'ateurs' 'ations' ( R2 delete try ( ['ic'] (R2 delete) or <-'iqU' ) ) 'logie' 'logies' ( R2 <- 'log' ) 'usion' 'ution' 'usions' 'utions' ( R2 <- 'u' ) 'ence' 'ences' ( R2 <- 'ent' ) 'ement' 'ements' ( RV delete try ( [substring] among( 'iv' (R2 delete ['at'] R2 delete) 'eus' ((R2 delete) or (R1<-'eux')) 'abl' 'iqU' (R2 delete) 'i{e`}r' 'I{e`}r' (RV <-'i') ) ) ) 'it{e'}' 'it{e'}s' ( R2 delete try ( [substring] among( 'abil' ((R2 delete) or <-'abl') 'ic' ((R2 delete) or <-'iqU') 'iv' (R2 delete) ) ) ) 'if' 'ive' 'ifs' 'ives' ( R2 delete try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) ) 'eaux' (<- 'eau') 'aux' (R1 <- 'al') 'oux' (oux_ending <- 'ou') 'euse' 'euses'((R2 delete) or (R1<-'eux')) 'issement' 'issements'(R1 non-v delete) // verbal // fail(...) below forces entry to verb_suffix. -ment typically // follows the p.p., e.g 'confus{e'}ment'. 'amment' (RV fail(<- 'ant')) 'emment' (RV fail(<- 'ent')) 'ment' 'ments' (test(v RV) fail(delete)) // v is e,i,u,{e'},I or U ) ) define i_verb_suffix as setlimit tomark pV for ( [substring] among ( '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' 'issez' 'issiez' 'issions' 'issons' 'it' (not 'H' non-v delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among ( 'ions' (R2 delete) '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' 'erons' 'eront' 'ez' 'iez' // 'ons' //-best omitted (delete) '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ait' 'ant' 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' 'assions' ( try('e' RV]) delete ) 'ais' 'aise' 'aises' ( not among ( 'al' // balais, calais, galais, malais, palais, valais (next atlimit) 'auv' // mauvais '{e'}pl' // déplais () ) delete ) 'eais' (delete) ) ) define keep_with_s 'aiou{e`}s' define residual_suffix as ( try(['s'] test ('Hi' or non-keep_with_s) delete) setlimit tomark pV for ( [substring] among( 'ion' (R2 's' or 't' delete) 'ier' 'i{e`}re' 'Ier' 'I{e`}re' (<-'i') 'e' (delete) ) ) ) define un_double as ( test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete ) define un_accent as ( atleast 1 non-v [ '{e'}' or '{e`}' ] <-'e' ) ) define stem as ( do elisions do prelude do mark_regions backwards ( do ( ( ( standard_suffix or i_verb_suffix or verb_suffix ) and try( [ ('Y' ] <- 'i' ) or ('{cc}'] <- 'c' ) ) ) or residual_suffix ) // try(['ent'] RV delete) // is best omitted do un_double do un_accent ) do postlude ) snowball-3.0.1/algorithms/german.sbl000066400000000000000000000070031500727106100174570ustar00rootroot00000000000000routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v et_ending s_ending st_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef ss '{U+00DF}' define v 'aeiouy{a"}{o"}{u"}' define et_ending 'dfgklmnrstUz{a"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) repeat ( [substring] among( '{ss}' (<- 'ss') 'ae' (<- '{a"}') 'oe' (<- '{o"}') 'ue' (<- '{u"}') 'qu' () '' (next) ) ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' ( not 'syst' // don't remove -em from words ending -system delete ) 'ern' 'er' 'erin' 'erinnen' // conflate female versions of nouns ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) 'ln' 'lns' ( <- 'l' ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) 'et' ( test et_ending not among ( 'geordn' // Still conflate untergeordnet/untergeordnetere, etc. 'intern' // Don't conflate Internet and internes. 'plan' // Don't conflate Plan and Planet. 'tick' // Don't conflate Tick and Ticket. 'tr' // Still conflate Vertreter/Vertretung, etc. ) delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-3.0.1/algorithms/greek.sbl000066400000000000000000000646231500727106100173160ustar00rootroot00000000000000// A stemmer for Modern Greek language, based on: // // Ntais, Georgios. Development of a Stemmer for the Greek // Language. Diss. Royal Institute of Technology, 2006. // https://sais.se/mthprize/2007/ntais2007.pdf // // Saroukos, Spyridon. Enhancing a Greek language stemmer. // University of Tampere, 2008. // https://trepo.tuni.fi/bitstream/handle/10024/80480/gradu03463.pdf stringescapes {} stringdef a '{U+03B1}' // alpha stringdef v '{U+03B2}' // beta stringdef g '{U+03B3}' // gamma stringdef d '{U+03B4}' // delta stringdef e '{U+03B5}' // epsilon stringdef z '{U+03B6}' // zeta stringdef i '{U+03B7}' // eta stringdef th '{U+03B8}' // theta stringdef y '{U+03B9}' // iota stringdef k '{U+03BA}' // kappa stringdef l '{U+03BB}' // lambda stringdef m '{U+03BC}' // mu stringdef n '{U+03BD}' // nu stringdef x '{U+03BE}' // xi stringdef o '{U+03BF}' // omicron stringdef p '{U+03C0}' // pi stringdef r '{U+03C1}' // rho stringdef ss '{U+03C2}' // sigma final stringdef s '{U+03C3}' // sigma stringdef t '{U+03C4}' // tau stringdef u '{U+03C5}' // upsilon stringdef f '{U+03C6}' // phi stringdef ch '{U+03C7}' // chi stringdef ps '{U+03C8}' // psi stringdef oo '{U+03C9}' // omega stringdef A '{U+0391}' // Alpha stringdef V '{U+0392}' // Beta stringdef G '{U+0393}' // Gamma stringdef D '{U+0394}' // Delta stringdef E '{U+0395}' // Epsilon stringdef Z '{U+0396}' // Zeta stringdef I '{U+0397}' // Eta stringdef Th '{U+0398}' // Theta stringdef Y '{U+0399}' // Iota stringdef K '{U+039A}' // Kappa stringdef L '{U+039B}' // Lambda stringdef M '{U+039C}' // Mu stringdef N '{U+039D}' // Nu stringdef X '{U+039E}' // Xi stringdef O '{U+039F}' // Omicron stringdef P '{U+03A0}' // Pi stringdef R '{U+03A1}' // Rho stringdef S '{U+03A3}' // Sigma stringdef T '{U+03A4}' // Tau stringdef U '{U+03A5}' // Upsilon stringdef F '{U+03A6}' // Phi stringdef Ch '{U+03A7}' // Chi stringdef Ps '{U+03A8}' // Psi stringdef Oo '{U+03A9}' // Omega stringdef Y: '{U+03AA}' // Iota with dialytika stringdef U: '{U+03AB}' // Upsilon with dialytika stringdef a' '{U+03AC}' // alpha with tonos stringdef e' '{U+03AD}' // epsilon with tonos stringdef i' '{U+03AE}' // eta with tonos stringdef y' '{U+03AF}' // iota with tonos stringdef o' '{U+03CC}' // omicron with tonos stringdef u' '{U+03CD}' // upsilon with tonos stringdef oo' '{U+03CE}' // omega with tonos stringdef i:' '{U+0390}' // iota with dialytika and tonos stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos stringdef i: '{U+03CA}' // iota with dialytika stringdef u: '{U+03CB}' // upsilon with dialytika stringdef A' '{U+0386}' // Alpha with tonos stringdef E' '{U+0388}' // Epsilon with tonos stringdef I' '{U+0389}' // Eta with tonos stringdef Y' '{U+038A}' // Iota with tonos stringdef O' '{U+038C}' // Omicron with tonos stringdef U' '{U+038E}' // Upsilon with tonos stringdef OO' '{U+038F}' // Omega with tonos externals ( stem ) booleans ( test1 ) groupings ( v v2 ) routines ( tolower has_min_length step_s1 step_s2 step_s3 step_s4 step_s5 step_s6 step_s7 step_s8 step_s9 step_s10 step_1 step_2a step_2b step_2c step_2d step_3 step_4 step_5a step_5b step_5c step_5d step_5e step_5f step_5g step_5h step_5i step_5j step_5k step_5l step_5m step_6 step_7 ) define v '{a}{e}{i}{y}{o}{u}{oo}' define v2 '{a}{e}{i}{y}{o}{oo}' backwardmode ( define has_min_length as ( $(len >= 3) ) define tolower as ( repeat ( [substring] among ( '{A}' (<- '{a}') '{V}' (<- '{v}') '{G}' (<- '{g}') '{D}' (<- '{d}') '{E}' (<- '{e}') '{Z}' (<- '{z}') '{I}' (<- '{i}') '{Th}' (<- '{th}') '{Y}' (<- '{y}') '{K}' (<- '{k}') '{L}' (<- '{l}') '{M}' (<- '{m}') '{N}' (<- '{n}') '{X}' (<- '{x}') '{O}' (<- '{o}') '{P}' (<- '{p}') '{R}' (<- '{r}') '{S}' (<- '{s}') '{T}' (<- '{t}') '{U}' (<- '{u}') '{F}' (<- '{f}') '{Ch}' (<- '{ch}') '{Ps}' (<- '{ps}') '{Oo}' (<- '{oo}') '{Y:}' (<- '{y}') '{U:}' (<- '{u}') '{a'}' (<- '{a}') '{e'}' (<- '{e}') '{i'}' (<- '{i}') '{y'}' (<- '{y}') '{o'}' (<- '{o}') '{u'}' (<- '{u}') '{oo'}' (<- '{oo}') '{i:'}' (<- '{i}') '{u:'}' (<- '{u}') '{i:}' (<- '{i}') '{u:}' (<- '{u}') '{A'}' (<- '{a}') '{E'}' (<- '{e}') '{I'}' (<- '{i}') '{Y'}' (<- '{y}') '{O'}' (<- '{o}') '{U'}' (<- '{u}') '{OO'}' (<- '{oo}') '{ss}' (<- '{s}') '' (next) ) ) ) define step_1 as ( [substring] among ( '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}') '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}') '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}') '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}') '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}') '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}') '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}') '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}') '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}') '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}') '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}') ) unset test1 ) define step_s1 as ( [substring] among ( '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}' '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' ( delete unset test1 ([] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' (<- '{y}') '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}' '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}' '{r}' '{p}{y}{p}{e}{r}{o}{r}' (<- '{y}{z}') )) ) ) ) define step_s2 as ( [substring] among ( '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}') ) ) ) ) define step_s3 as ( (['{y}{s}{a}'] atlimit <- '{y}{s}') or [substring] among ( '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' ( delete unset test1 ([] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' (<- '{y}') '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}' '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}' (<- '{y}{s}') )) ) ) ) define step_s4 as ( [substring] among ( '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' (<- '{y}') ) ) ) ) define step_s5 as ( [substring] among ( '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}' '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' ( delete unset test1 ([] substring atlimit among ( '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}' (<- '{y}') '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}' '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}' '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}' (<- '{y}{s}{t}') )) ) ) ) define step_s6 as ( [substring] among ( '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' ( delete unset test1 ([] substring atlimit among ( '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}' (<- '{y}{s}{m}') '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}' (<- '{y}') )) or ([substring] among ( '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}') '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}') '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}') '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}') '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}') '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}') '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}') '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}') '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}') '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}') )) ) ) ) define step_s7 as ( [substring] among ( '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' ( delete unset test1 [] substring atlimit among ( '{s}' '{ch}' (<- '{a}{r}{a}{k}') ) ) ) ) define step_s8 as ( [substring] among ( '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' ( delete unset test1 ([] substring atlimit among ( '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}' '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}' '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}' '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}' (<- '{a}{k}') '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}' '{p}{a}{t}{e}{r}' '{p}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}' // We're implementing the revised algorithm from the Saroukos paper // which also lists '{k}{o}{n}' and '{s}{k}' here, but these are // also listed just above in the `Add {a}{k} in the end` exception. // It seems they're redundant here, so we omit them (otherwise the // Snowball compiler would report an error). (<- '{y}{t}{s}') )) or ([] '{k}{o}{r}' <- '{y}{t}{s}') ) ) ) define step_s9 as ( [substring] among ( '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' ( delete unset test1 ([] substring atlimit among ( '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}') )) or ([] substring among ( '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}') )) ) ) ) define step_s10 as ( [substring] among ( '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' ( delete unset test1 [] substring atlimit among ( '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}' (<- '{y}{s}{k}') ) ) ) ) define step_2a as ( [substring] among ( '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete) ) not (substring among ( '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}' )) insert '{a}{d}' ) define step_2b as ( [substring] among ( '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete) ) [] substring among ( '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}') ) ) define step_2c as ( [substring] among ( '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete) ) [] substring among ( '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}' '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}') ) ) define step_2d as ( [substring] among ( '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1) ) [] substring atlimit among ( '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}') ) ) define step_3 as ( [substring] among ( '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1) ) ([] v <- '{y}') ) define step_4 as ( [substring] among ( '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1) ) ([] v <- '{y}{k}') or [] substring atlimit among ( '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}' '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}' '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}' '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}' (<- '{y}{k}') ) ) define step_5a as ( do (['{a}{g}{a}{m}{e}'] atlimit <- '{a}{g}{a}{m}') do ( [substring] among ( '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1) ) ) ['{a}{m}{e}'] delete unset test1 [] substring atlimit among ( '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}' (<- '{a}{m}') ) ) define step_5b as ( do ( [substring] among ( '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}' '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}') ) ) ) ) ['{a}{n}{e}'] delete unset test1 ([] v2 <- '{a}{n}') or [] substring atlimit among ( '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}' '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}' '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}' '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}' '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}' '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}' '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}' '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}' '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}' '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}' '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}' '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}' '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}' '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}' '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}' '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}' (<- '{a}{n}') ) ) define step_5c as ( do ( [substring] among ( '{i}{s}{e}{t}{e}' (delete unset test1) ) ) ['{e}{t}{e}'] delete unset test1 ([] v2 <- '{e}{t}') or ([] substring among ( '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}' '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}' '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}' (<- '{e}{t}') )) or [] substring atlimit among ( '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}' '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}' '{th}{a}{r}{r}' '{th}' (<- '{e}{t}') ) ) define step_5d as ( [substring] among ( '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' ( delete unset test1 ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or ([] '{k}{r}{e}' <- '{oo}{n}{t}') ) ) ) define step_5e as ( [substring] among ( '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' ( delete unset test1 ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}') ) ) ) define step_5f as ( do ( ['{y}{e}{s}{t}{e}'] delete unset test1 [] substring atlimit among ( '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}') ) ) ['{e}{s}{t}{e}'] delete unset test1 [] substring atlimit among ( '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}' (<- '{y}{e}{s}{t}') ) ) define step_5g as ( do ( [substring] among ( '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1) ) ) [substring] among ( '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' ( delete unset test1 ([] substring among ( '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}') )) or ([] substring atlimit among ( '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}') )) ) ) ) define step_5h as ( [substring] among ( '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' ( delete unset test1 ([] substring among ( '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}' '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}') )) or ([] substring atlimit among ( '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}' '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}' '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}' (<- '{o}{u}{s}') )) ) ) ) define step_5i as ( [substring] among ( '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' ( delete unset test1 ([] '{k}{o}{l}{l}' <- '{a}{g}') or ( ([] substring among ( '{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}' () '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}' (<- '{a}{g}') )) or ([] substring atlimit among ( '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}' '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}' '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}' '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}' '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}' (<- '{a}{g}') )) ) ) ) ) define step_5j as ( [substring] among ( '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1) ) [] substring atlimit among ( '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}') ) ) define step_5k as ( [substring] among ( '{i}{s}{t}{e}' (delete unset test1) ) [] substring atlimit among ( '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}' (<- '{i}{s}{t}') ) ) define step_5l as ( [substring] among ( '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1) ) [] substring atlimit among ( '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}') ) ) define step_5m as ( [substring] among ( '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1) ) [] substring atlimit among ( '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}' (<- '{o}{u}{m}') ) ) define step_6 as ( do ( [substring] among ( '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}') ) ) test1 [substring] among ( '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}' '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}' '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}' '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}' '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}' '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}' '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}' '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}' '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}' '{oo}{n}' (delete) ) ) define step_7 as ( [substring] among ( '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete) ) ) ) define stem as ( backwards ( do tolower has_min_length set test1 do step_1 do step_s1 do step_s2 do step_s3 do step_s4 do step_s5 do step_s6 do step_s7 do step_s8 do step_s9 do step_s10 do step_2a do step_2b do step_2c do step_2d do step_3 do step_4 do step_5a do step_5b do step_5c do step_5d do step_5e do step_5f do step_5g do step_5h do step_5j do step_5i do step_5k do step_5l do step_5m do step_6 do step_7 ) ) snowball-3.0.1/algorithms/hindi.sbl000066400000000000000000000226621500727106100173110ustar00rootroot00000000000000// An implementation of "A Lightweight Stemmer for Hindi": // http://www.kbcs.in/downloads/papers/StmmerHindi.pdf externals ( stem ) stringescapes {} // The transliteration scheme used for our stringdefs matches that used in the // paper, as documented in the appendix. It appears to match the WX notation // (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently // uses 'z' for Anunasika whereas the paper uses Mh. // // We discriminate dependent vowels by adding a leading "_" to their stringdef // names (mnemonic: the _ signifies removing the implicit a from the preceding // character). // Vowels and sonorants: stringdef a '{U+0905}' stringdef A '{U+0906}' stringdef i '{U+0907}' stringdef I '{U+0908}' stringdef u '{U+0909}' stringdef U '{U+090A}' stringdef q '{U+090B}' stringdef e '{U+090F}' stringdef E '{U+0910}' stringdef o '{U+0913}' stringdef O '{U+0914}' // Vowel signs: stringdef _A '{U+093E}' stringdef _i '{U+093F}' stringdef _I '{U+0940}' stringdef _u '{U+0941}' stringdef _U '{U+0942}' stringdef _q '{U+0943}' stringdef _e '{U+0947}' stringdef _E '{U+0948}' stringdef _o '{U+094B}' stringdef _O '{U+094C}' // Diacritics: stringdef M '{U+0902}' stringdef H '{U+0903}' stringdef Mh '{U+0901}' stringdef Z '{U+093C}' // Nukta stringdef virama '{U+094D}' // Velar consonants: stringdef k '{U+0915}' stringdef K '{U+0916}' stringdef g '{U+0917}' stringdef G '{U+0918}' stringdef f '{U+0919}' // Palatal consonants: stringdef c '{U+091A}' stringdef C '{U+091B}' stringdef j '{U+091C}' stringdef J '{U+091D}' stringdef F '{U+091E}' // Retroflex consonants: stringdef t '{U+091F}' stringdef T '{U+0920}' stringdef d '{U+0921}' stringdef D '{U+0922}' stringdef N '{U+0923}' // Dental consonants: stringdef w '{U+0924}' stringdef W '{U+0925}' stringdef x '{U+0926}' stringdef X '{U+0927}' stringdef n '{U+0928}' // Labial consonants: stringdef p '{U+092A}' stringdef P '{U+092B}' stringdef b '{U+092C}' stringdef B '{U+092D}' stringdef m '{U+092E}' // Semi-vowels: stringdef y '{U+092F}' stringdef r '{U+0930}' stringdef l '{U+0932}' stringdef v '{U+0935}' // Fricatives: stringdef S '{U+0936}' stringdef R '{U+0937}' stringdef s '{U+0938}' stringdef h '{U+0939}' stringdef lY '{U+0933}' // Precomposed characters - letters + nukta: stringdef nZ '{U+0929}' // ≡ {n}{Z} stringdef rZ '{U+0931}' // ≡ {r}{Z} stringdef lYZ '{U+0934}' // ≡ {lY}{Z} stringdef kZ '{U+0958}' // ≡ {k}{Z} stringdef KZ '{U+0959}' // ≡ {K}{Z} stringdef gZ '{U+095A}' // ≡ {g}{Z} stringdef jZ '{U+095B}' // ≡ {j}{Z} stringdef dZ '{U+095C}' // ≡ {d}{Z} stringdef DZ '{U+095D}' // ≡ {D}{Z} stringdef PZ '{U+095E}' // ≡ {P}{Z} stringdef yZ '{U+095F}' // ≡ {y}{Z} groupings ( consonant ) routines ( CONSONANT ) define consonant '{k}{K}{g}{G}{f}' + '{c}{C}{j}{J}{F}' + '{t}{T}{d}{D}{N}' + '{w}{W}{x}{X}{n}' + '{p}{P}{b}{B}{m}' + '{y}{r}{l}{v}' + '{S}{R}{s}{h}' + '{lY}' + '{Z}' + // Nukta // Precomposed characters - letter and nukta: '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}' backwardmode ( define CONSONANT as ( consonant ) ) define stem as ( // We assume in this implementation that the whole word doesn't count // as a valid suffix to remove, so we remove the longest suffix from // the list which leaves at least one character. This change affects // 47 words out of the 65,140 in the sample vocabulary from Hindi // wikipedia. // // The trick here is we use `next` in forward mode to advance the cursor // to the second character, then `backwards` swaps the cursor and limit. next backwards ( [substring] among ( // The list below is derived from figure 3 in the paper. // // We perform the stemming on the Devanagari characters rather than // transliterating to Latin, so we have adapted the list below to // reflect this by converting suffixes back to Devanagari as // follows: // // * within the suffixes, "a" after a consonant is dropped since // consonants have an implicit "a". // // * within the suffixes, a vowel other than "a" after a consonant // is a dependent vowel (vowel sign); a vowel (including "a") // after a non-consonant is an independent vowel. // // * to allow the vowel at the start of each suffix being dependent // or independent, we include each suffix twice. For the // dependent version, a leading "a" is dropped and we check that // the suffix is preceded by a consonant (which will have an // implicit "a"). // // * we add '{a}', which is needed for the example given right at // the end of section 5 to work (conflating BarawIya and // BarawIyawA), and which 3.1 a.v strongly suggests should be in // the list: // // Thus, the following suffix deletions (longest possible // match) are required to reduce inflected forms of masculine // nouns to a common stem: // a A i [...] // // Adding '{a}' only affect 2 words out of the 65,140 in the // sample vocabulary. // // * The transliterations of our stems would end with "a" when our // stems end in a consonant, so we also include {virama} in the // list of suffixes to remove (this affects 222 words from the // sample vocabulary). // // We've also assumed that Mh in the suffix list always means {Mh} // and never {M}{h}{virama}. Only one of the 65,140 words in the // sample vocabulary stems differently due to this (and that word // seems to be a typo). '{virama}' '{a}' '{A}' '{i}' '{I}' '{u}' '{U}' '{e}' '{o}' '{e}{M}' '{o}{M}' '{A}{M}' '{u}{A}{M}' '{u}{e}{M}' '{u}{o}{M}' '{A}{e}{M}' '{A}{o}{M}' '{i}{y}{_A}{M}' '{i}{y}{_o}{M}' '{A}{i}{y}{_A}{M}' '{A}{i}{y}{_o}{M}' '{A}{Mh}' '{i}{y}{_A}{Mh}' '{A}{i}{y}{_A}{Mh}' '{a}{w}{_A}{e}{M}' '{a}{w}{_A}{o}{M}' '{a}{n}{_A}{e}{M}' '{a}{n}{_A}{o}{M}' '{a}{w}{_A}' '{a}{w}{_I}' '{I}{M}' '{a}{w}{_I}{M}' '{a}{w}{_e}' '{A}{w}{_A}' '{A}{w}{_I}' '{A}{w}{_I}{M}' '{A}{w}{_e}' '{a}{n}{_A}' '{a}{n}{_I}' '{a}{n}{_e}' '{A}{n}{_A}' '{A}{n}{_e}' '{U}{M}{g}{_A}' '{U}{M}{g}{_I}' '{A}{U}{M}{g}{_A}' '{A}{U}{M}{g}{_I}' '{e}{M}{g}{_e}' '{e}{M}{g}{_I}' '{A}{e}{M}{g}{_e}' '{A}{e}{M}{g}{_I}' '{o}{g}{_e}' '{o}{g}{_I}' '{A}{o}{g}{_e}' '{A}{o}{g}{_I}' '{e}{g}{_A}' '{e}{g}{_I}' '{A}{e}{g}{_A}' '{A}{e}{g}{_I}' '{A}{y}{_A}' '{A}{e}' '{A}{I}' '{A}{I}{M}' '{i}{e}' '{A}{o}' '{A}{i}{e}' '{a}{k}{r}' '{A}{k}{r}' '{_A}' '{_i}' '{_I}' '{_u}' '{_U}' '{_e}' '{_o}' '{_e}{M}' '{_o}{M}' '{_A}{M}' '{_u}{A}{M}' '{_u}{e}{M}' '{_u}{o}{M}' '{_A}{e}{M}' '{_A}{o}{M}' '{_i}{y}{_A}{M}' '{_i}{y}{_o}{M}' '{_A}{i}{y}{_A}{M}' '{_A}{i}{y}{_o}{M}' '{_A}{Mh}' '{_i}{y}{_A}{Mh}' '{_A}{i}{y}{_A}{Mh}' '{_I}{M}' '{_A}{w}{_A}' '{_A}{w}{_I}' '{_A}{w}{_I}{M}' '{_A}{w}{_e}' '{_A}{n}{_A}' '{_A}{n}{_e}' '{_U}{M}{g}{_A}' '{_U}{M}{g}{_I}' '{_A}{U}{M}{g}{_A}' '{_A}{U}{M}{g}{_I}' '{_e}{M}{g}{_e}' '{_e}{M}{g}{_I}' '{_A}{e}{M}{g}{_e}' '{_A}{e}{M}{g}{_I}' '{_o}{g}{_e}' '{_o}{g}{_I}' '{_A}{o}{g}{_e}' '{_A}{o}{g}{_I}' '{_e}{g}{_A}' '{_e}{g}{_I}' '{_A}{e}{g}{_A}' '{_A}{e}{g}{_I}' '{_A}{y}{_A}' '{_A}{e}' '{_A}{I}' '{_A}{I}{M}' '{_i}{e}' '{_A}{o}' '{_A}{i}{e}' '{_A}{k}{r}' /* Suffixes with a leading implicit a: */ '{w}{_A}{e}{M}' CONSONANT '{w}{_A}{o}{M}' CONSONANT '{n}{_A}{e}{M}' CONSONANT '{n}{_A}{o}{M}' CONSONANT '{w}{_A}' CONSONANT '{w}{_I}' CONSONANT '{w}{_I}{M}' CONSONANT '{w}{_e}' CONSONANT '{n}{_A}' CONSONANT '{n}{_I}' CONSONANT '{n}{_e}' CONSONANT '{k}{r}' CONSONANT ) delete ) ) snowball-3.0.1/algorithms/hungarian.sbl000066400000000000000000000124001500727106100201570ustar00rootroot00000000000000/* Hungarian Stemmer Removes noun inflections */ routines ( mark_regions R1 v_ending case case_special case_other plural owned sing_owner plur_owner instrum factive undouble double ) externals ( stem ) integers ( p1 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' //a-acute stringdef e' '{U+00E9}' //e-acute stringdef i' '{U+00ED}' //i-acute stringdef o' '{U+00F3}' //o-acute stringdef o" '{U+00F6}' //o-umlaut stringdef oq '{U+0151}' //o-double acute stringdef u' '{U+00FA}' //u-acute stringdef u" '{U+00FC}' //u-umlaut stringdef uq '{U+0171}' //u-double acute define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' define mark_regions as ( $p1 = limit ( // Word start with a vowel, start R1 after: V...C v do (gopast non-v setmark p1) ) or ( // Word start with a non-vowel, start R1 after: C...V gopast v setmark p1 ) ) backwardmode ( define R1 as $p1 <= cursor define v_ending as ( [substring] R1 among( '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define double as ( test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') ) define undouble as ( next [hop 1] delete ) define instrum as( [substring] R1 among( 'al' (double) 'el' (double) ) delete undouble ) define case as ( [substring] R1 among( 'ban' 'ben' 'ba' 'be' 'ra' 're' 'nak' 'nek' 'val' 'vel' 't{o'}l' 't{oq}l' 'r{o'}l' 'r{oq}l' 'b{o'}l' 'b{oq}l' 'hoz' 'hez' 'h{o"}z' 'n{a'}l' 'n{e'}l' 'ig' 'at' 'et' 'ot' '{o"}t' '{e'}rt' 'k{e'}pp' 'k{e'}ppen' 'kor' 'ul' '{u"}l' 'v{a'}' 'v{e'}' 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' 'k{e'}nt' 'en' 'on' 'an' '{o"}n' 'n' 't' ) delete v_ending ) define case_special as( [substring] R1 among( '{e'}n' (<- 'e') '{a'}n' (<- 'a') '{a'}nk{e'}nt' (<- 'a') ) ) define case_other as( [substring] R1 among( 'astul' 'est{u"}l' (delete) 'stul' 'st{u"}l' (delete) '{a'}stul' (<- 'a') '{e'}st{u"}l' (<- 'e') ) ) define factive as( [substring] R1 among( '{a'}' (double) '{e'}' (double) ) delete undouble ) define plural as ( [substring] R1 among( '{a'}k' (<- 'a') '{e'}k' (<- 'e') '{o"}k' (delete) 'ak' (delete) 'ok' (delete) 'ek' (delete) 'k' (delete) ) ) define owned as ( [substring] R1 among ( 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) '{e'}k{e'}' (<- 'e') '{a'}k{e'}' (<- 'a') 'k{e'}' (delete) '{e'}{e'}i' (<- 'e') '{a'}{e'}i' (<- 'a') '{e'}i' (delete) '{e'}{e'}' (<- 'e') '{e'}' (delete) ) ) define sing_owner as ( [substring] R1 among( '{u"}nk' 'unk' (delete) '{a'}nk' (<- 'a') '{e'}nk' (<- 'e') 'nk' (delete) '{a'}juk' (<- 'a') '{e'}j{u"}k' (<- 'e') 'juk' 'j{u"}k' (delete) 'uk' '{u"}k' (delete) 'em' 'om' 'am' (delete) '{a'}m' (<- 'a') '{e'}m' (<- 'e') 'm' (delete) 'od' 'ed' 'ad' '{o"}d' (delete) '{a'}d' (<- 'a') '{e'}d' (<- 'e') 'd' (delete) 'ja' 'je' (delete) 'a' 'e' 'o' (delete) '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define plur_owner as ( [substring] R1 among( 'jaim' 'jeim' (delete) '{a'}im' (<- 'a') '{e'}im' (<- 'e') 'aim' 'eim' (delete) 'im' (delete) 'jaid' 'jeid' (delete) '{a'}id' (<- 'a') '{e'}id' (<- 'e') 'aid' 'eid' (delete) 'id' (delete) 'jai' 'jei' (delete) '{a'}i' (<- 'a') '{e'}i' (<- 'e') 'ai' 'ei' (delete) 'i' (delete) 'jaink' 'jeink' (delete) 'eink' 'aink' (delete) '{a'}ink' (<- 'a') '{e'}ink' (<- 'e') 'ink' 'jaitok' 'jeitek' (delete) 'aitok' 'eitek' (delete) '{a'}itok' (<- 'a') '{e'}itek' (<- 'e') 'itek' (delete) 'jeik' 'jaik' (delete) 'aik' 'eik' (delete) '{a'}ik' (<- 'a') '{e'}ik' (<- 'e') 'ik' (delete) ) ) ) define stem as ( do mark_regions backwards ( do instrum do case do case_special do case_other do factive do owned do sing_owner do plur_owner do plural ) ) snowball-3.0.1/algorithms/indonesian.sbl000066400000000000000000000165151500727106100203450ustar00rootroot00000000000000// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: // http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf integers ( // The paper defines measure as the number of vowels in the word. We // count this initially, then adjust the count each time we remove a // prefix or suffix. measure // Numeric code for the type of prefix removed: // // 0 other/none // 1 'di' or 'meng' or 'ter' // 2 'per' // 3 'ke' or 'peng' // 4 'ber' // // Some of these have variant forms, so e.g. "meng" includes "men", "me", // "meny", "mem". // // Note that the value of prefix is only used in remove_suffix (and // routines it calls) so we don't need to worry about // remove_second_order_prefix overwriting a value of prefix set by // remove_first_order_prefix since remove_suffix gets called between // the two. prefix ) groupings ( vowel ) routines ( remove_particle remove_possessive_pronoun remove_first_order_prefix remove_second_order_prefix remove_suffix KER SUFFIX_KAN_OK SUFFIX_AN_OK SUFFIX_I_OK VOWEL ) externals ( stem ) stringescapes {} backwardmode ( define remove_particle as ( [substring] among ( 'kah' 'lah' 'pun' (delete $measure-=1) ) ) define remove_possessive_pronoun as ( [substring] among ( 'ku' 'mu' 'nya' (delete $measure-=1) ) ) // prefix not in {ke, peng, per} define SUFFIX_KAN_OK as ( // On page 29, the example "kompas Q.31" says "Both Nazief and Porter // stemmer converted the word peledakan (blast, explotion [sic]) to // ledak (to blast, to explode)". However, the algorithm as described // doesn't behave in this way - grammatically the prefix pe- occurs as a // variation of both the first-order derivational prefix peng- and the // second-order derivational prefix per-, but table 2.5 doesn't include // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) // as having prefix "per" not "peng", and so we remove derivational // suffix "kan" rather than "an" to give stem leda. (Porter-style // stemmers remove the longest suffix they can amongst those available, // which this paper notes in the last paragraph on page 15). // // We resolve this by amending the condition on suffix "kan" to // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's // behaviour match all the examples in the paper except for one: // "perbaikan" is shown in table 3.4 as stemming to "bai", but with // this change it now stems to "baik". The table notes that "baik" is // the actual root so this deviation is an improvement. In a sample // vocabulary derived from the most common words in id.wikipedia.org, // this change only affects 0.12% of words (76 out of 64,587, including // "peledakan" and "perbaikan"). $prefix != 3 and $prefix != 2 ) // prefix not in {di, meng, ter} define SUFFIX_AN_OK as ( $prefix != 1 ) define SUFFIX_I_OK as ( // prefix not in {ke, peng, ber} $prefix <= 2 // The rest of the condition from the paper is: // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i // // The meaning of this is unclear in several ways, and none of the // examples given of the stemmer's behaviour in the paper help to // resolve these issues. // // Notice that c₂ isn't actually used - the most obvious explanation // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". // // Elsewhere the paper defines V... as meaning "the stem starts with // a vowel" and K... as meaning "the stem starts with a consonant". // The meaning of | isn't actually defined, but clearly means // alternation. // // However nowhere is the precedence of | vs ... defined, and there // isn't a standard precedence we could reasonably assume. In other // places where the paper says X|Y... it seems the | binds more // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit // odd as the first letter must be either a vowel or a consonant, so // that really just means "ends cᵢcⱼ" (and has at least one letter // before cᵢ but we only call SUFFIX_I_OK if $measure > 2 which // ensures that part). However, nowhere in the paper uses or defines // a notation such as ...X, which may explain this seemingly redundant // way of specifying this. // // The conditions elsewhere on prefix removal (e.g. V...) are clearly // on the stem left after the prefix is removed. None of the other // rules for suffix removal have conditions on the stem, but for // consistency with the prefix rules we might expect that the cᵢcⱼ // test is on what's left *after* removing the "i" suffix. // // Studying Indonesian wordlists and discussion with a native // speaker leads us to conclude that the purpose of this check is to // protect words of foreign origin (e.g. "televisi", "organisasi", // "komunikasi") from stemming, and the common feature of these is // that the word ends "-si", so we conclude that the condition here // should be read as "word does not end -si", and this is what we // have implemented. not 's' ) define remove_suffix as ( [substring] among ( 'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK (delete $measure-=1) ) ) ) define vowel 'aeiou' define VOWEL as ( vowel ) define KER as ( non-vowel 'er' ) define remove_first_order_prefix as ( [substring] among ( 'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1) 'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1) 'meny' VOWEL ($prefix=1 <-'s' $measure-=1) 'peny' VOWEL ($prefix=3 <-'s' $measure-=1) 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) ) ) define remove_second_order_prefix as ( // The paper has the condition on removal of prefix "bel" and "pel" as // just "ajar" not "ajar..." but it seems that the latter must be what // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". // This change only affects a very small number of words (11 out of // 64,587) and only for the better. [substring] among ( 'per' 'pe' (delete $prefix=2 $measure-=1) 'pelajar' (<-'ajar' $measure-=1) 'ber' (delete $prefix=4 $measure-=1) 'belajar' (<-'ajar' $prefix=4 $measure-=1) 'be' KER (delete $prefix=4 $measure-=1) ) ) define stem as ( $measure = 0 do ( repeat ( gopast vowel $measure+=1 ) ) $measure > 2 $prefix = 0 backwards ( do remove_particle $measure > 2 do remove_possessive_pronoun ) $measure > 2 test ( remove_first_order_prefix do ( test ($measure > 2 backwards remove_suffix) $measure > 2 remove_second_order_prefix ) ) or ( do remove_second_order_prefix do ($measure > 2 backwards remove_suffix) ) ) snowball-3.0.1/algorithms/irish.sbl000066400000000000000000000047471500727106100173400ustar00rootroot00000000000000routines ( R1 R2 RV initial_morph mark_regions noun_sfx deriv verb_sfx ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* Accented characters */ stringdef a' '{U+00E1}' // a-acute stringdef e' '{U+00E9}' // e-acute stringdef i' '{U+00ED}' // i-acute stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute define v 'aeiou{a'}{e'}{i'}{o'}{u'}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( gopast v setmark pV gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define initial_morph as ( [substring] among ( 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic (delete) // verbs 'd{'}' (delete) 'd{'}fh' (<- 'f') // other contractions 'm{'}' 'b{'}' (delete) 'sh' (<- 's') 'mb' (<- 'b') 'gc' (<- 'c') 'nd' (<- 'd') 'bhf' (<- 'f') 'ng' (<- 'g') 'bp' (<- 'p') 'ts' (<- 's') 'dt' (<- 't') // Lenition 'bh' (<- 'b') 'ch' (<- 'c') 'dh' (<- 'd') 'fh' (<- 'f') 'gh' (<- 'g') 'mh' (<- 'm') 'ph' (<- 'p') 'th' (<- 't') ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define noun_sfx as ( [substring] among ( 'amh' 'eamh' 'abh' 'eabh' 'aibh' 'ibh' 'aimh' 'imh' 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' (R1 delete) 'ire' 'ir{i'}' 'aire' 'air{i'}' (R2 delete) ) ) define deriv as ( [substring] among ( 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl 'arcacht' 'arcachta{i'}' 'arcachta' (<- 'arc') // monarcacht -> monarc 'gineach' 'gineas' 'ginis' (<- 'gin') 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' (<- 'graf') 'paite' 'patach' 'pataigh' 'patacha' (<- 'paite') '{o'}ideach' '{o'}ideacha' '{o'}idigh' (<- '{o'}id') ) ) define verb_sfx as ( [substring] among ( 'imid' 'aimid' '{i'}mid' 'a{i'}mid' 'faidh' 'fidh' (RV delete) 'ain' 'eadh' 'adh' '{a'}il' 'tear' 'tar' (R1 delete) ) ) ) define stem as ( do initial_morph do mark_regions backwards ( do noun_sfx do deriv do verb_sfx ) ) snowball-3.0.1/algorithms/italian.sbl000066400000000000000000000115261500727106100176340ustar00rootroot00000000000000 routines ( prelude postlude mark_regions RV R1 R2 attached_pronoun standard_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v AEIO CG ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' stringdef a` '{U+00E0}' stringdef e' '{U+00E9}' stringdef e` '{U+00E8}' stringdef i' '{U+00ED}' stringdef i` '{U+00EC}' stringdef o' '{U+00F3}' stringdef o` '{U+00F2}' stringdef u' '{U+00FA}' stringdef u` '{U+00F9}' define v 'aeiou{a`}{e`}{i`}{o`}{u`}' define prelude as ( test repeat ( [substring] among( '{a'}' (<- '{a`}') '{e'}' (<- '{e`}') '{i'}' (<- '{i`}') '{o'}' (<- '{o`}') '{u'}' (<- '{u`}') 'qu' (<- 'qU') '' (next) ) ) repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or 'divan' // Otherwise "divano" stems to "div" and collides with "diva". or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'ci' 'gli' 'la' 'le' 'li' 'lo' 'mi' 'ne' 'si' 'ti' 'vi' // the compound forms are: 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' 'mela' 'mele' 'meli' 'melo' 'mene' 'tela' 'tele' 'teli' 'telo' 'tene' 'cela' 'cele' 'celi' 'celo' 'cene' 'vela' 'vele' 'veli' 'velo' 'vene' ) among( (RV) 'ando' 'endo' (delete) 'ar' 'er' 'ir' (<- 'e') ) ) define standard_suffix as ( [substring] among( 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' 'atrice' 'atrici' 'ante' 'anti' ( R2 delete ) 'azione' 'azioni' 'atore' 'atori' ( R2 delete try ( ['ic'] R2 delete ) ) 'logia' 'logie' ( R2 <- 'log' ) 'uzione' 'uzioni' 'usione' 'usioni' ( R2 <- 'u' ) 'enza' 'enze' ( R2 <- 'ente' ) 'amento' 'amenti' 'imento' 'imenti' ( RV delete ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' ( ['at'] R2 delete ) 'os' 'ic' 'abil' ) ) ) 'it{a`}' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'ivo' 'ivi' 'iva' 'ive' ( R2 delete try ( ['at'] R2 delete ['ic'] R2 delete ) ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' 'ono' 'uta' 'ute' 'uti' 'uto' 'ar' 'ir' // but 'er' is problematical (delete) ) ) define AEIO 'aeio{a`}{e`}{i`}{o`}' define CG 'cg' define vowel_suffix as ( try ( [AEIO] RV delete ['i'] RV delete ) try ( ['h'] CG RV delete ) ) ) define stem as ( do prelude do mark_regions backwards ( do attached_pronoun do (standard_suffix or verb_suffix) do vowel_suffix ) do postlude ) snowball-3.0.1/algorithms/lithuanian.sbl000066400000000000000000000316601500727106100203500ustar00rootroot00000000000000externals ( stem ) // escape symbols for substituting lithuanian characters stringescapes { } /* Special characters in Unicode Latin Extended-A */ // ' nosine stringdef ak '{U+0105}' // ą a + ogonek stringdef ek '{U+0119}' // ę e + ogonek stringdef ik '{U+012F}' // į i + ogonek stringdef uk '{U+0173}' // ų u + ogonek // . taskas stringdef e. '{U+0117}' // ė e + dot // - ilgoji stringdef u- '{U+016B}' // ū u + macron // v varnele stringdef cv '{U+010D}' // č c + caron (haček) stringdef sv '{U+0161}' // š s + caron (haček) stringdef zv '{U+017E}' // ž z + caron (haček) // [C](VC)^m[V|C] // definitions of variables for // p1 - position of m = 0 integers ( p1 ) // groupings // v - lithuanian vowels groupings ( v ) // v - all lithuanian vowels define v 'aeiyou{ak}{ek}{ik}{uk}{e.}{u-}' // all lithuanian stemmer routines: 4 steps routines ( step2 step1 fix_chdz fix_gd fix_conflicts ) backwardmode ( define step1 as ( setlimit tomark p1 for ([substring]) among ( // Daiktavardžiai (Nouns) // I linksniuotė (declension I) 'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys 'o' 'io' // vyro, kelio 'ui' 'iui' // vyrui, keliui '{ak}' 'i{ak}' '{ik}' // vyrą, kelią, brolį 'u' 'iu' // vyru, keliu 'e' 'yje' // vyre, kelyje 'y' 'au' 'i' // kely, brolau, broli, 'an' // nusižengiman 'ai' 'iai' // vyrai, keliai '{uk}' 'i{uk}' // vyrų, kelių 'ams' 'am' // vyrams, vyram 'iams' 'iam' // broliams, broliam 'us' 'ius' // vyrus, brolius 'ais' 'iais' // vyrais, keliais 'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos 'uosna' 'iuosna' // vyruosna, keliuosna 'ysna' // žutysna 'asis' 'aisi' // sukimasis, sukimaisi 'osi' '{uk}si' // sukimosi, sukimųsi 'uisi' // sukimuisi '{ak}si' // sukimąsi 'usi' // sukimusi 'esi' // sukimesi 'uo' // mėnuo // II linksniuote (declension II) 'a' 'ia' // galva, vysnios 'os' 'ios' // galvos, vysnios 'oj' 'oje' 'ioje' // galvoje, vysnioje 'osna' 'iosna' // galvosna, vyšniosna 'om' 'oms' 'ioms' // galvoms, vysnioms 'omis' 'iomis' // galvomis, vysniomis 'ose' 'iose' // galvose, vysniose 'on' 'ion' // galvon, vyšnion // III linksniuote (declension III) '{e.}' // gervė '{e.}s' // gervės 'ei' // gervei '{ek}' // gervę '{e.}j' '{e.}je' // gervėj, gervėje '{e.}ms' // gervėms 'es' // gerves '{e.}mis' // gervėmis '{e.}se' // gervėse '{e.}sna' // gervėsna '{e.}n' // žydaitėn // IV linksniuote (declension IV) 'aus' 'iaus' // sūnaus, skaičiaus 'umi' 'iumi' // sūnumi, skaičiumi 'uje' 'iuje' // sūnuje, skaičiuje 'iau' // skaičiau '{u-}s' // sūnūs 'ums' // sūnums 'umis' // sūnumis 'un' 'iun' // sūnun, administratoriun // V linksniuote (declension V) 'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers 'eniui' 'eriai' // vandeniui, eriai 'en{ik}' 'er{ik}' // vandenį, seserį 'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria 'enyje' 'eryje' // vandenyje, seseryje 'ie' 'enie' 'erie' // avie, vandenie, seserie 'enys' 'erys' // vandenys, seserys // 'en{uk}' konfliktas su 'žandenų' 'antenų' 'er{uk}' // seserų 'ims' 'enims' 'erims' // avims, vandemins, seserims 'enis' // vandenis 'imis' // žebenkštimis 'enimis' // vandenimis 'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse // Būdvardžiai (Adjectives) // (i)a linksniuotė 'iem' 'iems' // geriem, geriems 'ame' 'iame' // naujame, mediniame // Veiksmažodžiai (Verbs) // Tiesioginė nuosaka (indicative mood) // esamasis laikas (present tense) // (i)a asmenuotė (declension (i)a) 'uosi' 'iuosi' // dirbuosi, traukiuosi 'iesi' // dirbiesi 'asi' 'iasi' // dirbasi, traukiasi 'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės 'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate 'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės // i asmenuotė (declension i) 'isi' // tikisi 'im' // mylim // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime' 'im{e.}s' // tikimės 'it' 'ite' // mylit, mylite, tikitės // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės // o asmenuotė (declension o) 'ome' // mokome 'ot' 'ote' // mokot, mokote // būtasis laikas // o asmenuotė (declension o) '{e.}jo' '{e.}josi' // tikėjo, tikėjosi 'ot{e.}s' // tikėjotės/bijotės // ė asmenuotė (declension ė) 'eisi' // mokeisi '{e.}si' // mokėsi '{e.}m' '{e.}me' // mokėm, mokėme '{e.}m{e.}s' // mokėmės '{e.}t' '{e.}te' // mokėt, mokėte '{e.}t{e.}s' // mokėtės // būtasis dažninis laikas (frequentative past tense) 'ausi' // mokydavausi 'om{e.}s' // mokydavomės/bijomės // būsimasis laikas (future tense) 'siu' 'siuosi' // dirbsiu, mokysiuosi 'si' 'siesi' // dirbsi, dirbsiesi 's' 'ysis' // dirbs, mokysis 'sim' 'sime' // dirbsim, dirbsime 'sit' 'site' // gersit, gersite // tariamoji nuosaka (subjunctive mood) '{cv}iau' '{cv}iausi' // dirbčiau 'tum' 'tumei' // dirbtum, dirbtumei 'tumeis' 'tumeisi' // mokytumeis, mokytumeisi // 't{uk}' nes blogai batutų -> batų 't{uk}si' // mokytųsi // 'tume' konfliktas su 'šventume' 'tum{e.}m' // dirbtumėm 'tum{e.}me' // dirbtumėme 'tum{e.}m{e.}s' // mokytumėmės 'tute' 'tum{e.}t' // dirbtute, dirbtumėt 'tum{e.}te' // dirbtumėte 'tum{e.}t{e.}s' // mokytumėtės // liepiamoji nuosaka (imperative mood) 'k' 'ki' // dirbk, dirbki, mokykis // 'kis' konfliktas viln-išk-is // 'kime' konfliktas, nes pirkime 'kim{e.}s' // mokykimės // bendratis (infinitive) 'uoti' 'iuoti' // meluoti, dygsniuoti 'auti' 'iauti' // draugauti, girtuokliauti 'oti' 'ioti' // dovanoti, meškerioti '{e.}ti' // auklėti 'yti' // akyti 'inti' // auginti 'in{e.}ti' // blusinėti 'enti' // gyventi 'tel{e.}ti' // bumbtelėti 'ter{e.}ti' // bumbterėti 'ti' // skalbti // 'tis' konfliktas, nes rytme-tis -> rytme // dalyviai (participles) '{ak}s' 'i{ak}s' '{ik}s' // dirbąs, žaidžiąs, gulįs 't{uk}s' // suktųs -> suk 'sim{e.}s' // suksimės 'sit{e.}s' // suksitės 'kite' // supkite ) delete ) define step2 as repeat ( setlimit tomark p1 for ([substring]) among ( // daiktavardziu priesagos (Noun suffixes) // budvardziu priesagos (Adjective suffixes) // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is 'ing' // tvark-ing-as 'i{sv}k' // lenk-išk-as '{e.}t' // dem-ėt-as 'ot' // garban-ot-as 'uot' 'iuot' // lang-uot-as, akin-iuot-as // 'tin', nes augintinis // dirb-tin-is // 'ut', nes batutas, degutas etc. // maž-ut-is 'yt' // maž-yt-is 'iuk' // maž-iuk-as 'iul' // maž-ul-is '{e.}l' // maž-ėl-is 'yl' // maž-yl-is 'u{cv}iuk' // maž-učiuk-as 'uliuk' // maž-uliuk-as 'ut{e.}ait' // maž-utėlait-is 'ok' // did-ok-as 'iok' // višč-iok-as 'sv' '{sv}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as 'op' 'iop' // dvej-op-as, viener-iop-as 'ain' // apval-ain-as 'yk{sv}t' 'yk{sv}{cv}' // ten-ykšt-is, vakar-ykšč-ias // laisniai 'esn' // did-esn-is 'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias // ivardziuotiniai budvardziai (Pronominal adjectives) // vyriska gimine (Male gender) 'ias' // žaliasis 'oj' 'ioj' // gerojo, žaliojo 'aj' 'iaj' // gerajam, žaliajam '{ak}j' 'i{ak}j' // garąjį, žaliąjį 'uoj' 'iuoj' // geruoju, žaliuoju 'iej' // gerieji '{uk}j' 'i{uk}j' // gerųjų, žaliųjų 'ies' // geriesiems 'uos' 'iuos' // geruosius, žaliuosius 'ais' 'iais' // geraisiais, žaliaisiais // moteriska gimine (Female gender) 'os' 'ios' // gerosios, žaliosios '{ak}s' 'i{ak}s' // gerąsios, žaliąsias // būtasis dažninis laikas (frequentative past tense) 'dav' // ei-dav-o // dalyvių priesagos (particple suffix) 'ant' 'iant' 'int' // tur-int-is '{e.}j' // tur-ėj-o '{ek}' // '{e.}j{ek}' '{ek}s' // dirb-ęs-is 'siant' // dirb-siant // pusdalyviai (participle) 'dam' // bėg-dam-as 'auj' // ūkinink-auj-a 'jam' 'iau' 'am' // baiminim-ams-i ) delete ) define fix_conflicts as ( [substring] among ( // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite' 'aite' (<-'ait{e.}') // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės' 'ait{e.}s' (<-'ait{e.}') // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės' 'uot{e.}s' (<-'uot{e.}') // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote' 'uote' (<-'uot{e.}') // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime' '{e.}jime' (<-'{e.}jimas') // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu' 'esiu' (<-'esys') // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu' 'asius' (<-'asys') // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime' 'avime' (<-'avimas') 'ojime' (<-'ojimas') // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės' 'okat{e.}s' (<-'okat{e.}') // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate' 'okate' (<-'okat{e.}') ) ) define fix_chdz as ( [substring] among ( '{cv}' (<-'t') 'd{zv}' (<-'d') ) ) define fix_gd as ( [substring] among ( 'gd' (<-'g') // '{e.}k' (<-'{e.}g') ) ) ) define stem as ( $p1 = limit do ( // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'. try (test 'a' $(len > 6) hop 1) gopast v gopast non-v setmark p1 ) backwards ( do fix_conflicts do step1 do fix_chdz do step2 do fix_chdz do fix_gd ) ) snowball-3.0.1/algorithms/lovins.sbl000066400000000000000000000200021500727106100175120ustar00rootroot00000000000000 stringescapes {} routines ( A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC endings undouble respell ) externals ( stem ) backwardmode ( /* Lovins' conditions A, B ... CC, as given in her Appendix B, where a test for a two letter prefix ('test hop 2') is implicitly assumed. Note that 'e' next 'u' corresponds to her u*e because Snowball is scanning backwards. */ define A as ( hop 2 ) define B as ( hop 3 ) define C as ( hop 4 ) define D as ( hop 5 ) define E as ( test hop 2 not 'e' ) define F as ( test hop 3 not 'e' ) define G as ( test hop 3 'f' ) define H as ( test hop 2 't' or 'll' ) define I as ( test hop 2 not 'o' not 'e' ) define J as ( test hop 2 not 'a' not 'e' ) define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) define O as ( test hop 2 'l' or 'i' ) define P as ( test hop 2 not 'c' ) define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) define R as ( test hop 2 'n' or 'r' ) define S as ( test hop 2 'dr' or ('t' not 't') ) define T as ( test hop 2 's' or ('t' not 'o') ) define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) define V as ( test hop 2 'c' ) define W as ( test hop 2 not 's' not 'u' ) define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) define Y as ( test hop 2 'in' ) define Z as ( test hop 2 not 'f' ) define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' 'es' 't' ) ) define BB as ( test hop 3 not 'met' not 'ryst' ) define CC as ( test hop 2 'l' ) /* The system of endings, as given in Appendix A. */ define endings as ( [substring] among( 'alistically' B 'arizability' A 'izationally' B 'antialness' A 'arisations' A 'arizations' A 'entialness' A 'allically' C 'antaneous' A 'antiality' A 'arisation' A 'arization' A 'ationally' B 'ativeness' A 'eableness' E 'entations' A 'entiality' A 'entialize' A 'entiation' A 'ionalness' A 'istically' A 'itousness' A 'izability' A 'izational' A 'ableness' A 'arizable' A 'entation' A 'entially' A 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A 'ionality' A 'ionalize' A 'iousness' A 'izations' A 'lessness' A 'ability' A 'aically' A 'alistic' B 'alities' A 'ariness' E 'aristic' A 'arizing' A 'ateness' A 'atingly' A 'ational' B 'atively' A 'ativism' A 'elihood' E 'encible' A 'entally' A 'entials' A 'entiate' A 'entness' A 'fulness' A 'ibility' A 'icalism' A 'icalist' A 'icality' A 'icalize' A 'ication' G 'icianry' A 'ination' A 'ingness' A 'ionally' A 'isation' A 'ishness' A 'istical' A 'iteness' A 'iveness' A 'ivistic' A 'ivities' A 'ization' F 'izement' A 'oidally' A 'ousness' A 'aceous' A 'acious' B 'action' G 'alness' A 'ancial' A 'ancies' A 'ancing' B 'ariser' A 'arized' A 'arizer' A 'atable' A 'ations' B 'atives' A 'eature' Z 'efully' A 'encies' A 'encing' A 'ential' A 'enting' C 'entist' A 'eously' A 'ialist' A 'iality' A 'ialize' A 'ically' A 'icance' A 'icians' A 'icists' A 'ifully' A 'ionals' A 'ionate' D 'ioning' A 'ionist' A 'iously' A 'istics' A 'izable' E 'lessly' A 'nesses' A 'oidism' A 'acies' A 'acity' A 'aging' B 'aical' A 'alist' A 'alism' B 'ality' A 'alize' A 'allic'BB 'anced' B 'ances' B 'antic' C 'arial' A 'aries' A 'arily' A 'arity' B 'arize' A 'aroid' A 'ately' A 'ating' I 'ation' B 'ative' A 'ators' A 'atory' A 'ature' E 'early' Y 'ehood' A 'eless' A 'elity' A 'ement' A 'enced' A 'ences' A 'eness' E 'ening' E 'ental' A 'ented' C 'ently' A 'fully' A 'ially' A 'icant' A 'ician' A 'icide' A 'icism' A 'icist' A 'icity' A 'idine' I 'iedly' A 'ihood' A 'inate' A 'iness' A 'ingly' B 'inism' J 'inity'CC 'ional' A 'ioned' A 'ished' A 'istic' A 'ities' A 'itous' A 'ively' A 'ivity' A 'izers' F 'izing' F 'oidal' A 'oides' A 'otide' A 'ously' A 'able' A 'ably' A 'ages' B 'ally' B 'ance' B 'ancy' B 'ants' B 'aric' A 'arly' K 'ated' I 'ates' A 'atic' B 'ator' A 'ealy' Y 'edly' E 'eful' A 'eity' A 'ence' A 'ency' A 'ened' E 'enly' E 'eous' A 'hood' A 'ials' A 'ians' A 'ible' A 'ibly' A 'ical' A 'ides' L 'iers' A 'iful' A 'ines' M 'ings' N 'ions' B 'ious' A 'isms' B 'ists' A 'itic' H 'ized' F 'izer' F 'less' A 'lily' A 'ness' A 'ogen' A 'ward' A 'wise' A 'ying' B 'yish' A 'acy' A 'age' B 'aic' A 'als'BB 'ant' B 'ars' O 'ary' F 'ata' A 'ate' A 'eal' Y 'ear' Y 'ely' E 'ene' E 'ent' C 'ery' E 'ese' A 'ful' A 'ial' A 'ian' A 'ics' A 'ide' L 'ied' A 'ier' A 'ies' P 'ily' A 'ine' M 'ing' N 'ion' Q 'ish' C 'ism' B 'ist' A 'ite'AA 'ity' A 'ium' A 'ive' A 'ize' F 'oid' A 'one' R 'ous' A 'ae' A 'al'BB 'ar' X 'as' B 'ed' E 'en' F 'es' E 'ia' A 'ic' A 'is' A 'ly' B 'on' S 'or' T 'um' U 'us' V 'yl' R '{'}s' A 's{'}' A 'a' A 'e' A 'i' A 'o' A 's' W 'y' B (delete) ) ) /* Undoubling is rule 1 of appendix C. */ define undouble as ( test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' 'tt') [next] delete ) /* The other appendix C rules can be done together. */ define respell as ( [substring] among ( 'iev' (<-'ief') 'uct' (<-'uc') 'umpt' (<-'um') 'rpt' (<-'rb') 'urs' (<-'ur') 'istr' (<-'ister') 'metr' (<-'meter') 'olv' (<-'olut') 'ul' (not 'a' not 'i' not 'o' <-'l') 'bex' (<-'bic') 'dex' (<-'dic') 'pex' (<-'pic') 'tex' (<-'tic') 'ax' (<-'ac') 'ex' (<-'ec') 'ix' (<-'ic') 'lux' (<-'luc') 'uad' (<-'uas') 'vad' (<-'vas') 'cid' (<-'cis') 'lid' (<-'lis') 'erid' (<-'eris') 'pand' (<-'pans') 'end' (not 's' <-'ens') 'ond' (<-'ons') 'lud' (<-'lus') 'rud' (<-'rus') 'her' (not 'p' not 't' <-'hes') 'mit' (<-'mis') 'ent' (not 'm' <-'ens') /* 'ent' was 'end' in the 1968 paper - a typo. */ 'ert' (<-'ers') 'et' (not 'n' <-'es') 'yt' (<-'ys') 'yz' (<-'ys') ) ) ) define stem as ( backwards ( do endings do undouble do respell ) ) snowball-3.0.1/algorithms/nepali.sbl000066400000000000000000000130271500727106100174610ustar00rootroot00000000000000/* * Authors: * - Ingroj Shrestha , Nepali NLP Group * - Oleg Bartunov , Postgres Professional Ltd. * - Shreeya Singh Dhakal, Nepali NLP Group */ routines ( remove_category_1 remove_category_2 remove_category_3 ) stringescapes {} stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA externals ( stem ) backwardmode ( define remove_category_1 as( [substring] among ( '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' (delete) '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}' ('{dle}' or '{dvse}' or delete) ) ) define remove_category_2 as ( [substring] among( '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) '{dvsai}' ('{dlta}{dsv}{dlr}' delete) ) ) define remove_category_3 as( [substring] among( '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' (delete) ) ) ) define stem as ( backwards ( do remove_category_1 repeat ( do remove_category_2 remove_category_3 ) ) ) snowball-3.0.1/algorithms/norwegian.sbl000066400000000000000000000037511500727106100202050ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters */ stringdef ae '{U+00E6}' stringdef ao '{U+00E5}' stringdef e^ '{U+00EA}' // e-circumflex stringdef o` '{U+00F2}' // o-grave stringdef o' '{U+00F3}' // o-acute stringdef o^ '{U+00F4}' // o-circumflex stringdef o/ '{U+00F8}' define v 'ae{e^}io{o`}{o'}{o^}uy{ae}{ao}{o/}' define s_ending 'bcdfghjlmnoptvyz' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) gopast v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' 'hetens' 'ets' 'et' 'het' 'ast' (delete) 'ers' ( among ( 'amm' 'ast' 'ind' 'kap' 'kk' 'lt' 'nk' 'omm' 'pp' 'v' '{o/}st' () 'giv' 'hav' 'skap' '' (delete) ) ) 's' (s_ending or ('r' not 'e') or ('k' non-v) delete) 'erte' 'ert' (<-'er') ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'dt' 'vt' ) ) next] delete ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' 'hetslov' (delete) ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball-3.0.1/algorithms/porter.sbl000066400000000000000000000056541500727106100175330ustar00rootroot00000000000000integers ( p1 p2 ) booleans ( Y_found ) routines ( shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b ) externals ( stem ) groupings ( v v_WXY ) define v 'aeiouy' define v_WXY v + 'wxY' backwardmode ( define shortv as ( non-v_WXY v non-v ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( [substring] among ( 'sses' (<-'ss') 'ies' (<-'i') 'ss' () 's' (delete) ) ) define Step_1b as ( [substring] among ( 'eed' (R1 <-'ee') 'ed' 'ing' ( test gopast v delete test substring among( 'at' 'bl' 'iz' (insert 'e') 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x ([next] delete) '' (atmark p1 test shortv insert 'e') ) ) ) ) define Step_1c as ( ['y' or 'Y'] gopast v <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'eli' (<-'e') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alli' (<-'al') 'alism' 'aliti' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' (<-'ble') ) ) define Step_3 as ( [substring] R1 among ( 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ative' 'ful' 'ness' (delete) ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5a as ( ['e'] R2 or (R1 not shortv) delete ) define Step_5b as ( ['l'] R2 'l' delete ) ) define stem as ( unset Y_found do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) $p1 = limit $p2 = limit do( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) backwards ( do Step_1a do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5a do Step_5b ) do(Y_found repeat(goto (['Y']) <-'y')) ) snowball-3.0.1/algorithms/portuguese.sbl000066400000000000000000000126261500727106100204170ustar00rootroot00000000000000routines ( prelude postlude mark_regions RV R1 R2 standard_suffix verb_suffix residual_suffix residual_form ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico stringdef e' '{U+00E9}' // e-acute stringdef e^ '{U+00EA}' // e-circumflex stringdef i' '{U+00ED}' // i-acute stringdef o^ '{U+00F4}' // o-circumflex stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute stringdef cc '{U+00E7}' // c-cedilla stringdef a~ '{U+00E3}' // a-tilde stringdef o~ '{U+00F5}' // o-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' define prelude as repeat ( [substring] among( '{a~}' (<- 'a~') '{o~}' (<- 'o~') '' (next) ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'a~' (<- '{a~}') 'o~' (<- '{o~}') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'eza' 'ezas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' '{a'}vel' '{i'}vel' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amento' 'amentos' 'imento' 'imentos' 'adora' 'ador' 'a{cc}a~o' 'adoras' 'adores' 'a{cc}o~es' // no -ic test 'ante' 'antes' '{a^}ncia' ( R2 delete ) 'logia' 'logias' ( R2 <- 'log' ) 'u{cc}a~o' 'u{cc}o~es' ( R2 <- 'u' ) '{e^}ncia' '{e^}ncias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' 'avel' '{i'}vel' (R2 delete) ) ) ) 'idade' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) 'ira' 'iras' ( RV 'e' // -eira -eiras usually non-verbal <- 'ir' ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' 'ira' 'iras' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) ) ) define residual_form as ( [substring] among( 'e' '{e'}' '{e^}' ( RV delete [('u'] test 'g') or ('i'] test 'c') RV delete ) '{cc}' (<-'c') ) ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or verb_suffix ) and do ( ['i'] test 'c' RV delete ) ) or residual_suffix ) do residual_form ) do postlude ) snowball-3.0.1/algorithms/romanian.sbl000066400000000000000000000146041500727106100200170ustar00rootroot00000000000000 routines ( norm prelude postlude mark_regions RV R1 R2 step_0 standard_suffix combo_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) booleans ( standard_suffix_removed ) stringescapes {} /* special characters */ stringdef a^ '{U+00E2}' // a circumflex stringdef i^ '{U+00EE}' // i circumflex stringdef a+ '{U+0103}' // a breve stringdef sc '{U+015F}' // s cedilla stringdef tc '{U+0163}' // t cedilla stringdef s, '{U+0219}' // s comma stringdef t, '{U+021B}' // t comma define v 'aeiou{a^}{i^}{a+}' // Normalize old cedilla forms to correct comma-below forms. define norm as ( do repeat goto ( [substring] among ( '{sc}' (<- '{s,}') '{tc}' (<- '{t,}') ) ) ) define prelude as ( repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define step_0 as ( [substring] R1 among( 'ul' 'ului' ( delete ) 'aua' ( <-'a' ) 'ea' 'ele' 'elor' ( <-'e' ) 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' ( <-'i') 'ile' ( not 'ab' <- 'i' ) 'atei' ( <- 'at' ) 'a{t,}ie' 'a{t,}ia' ( <- 'a{t,}i' ) ) ) define combo_suffix as test ( [substring] R1 ( among( /* 'IST'. alternative: include the following 'alism' 'alisme' 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( <- 'al' ) */ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( <- 'abil' ) 'ibilitate' ( <- 'ibil' ) 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( <- 'iv' ) 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' 'icator' 'icatori' 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( <- 'ic' ) 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' 'atoare' 'ator' 'atori' '{a+}toare' '{a+}tor' '{a+}tori' ( <- 'at' ) 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' 'itoare' 'itor' 'itori' ( <- 'it' ) ) set standard_suffix_removed ) ) define standard_suffix as ( unset standard_suffix_removed repeat combo_suffix [substring] R2 ( among( // past participle is treated here, rather than // as a verb ending: 'at' 'ata' 'at{a+}' 'ati' 'ate' 'ut' 'uta' 'ut{a+}' 'uti' 'ute' 'it' 'ita' 'it{a+}' 'iti' 'ite' 'ic' 'ica' 'ice' 'ici' 'ic{a+}' 'abil' 'abila' 'abile' 'abili' 'abil{a+}' 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' 'ant' 'anta' 'ante' 'anti' 'ant{a+}' 'ator' 'atori' 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( delete ) 'iune' 'iuni' ( '{t,}'] <- 't' ) 'ism' 'isme' 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( <- 'ist' /* 'IST'. alternative: remove with <- '' */ ) ) set standard_suffix_removed ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( // 'long' infinitive: 'are' 'ere' 'ire' '{a^}re' // gerund: 'ind' '{a^}nd' 'indu' '{a^}ndu' 'eze' 'easc{a+}' // present: 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' 'e{s,}te' '{a+}sc' '{a+}{s,}ti' '{a+}{s,}te' // imperfect: 'am' 'ai' 'au' 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' // past: // (not 'ii') 'ui' 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' // pluperfect: 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' '{a^}ser{a+}' 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' ( non-v or 'u' delete ) // present: '{a+}m' 'a{t,}i' 'em' 'e{t,}i' 'im' 'i{t,}i' '{a^}m' '{a^}{t,}i' // past: 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' 'sei' 'se' // pluperfect: 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' (delete) ) ) define vowel_suffix as ( [substring] RV among ( 'a' 'e' 'i' 'ie' '{a+}' ( delete ) ) ) ) define stem as ( do norm do prelude do mark_regions backwards ( do step_0 do standard_suffix do ( standard_suffix_removed or verb_suffix ) do vowel_suffix ) do postlude ) snowball-3.0.1/algorithms/russian.sbl000066400000000000000000000143451500727106100177010ustar00rootroot00000000000000stringescapes {} /* the 33 Cyrillic letters represented in ASCII characters following the * conventions of the standard Library of Congress transliteration: */ stringdef a '{U+0430}' stringdef b '{U+0431}' stringdef v '{U+0432}' stringdef g '{U+0433}' stringdef d '{U+0434}' stringdef e '{U+0435}' stringdef e" '{U+0451}' stringdef zh '{U+0436}' stringdef z '{U+0437}' stringdef i '{U+0438}' stringdef i` '{U+0439}' stringdef k '{U+043A}' stringdef l '{U+043B}' stringdef m '{U+043C}' stringdef n '{U+043D}' stringdef o '{U+043E}' stringdef p '{U+043F}' stringdef r '{U+0440}' stringdef s '{U+0441}' stringdef t '{U+0442}' stringdef u '{U+0443}' stringdef f '{U+0444}' stringdef kh '{U+0445}' stringdef ts '{U+0446}' stringdef ch '{U+0447}' stringdef sh '{U+0448}' stringdef shch '{U+0449}' stringdef " '{U+044A}' stringdef y '{U+044B}' stringdef ' '{U+044C}' stringdef e` '{U+044D}' stringdef iu '{U+044E}' stringdef ia '{U+044F}' routines ( mark_regions R2 perfective_gerund adjective adjectival reflexive verb noun derivational tidy_up ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define perfective_gerund as ( [substring] among ( '{v}' '{v}{sh}{i}' '{v}{sh}{i}{s}{'}' ('{a}' or '{ia}' delete) '{i}{v}' '{i}{v}{sh}{i}' '{i}{v}{sh}{i}{s}{'}' '{y}{v}' '{y}{v}{sh}{i}' '{y}{v}{sh}{i}{s}{'}' (delete) ) ) define adjective as ( [substring] among ( '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' '{ia}{ia}' // and - '{o}{iu}' // - which is somewhat archaic '{e}{iu}' // - soft form of {o}{iu} (delete) ) ) define adjectival as ( adjective /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of errors. Removing im, uem, enn creates too many errors. */ try ( [substring] among ( '{e}{m}' // present passive participle '{n}{n}' // adjective from past passive participle '{v}{sh}' // past active participle '{iu}{shch}' '{shch}' // present active participle ('{a}' or '{ia}' delete) //but not '{i}{m}' '{u}{e}{m}' // present passive participle //or '{e}{n}{n}' // adjective from past passive participle '{i}{v}{sh}' '{y}{v}{sh}'// past active participle '{u}{iu}{shch}' // present active participle (delete) ) ) ) define reflexive as ( [substring] among ( '{s}{ia}' '{s}{'}' (delete) ) ) define verb as ( [substring] among ( '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' '{n}{y}' '{t}{'}' '{e}{sh}{'}' '{n}{n}{o}' ('{a}' or '{ia}' delete) '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' (delete) /* note the short passive participle tests: '{n}{a}' '{n}' '{n}{o}' '{n}{y}' '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' */ ) ) define noun as ( [substring] among ( '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' (delete) /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' omitted - they only occur on 12 words. */ ) ) define derivational as ( [substring] R2 among ( '{o}{s}{t}' '{o}{s}{t}{'}' (delete) ) ) define tidy_up as ( [substring] among ( '{e}{i`}{sh}' '{e}{i`}{sh}{e}' // superlative forms (delete ['{n}'] '{n}' delete ) '{n}' ('{n}' delete) // e.g. -nno endings '{'}' (delete) // with some slight false conflations ) ) ) define stem as ( // Normalise {e"} to {e}. The documentation has long suggested the user // should do this before calling the stemmer - we now do it for them. do repeat ( goto (['{e"}']) <- '{e}' ) do mark_regions backwards setlimit tomark pV for ( do ( perfective_gerund or ( try reflexive adjectival or verb or noun ) ) try([ '{i}' ] delete) // because noun ending -i{iu} is being treated as verb ending -{iu} do derivational do tidy_up ) ) snowball-3.0.1/algorithms/serbian.sbl000066400000000000000000001530621500727106100176400ustar00rootroot00000000000000/* Stemmer for Serbian language, based on: * * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/ * * authors: Stefan Petkovic and Dragan Ivanovic * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs */ routines ( cyr_to_lat prelude mark_regions R1 Step_1 Step_2 Step_3 ) externals ( stem ) booleans ( no_diacritics ) integers ( p1 ) groupings ( v ca sa rg ) stringescapes {} /* special characters - Unicode codepoints */ /* serbian cyrillic */ stringdef cyrA '{U+0430}' stringdef cyrB '{U+0431}' stringdef cyrV '{U+0432}' stringdef cyrG '{U+0433}' stringdef cyrD '{U+0434}' stringdef cyrDx '{U+0452}' stringdef cyrE '{U+0435}' stringdef cyrZh '{U+0436}' stringdef cyrZ '{U+0437}' stringdef cyrI '{U+0438}' stringdef cyrJ '{U+0458}' stringdef cyrK '{U+043A}' stringdef cyrL '{U+043B}' stringdef cyrLJ '{U+0459}' stringdef cyrM '{U+043C}' stringdef cyrN '{U+043D}' stringdef cyrNJ '{U+045A}' stringdef cyrO '{U+043E}' stringdef cyrP '{U+043F}' stringdef cyrR '{U+0440}' stringdef cyrS '{U+0441}' stringdef cyrT '{U+0442}' stringdef cyrCy '{U+045B}' stringdef cyrU '{U+0443}' stringdef cyrF '{U+0444}' stringdef cyrH '{U+0445}' stringdef cyrC '{U+0446}' stringdef cyrCx '{U+0447}' stringdef cyrDzx '{U+045F}' stringdef cyrSx '{U+0448}' /* serbian latin with diacritics */ stringdef cv '{U+010D}' // small c with caron stringdef c' '{U+0107}' // small c with acute stringdef zv '{U+017E}' // small z with caron stringdef sv '{U+0161}' // small s with caron stringdef d/ '{U+0111}' // small d with stroke define v 'aeiou' define sa '{cv}{c'}{zv}{sv}{d/}' define ca 'bvgdzjklmnprstfhc' + sa define rg 'r' define cyr_to_lat as ( do repeat goto ( [substring] among ( '{cyrA}' (<- 'a') '{cyrB}' (<- 'b') '{cyrV}' (<- 'v') '{cyrG}' (<- 'g') '{cyrD}' (<- 'd') '{cyrDx}' (<- '{d/}') '{cyrE}' (<- 'e') '{cyrZh}' (<- '{zv}') '{cyrZ}' (<- 'z') '{cyrI}' (<- 'i') '{cyrJ}' (<- 'j') '{cyrK}' (<- 'k') '{cyrL}' (<- 'l') '{cyrLJ}' (<- 'lj') '{cyrM}' (<- 'm') '{cyrN}' (<- 'n') '{cyrNJ}' (<- 'nj') '{cyrO}' (<- 'o') '{cyrP}' (<- 'p') '{cyrR}' (<- 'r') '{cyrS}' (<- 's') '{cyrT}' (<- 't') '{cyrCy}' (<- '{c'}') '{cyrU}' (<- 'u') '{cyrF}' (<- 'f') '{cyrH}' (<- 'h') '{cyrC}' (<- 'c') '{cyrCx}' (<- '{cv}') '{cyrDzx}' (<- 'd{zv}') '{cyrSx}' (<- '{sv}') ) ) ) define prelude as ( do repeat goto ( ca ['ije'] ca <- 'e' ) do repeat goto ( ca ['je'] ca <- 'e' ) do repeat goto ( ['dj'] <- '{d/}' ) ) define mark_regions as ( set no_diacritics do ( gopast sa unset no_diacritics ) $p1 = limit do ( gopast v setmark p1 ($p1 < 2) ( gopast non-v setmark p1 ) ) do ( gopast 'r' $(cursor >= 2) or (gopast non-rg) $(p1 - cursor > 1) setmark p1 ) ) backwardmode ( define R1 as $p1 <= cursor define Step_1 as ( [substring] among ( 'lozi' 'lozima' (<-'loga') 'pesi' 'pesima' (<-'peh') 'vojci' (<-'vojka') 'bojci' (<-'bojka') 'jaci' 'jacima' (<-'jak') '{cv}ajan' (<-'{cv}ajni') 'cajan' (no_diacritics <-'cajni') 'eran' (<-'erni') 'laran' (<-'larni') 'esan' (<-'esni') 'anjac' (<-'anjca') 'ajac' 'ajaca' (<-'ajca') 'ljaca' 'ljac' (<-'ljca') 'ejac' 'ejaca' (<-'ejca') 'ojac' 'ojaca' (<-'ojca') 'ajaka' (<-'ajka') 'ojaka' (<-'ojka') '{sv}aca' '{sv}ac' (<-'{sv}ca') 'inzima' 'inzi' (<-'ing') 'tvenici' (<-'tvenik') 'tetici' 'teticima' (<-'tetika') 'nstava' (<-'nstva') 'nicima' (<-'nik') 'ticima' (<-'tik') 'zicima' (<-'zik') 'snici' (<-'snik') 'kuse' (<-'kusi') 'kusan' (<-'kusni') 'kustava' (<-'kustva') 'du{sv}an' (<-'du{sv}ni') 'dusan' (no_diacritics <-'dusni') 'antan' (<-'antni') 'bilan' (<-'bilni') 'tilan' (<-'tilni') 'avilan' (<-'avilni') 'silan' (<-'silni') 'gilan' (<-'gilni') 'rilan' (<-'rilni') 'nilan' (<-'nilni') 'alan' (<-'alni') 'ozan' (<-'ozni') 'rave' (<-'ravi') 'stavan' (<-'stavni') 'pravan' (<-'pravni') 'tivan' (<-'tivni') 'sivan' (<-'sivni') 'atan' (<-'atni') 'enat' (<-'enta') 'tetan' (<-'tetni') 'pletan' (<-'pletni') '{sv}ave' (<-'{sv}avi') 'save' (no_diacritics <-'savi') 'anata' (<-'anta') 'a{cv}ak' 'a{cv}aka' (<-'a{cv}ka') 'acak' 'acaka' (no_diacritics <-'acka') 'u{sv}ak' (<-'u{sv}ka') 'usak' (no_diacritics <-'uska') 'atak' 'ataka' 'atci' 'atcima' (<-'atka') 'etak' 'etaka' (<-'etka') 'itak' 'itaka' 'itci' (<-'itka') 'otak' 'otaka' (<-'otka') 'utak' 'utaka' 'utci' 'utcima' (<-'utka') 'eskan' (<-'eskna') 'ti{cv}an' (<-'ti{cv}ni') 'tican' (no_diacritics <-'ticni') 'ojsci' (<-'ojska') 'esama' (<-'esma') 'metar' 'metara' (<-'metra') 'centar' 'centara' (<-'centra') 'istar' 'istara' (<-'istra') 'o{sv}{c'}u' (<-'osti') 'oscu' (no_diacritics <-'osti') 'daba' (<-'dba') '{cv}cima' '{cv}ci' (<-'{cv}ka') 'mac' 'maca' (<-'mca') 'naca' 'nac' (<-'nca') 'voljan' (<-'voljni') 'anaka' (<-'anki') 'vac' 'vaca' (<-'vca') 'saca' 'sac' (<-'sca') 'raca' 'rac' (<-'rca') 'aoca' 'alaca' 'alac' (<-'alca') 'elaca' 'elac' (<-'elca') 'olaca' 'olac' 'olce' (<-'olca') 'njac' 'njaca' (<-'njca') 'ekata' 'ekat' (<-'ekta') 'izam' 'izama' (<-'izma') 'jebe' (<-'jebi') 'baci' (<-'baci') 'a{sv}an' (<-'a{sv}ni') 'asan' (no_diacritics <-'asni') ) ) define Step_2 as ( [substring] R1 among ( 'skijima' 'skijega' 'skijemu' 'skijem' 'skega' 'skemu' 'skem' 'skijim' 'skijih' 'skijoj' 'skijeg' 'skiji' 'skije' 'skija' 'skoga' 'skome' 'skomu' 'skima' 'skog' 'skom' 'skim' 'skih' 'skoj' 'ski' 'ske' 'sko' 'ska' 'sku' (<-'sk') '{sv}kijima' '{sv}kijega' '{sv}kijemu' '{sv}kijem' '{sv}kega' '{sv}kemu' '{sv}kem' '{sv}kijim' '{sv}kijih' '{sv}kijoj' '{sv}kijeg' '{sv}kiji' '{sv}kije' '{sv}kija' '{sv}koga' '{sv}kome' '{sv}komu' '{sv}kima' '{sv}kog' '{sv}kom' '{sv}kim' '{sv}kih' '{sv}koj' '{sv}ki' '{sv}ke' '{sv}ko' '{sv}ka' '{sv}ku' (<-'{sv}k') 'stvima' 'stvom' 'stvo' 'stva' 'stvu' (<-'stv') '{sv}tvima' '{sv}tvom' '{sv}tvo' '{sv}tva' '{sv}tvu' (<-'{sv}tv') 'tanijama' 'tanijima' 'tanijom' 'tanija' 'taniju' 'tanije' 'taniji' (<-'tanij') 'manijama' 'manijima' 'manijom' 'manija' 'maniju' 'manije' 'maniji' (<-'manij') 'panijama' 'panijima' 'panijom' 'panija' 'paniju' 'panije' 'paniji' (<-'panij') 'ranijama' 'ranijima' 'ranijom' 'ranija' 'raniju' 'ranije' 'raniji' (<-'ranij') 'ganijama' 'ganijima' 'ganijom' 'ganija' 'ganiju' 'ganije' 'ganiji' (<-'ganij') 'aninom' 'anina' 'aninu' 'anine' 'anima' 'anin' 'anom' 'anu' 'ani' 'ana' 'ane' (<-'an') 'inima' 'inama' 'inom' 'ina' 'ine' 'ini' 'inu' 'ino' (<-'in') 'onovima' 'onova' 'onove' 'onovi' 'onima' 'onom' 'ona' 'one' 'oni' 'onu' (<-'on') 'nijima' 'nijega' 'nijemu' 'nijeg' 'nijem' 'nega' 'nemu' 'neg' 'nem' 'nijim' 'nijih' 'nijoj' 'niji' 'nije' 'nija' 'niju' 'nima' 'nome' 'nomu' 'noga' 'noj' 'nom' 'nih' 'nim' 'nog' 'no' 'ne' 'na' 'nu' 'ni' (<-'n') 'a{c'}oga' 'a{c'}ome' 'a{c'}omu' 'a{c'}ega' 'a{c'}emu' 'a{c'}ima' 'a{c'}oj' 'a{c'}ih' 'a{c'}om' 'a{c'}eg' 'a{c'}em' 'a{c'}og' 'a{c'}uh' 'a{c'}im' 'a{c'}e' 'a{c'}a' (<-'a{c'}') 'e{c'}oga' 'e{c'}ome' 'e{c'}omu' 'e{c'}ega' 'e{c'}emu' 'e{c'}ima' 'e{c'}oj' 'e{c'}ih' 'e{c'}om' 'e{c'}eg' 'e{c'}em' 'e{c'}og' 'e{c'}uh' 'e{c'}im' 'e{c'}e' 'e{c'}a' (<-'e{c'}') 'u{c'}oga' 'u{c'}ome' 'u{c'}omu' 'u{c'}ega' 'u{c'}emu' 'u{c'}ima' 'u{c'}oj' 'u{c'}ih' 'u{c'}om' 'u{c'}eg' 'u{c'}em' 'u{c'}og' 'u{c'}uh' 'u{c'}im' 'u{c'}e' 'u{c'}a' (<-'u{c'}') 'ugovima' 'ugovi' 'ugove' 'ugova' (<-'ugov') 'ugama' 'ugom' 'uga' 'uge' 'ugi' 'ugu' 'ugo' (<-'ug') 'logama' 'logom' 'loga' 'logu' 'loge' (<-'log') 'govima' 'gama' 'govi' 'gove' 'gova' 'gom' 'ga' 'ge' 'gi' 'gu' 'go' (<-'g') 'rarijem' 'rarija' 'rariju' 'rario' (<-'rari') 'otijem' 'otija' 'otiju' 'otio' (<-'oti') 'sijem' 'sija' 'siju' 'sio' (<-'si') 'lijem' 'lija' 'liju' 'lio' (<-'li') 'uju{c'}i' 'ujemo' 'ujete' 'ujmo' 'ujem' 'uje{sv}' 'uje' 'uju' (<-'uj') 'cajevima' 'cajevi' 'cajeva' 'cajeve' 'cajama' 'cajima' 'cajem' 'caja' 'caje' 'caji' 'caju' (<-'caj') '{cv}ajevima' '{cv}ajevi' '{cv}ajeva' '{cv}ajeve' '{cv}ajama' '{cv}ajima' '{cv}ajem' '{cv}aja' '{cv}aje' '{cv}aji' '{cv}aju' (<-'{cv}aj') '{c'}ajevima' '{c'}ajevi' '{c'}ajeva' '{c'}ajeve' '{c'}ajama' '{c'}ajima' '{c'}ajem' '{c'}aja' '{c'}aje' '{c'}aji' '{c'}aju' (<-'{c'}aj') '{d/}ajevima' '{d/}ajevi' '{d/}ajeva' '{d/}ajeve' '{d/}ajama' '{d/}ajima' '{d/}ajem' '{d/}aja' '{d/}aje' '{d/}aji' '{d/}aju' (<-'{d/}aj') 'lajevima' 'lajevi' 'lajeva' 'lajeve' 'lajama' 'lajima' 'lajem' 'laja' 'laje' 'laji' 'laju' (<-'laj') 'rajevima' 'rajevi' 'rajeva' 'rajeve' 'rajama' 'rajima' 'rajem' 'raja' 'raje' 'raji' 'raju' (<-'raj') 'bijima' 'bijama' 'bijom' 'bija' 'bije' 'biji' 'biju' 'bijo' (<-'bij') 'cijima' 'cijama' 'cijom' 'cija' 'cije' 'ciji' 'ciju' 'cijo' (<-'cij') 'dijima' 'dijama' 'dijom' 'dija' 'dije' 'diji' 'diju' 'dijo' (<-'dij') 'lijima' 'lijama' 'lijom' 'lije' 'liji' 'lijo' (<-'lij') 'nijama' 'nijom' 'nijo' (<-'nij') 'mijima' 'mijama' 'mijom' 'mija' 'mije' 'miji' 'miju' 'mijo' (<-'mij') '{zv}ijima' '{zv}ijama' '{zv}ijom' '{zv}ija' '{zv}ije' '{zv}iji' '{zv}iju' '{zv}ijo' (<-'{zv}ij') 'gijima' 'gijama' 'gijom' 'gija' 'gije' 'giji' 'giju' 'gijo' (<-'gij') 'fijima' 'fijama' 'fijom' 'fija' 'fije' 'fiji' 'fiju' 'fijo' (<-'fij') 'pijima' 'pijama' 'pijom' 'pija' 'pije' 'piji' 'piju' 'pijo' (<-'pij') 'rijima' 'rijama' 'rijom' 'rija' 'rije' 'riji' 'riju' 'rijo' (<-'rij') 'sijima' 'sijama' 'sijom' 'sije' 'siji' 'sijo' (<-'sij') 'tijima' 'tijama' 'tijom' 'tija' 'tije' 'tiji' 'tiju' 'tijo' (<-'tij') 'zijima' 'zijama' 'zijom' 'zija' 'zije' 'ziji' 'ziju' 'zijo' (<-'zij') 'nalima' 'nalama' 'nalom' 'nala' 'nale' 'nali' 'nalu' 'nalo' (<-'nal') 'ijalima' 'ijalama' 'ijalom' 'ijala' 'ijale' 'ijali' 'ijalu' 'ijalo' (<-'ijal') 'ozilima' 'ozilom' 'ozila' 'ozile' 'ozilu' 'ozili' (<-'ozil') 'olovima' 'olovi' 'olova' 'olove' (<-'olov') 'olima' 'olom' 'ola' 'olu' 'ole' 'oli' (<-'ol') 'lemama' 'lemima' 'lemom' 'lema' 'leme' 'lemi' 'lemu' 'lemo' (<-'lem') 'ramama' 'ramom' 'rama' 'rame' 'rami' 'ramu' 'ramo' (<-'ram') 'arama' 'arima' 'arom' 'aru' 'ara' 'are' 'ari' (<-'ar') 'drama' 'drima' 'drom' 'dru' 'dra' 'dre' 'dri' (<-'dr') 'erama' 'erima' 'erom' 'eru' 'era' 'ere' 'eri' (<-'er') 'orama' 'orima' 'orom' 'oru' 'ora' 'ore' 'ori' (<-'or') 'esima' 'esom' 'ese' 'esa' 'esu' (<-'es') 'isima' 'isom' 'ise' 'isa' 'isu' (<-'is') 'ta{sv}ama' 'ta{sv}ima' 'ta{sv}om' 'ta{sv}em' 'ta{sv}a' 'ta{sv}u' 'ta{sv}i' 'ta{sv}e' (<-'ta{sv}') 'na{sv}ama' 'na{sv}ima' 'na{sv}om' 'na{sv}em' 'na{sv}a' 'na{sv}u' 'na{sv}i' 'na{sv}e' (<-'na{sv}') 'ja{sv}ama' 'ja{sv}ima' 'ja{sv}om' 'ja{sv}em' 'ja{sv}a' 'ja{sv}u' 'ja{sv}i' 'ja{sv}e' (<-'ja{sv}') 'ka{sv}ama' 'ka{sv}ima' 'ka{sv}om' 'ka{sv}em' 'ka{sv}a' 'ka{sv}u' 'ka{sv}i' 'ka{sv}e' (<-'ka{sv}') 'ba{sv}ama' 'ba{sv}ima' 'ba{sv}om' 'ba{sv}em' 'ba{sv}a' 'ba{sv}u' 'ba{sv}i' 'ba{sv}e' (<-'ba{sv}') 'ga{sv}ama' 'ga{sv}ima' 'ga{sv}om' 'ga{sv}em' 'ga{sv}a' 'ga{sv}u' 'ga{sv}i' 'ga{sv}e' (<-'ga{sv}') 'va{sv}ama' 'va{sv}ima' 'va{sv}om' 'va{sv}em' 'va{sv}a' 'va{sv}u' 'va{sv}i' 'va{sv}e' (<-'va{sv}') 'e{sv}ima' 'e{sv}ama' 'e{sv}om' 'e{sv}em' 'e{sv}i' 'e{sv}e' 'e{sv}a' 'e{sv}u' (<-'e{sv}') 'i{sv}ima' 'i{sv}ama' 'i{sv}om' 'i{sv}em' 'i{sv}i' 'i{sv}e' 'i{sv}a' 'i{sv}u' (<-'i{sv}') 'ikatima' 'ikatom' 'ikata' 'ikate' 'ikati' 'ikatu' 'ikato' (<-'ikat') 'latima' 'latom' 'lata' 'late' 'lati' 'latu' 'lato' (<-'lat') 'etama' 'etima' 'etom' 'eta' 'ete' 'eti' 'etu' 'eto' (<-'et') 'estima' 'estama' 'estom' 'esta' 'este' 'esti' 'estu' 'esto' (<-'est') 'istima' 'istama' 'istom' 'ista' 'iste' 'isti' 'istu' 'isto' (<-'ist') 'kstima' 'kstama' 'kstom' 'ksta' 'kste' 'ksti' 'kstu' 'ksto' (<-'kst') 'ostima' 'ostama' 'ostom' 'osta' 'oste' 'osti' 'ostu' 'osto' (<-'ost') 'i{sv}tima' 'i{sv}tem' 'i{sv}ta' 'i{sv}te' 'i{sv}tu' (<-'i{sv}t') 'ovasmo' 'ovaste' 'ovahu' 'ovati' 'ova{sv}e' 'ovali' 'ovala' 'ovale' 'ovalo' 'ovat' 'ovah' 'ovao' (<-'ova') 'avijemu' 'avijima' 'avijega' 'avijeg' 'avijem' 'avemu' 'avega' 'aveg' 'avem' 'avijim' 'avijih' 'avijoj' 'avoga' 'avome' 'avomu' 'avima' 'avama' 'aviji' 'avije' 'avija' 'aviju' 'avim' 'avih' 'avoj' 'avom' 'avog' 'avi' 'ava' 'avu' 'ave' 'avo' (<-'av') 'evijemu' 'evijima' 'evijega' 'evijeg' 'evijem' 'evemu' 'evega' 'eveg' 'evem' 'evijim' 'evijih' 'evijoj' 'evoga' 'evome' 'evomu' 'evima' 'evama' 'eviji' 'evije' 'evija' 'eviju' 'evim' 'evih' 'evoj' 'evom' 'evog' 'evi' 'eva' 'evu' 'eve' 'evo' (<-'ev') 'ivijemu' 'ivijima' 'ivijega' 'ivijeg' 'ivijem' 'ivemu' 'ivega' 'iveg' 'ivem' 'ivijim' 'ivijih' 'ivijoj' 'ivoga' 'ivome' 'ivomu' 'ivima' 'ivama' 'iviji' 'ivije' 'ivija' 'iviju' 'ivim' 'ivih' 'ivoj' 'ivom' 'ivog' 'ivi' 'iva' 'ivu' 'ive' 'ivo' (<-'iv') 'ovijemu' 'ovijima' 'ovijega' 'ovijeg' 'ovijem' 'ovemu' 'ovega' 'oveg' 'ovijim' 'ovijih' 'ovijoj' 'ovoga' 'ovome' 'ovomu' 'ovima' 'oviji' 'ovije' 'ovija' 'oviju' 'ovim' 'ovih' 'ovoj' 'ovom' 'ovog' 'ovi' 'ova' 'ovu' 'ove' 'ovo' (<-'ov') 'movima' 'movom' 'mova' 'movu' 'move' 'movi' (<-'mov') 'lovima' 'lovom' 'lova' 'lovu' 'love' 'lovi' (<-'lov') 'elijemu' 'elijima' 'elijega' 'elijeg' 'elijem' 'elemu' 'elega' 'eleg' 'elem' 'elijim' 'elijih' 'elijoj' 'eloga' 'elome' 'elomu' 'elima' 'eliji' 'elije' 'elija' 'eliju' 'elim' 'elih' 'eloj' 'elom' 'elog' 'eli' 'ela' 'elu' 'ele' 'elo' (<-'el') 'anjijemu' 'anjijima' 'anjijega' 'anjijeg' 'anjijem' 'anjemu' 'anjega' 'anjeg' 'anjem' 'anjijim' 'anjijih' 'anjijoj' 'anjoga' 'anjome' 'anjomu' 'anjima' 'anjiji' 'anjije' 'anjija' 'anjiju' 'anjim' 'anjih' 'anjoj' 'anjom' 'anjog' 'anja' 'anje' 'anji' 'anjo' 'anju' (<-'anj') 'enjijemu' 'enjijima' 'enjijega' 'enjijeg' 'enjijem' 'enjemu' 'enjega' 'enjeg' 'enjem' 'enjijim' 'enjijih' 'enjijoj' 'enjoga' 'enjome' 'enjomu' 'enjima' 'enjiji' 'enjije' 'enjija' 'enjiju' 'enjim' 'enjih' 'enjoj' 'enjom' 'enjog' 'enja' 'enje' 'enji' 'enjo' 'enju' (<-'enj') '{sv}njijemu' '{sv}njijima' '{sv}njijega' '{sv}njijeg' '{sv}njijem' '{sv}njemu' '{sv}njega' '{sv}njeg' '{sv}njem' '{sv}njijim' '{sv}njijih' '{sv}njijoj' '{sv}njoga' '{sv}njome' '{sv}njomu' '{sv}njima' '{sv}njiji' '{sv}njije' '{sv}njija' '{sv}njiju' '{sv}njim' '{sv}njih' '{sv}njoj' '{sv}njom' '{sv}njog' '{sv}nja' '{sv}nje' '{sv}nji' '{sv}njo' '{sv}nju' (<-'{sv}nj') 'anemu' 'anega' 'aneg' 'anem' (<-'an') 'enemu' 'enega' 'eneg' 'enem' (<-'en') '{sv}nemu' '{sv}nega' '{sv}neg' '{sv}nem' (<-'{sv}n') '{cv}inama' '{cv}inome' '{cv}inomu' '{cv}inoga' '{cv}inima' '{cv}inog' '{cv}inom' '{cv}inim' '{cv}inih' '{cv}inoj' '{cv}ina' '{cv}inu' '{cv}ini' '{cv}ino' '{cv}ine' (<-'{cv}in') 'ro{sv}iv{sv}i' 'ro{sv}ismo' 'ro{sv}iste' 'ro{sv}i{sv}e' 'ro{sv}imo' 'ro{sv}ite' 'ro{sv}iti' 'ro{sv}ili' 'ro{sv}ila' 'ro{sv}ilo' 'ro{sv}ile' 'ro{sv}im' 'ro{sv}i{sv}' 'ro{sv}it' 'ro{sv}ih' 'ro{sv}io' (<-'ro{sv}i') 'o{sv}ijemu' 'o{sv}ijima' 'o{sv}ijega' 'o{sv}ijeg' 'o{sv}ijem' 'o{sv}emu' 'o{sv}ega' 'o{sv}eg' 'o{sv}em' 'o{sv}ijim' 'o{sv}ijih' 'o{sv}ijoj' 'o{sv}oga' 'o{sv}ome' 'o{sv}omu' 'o{sv}ima' 'o{sv}iji' 'o{sv}ije' 'o{sv}ija' 'o{sv}iju' 'o{sv}im' 'o{sv}ih' 'o{sv}oj' 'o{sv}om' 'o{sv}og' 'o{sv}i' 'o{sv}a' 'o{sv}u' 'o{sv}e' (<-'o{sv}') 'evitijima' 'evitijega' 'evitijemu' 'evitijem' 'evitega' 'evitemu' 'evitem' 'evitijim' 'evitijih' 'evitijoj' 'evitijeg' 'evitiji' 'evitije' 'evitija' 'evitoga' 'evitome' 'evitomu' 'evitima' 'evitog' 'evitom' 'evitim' 'evitih' 'evitoj' 'eviti' 'evite' 'evito' 'evita' 'evitu' (<-'evit') 'ovitijima' 'ovitijega' 'ovitijemu' 'ovitijem' 'ovitega' 'ovitemu' 'ovitem' 'ovitijim' 'ovitijih' 'ovitijoj' 'ovitijeg' 'ovitiji' 'ovitije' 'ovitija' 'ovitoga' 'ovitome' 'ovitomu' 'ovitima' 'ovitog' 'ovitom' 'ovitim' 'ovitih' 'ovitoj' 'oviti' 'ovite' 'ovito' 'ovita' 'ovitu' (<-'ovit') 'astijima' 'astijega' 'astijemu' 'astijem' 'astega' 'astemu' 'astem' 'astijim' 'astijih' 'astijoj' 'astijeg' 'astiji' 'astije' 'astija' 'astoga' 'astome' 'astomu' 'astima' 'astog' 'astom' 'astim' 'astih' 'astoj' 'asti' 'aste' 'asto' 'asta' 'astu' (<-'ast') 'kijemu' 'kijima' 'kijega' 'kijeg' 'kijem' 'kemu' 'kega' 'keg' 'kem' 'kijim' 'kijih' 'kijoj' 'koga' 'kome' 'komu' 'kima' 'kiji' 'kije' 'kija' 'kiju' 'kim' 'kih' 'koj' 'kom' 'kog' 'kov' 'ki' 'ka' 'ku' 'ke' 'ko' (<-'k') 'evaju{c'}i' 'evasmo' 'evaste' 'evajmo' 'evajte' 'evaju' 'evala' 'evale' 'evali' 'evalo' 'evamo' 'evana' 'evane' 'evani' 'evano' 'evate' 'evati' 'eva{sv}e' 'evahu' 'evah' 'evaj' 'evam' 'evan' 'evao' 'evat' 'evav' 'eva{sv}' (<-'eva') 'avaju{c'}i' 'avasmo' 'avaste' 'avajmo' 'avajte' 'avaju' 'avala' 'avale' 'avali' 'avalo' 'avamo' 'avana' 'avane' 'avani' 'avano' 'avate' 'avati' 'ava{sv}e' 'avahu' 'avah' 'avaj' 'avam' 'avan' 'avao' 'avat' 'avav' 'ava{sv}' (<-'ava') 'ivaju{c'}i' 'ivasmo' 'ivaste' 'ivajmo' 'ivajte' 'ivaju' 'ivala' 'ivale' 'ivali' 'ivalo' 'ivamo' 'ivana' 'ivane' 'ivani' 'ivano' 'ivate' 'ivati' 'iva{sv}e' 'ivahu' 'ivah' 'ivaj' 'ivam' 'ivan' 'ivao' 'ivat' 'ivav' 'iva{sv}' (<-'iva') 'uvaju{c'}i' 'uvasmo' 'uvaste' 'uvajmo' 'uvajte' 'uvaju' 'uvala' 'uvale' 'uvali' 'uvalo' 'uvamo' 'uvana' 'uvane' 'uvani' 'uvano' 'uvate' 'uvati' 'uva{sv}e' 'uvahu' 'uvah' 'uvaj' 'uvam' 'uvan' 'uvao' 'uvat' 'uvav' 'uva{sv}' (<-'uva') 'irujemo' 'irujete' 'iruju{c'}i' 'iraju{c'}i' 'irivat' 'irujem' 'iruje{sv}' 'irujmo' 'irujte' 'irav{sv}i' 'irasmo' 'iraste' 'irati' 'iramo' 'irate' 'iraju' 'ira{sv}e' 'irahu' 'irala' 'iralo' 'irali' 'irale' 'iruje' 'iruju' 'iruj' 'iral' 'iran' 'iram' 'ira{sv}' 'irat' 'irah' 'irao' (<-'ir') 'a{cv}ismo' 'a{cv}iste' 'a{cv}iti' 'a{cv}imo' 'a{cv}ite' 'a{cv}i{sv}e' 'a{cv}e{c'}i' 'a{cv}ila' 'a{cv}ilo' 'a{cv}ili' 'a{cv}ile' 'a{cv}ena' 'a{cv}eno' 'a{cv}eni' 'a{cv}ene' 'a{cv}io' 'a{cv}im' 'a{cv}i{sv}' 'a{cv}it' 'a{cv}ih' 'a{cv}en' 'a{cv}i' 'a{cv}e' (<-'a{cv}') 'a{cv}av{sv}i' 'a{cv}asmo' 'a{cv}aste' 'a{cv}ahu' 'a{cv}ati' 'a{cv}amo' 'a{cv}ate' 'a{cv}a{sv}e' 'a{cv}ala' 'a{cv}alo' 'a{cv}ali' 'a{cv}ale' 'a{cv}aju' 'a{cv}ana' 'a{cv}ano' 'a{cv}ani' 'a{cv}ane' 'a{cv}ao' 'a{cv}am' 'a{cv}a{sv}' 'a{cv}at' 'a{cv}ah' 'a{cv}an' (<-'a{cv}a') 'nuv{sv}i' 'nusmo' 'nuste' 'nu{c'}i' 'nimo' 'nite' 'nemo' 'nete' 'nula' 'nulo' 'nule' 'nuli' 'nuto' 'nuti' 'nuta' 'ne{sv}' 'nuo' 'nut' (<-'n') 'niv{sv}i' 'nismo' 'niste' 'niti' 'nila' 'nilo' 'nile' 'nili' 'ni{sv}' 'nio' (<-'ni') 'aju{c'}i' 'av{sv}i' 'asmo' 'ajmo' 'ajte' 'ajem' 'aloj' 'amo' 'ate' 'aje' 'aju' 'ati' 'a{sv}e' 'ahu' 'ala' 'ali' 'ale' 'alo' 'ano' 'at' 'ah' 'ao' 'aj' 'an' 'am' 'a{sv}' (<-'a') 'uraju{c'}i' 'urasmo' 'uraste' 'urajmo' 'urajte' 'uramo' 'urate' 'uraju' 'urati' 'ura{sv}e' 'urahu' 'urala' 'urali' 'urale' 'uralo' 'urana' 'urano' 'urani' 'urane' 'ural' 'urat' 'urah' 'urao' 'uraj' 'uran' 'uram' 'ura{sv}' (<-'ur') 'astajasmo' 'astajaste' 'astajahu' 'astajati' 'astajemo' 'astajete' 'astaja{sv}e' 'astajali' 'astaju{c'}i' 'astajala' 'astajalo' 'astajale' 'astajmo' 'astajao' 'astajem' 'astaje{sv}' 'astajat' 'astajah' 'astajte' 'astaje' 'astaju' (<-'astaj') 'istajasmo' 'istajaste' 'istajahu' 'istajati' 'istajemo' 'istajete' 'istaja{sv}e' 'istajali' 'istaju{c'}i' 'istajala' 'istajalo' 'istajale' 'istajmo' 'istajao' 'istajem' 'istaje{sv}' 'istajat' 'istajah' 'istajte' 'istaje' 'istaju' (<-'istaj') 'ostajasmo' 'ostajaste' 'ostajahu' 'ostajati' 'ostajemo' 'ostajete' 'ostaja{sv}e' 'ostajali' 'ostaju{c'}i' 'ostajala' 'ostajalo' 'ostajale' 'ostajmo' 'ostajao' 'ostajem' 'ostaje{sv}' 'ostajat' 'ostajah' 'ostajte' 'ostaje' 'ostaju' (<-'ostaj') 'alama' 'alima' 'alom' 'alu' 'al' (<-'a') 'ajevima' 'ajevi' 'ajeva' 'ajeve' 'ajama' 'ajima' 'aja' 'aji' (<-'aj') 'astadosmo' 'astadoste' 'astado{sv}e' 'astanemo' 'astademo' 'astanete' 'astadete' 'astanimo' 'astanite' 'astanila' 'astav{sv}i' 'astanem' 'astadem' 'astane{sv}' 'astade{sv}' 'astadoh' 'astade' 'astati' 'astane' 'astanu' 'astadu' 'astala' 'astali' 'astalo' 'astale' 'astat' 'astao' (<-'asta') 'istadosmo' 'istadoste' 'istado{sv}e' 'istanemo' 'istademo' 'istanete' 'istadete' 'istanimo' 'istanite' 'istanila' 'istav{sv}i' 'istanem' 'istadem' 'istane{sv}' 'istade{sv}' 'istadoh' 'istade' 'istati' 'istane' 'istanu' 'istadu' 'istala' 'istali' 'istalo' 'istale' 'istat' 'istao' (<-'ista') 'ostadosmo' 'ostadoste' 'ostado{sv}e' 'ostanemo' 'ostademo' 'ostanete' 'ostadete' 'ostanimo' 'ostanite' 'ostanila' 'ostav{sv}i' 'ostanem' 'ostadem' 'ostane{sv}' 'ostade{sv}' 'ostadoh' 'ostade' 'ostati' 'ostane' 'ostanu' 'ostadu' 'ostala' 'ostali' 'ostalo' 'ostale' 'ostat' 'ostao' (<-'osta') 'tasmo' 'taste' 'tajmo' 'tajte' 'tav{sv}i' 'tati' 'tamo' 'tate' 'taju' 'tala' 'talo' 'tale' 'tali' 'tana' 'tano' 'tani' 'tane' 'tan' 'taj' 'tao' 'tam' 'ta{sv}' 'tat' 'tah' (<-'ta') 'injasmo' 'injaste' 'injati' 'injemo' 'injete' 'injali' 'injala' 'injalo' 'injale' 'inja{sv}e' 'injahu' 'injem' 'inje{sv}' 'injat' 'injah' 'injao' (<-'inj') 'astemo' 'astete' 'astimo' 'astite' 'astu{c'}i' 'aste{sv}' 'asli' 'asla' 'aslo' 'asle' (<-'as') 'iv{sv}i' 'ie{c'}i' 'ismo' 'imo' 'ite' 'iti' 'ili' 'ila' 'ilo' 'ile' 'im' 'i{sv}' 'it' 'ih' 'io' (<-'i') 'ijemo' 'ijete' 'ijem' 'ije{sv}' 'ijmo' 'ijte' 'iju' 'ije' 'ij' 'ilu' (<-'i') 'lu{cv}ujete' 'lu{cv}uju{c'}i' 'lu{cv}ujemo' 'lu{cv}ujem' 'lu{cv}uje{sv}' 'lu{cv}ismo' 'lu{cv}iste' 'lu{cv}ujmo' 'lu{cv}ujte' 'lu{cv}uje' 'lu{cv}uju' 'lu{cv}i{sv}e' 'lu{cv}iti' 'lu{cv}imo' 'lu{cv}ite' 'lu{cv}ila' 'lu{cv}ilo' 'lu{cv}ili' 'lu{cv}ile' 'lu{cv}ena' 'lu{cv}eno' 'lu{cv}eni' 'lu{cv}ene' 'lu{cv}uj' 'lu{cv}io' 'lu{cv}en' 'lu{cv}im' 'lu{cv}i{sv}' 'lu{cv}it' 'lu{cv}ih' 'lu{cv}e' 'lu{cv}i' (<-'lu{cv}') 'jetismo' 'jetiste' 'jeti{sv}e' 'jetimo' 'jetite' 'jetiti' 'jetili' 'jetila' 'jetilo' 'jetile' 'jetim' 'jeti{sv}' 'jetit' 'jetih' 'jetio' (<-'jeti') 'emo' 'em' 'e{sv}' 'elama' 'el' (<-'e') 'ilama' 'ilima' 'ilom' 'il' (<-'i') 'atijega' 'atijemu' 'atijima' 'atijeg' 'atijem' 'atega' 'atemu' 'ateg' 'atem' 'atijih' 'atijim' 'atima' 'atoga' 'atome' 'atomu' 'atiji' 'atije' 'atija' 'atiju' 'atoj' 'atog' 'atom' 'atim' 'atih' 'ata' 'atu' 'ato' (<-'at') 'etav{sv}i' 'etu{c'}i' 'etemo' 'etimo' 'etem' 'ete{sv}' (<-'et') 'lucujuci' 'lucujemo' 'lucujete' 'lucujem' 'lucujes' 'lucujmo' 'lucujte' 'lucismo' 'luciste' 'luciti' 'lucite' 'lucise' 'lucuje' 'lucuju' 'lucila' 'lucile' 'lucili' 'lucilo' 'lucena' 'luceni' 'lucene' 'luceno' 'lucimo' 'lucim' 'lucis' 'lucih' 'lucit' 'lucio' 'lucuj' 'lucen' 'luce' 'luci' (no_diacritics <-'luc') 'snjijima' 'snjijemu' 'snjijega' 'snjijim' 'snjijih' 'snjijeg' 'snjijoj' 'snjiji' 'snjija' 'snjije' 'snjiju' 'snjima' 'snjemu' 'snjomu' 'snjome' 'snjega' 'snjoga' 'snjih' 'snjim' 'snjem' 'snjom' 'snjeg' 'snjog' 'snjoj' 'snja' 'snje' 'snji' 'snjo' 'snju' (no_diacritics <-'snj') 'osijima' 'osijemu' 'osijega' 'snjijem' 'osijih' 'osijim' 'osijem' 'osijeg' 'osijoj' 'osima' 'osemu' 'osomu' 'osome' 'osega' 'osoga' 'osija' 'osije' 'osiji' 'osiju' 'osih' 'osim' 'osem' 'osom' 'oseg' 'osog' 'osoj' 'osa' 'ose' 'osi' 'osu' (no_diacritics <-'os') 'acismo' 'aciste' 'acima' 'acimo' 'acome' 'acomu' 'acite' 'aciti' 'acise' 'acila' 'acile' 'acili' 'acilo' 'acega' 'acene' 'aceci' 'aceni' 'acemu' 'acena' 'aceno' 'acoga' 'acoj' 'acih' 'acem' 'acom' 'acen' 'acog' 'acit' 'acio' 'aceg' 'acim' 'acuh' 'acis' 'ace' 'aca' 'aci' (no_diacritics <-'ac') 'ecome' 'ecoga' 'ecemu' 'ecima' 'ecega' 'ecomu' 'ecoj' 'ecuh' 'ecom' 'ecog' 'eceg' 'ecih' 'ecem' 'ecim' 'eca' 'ece' (no_diacritics <-'ec') 'ucomu' 'ucome' 'ucima' 'ucoga' 'ucega' 'ucemu' 'ucih' 'ucog' 'uceg' 'ucom' 'ucem' 'ucim' 'ucuh' 'ucoj' 'uca' 'uce' (no_diacritics <-'uc') 'rosismo' 'rosivsi' 'rosiste' 'rositi' 'rosili' 'rosise' 'rosite' 'rosilo' 'rosimo' 'rosile' 'rosila' 'rosit' 'rosis' 'rosio' 'rosim' 'rosih' (no_diacritics <-'rosi') 'acavsi' 'acaste' 'acasmo' 'acaju' 'acane' 'acate' 'acali' 'acani' 'acati' 'acale' 'acahu' 'acase' 'acano' 'acamo' 'acalo' 'acana' 'acala' 'acam' 'acan' 'acao' 'acas' 'acat' 'acah' (no_diacritics <-'aca') 'jasima' 'jasama' 'jasem' 'jasom' 'jase' 'jasi' 'jasa' 'jasu' (no_diacritics <-'jas') 'tasima' 'tasama' 'tasem' 'tasom' 'tase' 'tasa' 'tasu' 'tasi' (no_diacritics <-'tas') 'gasima' 'gasama' 'gasem' 'gasom' 'gasi' 'gasu' 'gase' 'gasa' (no_diacritics <-'gas') 'nasama' 'nasima' 'nasem' 'nasom' 'nasu' 'nasi' 'nase' 'nasa' (no_diacritics <-'nas') 'kasama' 'kasima' 'kasom' 'kasem' 'kasi' 'kasu' 'kase' 'kasa' (no_diacritics <-'kas') 'vasama' 'vasima' 'vasom' 'vasem' 'vasi' 'vase' 'vasa' 'vasu' (no_diacritics <-'vas') 'basama' 'basima' 'basom' 'basem' 'basi' 'base' 'basu' 'basa' (no_diacritics <-'bas') 'astuci' 'astes' (no_diacritics <-'as') 'cinima' 'cinome' 'cinama' 'cinomu' 'cinoga' 'cinom' 'cinih' 'cinim' 'cinog' 'cinoj' 'cino' 'cini' 'cinu' 'cine' 'cina' (no_diacritics <-'cin') 'astajase' 'astajuci' 'astajes' (no_diacritics <-'astaj') 'istajase' 'istajuci' 'istajes' (no_diacritics <-'istaj') 'ostajase' 'ostajuci' 'ostajes' (no_diacritics <-'ostaj') 'astadose' 'astades' 'astanes' 'astavsi' (no_diacritics <-'asta') 'istadose' 'istades' 'istanes' 'istavsi' (no_diacritics <-'ista') 'ostadose' 'ostades' 'ostanes' 'ostavsi' (no_diacritics <-'osta') 'avajuci' 'avase' 'avas' (no_diacritics <-'ava') 'evajuci' 'evase' 'evas' (no_diacritics <-'eva') 'ivajuci' 'ivase' 'ivas' (no_diacritics <-'iva') 'uvajuci' 'uvase' 'uvas' (no_diacritics <-'uva') 'ovase' (no_diacritics <-'ova') 'jetise' 'jetis' (no_diacritics <-'jeti') 'injase' 'injes' (no_diacritics <-'inj') 'istem' (no_diacritics <-'ist') 'esama' 'esem' 'esi' (no_diacritics <-'es') 'etavsi' 'etuci' 'etes' (no_diacritics <-'et') 'isama' 'isem' 'isi' (no_diacritics <-'is') 'irajuci' 'irujuci' 'irujes' 'iravsi' 'irase' 'iras' (no_diacritics <-'ir') 'urajuci' 'urase' 'uras' (no_diacritics <-'ur') 'ujuci' 'ujes' (no_diacritics <-'uj') 'nivsi' 'nis' (no_diacritics <-'ni') 'snega' 'snemu' 'snem' 'sneg' (no_diacritics <-'sn') 'tavsi' 'tas' (no_diacritics <-'ta') 'ajuci' 'avsi' 'ase' 'as' (no_diacritics <-'a') 'ijes' 'ivsi' 'ieci' 'is' (no_diacritics <-'i') 'es' (no_diacritics <-'e') 'nuvsi' 'nuci' 'nes' (no_diacritics <-'n') ) ) define Step_3 as ( [substring] R1 among ( 'enom' 'enoj' 'enog' 'enim' 'enih' 'anoj' 'anog' 'anim' 'anih' 'ost' 'eno' 'eni' 'oga' 'ima' 'enu' 'ena' 'ama' 'ano' 'ani' 'om' 'og' 'u' 'o' 'i' 'e' 'a' (<-'') ) ) ) define stem as ( do cyr_to_lat do prelude do mark_regions backwards ( do Step_1 do (Step_2 or Step_3) ) ) snowball-3.0.1/algorithms/spanish.sbl000066400000000000000000000134071500727106100176600ustar00rootroot00000000000000routines ( postlude mark_regions RV R1 R2 attached_pronoun standard_suffix y_verb_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef e' '{U+00E9}' // e-acute stringdef i' '{U+00ED}' // i-acute stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute stringdef u" '{U+00FC}' // u-diaeresis stringdef n~ '{U+00F1}' // n-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( '{a'}' (<- 'a') '{e'}' (<- 'e') '{i'}' (<- 'i') '{o'}' (<- 'o') '{u'}' (<- 'u') // and possibly {u"}->u here, or in prelude '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' 'las' 'les' 'los' 'nos' ) substring RV among( 'i{e'}ndo' (] <- 'iendo') '{a'}ndo' (] <- 'ando') '{a'}r' (] <- 'ar') '{e'}r' (] <- 'er') '{i'}r' (] <- 'ir') 'ando' 'iendo' 'ar' 'er' 'ir' (delete) 'yendo' ('u' delete) ) ) define standard_suffix as ( [substring] among( 'anza' 'anzas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' 'able' 'ables' 'ible' 'ibles' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amiento' 'amientos' 'imiento' 'imientos' ( R2 delete ) 'adora' 'ador' 'aci{o'}n' 'adoras' 'adores' 'aciones' 'ante' 'antes' 'ancia' 'ancias' 'acion' // Misspelling of '-ación'. ( R2 delete try ( ['ic'] R2 delete ) ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' 'ucion' // Misspelling of '-ución'. ( R2 <- 'u' ) 'encia' 'encias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' 'able' 'ible' (R2 delete) ) ) ) 'idad' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) ) ) define y_verb_suffix as ( setlimit tomark pV for ([substring]) among( 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' 'yas' 'yes' 'yais' 'yamos' ('u' delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among( 'en' 'es' '{e'}is' 'emos' (try ('u' test 'g') ] delete) 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' 'ar{e'}' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) 'e' '{e'}' ( RV delete try( ['u'] test 'g' RV delete ) ) ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or y_verb_suffix or verb_suffix ) do residual_suffix ) do postlude ) snowball-3.0.1/algorithms/swedish.sbl000066400000000000000000000057731500727106100176700ustar00rootroot00000000000000routines ( et_condition mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ost_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef ao '{U+00E5}' stringdef o" '{U+00F6}' define v 'aeiouy{a"}{ao}{o"}' define s_ending 'bcdfghjklmnoprtvy' define ost_ending 'iklnprtuv' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) gopast v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define et_condition as ( (non-v v not atlimit) and not among ( // frihet, nyhet, råhet, trohet 'h' // societet 'iet' // annuitet, kontinuitet 'uit' // alfabet 'fab' // autenticitet, elektricitet, kapacitet, metallicitet, publicitet 'cit' // graviditet, likviditet, rigiditet 'dit' // neutralitet, rivalitet, sexualitet 'alit' // flexibilitet, instabilitet, kompatibilitet, mobilitet, variabilitet 'ilit' // anonymitet, intimitet, legitimitet 'mit' // kommunitet, maskulinitet, modernitet, spontanitet, suveränitet 'nit' // epitet, serendipitet 'pit' // auktoritet, integritet, majoritet, popularitet, prioritet 'rit' // densitet, generositet, intensitet, luminositet, viskositet 'sit' // identitet, kvantitet 'tit' // aggressivitet, positivitet 'ivit' // antikvitet, oblikvitet 'kvit' // komplexitet 'xit' // komet 'kom' // raket 'rak' // paket 'pak' // staket 'stak' ) ) define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' 'hetens' 'erns' 'at' 'andet' 'het' 'ast' (delete) 's' ( ('et' et_condition ]) or s_ending delete ) 'et' ( et_condition delete ) ) ) define consonant_pair as setlimit tomark p1 for ( among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') and ([next] delete) ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'lig' 'ig' 'els' (delete) '{o"}st' (ost_ending <-'{o"}s') 'fullt' (<-'full') ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball-3.0.1/algorithms/tamil.sbl000066400000000000000000000247631500727106100173300ustar00rootroot00000000000000/* * Affix stripping stemming algorithm for Tamil * By Damodharan Rajalingam */ stringescapes {} /* Aytham */ stringdef aytham '{U+0B83}' /* Uyir - independent vowels */ stringdef a '{U+0B85}' stringdef aa '{U+0B86}' stringdef i '{U+0B87}' stringdef ii '{U+0B88}' stringdef u '{U+0B89}' stringdef uu '{U+0B8A}' stringdef e '{U+0B8E}' stringdef ee '{U+0B8F}' stringdef ai '{U+0B90}' stringdef o '{U+0B92}' stringdef oo '{U+0B93}' stringdef au '{U+0B94}' /* Consonants */ stringdef ka '{U+0B95}' stringdef nga '{U+0B99}' stringdef ca '{U+0B9A}' stringdef ja '{U+0B9C}' stringdef nya '{U+0B9E}' stringdef tta '{U+0B9F}' stringdef nna '{U+0BA3}' stringdef ta '{U+0BA4}' stringdef tha '{U+0BA4}' stringdef na '{U+0BA8}' stringdef nnna '{U+0BA9}' stringdef pa '{U+0BAA}' stringdef ma '{U+0BAE}' stringdef ya '{U+0BAF}' stringdef ra '{U+0BB0}' stringdef rra '{U+0BB1}' stringdef la '{U+0BB2}' stringdef lla '{U+0BB3}' stringdef llla '{U+0BB4}' stringdef zha '{U+0BB4}' stringdef va '{U+0BB5}' /* Vatamozi - borrowed */ stringdef sha '{U+0BB6}' stringdef ssa '{U+0BB7}' stringdef sa '{U+0BB8}' stringdef ha '{U+0BB9}' /* Dependent vowel signs (kombu etc.) */ stringdef vs_aa '{U+0BBE}' stringdef vs_i '{U+0BBF}' stringdef vs_ii '{U+0BC0}' stringdef vs_u '{U+0BC1}' stringdef vs_uu '{U+0BC2}' stringdef vs_e '{U+0BC6}' stringdef vs_ee '{U+0BC7}' stringdef vs_ai '{U+0BC8}' stringdef vs_o '{U+0BCA}' stringdef vs_oo '{U+0BCB}' stringdef vs_au '{U+0BCC}' /* Pulli */ stringdef pulli '{U+0BCD}' /* AU length mark */ stringdef au_lmark '{U+0BD7}' routines ( remove_plural_suffix remove_question_suffixes remove_question_prefixes remove_pronoun_prefixes remove_command_suffixes remove_um remove_vetrumai_urupukal fix_va_start fix_ending fix_endings remove_tense_suffix remove_tense_suffixes remove_common_word_endings has_min_length ) externals ( stem ) booleans ( found_a_match found_vetrumai_urupu ) define has_min_length as ( $(len > 4) ) define fix_va_start as ( [substring] among ( '{va}{vs_oo}' ( <- '{oo}' ) '{va}{vs_o}' ( <- '{o}' ) '{va}{vs_u}' ( <- '{u}' ) '{va}{vs_uu}' ( <- '{uu}' ) ) ) define fix_endings as ( do repeat fix_ending ) define remove_question_prefixes as ( [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete do fix_va_start ) // Gives signal t if an ending was fixed, signal f otherwise. define fix_ending as ( $(len > 3) backwards ( ( [substring] among ( '{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}' ( delete ) '{ya}{pulli}' ( test among('{vs_ai}' '{vs_i}' '{vs_ii}') delete ) '{tta}{pulli}{pa}{pulli}' '{tta}{pulli}{ka}{pulli}' ( <- '{lla}{pulli}' ) '{nnna}{pulli}{rra}{pulli}' ( <- '{la}{pulli}' ) '{rra}{pulli}{ka}{pulli}' // '{nnna}{pulli}{nnna}{pulli}' ( <- '{la}{pulli}' ) '{tta}{pulli}{tta}{pulli}' ( <- '{tta}{vs_u}' ) '{ta}{pulli}{ta}{pulli}' ( found_vetrumai_urupu not '{vs_ai}' <- '{ma}{pulli}' ) '{vs_u}{ka}{pulli}' '{vs_u}{ka}{pulli}{ka}{pulli}' ( <- '{pulli}' ) '{va}' '{ya}' '{va}{pulli}' ( delete ) '{nnna}{vs_u}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') delete ) '{nga}{pulli}' ( among ( '{vs_ai}' ( delete ) '{pulli}' ( delete ) '' ( <- '{ma}{pulli}' ) ) ) ) ) or ( [ '{pulli}' ( ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') try ( '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ) ] delete ) or ( among( '{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}') ] '{pulli}' delete ) or ( test among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}' '{pulli}') ] delete ) ) ) ) ) define remove_pronoun_prefixes as ( [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete do fix_va_start ) define remove_plural_suffix as ( backwards ( [substring] among ( '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' ( ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') <- '{vs_u}{nga}{pulli}' ) or <- '{pulli}' ) '{rra}{pulli}{ka}{lla}{pulli}' ( <- '{la}{pulli}' ) '{tta}{pulli}{ka}{lla}{pulli}' ( <- '{lla}{pulli}' ) '{ka}{lla}{pulli}' ( delete ) ) ) ) define remove_question_suffixes as ( has_min_length backwards ( do ( [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' ) ) do fix_endings ) define remove_command_suffixes as ( has_min_length backwards ( [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete ) ) define remove_um as ( has_min_length backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' ) do fix_ending ) define remove_common_word_endings as ( // These are not suffixes actually but are // some words that are attached to other words // but can be removed for stemming has_min_length backwards ( [substring] among ( '{vs_u}{tta}{nnna}{pulli}' '{vs_i}{la}{pulli}{la}{vs_ai}' '{vs_i}{tta}{ma}{pulli}' '{vs_i}{nnna}{pulli}{rra}{vs_i}' '{vs_aa}{ka}{vs_i}' '{vs_aa}{ka}{vs_i}{ya}' '{vs_e}{nnna}{pulli}{rra}{vs_u}' '{vs_u}{lla}{pulli}{lla}' '{vs_u}{tta}{vs_ai}{ya}' '{vs_u}{tta}{vs_ai}' '{vs_e}{nnna}{vs_u}{ma}{pulli}' '{vs_e}{nnna}' ( <- '{pulli}' ) '{la}{pulli}{la}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') <- '{pulli}' ) '{pa}{tta}{vs_u}' '{pa}{tta}{pulli}{tta}' '{pa}{tta}{pulli}{tta}{vs_u}' '{pa}{tta}{pulli}{tta}{ta}{vs_u}' '{pa}{tta}{pulli}{tta}{nna}' '{ka}{vs_u}{ra}{vs_i}{ya}' '{pa}{rra}{pulli}{rra}{vs_i}' '{va}{vs_i}{tta}{vs_u}' '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' '{pa}{tta}{vs_i}' '{ta}{vs_aa}{nnna}' '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}' ( delete ) ) ) do fix_endings ) define remove_vetrumai_urupukal as ( unset found_vetrumai_urupu has_min_length backwards ( ( test ( [substring] among ( '{nnna}{vs_ai}' ( delete ) '{vs_o}{tta}{vs_u}' '{vs_oo}{tta}{vs_u}' '{vs_i}{la}{pulli}' '{vs_i}{rra}{pulli}' '{vs_i}{nnna}{pulli}{rra}{vs_u}' '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' '{va}{vs_i}{tta}' '{vs_aa}{la}{pulli}' '{vs_u}{tta}{vs_ai}' '{vs_aa}{ma}{la}{pulli}' '{vs_u}{lla}{pulli}' ( <- '{pulli}' ) '{vs_i}{nnna}{pulli}' ( not '{ma}' <- '{pulli}' ) '{vs_i}{tta}{ma}{pulli}' ( $(len >= 7) <- '{pulli}' ) '{la}{pulli}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') <- '{pulli}' ) '{ka}{nna}{pulli}' '{ma}{vs_u}{nnna}{pulli}' '{ma}{vs_ee}{la}{pulli}' '{ma}{vs_ee}{rra}{pulli}' '{ka}{vs_ii}{llla}{pulli}' (delete) '{ta}{vs_u}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') delete ) '{vs_ii}' ( <- '{vs_i}' ) ) ) or test ( [ '{vs_ai}' ( (not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) or (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')) ) ] <- '{pulli}' ) ) (set found_vetrumai_urupu) do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) ) do fix_endings ) define remove_tense_suffixes as ( repeat remove_tense_suffix ) // Gives signal t if a tense suffix was removed, signal f otherwise. define remove_tense_suffix as ( unset found_a_match has_min_length backwards ( do ( test ( [substring] among ( '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' '{pa}{tta}{vs_u}' '{ma}{vs_aa}{ra}{pulli}' '{ma}{vs_i}{nnna}{pulli}' '{nnna}{nnna}{pulli}' '{nnna}{vs_aa}{nnna}{pulli}' '{nnna}{vs_aa}{lla}{pulli}' '{nnna}{vs_aa}{ra}{pulli}' '{nnna}{lla}{pulli}' '{va}{lla}{pulli}' '{nnna}{ra}{pulli}' '{va}{ra}{pulli}' '{nnna}' '{pa}' '{ka}' '{ta}' '{ya}' '{pa}{nnna}{pulli}' '{pa}{lla}{pulli}' '{pa}{ra}{pulli}' '{vs_i}{rra}{pulli}{rra}{vs_u}' '{pa}{ma}{pulli}' '{nnna}{ma}{pulli}' '{ta}{vs_u}{ma}{pulli}' '{rra}{vs_u}{ma}{pulli}' '{ka}{vs_u}{ma}{pulli}' '{nnna}{vs_e}{nnna}{pulli}' '{nnna}{vs_ai}' '{va}{vs_ai}' ( delete ) '{va}{nnna}{pulli}' ( not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}') delete ) '{ta}{vs_u}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') delete ) '{vs_aa}{nnna}{pulli}' ( not '{ca}' <- '{pulli}' ) '{vs_aa}{lla}{pulli}' '{vs_aa}{ra}{pulli}' '{vs_ee}{nnna}{pulli}' '{vs_aa}' '{vs_aa}{ma}{pulli}' '{vs_e}{ma}{pulli}' '{vs_ee}{ma}{pulli}' '{vs_oo}{ma}{pulli}' '{tta}{vs_u}{ma}{pulli}' '{vs_aa}{ya}{pulli}' '{nnna}{vs_i}{ra}{pulli}' '{vs_ii}{ra}{pulli}' '{vs_ii}{ya}{ra}{pulli}' ( <- '{pulli}' ) '{ka}{vs_u}' ( test '{pulli}' delete ) ) (set found_a_match) ) ) do ([among( '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' '{ka}{vs_i}{nnna}{pulli}{rra}' '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' '{ka}{vs_i}{rra}' '{ka}{vs_i}{rra}{pulli}' )] delete (set found_a_match) ) ) do fix_endings found_a_match ) define stem as ( unset found_vetrumai_urupu do fix_ending has_min_length do remove_question_prefixes do remove_pronoun_prefixes do remove_question_suffixes do remove_um do remove_common_word_endings do remove_vetrumai_urupukal do remove_plural_suffix do remove_command_suffixes do remove_tense_suffixes ) snowball-3.0.1/algorithms/turkish.sbl000066400000000000000000000334501500727106100177040ustar00rootroot00000000000000/* Stemmer for Turkish * author: Evren (Kapusuz) Çilden * email: evren.kapusuz at gmail.com * * stems nominal verb suffixes * stems nominal inflections * more than one syllable word check * (y,n,s,U) context check * vowel harmony check * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) * * The stemming algorithm is based on the paper "An Affix Stripping * Morphological Analyzer for Turkish" by Gülşen Eryiğit and * Eşref Adalı (Proceedings of the IAESTED International Conference * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, * Innsbruck, Austria * * Turkish is an agglutinative language and has a very rich morphological * structure. In Turkish, you can form many different words from a single stem * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means * "You had been the doctor of him". The stem of the word is "doktor" and it * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about * the append order of suffixes can be clearly described as FSMs. * The paper referenced above defines some FSMs for right to left * morphological analysis. I generated a method for constructing snowball * expressions from right to left FSMs for stemming suffixes. */ routines ( append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings check_vowel_harmony // tests vowel harmony for suffixes is_reserved_word // tests whether current string is a reserved word ('ad','soyad') mark_cAsInA // nominal verb suffix mark_DA // noun suffix mark_DAn // noun suffix mark_DUr // nominal verb suffix mark_ki // noun suffix mark_lAr // noun suffix, nominal verb suffix mark_lArI // noun suffix mark_nA // noun suffix mark_ncA // noun suffix mark_ndA // noun suffix mark_ndAn // noun suffix mark_nU // noun suffix mark_nUn // noun suffix mark_nUz // nominal verb suffix mark_sU // noun suffix mark_sUn // nominal verb suffix mark_sUnUz // nominal verb suffix mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, mark_yA // noun suffix mark_ylA // noun suffix mark_yU // noun suffix mark_yUm // nominal verb suffix mark_yUz // nominal verb suffix mark_yDU // nominal verb suffix mark_yken // nominal verb suffix mark_ymUs_ // nominal verb suffix mark_ysA // nominal verb suffix mark_suffix_with_optional_y_consonant mark_suffix_with_optional_U_vowel mark_suffix_with_optional_n_consonant mark_suffix_with_optional_s_consonant more_than_one_syllable_word post_process_last_consonants postlude remove_proper_noun_suffix stem_nominal_verb_suffixes stem_noun_suffixes stem_suffix_chain_before_ki ) stringescapes { } /* Special characters in Unicode Latin-1 and Latin Extended-A */ stringdef cc '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE stringdef i '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS stringdef sc '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS booleans ( continue_stemming_noun_suffixes ) groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6 ) define vowel 'ae{i}io{o"}u{u"}' define U '{i}iu{u"}' // the vowel grouping definitions below are used for checking vowel harmony define vowel1 'a{i}ou' // vowels that can end with suffixes containing 'a' define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' define vowel3 'a{i}' // vowels that can end with suffixes containing '{i}' define vowel4 'ei' // vowels that can end with suffixes containing 'i' define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing '{o"}' or '{u"}' externals ( stem ) backwardmode ( // checks vowel harmony for possible suffixes, // helps to detect whether the candidate for suffix applies to vowel harmony // this rule is added to prevent over stemming define check_vowel_harmony as ( test ( (goto vowel) // if there is a vowel ( ('a' goto vowel1) or ('e' goto vowel2) or ('{i}' goto vowel3) or ('i' goto vowel4) or ('o' goto vowel5) or ('{o"}' goto vowel6) or ('u' goto vowel5) or ('{u"}' goto vowel6) ) ) ) // if the last consonant before suffix is vowel and n then advance and delete // if the last consonant before suffix is non vowel and n do nothing // if the last consonant before suffix is not n then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_n_consonant as ( ('n' (test vowel)) or ((not(test 'n')) test(next vowel)) ) // if the last consonant before suffix is vowel and s then advance and delete // if the last consonant before suffix is non vowel and s do nothing // if the last consonant before suffix is not s then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_s_consonant as ( ('s' (test vowel)) or ((not(test 's')) test(next vowel)) ) // if the last consonant before suffix is vowel and y then advance and delete // if the last consonant before suffix is non vowel and y do nothing // if the last consonant before suffix is not y then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_y_consonant as ( ('y' (test vowel)) or ((not(test 'y')) test(next vowel)) ) define mark_suffix_with_optional_U_vowel as ( (U (test non-vowel)) or ((not(test U)) test(next non-vowel)) ) define mark_possessives as ( among ('m{i}z' 'miz' 'muz' 'm{u"}z' 'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') (mark_suffix_with_optional_U_vowel) ) define mark_sU as ( check_vowel_harmony U (mark_suffix_with_optional_s_consonant) ) define mark_lArI as ( among ('leri' 'lar{i}') ) define mark_yU as ( check_vowel_harmony U (mark_suffix_with_optional_y_consonant) ) define mark_nU as ( check_vowel_harmony among ('n{i}' 'ni' 'nu' 'n{u"}') ) define mark_nUn as ( check_vowel_harmony among ('{i}n' 'in' 'un' '{u"}n') (mark_suffix_with_optional_n_consonant) ) define mark_yA as ( check_vowel_harmony among('a' 'e') (mark_suffix_with_optional_y_consonant) ) define mark_nA as ( check_vowel_harmony among('na' 'ne') ) define mark_DA as ( check_vowel_harmony among('da' 'de' 'ta' 'te') ) define mark_ndA as ( check_vowel_harmony among('nda' 'nde') ) define mark_DAn as ( check_vowel_harmony among('dan' 'den' 'tan' 'ten') ) define mark_ndAn as ( check_vowel_harmony among('ndan' 'nden') ) define mark_ylA as ( check_vowel_harmony among('la' 'le') (mark_suffix_with_optional_y_consonant) ) define mark_ki as ( 'ki' ) define mark_ncA as ( check_vowel_harmony among('ca' 'ce') (mark_suffix_with_optional_n_consonant) ) define mark_yUm as ( check_vowel_harmony among ('{i}m' 'im' 'um' '{u"}m') (mark_suffix_with_optional_y_consonant) ) define mark_sUn as ( check_vowel_harmony among ('s{i}n' 'sin' 'sun' 's{u"}n' ) ) define mark_yUz as ( check_vowel_harmony among ('{i}z' 'iz' 'uz' '{u"}z') (mark_suffix_with_optional_y_consonant) ) define mark_sUnUz as ( among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z') ) define mark_lAr as ( check_vowel_harmony among ('ler' 'lar') ) define mark_nUz as ( check_vowel_harmony among ('n{i}z' 'niz' 'nuz' 'n{u"}z') ) define mark_DUr as ( check_vowel_harmony among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r') ) define mark_cAsInA as ( among ('cas{i}na' 'cesine') ) define mark_yDU as ( check_vowel_harmony among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m' 't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n' 't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k' 't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}') (mark_suffix_with_optional_y_consonant) ) // does not fully obey vowel harmony define mark_ysA as ( among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') (mark_suffix_with_optional_y_consonant) ) define mark_ymUs_ as ( check_vowel_harmony among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}') (mark_suffix_with_optional_y_consonant) ) define mark_yken as ( 'ken' (mark_suffix_with_optional_y_consonant) ) define stem_nominal_verb_suffixes as ( [ set continue_stemming_noun_suffixes (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) or (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) or ( mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) unset continue_stemming_noun_suffixes ) or (mark_nUz (mark_yDU or mark_ysA)) or ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) or (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) ]delete ) // stems noun suffix chains ending with -ki define stem_suffix_chain_before_ki as ( [ mark_ki ( (mark_DA] delete try([ (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) )) or (mark_nUn] delete try([ (mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) or (mark_ndA ( (mark_lArI] delete) or ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) or (stem_suffix_chain_before_ki) )) ) ) define stem_noun_suffixes as ( ([mark_lAr] delete try(stem_suffix_chain_before_ki)) or ([mark_ncA] delete try( ([mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or ([mark_lAr] delete stem_suffix_chain_before_ki) ) ) or ([(mark_ndA or mark_nA) ( (mark_lArI] delete) or (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) ) ) or ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) or ( [mark_DAn] delete try ([ ( (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) ) or ([mark_nUn or mark_ylA] delete try( ([mark_lAr] delete stem_suffix_chain_before_ki) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or stem_suffix_chain_before_ki ) ) or ([mark_lArI] delete) or (stem_suffix_chain_before_ki) or ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) ) define post_process_last_consonants as ( [substring] among ( 'b' (<- 'p') 'c' (<- '{cc}') 'd' (<- 't') '{g~}' (<- 'k') ) ) // after stemming if the word ends with 'd' or 'g' most probably last U is // overstemmed like in 'kedim' -> 'ked' // Turkish words don't usually end with 'd' or 'g' // some very well known words are ignored (like 'ad' 'soyad' // appends U to stems ending with d or g, decides which vowel to add // based on the last vowel in the stem define append_U_to_stems_ending_with_d_or_g as ( [] ('d' or 'g') goto vowel (('a' or '{i}') <- '{i}') or (('e' or 'i') <- 'i') or (('o' or 'u') <- 'u') or (('{o"}' or '{u"}') <- '{u"}') ) define is_reserved_word as ( 'ad' try 'soy' atlimit ) ) define remove_proper_noun_suffix as ( // Remove any leading apostrophes (e.g. from tokenisation of single-quoted // text). do ([goto not '{'}'] delete) // https://en.wikipedia.org/wiki/Turkish_language says "In modern // Turkish orthography, an apostrophe is used to separate proper names // from any suffixes" with the example "Türkiye'dir ("it is Turkey")". // Therefore we truncate at the first apostrophe, provided there are at least // two characters before it (which avoids adversely affecting some foreign // names and words such as "o'connor", "l'entrée"). do ( hop 2 goto '{'}' [ tolimit ] delete ) ) // Test if there is more than one syllable. // In Turkish each vowel indicates a distinct syllable. define more_than_one_syllable_word as ( test (loop 2 gopast vowel) ) define postlude as ( backwards ( not is_reserved_word do append_U_to_stems_ending_with_d_or_g do post_process_last_consonants ) ) define stem as ( do remove_proper_noun_suffix more_than_one_syllable_word backwards ( do stem_nominal_verb_suffixes continue_stemming_noun_suffixes do stem_noun_suffixes ) postlude ) snowball-3.0.1/algorithms/yiddish.sbl000066400000000000000000000410421500727106100176440ustar00rootroot00000000000000/* ******************************************* * Stemmer for Yiddish language in YIVO script * * Author: Assaf Urieli * Emails: assaf.urieli at gmail.com ********************************************* */ routines ( prelude mark_regions R1 R1plus3 standard_suffix ) externals ( stem ) integers ( p1 x ) groupings ( vowel niked alefBeys consonant ) stringescapes {} // AlefBeys stringdef Alef '{U+05D0}' stringdef Beys '{U+05D1}' stringdef Giml '{U+05D2}' stringdef Dalet '{U+05D3}' stringdef Hey '{U+05D4}' stringdef Vov '{U+05D5}' stringdef Zayen '{U+05D6}' stringdef Khes '{U+05D7}' stringdef Tes '{U+05D8}' stringdef Yud '{U+05D9}' stringdef LangerKhof '{U+05DA}' stringdef Khof '{U+05DB}' stringdef Lamed '{U+05DC}' stringdef ShlosMem '{U+05DD}' stringdef Mem '{U+05DE}' stringdef LangerNun '{U+05DF}' stringdef Nun '{U+05E0}' stringdef Samekh '{U+05E1}' stringdef Ayen '{U+05E2}' stringdef LangerFey '{U+05E3}' stringdef Fey '{U+05E4}' stringdef LangerTsadek '{U+05E5}' stringdef Tsadek '{U+05E6}' stringdef Kuf '{U+05E7}' stringdef Reysh '{U+05E8}' stringdef Shin '{U+05E9}' stringdef Sof '{U+05EA}' stringdef TsveyVovn '{U+05F0}' stringdef VovYud '{U+05F1}' stringdef TsveyYudn '{U+05F2}' // Niked stringdef Shvo '{U+05B0}' stringdef Khirik '{U+05B4}' stringdef Tseyre '{U+05B5}' stringdef Segl '{U+05B6}' stringdef ReducedSegl '{U+05B1}' stringdef Pasekh '{U+05B7}' stringdef ReducedPasekh '{U+05B2}' stringdef Komets '{U+05B8}' stringdef ReducedKomets '{U+05B3}' stringdef Rafe '{U+05BF}' stringdef SinDot '{U+05C2}' stringdef ShinDot '{U+05C1}' stringdef Khoylm '{U+05B9}' stringdef Melupm '{U+05BC}' stringdef Kubuts '{U+05BB}' // Groupings define niked '{Shvo}{Khirik}{Tseyre}{Segl}{ReducedSegl}{Pasekh}{ReducedPasekh}{Komets}{ReducedKomets}{SinDot}{ShinDot}{Khoylm}{Melupm}{Kubuts}{Rafe}' define alefBeys '{Alef}{Beys}{Giml}{Dalet}{Hey}{Vov}{Zayen}{Khes}{Tes}{Yud}{LangerKhof}{Khof}{Lamed}{ShlosMem}{Mem}{LangerNun}{Nun}{Samekh}{Ayen}{LangerFey}{Fey}{LangerTsadek}{Tsadek}{Kuf}{Reysh}{Shin}{Sof}{TsveyVovn}{VovYud}{TsveyYudn}' define vowel '{Alef}{Vov}{Yud}{Ayen}{VovYud}{TsveyYudn}' define consonant alefBeys - vowel define prelude as ( do ( repeat goto ( [substring] among ( '{Vov}{Vov}' ( not '{Melupm}' <- '{TsveyVovn}' ) '{Vov}{Yud}' ( not '{Khirik}' <- '{VovYud}' ) '{Yud}{Yud}' ( not '{Khirik}' <- '{TsveyYudn}' ) '{LangerKhof}' ( <- '{Khof}') '{ShlosMem}' ( <- '{Mem}' ) '{LangerNun}' ( <- '{Nun}' ) '{LangerFey}' ( <- '{Fey}' ) '{LangerTsadek}' ( <- '{Tsadek}' ) ) ) ) do (repeat goto ( [niked] delete )) ) define mark_regions as ( $p1 = limit ( try ( // Replace past participle ge- at start of word // Unless word starts with gelt- or gebn- or the whole word is ge ['{Giml}{Ayen}'] not ('{Lamed}{Tes}' or '{Beys}{Nun}' or atlimit) <- 'GE' ) try ( // skip verbal prefix among( // Free stressed: Adurkh-, Durkh-, Ahin-, Aher-, Avek-, Mit-, Antkegn-, Akegn-, Anider-, Arop-, Aroys-, Aroyf-, Arum-, Arayn-, Arunter-, Ariber-, Nokh-, Farbay-, Aheym-, Afir-, Faroys-, Funander-, Tsuzamen-, Tsunoyf-, Tsurik- '{Alef}{Dalet}{Vov}{Reysh}{Khof}' '{Dalet}{Vov}{Reysh}{Khof}' '{Alef}{Hey}{Yud}{Nun}' '{Alef}{Hey}{Ayen}{Reysh}' '{Alef}{TsveyVovn}{Ayen}{Kuf}' '{Mem}{Yud}{Tes}' '{Alef}{Nun}{Tes}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Nun}{Yud}{Dalet}{Ayen}{Reysh}' '{Alef}{Reysh}{Alef}{Fey}' '{Alef}{Reysh}{VovYud}{Samekh}' '{Alef}{Reysh}{VovYud}{Fey}' '{Alef}{Reysh}{Vov}{Mem}' '{Alef}{Reysh}{TsveyYudn}{Nun}' '{Alef}{Reysh}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Reysh}{Yud}{Beys}{Ayen}{Reysh}' '{Nun}{Alef}{Khof}' '{Fey}{Alef}{Reysh}{Beys}{TsveyYudn}' '{Alef}{Hey}{TsveyYudn}{Mem}' '{Alef}{Fey}{Yud}{Reysh}' '{Fey}{Alef}{Reysh}{VovYud}{Samekh}' '{Fey}{Vov}{Nun}{Alef}{Nun}{Dalet}{Ayen}{Reysh}' '{Tsadek}{Vov}{Zayen}{Alef}{Mem}{Ayen}{Nun}' '{Tsadek}{Vov}{Nun}{VovYud}{Fey}' '{Tsadek}{Vov}{Reysh}{Yud}{Kuf}' // Stressed: Oys-, Oyf-, Um-, Unter-, Iber-, Ayn-, On-, Op-, Bay-, For-, Tsu-. '{Alef}{VovYud}{Samekh}' '{Alef}{VovYud}{Fey}' '{Alef}{Vov}{Mem}' '{Alef}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Yud}{Beys}{Ayen}{Reysh}' '{Alef}{TsveyYudn}{Nun}' '{Alef}{Nun}' '{Alef}{Fey}' '{Beys}{TsveyYudn}' '{Fey}{Alef}{Reysh}' '{Tsadek}{Vov}' // Unstressed: Ant-, Ba-, Der-, Tse-. Far- already covered by For-. Ge- comes later. '{Alef}{Nun}{Tes}' '{Beys}{Alef}' '{Dalet}{Ayen}{Reysh}' '{Tsadek}{Ayen}' // If verbal prefix followed by Tsu- or Ge-, replace it ( // Don't mark the TSU- prefix inside verbs like "oys-tsugn" test (('{Tsadek}{Vov}{Giml}{Nun}' or '{Tsadek}{Vov}{Kuf}{Tes}' or '{Tsadek}{Vov}{Kuf}{Nun}') atlimit) or // Don't mark the GE- prefix inside verbs like "avek-gebn" test ('{Giml}{Ayen}{Beys}{Nun}') or ( ['{Giml}{Ayen}'] <- 'GE') or (['{Tsadek}{Vov}'] <- 'TSU') ) ) ) test(hop 3 setmark x) // We want to allow three-consonant Hebrew roots. // To this end, we skip three-consonant combinations that exist in non-Hebraic Yiddish. try ( among( '{Shin}{Fey}{Reysh}' '{Shin}{Tes}{Reysh}' '{Shin}{Tes}{Shin}' '{Dalet}{Zayen}{Shin}' ( true ) ) ) // Either 3 consonants or the first non-vowel after a vowel ( not (consonant consonant consonant setmark p1) gopast vowel goto non-vowel setmark p1 ) try($p1 < x $p1 = x) // at least 3 past the prefix ) ) backwardmode ( define R1 as $p1 <= cursor // Like R1, but also allows the cursor to be outside R1 by the width of Giml Yud Samekh define R1plus3 as $p1 <= cursor + sizeof '{Giml}{Yud}{Samekh}' define standard_suffix as ( do ( [substring] among( // Plural/adjective endings: -er, -ers, -e, -n, -s, -en, -ns, -eners, -ens, -es '{Ayen}{Reysh}{Samekh}' '{Ayen}{Nun}' '{Nun}{Samekh}' '{Ayen}{Nun}{Ayen}{Reysh}{Samekh}' '{Ayen}{Samekh}' '{Ayen}' '{Nun}' '{Samekh}' '{Ayen}{Mem}' '{Ayen}{Reysh}' ( R1 delete ) // Exception: don't delete noun endings -ie, like "agitatsie" '{Yud}{Ayen}' ( true ) // -ies => ie '{Yud}{Ayen}{Samekh}' ( R1 <- '{Yud}{Ayen}' ) // Plural/adjective endings: -enem, -ener, -ene, -ens '{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}' (R1 delete [substring] among ( // -gegangen => -gey '{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}') // -genumen => -nem '{Nun}{Vov}{Mem}' (<- '{Nun}{Ayen}{Mem}') // -gemiten => -mayd '{Mem}{Yud}{Tes}' (<- '{Mem}{TsveyYudn}{Dalet}') // -gebiten => -bayt '{Beys}{Yud}{Tes}' (<- '{Beys}{TsveyYudn}{Tes}') // -gebisen => -bays '{Beys}{Yud}{Samekh}' (<- '{Beys}{TsveyYudn}{Samekh}') // -gevizen => -vayz '{TsveyVovn}{Yud}{Zayen}' (<- '{TsveyVovn}{TsveyYudn}{Zayen}') // -getriben => -trayb '{Tes}{Reysh}{Yud}{Beys}' (<- '{Tes}{Reysh}{TsveyYudn}{Beys}') // -geliten => -layt '{Lamed}{Yud}{Tes}' (<- '{Lamed}{TsveyYudn}{Tes}') // -gekliben => -klayb '{Kuf}{Lamed}{Yud}{Beys}' (<- '{Kuf}{Lamed}{TsveyYudn}{Beys}') // -geriben => -rayb '{Reysh}{Yud}{Beys}' (<- '{Reysh}{TsveyYudn}{Beys}') // -gerisen => -rays '{Reysh}{Yud}{Samekh}' (<- '{Reysh}{TsveyYudn}{Samekh}') // -geshvigen => -shvayg '{Shin}{TsveyVovn}{Yud}{Giml}' (<- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}') // -geshmisen => -shmays '{Shin}{Mem}{Yud}{Samekh}' (<- '{Shin}{Mem}{TsveyYudn}{Samekh}') // -geshniten => -shnayd '{Shin}{Nun}{Yud}{Tes}' (<- '{Shin}{Nun}{TsveyYudn}{Dalet}') // -geshriben => -shrayb '{Shin}{Reysh}{Yud}{Beys}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}') // -gebunden => -bind '{Beys}{Vov}{Nun}{Dalet}' (<- '{Beys}{Yud}{Nun}{Dalet}') // -gevuntshn => -vintsh '{TsveyVovn}{Vov}{Tes}{Shin}' (<- '{TsveyVovn}{Yud}{Tes}{Shin}') // -gezungen => -zing '{Zayen}{Vov}{Nun}{Giml}' (<- '{Zayen}{Yud}{Nun}{Giml}') // -getrunken => -trink '{Tes}{Reysh}{Vov}{Nun}{Kuf}' (<- '{Tes}{Reysh}{Yud}{Nun}{Kuf}') // -getsvungen => -tsving '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}' (<- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}') // -geshlungen => -shling '{Shin}{Lamed}{Vov}{Nun}{Giml}' (<- '{Shin}{Lamed}{Yud}{Nun}{Giml}') // -geboygen => -beyg '{Beys}{VovYud}{Giml}' (<- '{Beys}{TsveyYudn}{Giml}') // -gehoyben => -heyb '{Hey}{VovYud}{Beys}' (<- '{Hey}{TsveyYudn}{Beys}') // -farloyren => -farlir '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}' (<- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}') // -shtanen => -shtey '{Shin}{Tes}{Alef}{Nun}' (<- '{Shin}{Tes}{TsveyYudn}') // -geshvoyrn => -shver '{Shin}{TsveyVovn}{VovYud}{Reysh}' (<- '{Shin}{TsveyVovn}{Ayen}{Reysh}') ) ) // Verb/past participle ending: -t '{Tes}' ( R1 delete ) // As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate // Similarly for past participles: -tns, -tene, -tenem, -tener // If the Tes was before R1, we try to perform the same action while leaving the Tes in place '{Tes}{Nun}' '{Tes}{Ayen}' '{Tes}{Ayen}{Reysh}' '{Tes}{Samekh}' '{Tes}{Nun}{Samekh}' '{Tes}{Ayen}{Nun}{Ayen}' '{Tes}{Ayen}{Nun}{Ayen}{Mem}' '{Tes}{Ayen}{Nun}{Ayen}{Reysh}' ( ((R1 delete) or ( <- '{Tes}')) // -(ge)brakht => -breng ['{Beys}{Reysh}{Alef}{Khof}' try '{Giml}{Ayen}'] <- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' ) // Past participles: -et, -etn, -ets, -ete, -eter '{Ayen}{Tes}' '{Ayen}{Tes}{Nun}' '{Ayen}{Tes}{Samekh}' '{Ayen}{Tes}{Ayen}' '{Ayen}{Tes}{Ayen}{Reysh}' ( R1 delete ) // -geyn shorted to -gey '{Giml}{TsveyYudn}{Nun}' ( <- '{Giml}{TsveyYudn}') // ##################### Long list of irregular past participles // -(ge)gangen (shortened to -gangen after prefixes) => -gey '{Giml}{Alef}{Nun}{Giml}{Ayen}{Nun}' ( <- '{Giml}{TsveyYudn}' ) // -(ge)numen (shortened to -numen after prefixes) => -nem '{Nun}{Vov}{Mem}{Ayen}{Nun}' (<- '{Nun}{Ayen}{Mem}' ) // -(ge)shribn (shortened to -shribn after prefixes) => -shrayb '{Shin}{Reysh}{Yud}{Beys}{Nun}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}' ) // -gemiten => -mayd 'GE{Mem}{Yud}{Tes}{Nun}' (<- '{Mem}{TsveyYudn}{Dalet}') // -gebiten => -bayt 'GE{Beys}{Yud}{Tes}{Nun}' (<- '{Beys}{TsveyYudn}{Tes}') // -gebisen => -bays 'GE{Beys}{Yud}{Samekh}{Nun}' ( <- '{Beys}{TsveyYudn}{Samekh}') // -gevizen => -vayz '{TsveyVovn}{Yud}{Zayen}{Nun}' ( <- '{TsveyVovn}{TsveyYudn}{Zayen}') // -getriben => -trayb '{Tes}{Reysh}{Yud}{Beys}{Nun}' ( <- '{Tes}{Reysh}{TsveyYudn}{Beys}') // -geliten => -layt 'GE{Lamed}{Yud}{Tes}{Nun}' ( <- '{Lamed}{TsveyYudn}{Tes}') // -gekliben => -klayb '{Kuf}{Lamed}{Yud}{Beys}{Nun}' ( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}') // -geriben => -rayb '{Reysh}{Yud}{Beys}{Nun}' ( <- '{Reysh}{TsveyYudn}{Beys}') // -gerisen => -rays 'GE{Reysh}{Yud}{Samekh}{Nun}' ( <- '{Reysh}{TsveyYudn}{Samekh}') // -geshvigen => -shvayg '{Shin}{TsveyVovn}{Yud}{Giml}{Nun}' ( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}') // -geshmisen => -shmays '{Shin}{Mem}{Yud}{Samekh}{Nun}' ( <- '{Shin}{Mem}{TsveyYudn}{Samekh}') // -geshniten => -shnayd '{Shin}{Nun}{Yud}{Tes}{Nun}' ( <- '{Shin}{Nun}{TsveyYudn}{Dalet}') // -gebunden => -bind '{Beys}{Vov}{Nun}{Dalet}{Nun}' ( <- '{Beys}{Yud}{Nun}{Dalet}') // -gevuntshn => -vintsh '{TsveyVovn}{Vov}{Tes}{Shin}{Nun}' ( <- '{TsveyVovn}{Yud}{Tes}{Shin}') // -gezungen => -zing '{Zayen}{Vov}{Nun}{Giml}{Nun}' ( <- '{Zayen}{Yud}{Nun}{Giml}') // -getrunken => -trink '{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}' ( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}') // -getsvungen => -tsving '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}' ( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}') // -geshlungen => -shling '{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}' ( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}') // -geboygen => -beyg '{Beys}{VovYud}{Giml}{Nun}' ( <- '{Beys}{TsveyYudn}{Giml}') // -gehoyben => -heyb '{Hey}{VovYud}{Beys}{Nun}' ( <- '{Hey}{TsveyYudn}{Beys}') // -farloyren => -farlir '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}' ( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}') // -shtanen => -shtey '{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}' ( <- '{Shin}{Tes}{TsveyYudn}') // -geshvoyrn => -shver '{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}' ( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}') // -(ge)brakht (shortened to -brakht after prefixes) => -breng '{Beys}{Reysh}{Alef}{Khof}{Tes}' (<- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' ) // ###### End of irregular past participles // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}' ( R1 delete ) // Noun endings: -izm, izmen '{Yud}{Zayen}{Mem}' '{Yud}{Zayen}{Mem}{Ayen}{Nun}' ( R1 delete ) // Plural ending: -im '{Yud}{Mem}' ( R1 delete ) // Plural ending: -os (Hebraic), replace with -h '{Vov}{Sof}' ( R1 <- '{Hey}' ) // Diminutive endings: -elekh, -ele, -lekh, -eles, -elen '{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}' ( R1 delete ) // Noun ending: -ist '{Yud}{Samekh}{Tes}' ( // Exceptions: -gist, -shist ( ('{Giml}' or '{Shin}') try (R1plus3 <- '{Yud}{Samekh}') ) or ( R1 delete ) ) // Noun ending: -istn '{Yud}{Samekh}{Tes}{Nun}' ( R1 delete ) // Verb ending: -stu '{Samekh}{Tes}{Vov}' ( R1 delete ) // Superlative ending: -ster, -ste, -stn '{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}' ( R1 delete ) // Ambiguous verb ending: -st '{Samekh}{Tes}' ( R1 delete ) ) ) do ( [substring] among( // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}' ( R1 delete ) // Diminutive endings: -l '{Lamed}' ( R1 consonant delete ) ) ) do ( [substring] among( // Adjective endings: -ig, -ik, -ish, -nik, -dik '{Yud}{Giml}' '{Yud}{Kuf}' '{Yud}{Shin}' '{Nun}{Yud}{Kuf}' '{Dalet}{Yud}{Kuf}' ( R1 delete ) // Exceptions to above: -blik, -glik '{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}' ( true ) // Present participle endings: -ndik '{Nun}{Dalet}{Yud}{Kuf}' ( R1 delete ) // Present participle ending -endik: delete if after a -ng, -nk, -n, -m, consonant+l, or vowel. // Otherwise, delete just the -ndik part. '{Ayen}{Nun}{Dalet}{Yud}{Kuf}' ( R1 delete ) ) ) do (repeat goto ( ['GE' or 'TSU'] delete )) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix ) snowball-3.0.1/charsets/000077500000000000000000000000001500727106100151475ustar00rootroot00000000000000snowball-3.0.1/charsets/ISO-8859-2.sbl000066400000000000000000000051031500727106100170540ustar00rootroot00000000000000// ISO-8859-2 character mappings. stringdef U+00A0 hex 'A0' stringdef U+0104 hex 'A1' stringdef U+02D8 hex 'A2' stringdef U+0141 hex 'A3' stringdef U+00A4 hex 'A4' stringdef U+013D hex 'A5' stringdef U+015A hex 'A6' stringdef U+00A7 hex 'A7' stringdef U+00A8 hex 'A8' stringdef U+0160 hex 'A9' stringdef U+015E hex 'AA' stringdef U+0164 hex 'AB' stringdef U+0179 hex 'AC' stringdef U+00AD hex 'AD' stringdef U+017D hex 'AE' stringdef U+017B hex 'AF' stringdef U+00B0 hex 'B0' stringdef U+0105 hex 'B1' stringdef U+02DB hex 'B2' stringdef U+0142 hex 'B3' stringdef U+00B4 hex 'B4' stringdef U+013E hex 'B5' stringdef U+015B hex 'B6' stringdef U+02C7 hex 'B7' stringdef U+00B8 hex 'B8' stringdef U+0161 hex 'B9' stringdef U+015F hex 'BA' stringdef U+0165 hex 'BB' stringdef U+017A hex 'BC' stringdef U+02DD hex 'BD' stringdef U+017E hex 'BE' stringdef U+017C hex 'BF' stringdef U+0154 hex 'C0' stringdef U+00C1 hex 'C1' stringdef U+00C2 hex 'C2' stringdef U+0102 hex 'C3' stringdef U+00C4 hex 'C4' stringdef U+0139 hex 'C5' stringdef U+0106 hex 'C6' stringdef U+00C7 hex 'C7' stringdef U+010C hex 'C8' stringdef U+00C9 hex 'C9' stringdef U+0118 hex 'CA' stringdef U+00CB hex 'CB' stringdef U+011A hex 'CC' stringdef U+00CD hex 'CD' stringdef U+00CE hex 'CE' stringdef U+010E hex 'CF' stringdef U+0110 hex 'D0' stringdef U+0143 hex 'D1' stringdef U+0147 hex 'D2' stringdef U+00D3 hex 'D3' stringdef U+00D4 hex 'D4' stringdef U+0150 hex 'D5' stringdef U+00D6 hex 'D6' stringdef U+00D7 hex 'D7' stringdef U+0158 hex 'D8' stringdef U+016E hex 'D9' stringdef U+00DA hex 'DA' stringdef U+0170 hex 'DB' stringdef U+00DC hex 'DC' stringdef U+00DD hex 'DD' stringdef U+0162 hex 'DE' stringdef U+00DF hex 'DF' stringdef U+0155 hex 'E0' stringdef U+00E1 hex 'E1' stringdef U+00E2 hex 'E2' stringdef U+0103 hex 'E3' stringdef U+00E4 hex 'E4' stringdef U+013A hex 'E5' stringdef U+0107 hex 'E6' stringdef U+00E7 hex 'E7' stringdef U+010D hex 'E8' stringdef U+00E9 hex 'E9' stringdef U+0119 hex 'EA' stringdef U+00EB hex 'EB' stringdef U+011B hex 'EC' stringdef U+00ED hex 'ED' stringdef U+00EE hex 'EE' stringdef U+010F hex 'EF' stringdef U+0111 hex 'F0' stringdef U+0144 hex 'F1' stringdef U+0148 hex 'F2' stringdef U+00F3 hex 'F3' stringdef U+00F4 hex 'F4' stringdef U+0151 hex 'F5' stringdef U+00F6 hex 'F6' stringdef U+00F7 hex 'F7' stringdef U+0159 hex 'F8' stringdef U+016F hex 'F9' stringdef U+00FA hex 'FA' stringdef U+0171 hex 'FB' stringdef U+00FC hex 'FC' stringdef U+00FD hex 'FD' stringdef U+0163 hex 'FE' stringdef U+02D9 hex 'FF' snowball-3.0.1/charsets/KOI8-R.sbl000066400000000000000000000036671500727106100165760ustar00rootroot00000000000000// KOI8-R character mappings. stringdef U+00A0 hex '9A' stringdef U+00A9 hex 'BF' stringdef U+00B0 hex '9C' stringdef U+00B2 hex '9D' stringdef U+00B7 hex '9E' stringdef U+00F7 hex '9F' stringdef U+0401 hex 'B3' stringdef U+0410 hex 'E1' stringdef U+0411 hex 'E2' stringdef U+0412 hex 'F7' stringdef U+0413 hex 'E7' stringdef U+0414 hex 'E4' stringdef U+0415 hex 'E5' stringdef U+0416 hex 'F6' stringdef U+0417 hex 'FA' stringdef U+0418 hex 'E9' stringdef U+0419 hex 'EA' stringdef U+041A hex 'EB' stringdef U+041B hex 'EC' stringdef U+041C hex 'ED' stringdef U+041D hex 'EE' stringdef U+041E hex 'EF' stringdef U+041F hex 'F0' stringdef U+0420 hex 'F2' stringdef U+0421 hex 'F3' stringdef U+0422 hex 'F4' stringdef U+0423 hex 'F5' stringdef U+0424 hex 'E6' stringdef U+0425 hex 'E8' stringdef U+0426 hex 'E3' stringdef U+0427 hex 'FE' stringdef U+0428 hex 'FB' stringdef U+0429 hex 'FD' stringdef U+042A hex 'FF' stringdef U+042B hex 'F9' stringdef U+042C hex 'F8' stringdef U+042D hex 'FC' stringdef U+042E hex 'E0' stringdef U+042F hex 'F1' stringdef U+0430 hex 'C1' stringdef U+0431 hex 'C2' stringdef U+0432 hex 'D7' stringdef U+0433 hex 'C7' stringdef U+0434 hex 'C4' stringdef U+0435 hex 'C5' stringdef U+0436 hex 'D6' stringdef U+0437 hex 'DA' stringdef U+0438 hex 'C9' stringdef U+0439 hex 'CA' stringdef U+043A hex 'CB' stringdef U+043B hex 'CC' stringdef U+043C hex 'CD' stringdef U+043D hex 'CE' stringdef U+043E hex 'CF' stringdef U+043F hex 'D0' stringdef U+0440 hex 'D2' stringdef U+0441 hex 'D3' stringdef U+0442 hex 'D4' stringdef U+0443 hex 'D5' stringdef U+0444 hex 'C6' stringdef U+0445 hex 'C8' stringdef U+0446 hex 'C3' stringdef U+0447 hex 'DE' stringdef U+0448 hex 'DB' stringdef U+0449 hex 'DD' stringdef U+044A hex 'DF' stringdef U+044B hex 'D9' stringdef U+044C hex 'D8' stringdef U+044D hex 'DC' stringdef U+044E hex 'C0' stringdef U+044F hex 'D1' stringdef U+0451 hex 'A3' snowball-3.0.1/charsets/cp850.sbl000066400000000000000000000066661500727106100165260ustar00rootroot00000000000000// Code page 850 (MSDOS Latin 1) character mappings. stringdef U+00A0 hex 'FF' stringdef U+00A1 hex 'AD' stringdef U+00A2 hex 'BD' stringdef U+00A3 hex '9C' stringdef U+00A4 hex 'CF' stringdef U+00A5 hex 'BE' stringdef U+00A6 hex 'DD' stringdef U+00A7 hex 'F5' stringdef U+00A8 hex 'F9' stringdef U+00A9 hex 'B8' stringdef U+00AA hex 'A6' stringdef U+00AB hex 'AE' stringdef U+00AC hex 'AA' stringdef U+00AD hex 'F0' stringdef U+00AE hex 'A9' stringdef U+00AF hex 'EE' stringdef U+00B0 hex 'F8' stringdef U+00B1 hex 'F1' stringdef U+00B2 hex 'FD' stringdef U+00B3 hex 'FC' stringdef U+00B4 hex 'EF' stringdef U+00B5 hex 'E6' stringdef U+00B6 hex 'F4' stringdef U+00B7 hex 'FA' stringdef U+00B8 hex 'F7' stringdef U+00B9 hex 'FB' stringdef U+00BA hex 'A7' stringdef U+00BB hex 'AF' stringdef U+00BC hex 'AC' stringdef U+00BD hex 'AB' stringdef U+00BE hex 'F3' stringdef U+00BF hex 'A8' stringdef U+00C0 hex 'B7' stringdef U+00C1 hex 'B5' stringdef U+00C2 hex 'B6' stringdef U+00C3 hex 'C7' stringdef U+00C4 hex '8E' stringdef U+00C5 hex '8F' stringdef U+00C6 hex '92' stringdef U+00C7 hex '80' stringdef U+00C8 hex 'D4' stringdef U+00C9 hex '90' stringdef U+00CA hex 'D2' stringdef U+00CB hex 'D3' stringdef U+00CC hex 'DE' stringdef U+00CD hex 'D6' stringdef U+00CE hex 'D7' stringdef U+00CF hex 'D8' stringdef U+00D0 hex 'D1' stringdef U+00D1 hex 'A5' stringdef U+00D2 hex 'E3' stringdef U+00D3 hex 'E0' stringdef U+00D4 hex 'E2' stringdef U+00D5 hex 'E5' stringdef U+00D6 hex '99' stringdef U+00D7 hex '9E' stringdef U+00D8 hex '9D' stringdef U+00D9 hex 'EB' stringdef U+00DA hex 'E9' stringdef U+00DB hex 'EA' stringdef U+00DC hex '9A' stringdef U+00DD hex 'ED' stringdef U+00DE hex 'E8' stringdef U+00DF hex 'E1' stringdef U+00E0 hex '85' stringdef U+00E1 hex 'A0' stringdef U+00E2 hex '83' stringdef U+00E3 hex 'C6' stringdef U+00E4 hex '84' stringdef U+00E5 hex '86' stringdef U+00E6 hex '91' stringdef U+00E7 hex '87' stringdef U+00E8 hex '8A' stringdef U+00E9 hex '82' stringdef U+00EA hex '88' stringdef U+00EB hex '89' stringdef U+00EC hex '8D' stringdef U+00ED hex 'A1' stringdef U+00EE hex '8C' stringdef U+00EF hex '8B' stringdef U+00F0 hex 'D0' stringdef U+00F1 hex 'A4' stringdef U+00F2 hex '95' stringdef U+00F3 hex 'A2' stringdef U+00F4 hex '93' stringdef U+00F5 hex 'E4' stringdef U+00F6 hex '94' stringdef U+00F7 hex 'F6' stringdef U+00F8 hex '9B' stringdef U+00F9 hex '97' stringdef U+00FA hex 'A3' stringdef U+00FB hex '96' stringdef U+00FC hex '81' stringdef U+00FD hex 'EC' stringdef U+00FE hex 'E7' stringdef U+00FF hex '98' stringdef U+0131 hex 'D5' stringdef U+0192 hex '9F' stringdef U+2017 hex 'F2' stringdef U+2500 hex 'C4' stringdef U+2502 hex 'B3' stringdef U+250C hex 'DA' stringdef U+2510 hex 'BF' stringdef U+2514 hex 'C0' stringdef U+2518 hex 'D9' stringdef U+251C hex 'C3' stringdef U+2524 hex 'B4' stringdef U+252C hex 'C2' stringdef U+2534 hex 'C1' stringdef U+253C hex 'C5' stringdef U+2550 hex 'CD' stringdef U+2551 hex 'BA' stringdef U+2554 hex 'C9' stringdef U+2557 hex 'BB' stringdef U+255A hex 'C8' stringdef U+255D hex 'BC' stringdef U+2560 hex 'CC' stringdef U+2563 hex 'B9' stringdef U+2566 hex 'CB' stringdef U+2569 hex 'CA' stringdef U+256C hex 'CE' stringdef U+2580 hex 'DF' stringdef U+2584 hex 'DC' stringdef U+2588 hex 'DB' stringdef U+2591 hex 'B0' stringdef U+2592 hex 'B1' stringdef U+2593 hex 'B2' stringdef U+25A0 hex 'FE' snowball-3.0.1/compiler/000077500000000000000000000000001500727106100151455ustar00rootroot00000000000000snowball-3.0.1/compiler/analyser.c000066400000000000000000001710511500727106100171340ustar00rootroot00000000000000#include #include /* for INT_MAX */ #include /* printf etc */ #include /* exit */ #include /* memmove */ #include "header.h" typedef enum { e_token_omitted = 0, e_unexpected_token = 1, e_string_omitted = 2, e_unexpected_token_in_among = 3, /* For codes above here, report "after " t->previous_token after the error. */ e_unresolved_substring = 14, e_not_allowed_inside_reverse = 15, e_empty_grouping = 16, e_already_backwards = 17, e_empty_among = 18, e_adjacent_bracketed_in_among = 19, e_substring_preceded_by_substring = 20, /* For codes below here, tokeniser->s is printed before the error. */ e_redeclared = 30, e_undeclared = 31, e_declared_as_different_mode = 32, e_not_of_type_x = 33, e_not_of_type_string_or_integer = 34, e_misplaced = 35, e_redefined = 36, e_misused = 37 } error_code; /* recursive usage: */ static void read_program_(struct analyser * a, int terminator); static struct node * read_C(struct analyser * a); static struct node * C_style(struct analyser * a, const char * s, int token); static void print_node_(struct node * p, int n, const char * s) { printf("%*s%s", n * 2, s, name_of_token(p->type)); if (p->name) { putchar(' '); report_s(stdout, p->name->s); } if (p->literalstring) { printf(" '"); report_b(stdout, p->literalstring); printf("'"); } else if (p->type == c_number) { printf(" %d", p->number); } printf("\n"); if (p->AE) print_node_(p->AE, n+1, "# "); if (p->left) print_node_(p->left, n+1, ""); if (p->aux) print_node_(p->aux, n+1, "@ "); if (p->right) print_node_(p->right, n, ""); } extern void print_program(struct analyser * a) { print_node_(a->program, 0, ""); } static struct node * new_node(struct analyser * a, int type) { NEW(node, p); p->next = a->nodes; a->nodes = p; p->left = NULL; p->right = NULL; p->aux = NULL; p->AE = NULL; p->name = NULL; p->literalstring = NULL; p->mode = a->mode; p->line_number = a->tokeniser->line_number; p->type = type; return p; } static const char * name_of_mode(int n) { switch (n) { case m_backward: return "string backward"; case m_forward: return "string forward"; } fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n); exit(1); } static const char * name_of_type(int n) { switch (n) { case 'b': return "boolean"; case 's': return "string"; case 'i': return "integer"; case 'r': return "routine"; case 'R': return "routine or grouping"; case 'g': return "grouping"; } fprintf(stderr, "Invalid type %d in name_of_type()\n", n); exit(1); } static const char * name_of_name_type(int code) { switch (code) { case t_string: return "string"; case t_boolean: return "boolean"; case t_integer: return "integer"; case t_routine: return "routine"; case t_external: return "external"; case t_grouping: return "grouping"; } fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code); exit(1); } static void count_error(struct analyser * a) { struct tokeniser * t = a->tokeniser; if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); } t->error_count++; } static void error2(struct analyser * a, error_code n, int x) { struct tokeniser * t = a->tokeniser; if (n == e_unexpected_token && t->token_reported_as_unexpected) { // Avoid duplicate errors if this token was already reported as // unexpected and then held. return; } count_error(a); fprintf(stderr, "%s:%d: ", t->file, t->line_number); if ((int)n >= (int)e_redeclared) report_s(stderr, t->s); switch (n) { case e_token_omitted: fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; case e_unexpected_token_in_among: fprintf(stderr, "in among(...), "); /* fall through */ case e_unexpected_token: t->token_reported_as_unexpected = true; fprintf(stderr, "unexpected %s", name_of_token(t->token)); if (t->token == c_number) fprintf(stderr, " %d", t->number); if (t->token == c_name) { t->s[SIZE(t->s)] = 0; fprintf(stderr, " %s", t->s); } break; case e_string_omitted: fprintf(stderr, "string omitted"); break; case e_unresolved_substring: fprintf(stderr, "unresolved substring on line %d", x); break; case e_not_allowed_inside_reverse: fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break; case e_empty_grouping: fprintf(stderr, "empty grouping"); break; case e_already_backwards: fprintf(stderr, "backwards used when already in this mode"); break; case e_empty_among: fprintf(stderr, "empty among(...)"); break; case e_adjacent_bracketed_in_among: fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break; case e_substring_preceded_by_substring: fprintf(stderr, "substring preceded by another substring on line %d", x); break; case e_redeclared: fprintf(stderr, " re-declared"); break; case e_undeclared: fprintf(stderr, " undeclared"); break; case e_declared_as_different_mode: fprintf(stderr, " declared as %s mode; used as %s mode", name_of_mode(a->mode), name_of_mode(x)); break; case e_not_of_type_x: fprintf(stderr, " not of type %s", name_of_type(x)); break; case e_not_of_type_string_or_integer: fprintf(stderr, " not of type string or integer"); break; case e_misplaced: fprintf(stderr, " misplaced"); break; case e_redefined: fprintf(stderr, " redefined"); break; case e_misused: fprintf(stderr, " mis-used as %s mode", name_of_mode(x)); break; } if ((int)n < (int)e_unresolved_substring && t->previous_token > 0) fprintf(stderr, " after %s", name_of_token(t->previous_token)); fprintf(stderr, "\n"); } static void error(struct analyser * a, error_code n) { error2(a, n, 0); } static void error4(struct analyser * a, struct name * q) { count_error(a); q->s[SIZE(q->s)] = 0; fprintf(stderr, "%s:%d: %s undefined\n", a->tokeniser->file, q->used->line_number, q->s); } static void omission_error(struct analyser * a, int n) { a->tokeniser->omission = n; error(a, e_token_omitted); } static int check_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; if (t->token != code) { omission_error(a, code); return false; } return true; } static int get_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; read_token(t); int x = check_token(a, code); if (!x) hold_token(t); return x; } static struct name * look_for_name(struct analyser * a) { const byte * q = a->tokeniser->s; struct name * p; for (p = a->names; p; p = p->next) { byte * b = p->s; int n = SIZE(b); if (n == SIZE(q) && memcmp(q, b, n) == 0) { p->referenced = true; return p; } } return NULL; } static struct name * find_name(struct analyser * a) { struct name * p = look_for_name(a); if (p == NULL) error(a, e_undeclared); return p; } static void check_routine_mode(struct analyser * a, struct name * p, int mode) { if (p->mode == m_unknown) p->mode = mode; else if (p->mode != mode) error2(a, e_misused, mode); } static void check_name_type(struct analyser * a, struct name * p, int type) { switch (type) { case 's': if (p->type == t_string) return; break; case 'i': if (p->type == t_integer) return; break; case 'b': if (p->type == t_boolean) return; break; case 'R': if (p->type == t_grouping) return; /* FALLTHRU */ case 'r': if (p->type == t_routine || p->type == t_external) return; break; case 'g': if (p->type == t_grouping) return; break; } error2(a, e_not_of_type_x, type); } static void read_names(struct analyser * a, int type) { struct tokeniser * t = a->tokeniser; if (!get_token(a, c_bra)) return; while (true) { int token = read_token(t); switch (token) { case c_len: { /* Context-sensitive token - once declared as a name, it loses * its special meaning, for compatibility with older versions * of snowball. */ SIZE(t->s) = 0; t->s = add_literal_to_s(t->s, "len"); goto handle_as_name; } case c_lenof: { /* Context-sensitive token - once declared as a name, it loses * its special meaning, for compatibility with older versions * of snowball. */ SIZE(t->s) = 0; t->s = add_literal_to_s(t->s, "lenof"); goto handle_as_name; } case c_name: handle_as_name: if (look_for_name(a) != NULL) error(a, e_redeclared); else { NEW(name, p); p->s = copy_s(t->s); p->type = type; p->mode = m_unknown; /* used for routines, externals */ /* We defer assigning counts until after we've eliminated * variables whose values are never used. */ p->count = -1; p->referenced = false; p->used_in_among = false; p->used = NULL; p->value_used = false; p->initialised = false; p->used_in_definition = false; p->local_to = NULL; p->grouping = NULL; p->definition = NULL; p->declaration_line_number = t->line_number; p->next = a->names; a->names = p; if (token != c_name) { disable_token(t, token); } } break; default: if (!check_token(a, c_ket)) hold_token(t); return; } } } static symbol * new_literalstring(struct analyser * a) { NEW(literalstring, p); p->b = copy_b(a->tokeniser->b); p->next = a->literalstrings; a->literalstrings = p; return p->b; } static int read_AE_test(struct analyser * a) { struct tokeniser * t = a->tokeniser; switch (read_token(t)) { case c_assign: return c_mathassign; case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: return t->token; default: error(a, e_unexpected_token); hold_token(t); return c_eq; } } static int binding(int t) { switch (t) { case c_plus: case c_minus: return 1; case c_multiply: case c_divide: return 2; default: return -2; } } static void mark_used_in(struct analyser * a, struct name * q, struct node * p) { if (!q->used) { q->used = p; q->local_to = a->program_end->name; } else if (q->local_to) { if (q->local_to != a->program_end->name) { /* Used in more than one routine/external. */ q->local_to = NULL; } } } static void name_to_node(struct analyser * a, struct node * p, int type) { struct name * q = find_name(a); if (q) { check_name_type(a, q, type); mark_used_in(a, q, p); } p->name = q; } static struct node * read_AE(struct analyser * a, struct name * assigned_to, int B) { struct tokeniser * t = a->tokeniser; struct node * p; struct node * q; switch (read_token(t)) { case c_minus: /* monadic */ q = read_AE(a, assigned_to, 100); if (q->type == c_neg) { /* Optimise away double negation, which avoids generators * having to worry about generating "--" (decrement operator * in many languages). */ p = q->right; /* Don't free q, it's in the linked list a->nodes. */ break; } if (q->type == c_number) { /* Negated constant. */ q->number = -q->number; p = q; break; } p = new_node(a, c_neg); p->right = q; break; case c_bra: p = read_AE(a, assigned_to, 0); get_token(a, c_ket); break; case c_name: p = new_node(a, c_name); name_to_node(a, p, 'i'); if (p->name) { // $x = x + 1 shouldn't count as a use of x. p->name->value_used = (p->name != assigned_to); } break; case c_maxint: case c_minint: a->int_limits_used = true; /* fall through */ case c_cursor: case c_limit: case c_len: case c_size: p = new_node(a, t->token); break; case c_number: p = new_node(a, c_number); p->number = t->number; p->fixed_constant = true; break; case c_lenof: case c_sizeof: { int token = t->token; p = C_style(a, "S", token); if (!p->literalstring) { if (p->name) p->name->value_used = true; break; } /* Replace lenof or sizeof on a literal string with a numeric * constant. */ int result; if (token == c_lenof && t->encoding == ENC_UTF8) { // UTF-8. int i = 0; symbol * b = p->literalstring; result = 0; while (i < SIZE(b)) { int dummy; i += get_utf8(b + i, &dummy); ++result; } } else { result = SIZE(p->literalstring); } p->type = c_number; p->literalstring = NULL; p->number = result; p->fixed_constant = (token == c_lenof); break; } default: error(a, e_unexpected_token); hold_token(t); return NULL; } while (true) { int token = read_token(t); int b = binding(token); if (binding(token) <= B) { hold_token(t); return p; } struct node * r = read_AE(a, assigned_to, b); if (p->type == c_number && r->type == c_number) { // Evaluate constant sub-expression. q = new_node(a, c_number); switch (token) { case c_plus: q->number = p->number + r->number; break; case c_minus: q->number = p->number - r->number; break; case c_multiply: q->number = p->number * r->number; break; case c_divide: if (r->number == 0) { fprintf(stderr, "%s:%d: Division by zero\n", t->file, t->line_number); exit(1); } q->number = p->number / r->number; break; default: fprintf(stderr, "Unexpected AE operator %s\n", name_of_token(token)); exit(1); } q->fixed_constant = p->fixed_constant && r->fixed_constant; q->line_number = p->line_number; } else { // Check for specific constant or no-op cases. q = NULL; switch (token) { case c_plus: // 0 + r is r if (p->type == c_number && p->number == 0) { q = r; break; } // p + 0 is p if (r->type == c_number && r->number == 0) { q = p; break; } break; case c_minus: // 0 - r is -r if (p->type == c_number && p->number == 0) { q = new_node(a, c_neg); q->right = r; break; } // p - 0 is p if (r->type == c_number && r->number == 0) { q = p; break; } break; case c_multiply: // 0 * r is 0 if (p->type == c_number && p->number == 0) { q = p; break; } // p * 0 is 0 if (r->type == c_number && r->number == 0) { q = r; q->line_number = p->line_number; break; } // -1 * r is -r if (p->type == c_number && p->number == -1) { q = new_node(a, c_neg); q->right = r; q->line_number = p->line_number; break; } // p * -1 is -p if (r->type == c_number && r->number == -1) { q = new_node(a, c_neg); q->right = p; q->line_number = p->line_number; break; } // 1 * r is r if (p->type == c_number && p->number == 1) { q = r; q->line_number = p->line_number; break; } // p * 1 is p if (r->type == c_number && r->number == 1) { q = p; break; } break; case c_divide: // p / 1 is p if (r->type == c_number && r->number == 1) { q = p; break; } // p / -1 is -p if (r->type == c_number && r->number == -1) { q = new_node(a, c_neg); q->right = p; q->line_number = p->line_number; break; } // p / 0 is an error! if (r->type == c_number && r->number == 0) { fprintf(stderr, "%s:%d: Division by zero\n", t->file, t->line_number); exit(1); } break; } if (!q) { q = new_node(a, token); q->left = p; q->right = r; } } p = q; } } static struct node * read_C_connection(struct analyser * a, struct node * q, int op) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, op); struct node * p_end = q; p->left = q; do { q = read_C(a); p_end->right = q; p_end = q; } while (read_token(t) == op); hold_token(t); return p; } static struct node * read_C_list(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_bra); struct node * p_end = NULL; while (true) { int token = read_token(t); if (token == c_ket) return p; if (token < 0) { omission_error(a, c_ket); return p; } hold_token(t); { struct node * q = read_C(a); while (true) { token = read_token(t); if (token != c_and && token != c_or) { hold_token(t); break; } q = read_C_connection(a, q, token); } if (p_end == NULL) p->left = q; else p_end->right = q; p_end = q; } } } static struct node * C_style(struct analyser * a, const char * s, int token) { int i; struct node * p = new_node(a, token); for (i = 0; s[i] != 0; i++) switch (s[i]) { case 'C': p->left = read_C(a); continue; case 'D': p->aux = read_C(a); continue; case 'A': p->AE = read_AE(a, NULL, 0); continue; case 'f': get_token(a, c_for); continue; case 'S': { int str_token = read_token(a->tokeniser); if (str_token == c_name) name_to_node(a, p, 's'); else if (str_token == c_literalstring) p->literalstring = new_literalstring(a); else error(a, e_string_omitted); } continue; case 'b': case 's': case 'i': if (get_token(a, c_name)) name_to_node(a, p, s[i]); continue; } return p; } static struct node * read_literalstring(struct analyser * a) { struct node * p = new_node(a, c_literalstring); p->literalstring = new_literalstring(a); return p; } static void reverse_b(symbol * b) { int i = 0; int j = SIZE(b) - 1; while (i < j) { int ch1 = b[i]; int ch2 = b[j]; b[i++] = ch2; b[j--] = ch1; } } static int compare_amongvec(const void *pv, const void *qv) { const struct amongvec * p = (const struct amongvec*)pv; const struct amongvec * q = (const struct amongvec*)qv; symbol * b_p = p->b; int p_size = p->size; symbol * b_q = q->b; int q_size = q->size; int smaller_size = p_size < q_size ? p_size : q_size; int i; for (i = 0; i < smaller_size; i++) if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; if (p_size - q_size) return p_size - q_size; return p->line_number - q->line_number; } #define PTR_NULL_CHECK(P, Q) do {\ if ((Q) == NULL) {\ if ((P) != NULL) return 1;\ } else {\ if ((P) == NULL) return -1;\ }\ } while (0) static int compare_node(const struct node *p, const struct node *q) { PTR_NULL_CHECK(p, q); if (q == NULL) { /* p must be NULL too. */ return 0; } if (p->type != q->type) return p->type > q->type ? 1 : -1; if (p->mode != q->mode) return p->mode > q->mode ? 1 : -1; if (p->type == c_number) { if (p->number != q->number) return p->number > q->number ? 1 : -1; } PTR_NULL_CHECK(p->left, q->left); if (p->left) { int r = compare_node(p->left, q->left); if (r != 0) return r; } PTR_NULL_CHECK(p->AE, q->AE); if (p->AE) { int r = compare_node(p->AE, q->AE); if (r != 0) return r; } PTR_NULL_CHECK(p->aux, q->aux); if (p->aux) { int r = compare_node(p->aux, q->aux); if (r != 0) return r; } PTR_NULL_CHECK(p->name, q->name); if (p->name) { int r; if (SIZE(p->name->s) != SIZE(q->name->s)) { return SIZE(p->name->s) - SIZE(q->name->s); } r = memcmp(p->name->s, q->name->s, SIZE(p->name->s)); if (r != 0) return r; } PTR_NULL_CHECK(p->literalstring, q->literalstring); if (p->literalstring) { int r; if (SIZE(p->literalstring) != SIZE(q->literalstring)) { return SIZE(p->literalstring) - SIZE(q->literalstring); } r = memcmp(p->literalstring, q->literalstring, SIZE(p->literalstring) * sizeof(symbol)); if (r != 0) return r; } return compare_node(p->right, q->right); } static struct node * make_among(struct analyser * a, struct node * p, struct node * substring) { NEW(among, x); NEWVEC(amongvec, v, p->number); struct node * q = p->left; struct node * starter = NULL; struct amongvec * w0 = v; struct amongvec * w1 = v; int result = 1; int direction = substring != NULL ? substring->mode : p->mode; int backward = direction == m_backward; if (a->amongs == NULL) a->amongs = x; else a->amongs_end->next = x; a->amongs_end = x; x->next = NULL; x->node = p; x->b = v; x->number = a->among_count++; x->function_count = 0; x->nocommand_count = 0; x->amongvar_needed = false; x->always_matches = false; x->shortest_size = INT_MAX; if (q->type == c_bra) { starter = q; p->left = q = q->right; } while (q) { if (q->type == c_literalstring) { symbol * b = q->literalstring; w1->b = b; /* pointer to case string */ w1->action = NULL; /* action gets filled in below */ w1->line_number = q->line_number; w1->size = SIZE(b); /* number of characters in string */ w1->i = -1; /* index of longest substring */ w1->result = -1; /* number of corresponding case expression */ if (q->left) { struct name * function = q->left->name; w1->function = function; function->used_in_among = true; check_routine_mode(a, function, direction); x->function_count++; } else { w1->function = NULL; if (w1->size == 0) { // This among contains the empty string without a gating // function so it will always match. x->always_matches = true; } } w1++; } else if (q->left == NULL) { /* empty command: () */ w0 = w1; } else { /* Check for previous action which is the same as this one and use * the same action code if we find one. */ int among_result = -1; struct amongvec * w; for (w = v; w < w0; ++w) { if (w->action && compare_node(w->action->left, q->left) == 0) { if (w->result <= 0) { printf("Among code %d isn't positive\n", w->result); exit(1); } among_result = w->result; break; } } if (among_result < 0) { among_result = result++; } while (w0 != w1) { w0->action = q; w0->result = among_result; w0++; } } q = q->right; } if (w1-v != p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } x->command_count = result - 1; { NEWVEC(node*, commands, x->command_count); for (int i = 0; i != x->command_count; ++i) commands[i] = NULL; for (w0 = v; w0 < w1; w0++) { if (w0->result > 0) { /* result == -1 when there's no command. */ if (w0->result > x->command_count) { fprintf(stderr, "More among codes than expected\n"); exit(1); } if (!commands[w0->result - 1]) commands[w0->result - 1] = w0->action; } else { ++x->nocommand_count; } if (backward) reverse_b(w0->b); } x->commands = commands; } qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); /* the following loop is O(n squared) */ for (w0 = w1 - 1; w0 >= v; w0--) { symbol * b = w0->b; int size = w0->size; struct amongvec * w; if (size && size < x->shortest_size) x->shortest_size = size; for (w = w0 - 1; w >= v; w--) { if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) { w0->i = w - v; /* fill in index of longest substring */ break; } } } if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); for (w0 = v; w0 < w1 - 1; w0++) if (w0->size == (w0 + 1)->size && memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) { count_error(a); fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, (w0 + 1)->line_number); report_b(stderr, (w0 + 1)->b); fprintf(stderr, "'\n"); count_error(a); fprintf(stderr, "%s:%d: previously seen here\n", a->tokeniser->file, w0->line_number); } x->literalstring_count = p->number; p->among = x; if (x->command_count > 1 || (x->command_count == 1 && x->nocommand_count > 0)) { /* We need to set among_var rather than just checking if find_among*() * returns zero or not. */ x->amongvar_needed = a->amongvar_needed = true; } if (starter) { starter->right = p; if (substring) { p = starter; } else { substring = new_node(a, c_substring); substring->right = starter; p = substring; } } x->substring = substring; if (substring != NULL) substring->among = x; if (x->function_count > 0) ++a->among_with_function_count; return p; } static int is_just_true(struct node * q) { if (!q) return 1; if (q->type != c_bra && q->type != c_true) return 0; return is_just_true(q->left) && is_just_true(q->right); } static struct node * read_among(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_among); struct node * p_end = NULL; int previous_token = -1; struct node * substring = a->substring; a->substring = NULL; p->number = 0; /* counts the number of literals */ if (!get_token(a, c_bra)) return p; while (true) { struct node * q; int token = read_token(t); switch (token) { case c_literalstring: q = read_literalstring(a); if (read_token(t) == c_name) { struct node * r = new_node(a, c_name); name_to_node(a, r, 'r'); q->left = r; } else { hold_token(t); } p->number++; break; case c_bra: if (previous_token == c_bra) error(a, e_adjacent_bracketed_in_among); q = read_C_list(a); if (is_just_true(q->left)) { /* Convert anything equivalent to () to () so we handle it * the same way. */ q->left = NULL; } break; default: error(a, e_unexpected_token_in_among); previous_token = token; continue; case c_ket: if (p->number == 0) error(a, e_empty_among); if (t->error_count == 0) p = make_among(a, p, substring); return p; } previous_token = token; if (p_end == NULL) p->left = q; else p_end->right = q; p_end = q; } } static struct node * read_substring(struct analyser * a) { struct node * p = new_node(a, c_substring); if (a->substring != NULL) error2(a, e_substring_preceded_by_substring, a->substring->line_number); a->substring = p; return p; } static void check_modifyable(struct analyser * a) { if (!a->modifyable) error(a, e_not_allowed_inside_reverse); } static int ae_uses_name(struct node * p, struct name * q) { if (!p) { // AE is NULL after a syntax error, e.g. `$x = $y` return 0; } switch (p->type) { case c_name: case c_lenof: case c_sizeof: if (p->name == q) return 1; break; case c_neg: return ae_uses_name(p->right, q); case c_multiply: case c_plus: case c_minus: case c_divide: return ae_uses_name(p->left, q) || ae_uses_name(p->right, q); } return 0; } static struct node * read_C(struct analyser * a) { struct tokeniser * t = a->tokeniser; int token = read_token(t); switch (token) { case c_bra: { struct node * p = read_C_list(a); if (p->type != c_bra) { fprintf(stderr, "read_C_list returned unexpected type %s\n", name_of_token(p->type)); exit(1); } if (p->left && !p->left->right) { // Replace a single entry command list with the command it // contains in order to make subsequent optimisations easier. p = p->left; } return p; } case c_backwards: { int mode = a->mode; if (a->mode == m_backward) error(a, e_already_backwards); else a->mode = m_backward; { struct node * p = C_style(a, "C", token); a->mode = mode; return p; } } case c_reverse: { int mode = a->mode; int modifyable = a->modifyable; a->modifyable = false; a->mode = mode == m_forward ? m_backward : m_forward; { struct node * p = C_style(a, "C", token); a->mode = mode; a->modifyable = modifyable; return p; } } case c_not: case c_try: case c_fail: case c_test: case c_do: case c_repeat: return C_style(a, "C", token); case c_goto: case c_gopast: { struct node * subcommand = read_C(a); if (subcommand->type == c_grouping || subcommand->type == c_non) { /* We synthesise special commands for "goto" or "gopast" when * used on a grouping or an inverted grouping - the movement of * c by the matching action is exactly what we want! * * Adding the tokens happens to give unique values (the code * would fail to compile if it didn't!) */ switch (token + subcommand->type) { case c_goto + c_grouping: subcommand->type = c_goto_grouping; break; case c_gopast + c_grouping: subcommand->type = c_gopast_grouping; break; case c_goto + c_non: subcommand->type = c_goto_non; break; case c_gopast + c_non: subcommand->type = c_gopast_non; break; default: fprintf(stderr, "Unexpected go/grouping combination: %s %s", name_of_token(token), name_of_token(subcommand->type)); exit(1); } return subcommand; } struct node * p = new_node(a, token); p->left = subcommand; return p; } case c_loop: { struct node * n = C_style(a, "AC", token); // n->AE is NULL after a syntax error, e.g. `loop next`. if (n->AE && n->AE->type == c_number) { if (n->AE->number <= 0) { // `loop N C`, where N <= 0 is a no-op. if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: loop %d C is a no-op\n", t->file, n->AE->line_number, n->AE->number); } n->AE = NULL; n->left = NULL; n->type = c_true; } else if (n->AE->number == 1) { // `loop 1 C` -> `C`. if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: loop 1 C is just C\n", t->file, n->AE->line_number); } n = n->left; } } return n; } case c_atleast: { struct node * n = C_style(a, "AC", token); // n->AE is NULL after a syntax error, e.g. `loop next`. if (n->AE && n->AE->type == c_number) { if (n->AE->number <= 0) { // `atleast N C` where N <= 0 -> `repeat C`. if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: atleast %d C is just repeat C\n", t->file, n->AE->line_number, n->AE->number); } n->AE = NULL; n->type = c_repeat; } } return n; } case c_setmark: { struct node * n = C_style(a, "i", token); if (n->name) n->name->initialised = true; return n; } case c_tomark: case c_atmark: return C_style(a, "A", token); case c_hop: { struct node * n = C_style(a, "A", token); // n->AE is NULL after a syntax error, e.g. `hop hop`. if (n->AE && n->AE->type == c_number) { if (n->AE->number == 1) { // Convert `hop 1` to `next`. n->AE = NULL; n->type = c_next; } else if (n->AE->number == 0) { if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: hop 0 is a no-op\n", t->file, n->AE->line_number); } n->AE = NULL; n->type = c_true; } else if (n->AE->number < 0) { fprintf(stderr, "%s:%d: warning: hop %d now signals f (as was " "always documented) rather than moving the cursor " "in the opposite direction\n", t->file, n->AE->line_number, n->AE->number); n->AE = NULL; n->type = c_false; } } return n; } case c_delete: check_modifyable(a); /* fall through */ case c_next: case c_tolimit: case c_atlimit: case c_leftslice: case c_rightslice: case c_true: case c_false: case c_debug: return new_node(a, token); case c_assignto: case c_sliceto: { check_modifyable(a); struct node *n = C_style(a, "s", token); if (n->name) n->name->initialised = true; if (token == c_assignto) { fprintf(stderr, "%s:%d: warning: Use of `=>` is not recommended, " "see https://snowballstem.org/compiler/snowman.html " "section 13.3 for details\n", t->file, n->line_number); } return n; } case c_assign: case c_insert: case c_attach: case c_slicefrom: { struct node *n; check_modifyable(a); n = C_style(a, "S", token); if (n->name) n->name->value_used = true; return n; } case c_setlimit: return C_style(a, "CfD", token); case c_set: case c_unset: { struct node * n = C_style(a, "b", token); if (n->name) n->name->initialised = true; return n; } case c_dollar: { read_token(t); if (t->token == c_bra) { /* Handle newer $(AE REL_OP AE) syntax. */ struct node * n = read_AE(a, NULL, 0); read_token(t); token = t->token; switch (token) { case c_assign: count_error(a); fprintf(stderr, "%s:%d: Expected relational operator (did you mean '=='?)\n", t->file, t->line_number); /* Assume it was == to try to avoid an error avalanche. */ token = c_eq; /* FALLTHRU */ case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: { struct node * lhs = n; struct node * rhs = read_AE(a, NULL, 0); if (lhs->type == c_number && rhs->type == c_number) { // Evaluate constant numeric test expression. int result; switch (token) { case c_eq: result = (lhs->number == rhs->number); break; case c_ne: result = (lhs->number != rhs->number); break; case c_gt: result = (lhs->number > rhs->number); break; case c_ge: result = (lhs->number >= rhs->number); break; case c_lt: result = (lhs->number < rhs->number); break; case c_le: result = (lhs->number <= rhs->number); break; default: fprintf(stderr, "Unexpected numeric test operator %s\n", name_of_token(t->token)); exit(1); } n = new_node(a, result ? c_true : c_false); } else { n = new_node(a, token); n->left = lhs; n->AE = rhs; } get_token(a, c_ket); break; } default: error(a, e_unexpected_token); hold_token(t); break; } return n; } if (t->token == c_name) { struct node * p; struct name * q = find_name(a); int mode = a->mode; int modifyable = a->modifyable; if (q && q->type == t_string) { /* Assume for now that $ on string both initialises and * uses the string variable. FIXME: Can we do better? */ q->initialised = true; q->value_used = true; a->mode = m_forward; a->modifyable = true; p = new_node(a, c_dollar); p->left = read_C(a); p->name = q; } else { if (q && q->type != t_integer) { /* If $ is used on an unknown name or a name which * isn't a string or an integer then we assume the * unknown name is an integer as $ is used more often * on integers than strings, so hopefully this it less * likely to cause an error avalanche. * * For an unknown name, we'll already have reported an * error. */ error(a, e_not_of_type_string_or_integer); q = NULL; } p = new_node(a, read_AE_test(a)); switch (p->type) { case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: p->left = new_node(a, c_name); p->left->name = q; if (q) { q->value_used = true; } p->AE = read_AE(a, NULL, 0); break; default: /* +=, etc don't "initialise" as they only * amend an existing value. Similarly, they * don't count as using the value. */ p->name = q; p->AE = read_AE(a, q, 0); if (p->type == c_mathassign && q) { /* $x = x + 1 doesn't initialise x. */ q->initialised = !ae_uses_name(p->AE, q); } break; } } if (q) mark_used_in(a, q, p); a->mode = mode; a->modifyable = modifyable; return p; } error(a, e_unexpected_token); hold_token(t); return new_node(a, c_dollar); } case c_name: { struct name * q = find_name(a); struct node * p = new_node(a, c_name); if (q) { mark_used_in(a, q, p); switch (q->type) { case t_boolean: p->type = c_booltest; q->value_used = true; break; case t_integer: error(a, e_misplaced); /* integer name misplaced */ break; case t_string: q->value_used = true; break; case t_routine: case t_external: p->type = c_call; check_routine_mode(a, q, a->mode); break; case t_grouping: p->type = c_grouping; break; } } p->name = q; return p; } case c_non: { struct node * p = new_node(a, token); read_token(t); if (t->token == c_minus) read_token(t); if (!check_token(a, c_name)) { omission_error(a, c_name); return p; } name_to_node(a, p, 'g'); return p; } case c_literalstring: return read_literalstring(a); case c_among: return read_among(a); case c_substring: return read_substring(a); default: error(a, e_unexpected_token); return NULL; } } static int next_symbol(symbol * p, symbol * W, int utf8) { if (utf8) { int ch; int j = get_utf8(p, & ch); *W = ch; return j; } else { *W = *p; return 1; } } static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) { int j = 0; symbol W; int width; if (style == c_plus) { while (j < SIZE(q)) { width = next_symbol(q + j, &W, utf8); p = add_symbol_to_b(p, W); j += width; } } else { while (j < SIZE(q)) { int i; width = next_symbol(q + j, &W, utf8); for (i = 0; i < SIZE(p); i++) { if (p[i] == W) { memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol)); SIZE(p)--; } } j += width; } } return p; } static void read_define_grouping(struct analyser * a, struct name * q) { struct tokeniser * t = a->tokeniser; int style = c_plus; { NEW(grouping, p); if (a->groupings == NULL) a->groupings = p; else a->groupings_end->next = p; a->groupings_end = p; if (q) { if (q->grouping != NULL) { error(a, e_redefined); FREE(q->grouping); } q->grouping = p; } p->next = NULL; p->name = q; p->line_number = t->line_number; p->b = create_b(0); while (true) { switch (read_token(t)) { case c_name: { struct name * r = find_name(a); if (!r) break; check_name_type(a, r, 'g'); if (r == q) { count_error(a); r->s[SIZE(r->s)] = 0; fprintf(stderr, "%s:%d: %s defined in terms of itself\n", t->file, t->line_number, r->s); } else if (!r->grouping) { count_error(a); r->s[SIZE(r->s)] = 0; fprintf(stderr, "%s:%d: %s undefined\n", t->file, t->line_number, r->s); } else { p->b = alter_grouping(p->b, r->grouping->b, style, false); } r->used_in_definition = true; break; } case c_literalstring: p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8)); break; default: error(a, e_unexpected_token); return; } switch (read_token(t)) { case c_plus: case c_minus: style = t->token; break; default: goto label0; } } label0: { int i; int max = 0; int min = 1<<16; for (i = 0; i < SIZE(p->b); i++) { if (p->b[i] > max) max = p->b[i]; if (p->b[i] < min) min = p->b[i]; } p->largest_ch = max; p->smallest_ch = min; if (min == 1<<16) error(a, e_empty_grouping); } hold_token(t); } } static void read_define_routine(struct analyser * a, struct name * q) { struct node * p = new_node(a, c_define); a->amongvar_needed = false; if (q) { check_name_type(a, q, 'R'); if (q->definition != NULL) error(a, e_redefined); if (q->mode == m_unknown) q->mode = a->mode; else if (q->mode != a->mode) error2(a, e_declared_as_different_mode, q->mode); } p->name = q; if (a->program == NULL) a->program = p; else a->program_end->right = p; a->program_end = p; get_token(a, c_as); p->left = read_C(a); if (q) q->definition = p->left; /* We should get a node with a NULL right pointer from read_C() for the * routine's code. We synthesise a "functionend" node there so * optimisations such as dead code elimination and tail call optimisation * can easily see where the function ends. */ assert(p->left->right == NULL); p->left->right = new_node(a, c_functionend); if (a->substring != NULL) { error2(a, e_unresolved_substring, a->substring->line_number); a->substring = NULL; } p->amongvar_needed = a->amongvar_needed; } static void read_define(struct analyser * a) { if (get_token(a, c_name)) { struct name * q = find_name(a); int type; if (q) { type = q->type; } else { /* No declaration so sniff next token - if it is a string or name * we parse as a grouping, otherwise we parse as a routine. This * avoids an avalanche of further errors if `as` is missing from a * routine definition. */ switch (peek_token(a->tokeniser)) { case c_literalstring: case c_name: type = t_grouping; break; default: type = t_routine; } } if (type == t_grouping) { read_define_grouping(a, q); } else { read_define_routine(a, q); } } } static void read_backwardmode(struct analyser * a) { int mode = a->mode; a->mode = m_backward; if (get_token(a, c_bra)) { read_program_(a, c_ket); check_token(a, c_ket); } a->mode = mode; } static void read_program_(struct analyser * a, int terminator) { struct tokeniser * t = a->tokeniser; while (true) { switch (read_token(t)) { case c_strings: read_names(a, t_string); break; case c_booleans: read_names(a, t_boolean); break; case c_integers: read_names(a, t_integer); break; case c_routines: read_names(a, t_routine); break; case c_externals: read_names(a, t_external); break; case c_groupings: read_names(a, t_grouping); break; case c_define: read_define(a); break; case c_backwardmode:read_backwardmode(a); break; case c_ket: if (terminator == c_ket) return; /* fall through */ default: error(a, e_unexpected_token); break; case -1: if (terminator >= 0) omission_error(a, c_ket); return; } } } static void remove_dead_assignments(struct node * p, struct name * q) { if (p->name == q) { switch (p->type) { case c_assignto: case c_sliceto: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_setmark: case c_set: case c_unset: case c_dollar: /* c_true is a no-op. */ p->type = c_true; p->AE = NULL; break; default: /* There are no read accesses to this variable, so any * references must be assignments. */ fprintf(stderr, "Unhandled type of dead assignment via %s\n", name_of_token(p->type)); exit(1); } } if (p->AE) remove_dead_assignments(p->AE, q); if (p->left) remove_dead_assignments(p->left, q); if (p->aux) remove_dead_assignments(p->aux, q); if (p->right) remove_dead_assignments(p->right, q); } extern void read_program(struct analyser * a) { read_program_(a, -1); { struct name * q = a->names; while (q) { switch (q->type) { case t_external: case t_routine: if (q->used && q->definition == NULL) error4(a, q); break; case t_grouping: if (q->used && q->grouping == NULL) error4(a, q); break; } q = q->next; } } if (a->tokeniser->error_count == 0) { struct name * q = a->names; struct name ** ptr = &(a->names); while (q) { if (!q->referenced) { q->s[SIZE(q->s)] = 0; fprintf(stderr, "%s:%d: warning: %s '%s' ", a->tokeniser->file, q->declaration_line_number, name_of_name_type(q->type), q->s); if (q->type == t_routine || q->type == t_external || q->type == t_grouping) { fprintf(stderr, "declared but not defined\n"); } else { fprintf(stderr, "defined but not used\n"); } q = q->next; *ptr = q; continue; } else if (q->type == t_routine || q->type == t_grouping) { /* It's OK to define a grouping but only use it to define other * groupings. */ if (!q->used && !q->used_in_definition) { int line_num; if (q->type == t_routine) { line_num = q->definition->line_number; } else { line_num = q->grouping->line_number; } q->s[SIZE(q->s)] = 0; fprintf(stderr, "%s:%d: warning: %s '%s' defined but not used\n", a->tokeniser->file, line_num, name_of_name_type(q->type), q->s); q = q->next; *ptr = q; continue; } } else if (q->type == t_external) { /* Unused is OK. */ } else if (!q->initialised) { q->s[SIZE(q->s)] = 0; fprintf(stderr, "%s:%d: warning: %s '%s' is never initialised\n", a->tokeniser->file, q->declaration_line_number, name_of_name_type(q->type), q->s); } else if (!q->value_used) { q->s[SIZE(q->s)] = 0; fprintf(stderr, "%s:%d: warning: %s '%s' is set but never used\n", a->tokeniser->file, q->declaration_line_number, name_of_name_type(q->type), q->s); remove_dead_assignments(a->program, q); q = q->next; *ptr = q; continue; } ptr = &(q->next); q = q->next; } { /* Now we've eliminated variables whose values are never used we * can number the variables, which is used by some generators. */ int * name_count = a->name_count; struct name * n; for (n = a->names; n; n = n->next) { n->count = name_count[n->type]++; } } } } extern struct analyser * create_analyser(struct tokeniser * t) { NEW(analyser, a); a->tokeniser = t; a->nodes = NULL; a->names = NULL; a->literalstrings = NULL; a->program = NULL; a->amongs = NULL; a->among_count = 0; a->among_with_function_count = 0; a->groupings = NULL; a->mode = m_forward; a->modifyable = true; { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; } a->substring = NULL; a->int_limits_used = false; return a; } extern void close_analyser(struct analyser * a) { { struct node * q = a->nodes; while (q) { struct node * q_next = q->next; FREE(q); q = q_next; } } { struct name * q = a->names; while (q) { struct name * q_next = q->next; lose_s(q->s); FREE(q); q = q_next; } } { struct literalstring * q = a->literalstrings; while (q) { struct literalstring * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } { struct among * q = a->amongs; while (q) { struct among * q_next = q->next; FREE(q->b); FREE(q->commands); FREE(q); q = q_next; } } { struct grouping * q = a->groupings; while (q) { struct grouping * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } FREE(a); } snowball-3.0.1/compiler/driver.c000066400000000000000000000503311500727106100166060ustar00rootroot00000000000000#include /* for toupper etc */ #include /* for fprintf etc */ #include /* for free etc */ #include /* for strcmp */ #include "header.h" #define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" #define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" #define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" #define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" #define DEFAULT_GO_PACKAGE "snowball" #define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" #define DEFAULT_ADA_PACKAGE "Snowball" #define DEFAULT_ADA_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/ada" #define DEFAULT_CS_NAMESPACE "Snowball" #define DEFAULT_CS_BASE_CLASS "Stemmer" #define DEFAULT_CS_AMONG_CLASS "Among" #define DEFAULT_CS_STRING_CLASS "StringBuilder" #define DEFAULT_JS_BASE_CLASS "BaseStemmer" #define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" static int eq(const char * s1, const char * s2) { return strcmp(s1, s2) == 0; } static void print_arglist(int exit_code) { FILE * f = exit_code ? stderr : stdout; fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" "Supported options:\n" " -o, -output OUTPUT_BASE\n" " -s, -syntax\n" " -comments\n" #ifndef DISABLE_JAVA " -j, -java\n" #endif #ifndef DISABLE_CSHARP " -cs, -csharp\n" #endif " -c++\n" #ifndef DISABLE_PASCAL " -pascal\n" #endif #ifndef DISABLE_PYTHON " -py, -python\n" #endif #ifndef DISABLE_JS " -js generate Javascript\n" #endif #ifndef DISABLE_RUST " -rust\n" #endif #ifndef DISABLE_GO " -go\n" #endif #ifndef DISABLE_ADA " -ada\n" #endif " -w, -widechars\n" " -u, -utf8\n" " -n, -name CLASS_NAME\n" " -ep, -eprefix EXTERNAL_PREFIX\n" " -vp, -vprefix VARIABLE_PREFIX\n" " -i, -include DIRECTORY\n" " -r, -runtime DIRECTORY\n" " -p, -parentclassname CLASS_NAME fully qualified parent class name\n" #if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) " -P, -Package PACKAGE_NAME package name for stemmers\n" " -S, -Stringclass STRING_CLASS StringBuffer-compatible class\n" " -a, -amongclass AMONG_CLASS fully qualified name of the Among class\n" #endif #ifndef DISABLE_GO " -gop, -gopackage PACKAGE_NAME Go package name for stemmers\n" " -gor, -goruntime PACKAGE_NAME Go snowball runtime package\n" #endif " --help display this help and exit\n" " --version output version information and exit\n" ); exit(exit_code); } static void check_lim(int i, int argc) { if (i >= argc) { fprintf(stderr, "argument list is one short\n"); print_arglist(1); } } static FILE * get_output(byte * s) { s[SIZE(s)] = 0; const char * filename = (const char *)s; FILE * output = fopen(filename, "w"); if (output == NULL) { fprintf(stderr, "Can't open output %s\n", filename); exit(1); } return output; } static int read_options(struct options * o, int argc, char * argv[]) { int i = 1; int new_argc = 1; /* Note down the last option used to specify an explicit encoding so * we can warn we ignored it for languages with a fixed encoding. */ const char * encoding_opt = NULL; /* set defaults: */ o->output_file = NULL; o->syntax_tree = false; o->comments = false; o->js_esm = false; o->externals_prefix = NULL; o->variables_prefix = NULL; o->runtime_path = NULL; o->parent_class_name = NULL; o->string_class = NULL; o->among_class = NULL; o->package = NULL; o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; o->name = NULL; o->make_lang = LANG_C; o->includes = NULL; o->includes_end = NULL; o->encoding = ENC_SINGLEBYTE; /* read options: */ while (i < argc) { char * s = argv[i++]; if (s[0] != '-') { /* Non-option argument - shuffle down. */ argv[new_argc++] = s; continue; } { if (eq(s, "-o") || eq(s, "-output")) { check_lim(i, argc); o->output_file = argv[i++]; continue; } if (eq(s, "-n") || eq(s, "-name")) { char * new_name; size_t len; check_lim(i, argc); /* Take a copy of the argument here, because * later we will free o->name */ len = strlen(argv[i]); new_name = MALLOC(len + 1); memcpy(new_name, argv[i++], len); new_name[len] = '\0'; o->name = new_name; continue; } #ifndef DISABLE_JS if (eq(s, "-js")) { o->make_lang = LANG_JAVASCRIPT; o->js_esm = false; continue; } #endif #ifndef DISABLE_RUST if (eq(s, "-rust")) { o->make_lang = LANG_RUST; continue; } #endif #ifndef DISABLE_GO if (eq(s, "-go")) { o->make_lang = LANG_GO; continue; } #endif #ifndef DISABLE_JAVA if (eq(s, "-j") || eq(s, "-java")) { o->make_lang = LANG_JAVA; continue; } #endif #ifndef DISABLE_CSHARP if (eq(s, "-cs") || eq(s, "-csharp")) { o->make_lang = LANG_CSHARP; continue; } #endif if (eq(s, "-c++")) { o->make_lang = LANG_CPLUSPLUS; continue; } #ifndef DISABLE_PASCAL if (eq(s, "-pascal")) { o->make_lang = LANG_PASCAL; continue; } #endif #ifndef DISABLE_PYTHON if (eq(s, "-py") || eq(s, "-python")) { o->make_lang = LANG_PYTHON; continue; } #endif #ifndef DISABLE_ADA if (eq(s, "-ada")) { o->make_lang = LANG_ADA; continue; } #endif if (eq(s, "-w") || eq(s, "-widechars")) { encoding_opt = s; o->encoding = ENC_WIDECHARS; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { o->syntax_tree = true; continue; } if (eq(s, "-comments")) { o->comments = true; continue; } if (eq(s, "-ep") || eq(s, "-eprefix")) { check_lim(i, argc); o->externals_prefix = argv[i++]; continue; } if (eq(s, "-vp") || eq(s, "-vprefix")) { check_lim(i, argc); o->variables_prefix = argv[i++]; continue; } if (eq(s, "-i") || eq(s, "-include")) { check_lim(i, argc); { NEW(include, p); byte * include_dir = add_sz_to_s(NULL, argv[i++]); include_dir = add_char_to_s(include_dir, '/'); p->next = NULL; p->s = include_dir; if (o->includes == NULL) { o->includes = p; } else { o->includes_end->next = p; } o->includes_end = p; } continue; } if (eq(s, "-r") || eq(s, "-runtime")) { check_lim(i, argc); o->runtime_path = argv[i++]; continue; } if (eq(s, "-u") || eq(s, "-utf8")) { encoding_opt = s; o->encoding = ENC_UTF8; continue; } if (eq(s, "-p") || eq(s, "-parentclassname")) { check_lim(i, argc); o->parent_class_name = argv[i++]; continue; } #if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; continue; } if (eq(s, "-S") || eq(s, "-stringclass")) { check_lim(i, argc); o->string_class = argv[i++]; continue; } if (eq(s, "-a") || eq(s, "-amongclass")) { check_lim(i, argc); o->among_class = argv[i++]; continue; } #endif #ifndef DISABLE_GO if (eq(s, "-gop") || eq(s, "-gopackage")) { check_lim(i, argc); o->package = argv[i++]; continue; } if (eq(s, "-gor") || eq(s, "-goruntime")) { check_lim(i, argc); o->go_snowball_runtime = argv[i++]; continue; } #endif if (eq(s, "--help")) { print_arglist(0); } if (eq(s, "--version")) { printf("Snowball compiler version " SNOWBALL_VERSION "\n"); exit(0); } fprintf(stderr, "'%s' misplaced\n", s); print_arglist(1); } } if (new_argc == 1) { fprintf(stderr, "no source files specified\n"); print_arglist(1); } argv[new_argc] = NULL; /* Set language-dependent defaults. */ switch (o->make_lang) { case LANG_C: case LANG_CPLUSPLUS: encoding_opt = NULL; break; case LANG_CSHARP: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_CS_BASE_CLASS; if (!o->string_class) o->string_class = DEFAULT_CS_STRING_CLASS; if (!o->among_class) o->among_class = DEFAULT_CS_AMONG_CLASS; if (!o->package) o->package = DEFAULT_CS_NAMESPACE; break; case LANG_GO: o->encoding = ENC_UTF8; if (!o->package) o->package = DEFAULT_GO_PACKAGE; break; case LANG_ADA: o->encoding = ENC_UTF8; if (!o->package) o->package = DEFAULT_ADA_PACKAGE; break; case LANG_JAVA: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; if (!o->string_class) o->string_class = DEFAULT_JAVA_STRING_CLASS; if (!o->among_class) o->among_class = DEFAULT_JAVA_AMONG_CLASS; if (!o->package) o->package = DEFAULT_JAVA_PACKAGE; break; case LANG_JAVASCRIPT: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_JS_BASE_CLASS; break; case LANG_PYTHON: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; break; case LANG_RUST: o->encoding = ENC_UTF8; break; default: break; } if (encoding_opt) { fprintf(stderr, "warning: %s only meaningful for C and C++\n", encoding_opt); } if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { if (o->runtime_path) { fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); } if (o->externals_prefix) { fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n"); } } if (!o->externals_prefix) o->externals_prefix = ""; if (!o->name && o->output_file) { /* Default class name to basename of output_file - this is the standard * convention for at least Java and C#. */ const char * slash = strrchr(o->output_file, '/'); size_t len; const char * leaf = (slash == NULL) ? o->output_file : slash + 1; slash = strrchr(leaf, '\\'); if (slash != NULL) leaf = slash + 1; { const char * dot = strchr(leaf, '.'); len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); } { char * new_name = MALLOC(len + 1); switch (o->make_lang) { case LANG_CSHARP: case LANG_PASCAL: /* Upper case initial letter. */ memcpy(new_name, leaf, len); new_name[0] = toupper(new_name[0]); break; case LANG_JAVASCRIPT: case LANG_PYTHON: { /* Upper case initial letter and change each * underscore+letter or hyphen+letter to an upper case * letter. */ size_t new_len = 0; int uc_next = true; for (size_t j = 0; j != len; ++j) { unsigned char ch = leaf[j]; if (ch == '_' || ch == '-') { uc_next = true; } else { if (uc_next) { new_name[new_len] = toupper(ch); uc_next = false; } else { new_name[new_len] = ch; } ++new_len; } } len = new_len; break; } default: /* Just copy. */ memcpy(new_name, leaf, len); break; } new_name[len] = '\0'; o->name = new_name; } } return new_argc; } extern int main(int argc, char * argv[]) { int i; NEW(options, o); argc = read_options(o, argc, argv); { char * file = argv[1]; byte * u = get_input(file); if (u == NULL) { fprintf(stderr, "Can't open input %s\n", file); exit(1); } { struct tokeniser * t = create_tokeniser(u, file); struct analyser * a = create_analyser(t); struct input ** next_input_ptr = &(t->next); a->encoding = t->encoding = o->encoding; t->includes = o->includes; /* If multiple source files are specified, set up the others to be * read after the first in order, using the same mechanism as * 'get' uses. */ for (i = 2; i != argc; ++i) { NEW(input, q); file = argv[i]; u = get_input(file); if (u == NULL) { fprintf(stderr, "Can't open input %s\n", file); exit(1); } q->p = u; q->c = 0; q->file = file; q->file_owned = 0; q->line_number = 1; *next_input_ptr = q; next_input_ptr = &(q->next); } *next_input_ptr = NULL; read_program(a); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); if (!o->syntax_tree) { struct generator * g; const char * output_base = o->output_file; if (!output_base) { fprintf(stderr, "Please include the -o option\n"); print_arglist(1); } g = create_generator(a, o); if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".h"); o->output_h = get_output(s); s[SIZE(s) - 1] = 'c'; if (o->make_lang == LANG_CPLUSPLUS) { s = add_char_to_s(s, 'c'); } o->output_src = get_output(s); lose_s(s); generate_program_c(g); fclose(o->output_src); fclose(o->output_h); } #ifndef DISABLE_JAVA if (o->make_lang == LANG_JAVA) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".java"); o->output_src = get_output(s); lose_s(s); generate_program_java(g); fclose(o->output_src); } #endif #ifndef DISABLE_PASCAL if (o->make_lang == LANG_PASCAL) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".pas"); o->output_src = get_output(s); lose_s(s); generate_program_pascal(g); fclose(o->output_src); } #endif #ifndef DISABLE_PYTHON if (o->make_lang == LANG_PYTHON) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".py"); o->output_src = get_output(s); lose_s(s); generate_program_python(g); fclose(o->output_src); } #endif #ifndef DISABLE_JS if (o->make_lang == LANG_JAVASCRIPT) { byte * s = add_sz_to_s(NULL, output_base); if (o->js_esm) { s = add_literal_to_s(s, ".mjs"); } else { s = add_literal_to_s(s, ".js"); } o->output_src = get_output(s); lose_s(s); generate_program_js(g); fclose(o->output_src); } #endif #ifndef DISABLE_CSHARP if (o->make_lang == LANG_CSHARP) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".cs"); o->output_src = get_output(s); lose_s(s); generate_program_csharp(g); fclose(o->output_src); } #endif #ifndef DISABLE_RUST if (o->make_lang == LANG_RUST) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".rs"); o->output_src = get_output(s); lose_s(s); generate_program_rust(g); fclose(o->output_src); } #endif #ifndef DISABLE_GO if (o->make_lang == LANG_GO) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".go"); o->output_src = get_output(s); lose_s(s); generate_program_go(g); fclose(o->output_src); } #endif #ifndef DISABLE_ADA if (o->make_lang == LANG_ADA) { byte * s = add_sz_to_s(NULL, output_base); s = add_literal_to_s(s, ".ads"); o->output_h = get_output(s); s[SIZE(s) - 1] = 'b'; o->output_src = get_output(s); lose_s(s); generate_program_ada(g); fclose(o->output_src); fclose(o->output_h); } #endif close_generator(g); } close_tokeniser(t); close_analyser(a); } lose_s(u); } { struct include * p = o->includes; while (p) { struct include * q = p->next; lose_s(p->s); FREE(p); p = q; } } FREE(o->name); FREE(o); if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } snowball-3.0.1/compiler/generator.c000066400000000000000000002015031500727106100173000ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for free etc */ #include /* for strlen */ #include "header.h" /* Define this to get warning messages when optimisations can't be used. */ /* #define OPTIMISATION_WARNINGS */ /* recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for simple entities */ static void wi3(struct generator * g, int i) { if (i < 100) write_char(g, ' '); if (i < 10) write_char(g, ' '); write_int(g, i); /* integer (width 3) */ } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SIIrxg"[p->type]; switch (p->type) { case t_external: write_string(g, g->options->externals_prefix); break; case t_string: case t_boolean: case t_integer: { int count = p->count; if (count < 0) { p->s[SIZE(p->s)] = 0; fprintf(stderr, "Reference to optimised out variable %s attempted\n", p->s); exit(1); } if (p->type == t_boolean) { /* We use a single array for booleans and integers, with the * integers first. */ count += g->analyser->name_count[t_integer]; } write_char(g, ch); write_char(g, '['); write_int(g, count); write_char(g, ']'); return; } default: write_char(g, ch); write_char(g, '_'); } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { /* reference to variable */ if (p->type < t_routine) write_string(g, "z->"); write_varname(g, p); } static void write_hexdigit(struct generator * g, int i) { str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ } extern void write_hex4(struct generator * g, int ch) { for (int i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i); } static void write_hex(struct generator * g, int i) { if (i >> 4) write_hex(g, i >> 4); write_hexdigit(g, i); /* hex integer */ } /* write character literal */ static void wlitch(struct generator * g, int ch) { if (32 <= ch && ch < 127) { write_char(g, '\''); if (ch == '\'' || ch == '\\') { write_char(g, '\\'); } write_char(g, ch); write_char(g, '\''); } else { write_string(g, "0x"); write_hex(g, ch); } } static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ write_string(g, "{ "); for (int i = 0; i < SIZE(p); i++) { wlitch(g, p[i]); if (i < SIZE(p) - 1) write_string(g, ", "); } write_string(g, " }"); } static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ if (SIZE(p) == 0) { write_char(g, '0'); } else { struct str * s = g->outbuf; g->outbuf = g->declarations; write_string(g, "static const symbol s_"); write_int(g, g->literalstring_count); write_string(g, "[] = "); wlitarray(g, p); write_string(g, ";\n"); g->outbuf = s; write_string(g, "s_"); write_int(g, g->literalstring_count); g->literalstring_count++; } } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } extern void write_c_relop(struct generator * g, int relop) { switch (relop) { case c_eq: write_string(g, " == "); break; case c_ne: write_string(g, " != "); break; case c_gt: write_string(g, " > "); break; case c_ge: write_string(g, " >= "); break; case c_lt: write_string(g, " < "); break; case c_le: write_string(g, " <= "); break; default: fprintf(stderr, "Unexpected type #%d in generate_integer_test\n", relop); exit(1); } } void write_comment_content(struct generator * g, struct node * p) { switch (p->type) { case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: if (p->name) { write_char(g, '$'); write_s(g, p->name->s); write_char(g, ' '); } write_string(g, name_of_token(p->type)); write_string(g, " "); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: write_string(g, "$( "); write_string(g, name_of_token(p->type)); write_string(g, " )"); break; case c_define: if (p->mode == m_forward) { write_string(g, "forwardmode "); } else { write_string(g, "backwardmode "); } /* FALLTHRU */ default: write_string(g, name_of_token(p->type)); if (p->name) { write_char(g, ' '); write_s(g, p->name->s); } } write_string(g, ", line "); write_int(g, p->line_number); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "/* "); write_comment_content(g, p); write_string(g, " */"); write_newline(g); } /* margin + string */ static void wms(struct generator * g, const char * s) { write_margin(g); write_string(g, s); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { if (g->line_labelled == g->line_count) { // Before C23, `;` is required between a label and the block end. w(g, "~M;~N"); } w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "z->l - "; writef(g, "~Mint ~B0 = ~S1z->c;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "z->c = "); if (p->mode != m_forward) str_append_string(out, "z->l - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void winc(struct generator * g, struct node * p) { /* increment c */ write_string(g, p->mode == m_forward ? "z->c++;" : "z->c--;"); } static void wsetl(struct generator * g, int n) { g->margin--; wms(g, "lab"); write_int(g, n); write_char(g, ':'); write_newline(g); g->line_labelled = g->line_count; g->margin++; } static void wgotol(struct generator * g, int n) { wms(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_string(g, "{ "); write_str(g, g->failure_str); write_char(g, ' '); } switch (g->failure_label) { case x_return: write_string(g, "return 0;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } if (str_len(g->failure_str) != 0) write_string(g, " }"); } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { write_string(g, p->mode == m_forward ? "if (z->c >= z->l) " : "if (z->c <= z->lb) "); write_failure(g); } static void write_data_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_int(g, SIZE(b)); w(g, ", "); wlitref(g, b); } else { write_varref(g, p->name); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'i': winc(g, p); continue; case 'l': write_check_limit(g, p); continue; case 'f': write_failure(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': case 'J': case 'c': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; if (ch == 'I') write_int(g, g->I[j]); else if (ch == 'J') wi3(g, g->I[j]); else wlitch(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': case 'A': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; if (ch == 'L') wlitref(g, g->L[j]); else wlitarray(g, g->L[j]); continue; } case 'a': write_data_address(g, p); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case '$': /* insert_s, insert_v etc */ write_char(g, p->literalstring == NULL ? 'v' : 's'); continue; case 'p': write_string(g, g->options->externals_prefix); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "INT_MAX"); break; case c_minint: write_string(g, "INT_MIN"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "z->c"); break; case c_limit: w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; case c_len: if (g->options->encoding == ENC_UTF8) { w(g, "len_utf8(z->p)"); break; } /* FALLTHRU */ case c_size: w(g, "SIZE(z->p)"); break; case c_lenof: if (g->options->encoding == ENC_UTF8) { g->V[0] = p->name; w(g, "len_utf8(~V0)"); break; } /* FALLTHRU */ case c_sizeof: g->V[0] = p->name; w(g, "SIZE(~V0)"); break; } } // Return 0 for always f. // Return 1 for always t. // Return -1 for don't know (or can raise t or f). static int check_possible_signals(struct generator * g, struct node * p, int call_depth) { switch (p->type) { case c_fail: case c_false: /* Always gives signal f. */ return 0; case c_assign: case c_attach: case c_debug: case c_delete: case c_do: case c_insert: case c_leftslice: case c_repeat: case c_rightslice: case c_set: case c_setmark: case c_slicefrom: case c_sliceto: case c_tolimit: case c_tomark: case c_true: case c_try: case c_unset: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_functionend: /* Always gives signal t. */ return 1; case c_not: { int res = check_possible_signals(g, p->left, call_depth); if (res >= 0) res = !res; if (res == 0 && p->right) { if (p->right->type != c_functionend) { fprintf(stderr, "%s:%d: warning: 'not' always signals f so following commands are unreachable\n", g->analyser->tokeniser->file, p->line_number); } p->right = NULL; } return res; } case c_setlimit: { /* If p->left signals f, setlimit does. */ int res = check_possible_signals(g, p->left, call_depth); if (res == 0) { return res; } /* Otherwise gives same signal as p->aux. */ int res2 = check_possible_signals(g, p->aux, call_depth); if (res2 <= 0) return res2; return res; } case c_and: case c_bra: /* Gives same signal as list p->left. */ return check_possible_signals_list(g, p->left, p->type, call_depth); case c_atleast: case c_backwards: case c_loop: case c_reverse: case c_test: /* Give same signal as p->left. */ return check_possible_signals(g, p->left, call_depth); case c_call: if (call_depth >= 100) { /* Recursive functions aren't typical in snowball programs, * so make the pessimistic assumption that both t and f are * possible if we hit a generous limit on recursion. It's * not likely to make a difference to any real world * program, but means we won't recurse until we run out of * stack for pathological cases. */ return -1; } return check_possible_signals_list(g, p->name->definition, c_define, call_depth + 1); case c_gopast: case c_goto: case c_goto_grouping: case c_gopast_grouping: case c_goto_non: case c_gopast_non: /* FIXME: unless we can prove that c is either definitely atlimit * or definitely not atlimit... */ return -1; case c_atlimit: case c_atmark: case c_booltest: case c_hop: case c_literalstring: case c_next: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_grouping: case c_non: case c_name: /* FIXME: unless we can prove... */ return -1; case c_substring: { struct among * x = p->among; if (x->always_matches) { return 1; } return -1; } case c_among: { struct among * x = p->among; int r = 1; if (x->substring == NULL) { if (!x->always_matches) { r = -1; } } if (x->command_count > 0) { int trues = (x->nocommand_count > 0); int falses = false; for (int i = 1; i <= x->command_count; i++) { int res = check_possible_signals(g, x->commands[i - 1], call_depth); if (res == 0) { falses = true; } else if (res > 0) { trues = true; } else { falses = trues = true; } if (falses && trues) break; } if (!trues) { // All commands in among always fail. return 0; } if (falses) { // Commands in among can succeed or fail. return -1; } } return r; } case c_or: { int r = 0; for (struct node * q = p->left; q; q = q->right) { // Just check this node - q->right is a separate clause of // the OR. int res = check_possible_signals(g, q, call_depth); if (res > 0) { // If any clause of the OR always signals t, then the OR // always signals t. if (q->right) { if (q->right->type != c_functionend) { fprintf(stderr, "%s:%d: warning: command always signals t here so rest of 'or' is unreachable\n", g->analyser->tokeniser->file, q->line_number); } q->right = NULL; } return 1; } if (res < 0) { r = res; } } return r; } default: return -1; } } // Return 0 for always f. // Return 1 for always t. // Return -1 for don't know (or can raise t or f). int check_possible_signals_list(struct generator * g, struct node * p, int type, int call_depth) { int r = 1; while (p) { int res = check_possible_signals(g, p, call_depth); if (res == 0) { // If any command always signals f, then the list always signals f. if (p->right) { if (p->right->type != c_functionend) { fprintf(stderr, "%s:%d: warning: command always signals f here so rest of %s is unreachable\n", g->analyser->tokeniser->file, p->line_number, (type == c_and ? "'and'" : "command list")); } p->right = NULL; } return res; } if (res < 0) r = res; p = p->right; } return r; } /* K_needed() tests to see if we really need to keep c. Not true when the command does not touch the cursor. This and repeat_score() could be elaborated almost indefinitely. */ static int K_needed_(struct node * p, int call_depth) { while (p) { switch (p->type) { case c_atlimit: case c_do: case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_sliceto: case c_booltest: case c_set: case c_unset: case c_true: case c_false: case c_debug: case c_functionend: case c_not: break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that keep is needed if we * hit a generous limit on recursion. It's not likely to make * a difference to any real world program, but means we won't * recurse until we run out of stack for pathological cases. */ if (call_depth >= 100) return true; if (K_needed_(p->name->definition, call_depth + 1)) return true; break; case c_bra: if (K_needed_(p->left, call_depth)) return true; break; default: return true; } p = p->right; } return false; } extern int K_needed(struct generator * g, struct node * p) { (void)g; return K_needed_(p, 0); } static int repeat_score(struct generator * g, struct node * p, int call_depth) { int score = 0; while (p) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_sliceto: /* case c_not: must not be included here! */ case c_debug: case c_functionend: break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that repeat requires cursor * reinstatement if we hit a generous limit on recursion. It's * not likely to make a difference to any real world program, * but means we won't recurse until we run out of stack for * pathological cases. */ if (call_depth >= 100) { return 2; } score += repeat_score(g, p->name->definition, call_depth + 1); if (score >= 2) return score; break; case c_bra: score += repeat_score(g, p->left, call_depth); if (score >= 2) return score; break; case c_name: case c_literalstring: case c_next: case c_grouping: case c_non: case c_goto_grouping: case c_gopast_grouping: case c_goto_non: case c_gopast_non: case c_hop: if (++score >= 2) return score; break; default: return 2; } p = p->right; } return score; } /* tests if an expression requires cursor reinstatement in a repeat */ extern int repeat_restore(struct generator * g, struct node * p) { return repeat_score(g, p, 0) >= 2; } static void generate_bra(struct generator * g, struct node * p) { p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { write_block_end(g); str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); while (p->right != NULL) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); wgotol(g, out_lab); if (g->label_used) wsetl(g, g->failure_label); if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); if (savevar) { write_block_end(g); str_delete(savevar); } wsetl(g, out_lab); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mz->lb = z->c; z->c = z->l;~N", p); generate(g, p->left); w(g, "~Mz->c = z->lb;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); g->label_used = 0; generate(g, p->left); int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; writef(g, "~M~f~N", p); if (u) wsetl(g, l); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); g->label_used = 0; if (savevar) append_restore_string(p, g->failure_str, savevar); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_block_end(g); str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = 1;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = 0;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); writef(g, "~M~f~N", p); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; writef(g, "~{~Mint ret = ~V0(z);~N", p->left); w(g, "~Mif (ret < 0) return ret;~N~}"); } else { g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); } if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->encoding == ENC_UTF8) { if (p->mode == m_forward) w(g, "~{~Mint ret = skip_utf8(z->p, z->c, z->l, 1"); else w(g, "~{~Mint ret = skip_b_utf8(z->p, z->c, z->lb, 1"); writef(g, ");~N" "~Mif (ret < 0) ~f~N" "~Mz->c = ret;~N" "~}", p); } else writef(g, "~M~l~N" "~M~i~N", p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f~N", p); } else { writef(g, "~{" "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" "~Mif (ret < 0) ~f~N", p); if (p->mode == m_forward) w(g, "~Mz->c += ret;~N"); else w(g, "~Mz->c -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); w(g, "~Mwhile (1) {~N~+"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (style == 1) { /* include for goto; omit for gopast */ write_restorecursor(g, p, savevar); } w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate_next(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{~Mint i; for (i = "); generate_AE(g, p->AE); writef(g, "; i > 0; i--) {~N~+", p); generate(g, p->left); w(g, "~}" "~}"); } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile (1) {~+~N", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); int possible_signals = check_possible_signals_list(g, p->left, p->type, 0); if (possible_signals != -1) { fprintf(stderr, "%s:%d: warning: body of '%s' always signals '%c'\n", g->analyser->tokeniser->file, p->line_number, loopvar ? "atleast" : "repeat", possible_signals ? 't' : 'f'); } generate(g, p->left); if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~{~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); writef(g, "~Mif (~B0 > 0) ~f~N" "~}", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = z->c;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); writef(g, ") ~f~N", p); w(g, "~Mz->c = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (z->c != "); generate_AE(g, p->AE); writef(g, ") ~f~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->encoding == ENC_UTF8) { g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = p->mode == m_forward ? "z->l" : "z->lb"; w(g, "~{~Mint ret = skip~S0_utf8(z->p, z->c, ~S1, "); generate_AE(g, p->AE); writef(g, ");~N", p); writef(g, "~Mif (ret < 0) ~f~N", p); writef(g, "~Mz->c = ret;~N" "~}", p); } else { // Fixed-width characters. g->S[0] = p->mode == m_forward ? "+" : "-"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. // // Note that if we signal f then z->c will be reset when this is // handled - we rely on this here and unconditionally update z->c. w(g, "z->c = z->c ~S0 "); generate_AE(g, p->AE); writef(g, ";~N", p); if (p->mode == m_forward) { writef(g, "~Mif (z->c > z->l) ~f~N", p); } else { writef(g, "~Mif (z->c < z->lb) ~f~N", p); } } else { w(g, "~{~Mint ret = z->c ~S0 "); generate_AE(g, p->AE); writef(g, ";~N", p); if (p->mode == m_forward) { writef(g, "~Mif (ret > z->l || ret < z->c) ~f~N", p); } else { writef(g, "~Mif (ret < z->lb || ret > z->c) ~f~N", p); } writef(g, "~Mz->c = ret;~N" "~}", p); } } } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~{~Mint ret = slice_del(z);~N", p); writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; writef(g, "~Mz->c = z->l~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; g->S[1] = p->mode == m_forward ? "<" : ">"; writef(g, "~Mif (z->c ~S1 z->l~S0) ~f~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~Mz->~S0 = z->c;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~Mz->~S0 = z->c;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = assign_to(z, ~V0);~N" "~Mif (~V0 == 0) return -1;~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = slice_to(z, ~V0);~N" "~Mif (~V0 == 0) return -1;~N", p); } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; writef(g, "~{", p); if (keep_c) w(g, "~Mint saved_c = z->c;~N"); writef(g, "~Mint ret = insert_~$(z, z->c, z->c, ~a);~N", p); if (keep_c) w(g, "~Mz->c = saved_c;~N"); writef(g, "~Mif (ret < 0) return ret;~N~}", p); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); writef(g, "~{", p); if (keep_c) w(g, "~Mint saved_c = z->c;~N"); w(g, "~Mint ret = "); writef(g, keep_c ? "insert_~$(z, z->c, z->l, ~a);~N" : "insert_~$(z, z->lb, z->c, ~a);~N", p); if (keep_c) w(g, "~Mz->c = saved_c;~N"); writef(g, "~Mif (ret < 0) return ret;~N~}", p); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~{~Mint ret = slice_from_~$(z, ~a);~N", p); writef(g, "~Mif (ret < 0) return ret;~N~}", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); assert(q->right == NULL); g->B[0] = str_data(varname); writef(g, "~N~{~Mint ~B0;~N", p); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q); g->B[0] = str_data(varname); w(g, "~M~B0 = "); if (p->mode == m_forward) { w(g, "z->l - z->c; z->l = "); } else { w(g, "z->lb; z->lb = "); } generate_AE(g, q->AE); w(g, ";~N"); if (p->mode == m_forward) { str_assign(g->failure_str, "z->l += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "z->lb = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = z->l - z->c; z->l = z->c;~N"); } else { w(g, "~Mint ~B0 = z->lb; z->lb = z->c;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "z->l += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "z->lb = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } str_delete(savevar); } generate(g, p->aux); w(g, "~M"); write_str(g, g->failure_str); w(g, "~N" "~}"); str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~Mstruct SN_env en~B0 = * z;~N", p); // only copy start - we don't need to copy variables g->V[0] = p->name; /* Assume failure. */ writef(g, "~Mint failure = 1;~N" "~Mz->p = ~V0;~N" "~Mz->lb = z->c = 0;~N" "~Mz->l = SIZE(z->p);~N", p); generate(g, p->left); /* Mark success. */ w(g, "~Mfailure = 0;~N"); if (g->label_used) wsetl(g, g->failure_label); g->V[0] = p->name; /* necessary */ g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V0 = z->p;~N" "~M* z = en~B0;~N" "~Mif (failure) ~f~N~}", p); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { writef(g, ";~N", p); } else { writef(g, ") ~f~N", p); } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (str_len(g->failure_str) == 0 && g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~V0(z);~N", p); return; } writef(g, "~{~Mint ret = ~V0(z);~N", p); if (str_len(g->failure_str) == 0 && g->failure_label == x_return) { /* Combine the two tests in this special case for better optimisation * and clearer generated code. */ writef(g, "~Mif (ret <= 0) return ret;~N", p); } else { if (signals == 1) { /* Always succeeds - just need to handle runtime errors. */ writef(g, "~Mif (ret < 0) return ret;~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~Mif (ret < 0) return ret;~N", p); writef(g, "~M~f~N", p); } else { writef(g, "~Mif (ret == 0) ~f~N", p); writef(g, "~Mif (ret < 0) return ret;~N", p); } } writef(g, "~}", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~N", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; writef(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~N", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); if (SIZE(b) == 1) { /* It's quite common to compare with a single character literal string, * so just inline the simpler code for this case rather than making a * function call. In UTF-8 mode, only do this for the ASCII subset, * since multi-byte characters are more complex to test against. */ if (g->options->encoding == ENC_UTF8 && *b >= 128) { printf("single byte %d\n", *b); exit(1); } g->I[0] = *b; if (p->mode == m_forward) { writef(g, "~Mif (z->c == z->l || z->p[z->c] != ~c0) ~f~N" "~Mz->c++;~N", p); } else { writef(g, "~Mif (z->c <= z->lb || z->p[z->c - 1] != ~c0) ~f~N" "~Mz->c--;~N", p); } } else { g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = SIZE(b); g->L[0] = b; writef(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~N", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); g->next_label = 0; g->var_number = 0; g->S[0] = q->type == t_routine ? "static" : "extern"; g->V[0] = q; w(g, "~S0 int ~V0(struct SN_env * z) {~N~+"); if (p->amongvar_needed) w(g, "~Mint among_var;~N"); str_clear(g->failure_str); g->failure_label = x_return; g->label_used = 0; int signals = check_possible_signals_list(g, p->left, c_define, 0); /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~}"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn 1;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = x->shortest_size; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (int c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } if (block != -1 || n_cases <= 2) { char buf[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; if (p->mode == m_forward) { sprintf(buf, "z->p[z->c + %d]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { writef(g, "~Mif (z->c >= z->l", p); } else { writef(g, "~Mif (z->c + ~I4 >= z->l", p); } } else { g->S[1] = "z->p[z->c - 1]"; if (shortest_size == 1) { writef(g, "~Mif (z->c <= z->lb", p); } else { writef(g, "~Mif (z->c - ~I4 <= z->lb", p); } } if (n_cases == 0) { /* We get this for the degenerate case: among ( '' ) * This doesn't seem to be a useful construct, but it is * syntactically valid. */ } else if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " || ~S1 != ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " || (~S1 != ~I4 && ~S1 != ~I5)", p); } else { writef(g, " || ~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); } write_string(g, ") "); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "among_var = ~I4; else~N", p); } else { writef(g, "~f~N", p); } } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);~N", p); if (!x->always_matches) { writef(g, "~Mif (!among_var) ~f~N", p); } } else if (x->always_matches) { writef(g, "~Mfind_among~S0(z, a_~I0, ~I1);~N", p); } else { writef(g, "~Mif (!find_among~S0(z, a_~I0, ~I1)) ~f~N", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { writef(g, "~Mswitch (among_var) {~N~+", p); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); w(g, "~Mbreak;~N~-"); } w(g, "~}"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Mif (!(~V0)) ~f~N", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~f~N", p); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(z, ~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } void write_generated_comment_content(struct generator * g) { // Report only the leafname of the Snowball source file to make output // reproducible even if an absolute path to the source file is specified. write_string(g, "Generated from "); const char * leaf = g->analyser->tokeniser->file; const char * p = strrchr(leaf, '/'); if (p) leaf = p + 1; p = strrchr(leaf, '\\'); if (p) leaf = p + 1; write_string(g, leaf); write_string(g, " by Snowball " SNOWBALL_VERSION " - https://snowballstem.org/"); } void write_start_comment(struct generator * g, const char * comment_start, const char * comment_end) { write_margin(g); w(g, comment_start); write_generated_comment_content(g); if (comment_end) { w(g, comment_end); } w(g, "~N~N"); } static void generate_head(struct generator * g) { w(g, "#include \""); if (g->options->runtime_path) { write_string(g, g->options->runtime_path); if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/') write_char(g, '/'); } w(g, "header.h\"~N~N"); } static void generate_routine_headers(struct generator * g) { for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_routine: w(g, "static int ~W0(struct SN_env * z);~N"); break; case t_external: w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N" "extern int ~W0(struct SN_env * z);~N" "#ifdef __cplusplus~N" "}~N" "#endif~N" ); break; } } } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; for (int i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v[i].size; g->L[0] = v[i].b; if (v[i].size) w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N"); } g->I[1] = x->literalstring_count; w(g, "~Mstatic const struct among a_~I0[~I1] = {~N"); for (int i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v[i].size; g->I[3] = (v[i].i >= 0 ? v[i].i - i : 0); g->I[4] = v[i].result; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; if (g->options->comments) { w(g, "/*~J1 */ "); } w(g, "{ ~I2, "); if (v[i].size == 0) { w(g, "0,"); } else { w(g, "s_~I0_~I1,"); } w(g, " ~I3, ~I4, "); if (v[i].function == NULL) { write_char(g, '0'); } else { write_varname(g, v[i].function); } w(g, "}~S0~N"); } w(g, "};~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "static const unsigned char ~V0[] = { "); for (int i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, " };~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_create(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; g->I[1] = p[t_integer] + p[t_boolean]; w(g, "~N" "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1); }" "~N"); } static void generate_close(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; w(g, "~Nextern void ~pclose_env(struct SN_env * z) { SN_close_env(z, ~I0); }~N~N"); } static void generate_create_and_close_templates(struct generator * g) { w(g, "~N" "extern struct SN_env * ~pcreate_env(void);~N" "extern void ~pclose_env(struct SN_env * z);~N" "~N"); } static void generate_header_file(struct generator * g) { const char * vp = g->options->variables_prefix; g->S[0] = vp; w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N"); /* for C++ */ generate_create_and_close_templates(g); for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_external: w(g, "extern int ~W0(struct SN_env * z);~N"); break; case t_string: case t_integer: case t_boolean: if (vp) { int count = q->count; if (count < 0) { /* Unused variables should get removed from `names`. */ q->s[SIZE(q->s)] = 0; fprintf(stderr, "Optimised out variable %s still in names list\n", q->s); exit(1); } if (q->type == t_boolean) { /* We use a single array for booleans and integers, * with the integers first. */ count += g->analyser->name_count[t_integer]; } g->I[0] = count; g->I[1] = "SIIrxg"[q->type]; w(g, "#define ~S0"); write_s(g, q->s); w(g, " (~c1[~I0])~N"); } break; } } w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); /* for C++ */ w(g, "~N"); } extern void generate_program_c(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "/* ", " */"); if (g->analyser->int_limits_used) { w(g, "#include ~N"); } generate_head(g); generate_routine_headers(g); w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N" "~N"); generate_create_and_close_templates(g); w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); generate_amongs(g); generate_groupings(g); g->declarations = g->outbuf; g->outbuf = str_new(); g->literalstring_count = 0; for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); } generate_create(g); generate_close(g); output_str(g->options->output_src, g->declarations); str_delete(g->declarations); output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); write_start_comment(g, "/* ", " */"); generate_header_file(g); output_str(g->options->output_h, g->outbuf); str_delete(g->outbuf); str_delete(g->failure_str); } /* Generator functions common to multiple languages. */ extern struct generator * create_generator(struct analyser * a, struct options * o) { NEW(generator, g); g->analyser = a; g->options = o; g->margin = 0; g->debug_count = 0; g->copy_from_count = 0; g->line_count = 0; g->line_labelled = 0; g->failure_label = -1; g->unreachable = false; #ifndef DISABLE_PYTHON g->max_label = 0; #endif return g; } extern void close_generator(struct generator * g) { FREE(g); } /* Write routines for simple entities */ extern void write_char(struct generator * g, int ch) { str_append_ch(g->outbuf, ch); /* character */ } extern void write_newline(struct generator * g) { /* Avoid generating trailing whitespace. */ while (true) { int ch = str_back(g->outbuf); if (ch != ' ' && ch != '\t') break; str_pop(g->outbuf); } str_append_ch(g->outbuf, '\n'); /* newline */ g->line_count++; } extern void write_string(struct generator * g, const char * s) { str_append_string(g->outbuf, s); } extern void write_wchar_as_utf8(struct generator * g, symbol ch) { str_append_wchar_as_utf8(g->outbuf, ch); } extern void write_int(struct generator * g, int i) { str_append_int(g->outbuf, i); } extern void write_s(struct generator * g, const byte * s) { str_append_s(g->outbuf, s); } extern void write_str(struct generator * g, struct str * str) { str_append(g->outbuf, str); } snowball-3.0.1/compiler/generator_ada.c000066400000000000000000001454301500727106100201130ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { if (p->type != t_external) { write_char(g, "SBIRXG"[p->type]); write_char(g, '_'); } { char save_initial = p->s[0]; p->s[0] = toupper(save_initial); str_append_s(g->outbuf, p->s); p->s[0] = save_initial; } if (p->s[SIZE(p->s) - 1] == '_') { write_char(g, 'E'); } } static void write_varref(struct generator * g, struct name * p) { /* reference to variable */ if (p->type < t_routine) write_string(g, "Z."); write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { // Ada supports UTF-8 literal strings, we only need to escape the quote and // special characters. write_char(g, '"'); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (ch == '"') { write_string(g, "\"\""); } else if (ch < 32 || ch == 127) { printf("In write_literal_string, can't handle non-graphic character 0x%02x currently\n", (int)p[i]); exit(1); } else if (ch <= 255) { write_char(g, ch); } else { printf("In write_literal_string, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } } write_char(g, '"'); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_relop(struct generator * g, int relop) { switch (relop) { case c_eq: write_string(g, " = "); break; case c_ne: write_string(g, " /= "); break; case c_gt: write_string(g, " > "); break; case c_ge: write_string(g, " >= "); break; case c_lt: write_string(g, " < "); break; case c_le: write_string(g, " <= "); break; default: fprintf(stderr, "Unexpected type #%d in generate_integer_test\n", relop); exit(1); } } /* Write a variable declaration. */ static void write_declare(struct generator * g, const char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_string(g, ";"); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "-- "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~Mbegin~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~Mend;~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "Z.L - "; write_declare(g, "~B0 : Char_Index", p); writef(g, "~M~B0 := ~S1Z.C;~N" , p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "Z.C := "); if (p->mode != m_forward) str_append_string(out, "Z.L - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void wsetl(struct generator * g, int n) { write_newline(g); write_margin(g); write_string(g, "<>"); write_newline(g); g->line_labelled = g->line_count; } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "Result := False;"); write_newline(g); write_margin(g); write_string(g, "return;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_string(g, ";"); g->label_used = 1; } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " then~N~+", p); write_failure(g); writef(g, "~-~Mend if;~N", p); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "Z.C >= Z.L", p); } else { write_failure_if(g, "Z.C <= Z.Lb", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_failure(g); g->unreachable = false; continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static int need_among_var(struct node *p) { while (p) { if (p->type == c_among) { return 1; } if (p->left && need_among_var(p->left)) { return 1; } if (p->aux && need_among_var(p->aux)) { return 1; } p = p->right; } return 0; } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: // Avoid `parentheses required for unary minus` error from gnat. if (p->number < 0) write_char(g, '('); write_int(g, p->number); if (p->number < 0) write_char(g, ')'); break; case c_maxint: write_string(g, "Integer'Last"); break; case c_minint: write_string(g, "Integer'First"); break; case c_neg: write_string(g, "(-"); generate_AE(g, p->right); write_char(g, ')'); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "Z.C"); break; case c_limit: w(g, p->mode == m_forward ? "Z.L" : "Z.Lb"); break; case c_len: w(g, "Length_Utf8 (Z)"); break; case c_size: w(g, "Z.Len"); break; case c_lenof: g->V[0] = p->name; w(g, "Length_Utf8 (Ada.Strings.Unbounded.To_String (~V0))"); break; case c_sizeof: g->V[0] = p->name; w(g, "Ada.Strings.Unbounded.Length (~V0)"); break; default: break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetl(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MZ.Lb := Z.C; Z.C := Z.L;~N", p); generate(g, p->left); w(g, "~MZ.C := Z.Lb;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); int l = g->failure_label; generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); if (g->label_used) wsetl(g, l); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := True;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := False;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0 (Z, Result);~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) w(g, "~MC := Skip_Utf8 (Z);~N"); else w(g, "~MC := Skip_Utf8_Backward (Z);~N"); write_failure_if(g, "C < 0", p); w(g, "~MZ.C := C;~N"); g->temporary_used = true; } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->S[1] = complement ? "In" : "Out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~M~S1_Grouping~S0 (Z, ~V0, ~I0, ~I1, True, C);~N", p); write_failure_if(g, "C < 0", p); } else { writef(g, "~M~S1_Grouping~S0 (Z, ~V0, ~I0, ~I1, True, C);~N", p); write_failure_if(g, "C < 0", p); if (p->mode == m_forward) w(g, "~MZ.C := Z.C + C;~N"); else w(g, "~MZ.C := Z.C - C;~N"); } g->temporary_used = true; } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; int end_unreachable = false; int golab = new_label(g); w(g, "~Mloop~N~+"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mexit;~N"); } g->unreachable = false; if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; write_check_limit(g, p); generate_next(g, p); w(g, "~-~Mend loop;~N"); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor ~B0 in reverse 1 .. "); generate_AE(g, p->AE); writef(g, " loop~N~+", p); generate(g, p->left); w(g, "~-~Mend loop;~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); wsetl(g, replab); writef(g, "~N~Mloop~N~+", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 := ~B0 - 1;~N"); } g->I[0] = replab; w(g, "~Mgoto lab~I0;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~N~Mexit;~N~-~Mend loop;~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~M~B0 := "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := Z.C;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif Z.C ~S0 "); generate_AE(g, p->AE); w(g, " then~N"); write_failure(g); w(g, "~Mend if;~N"); g->unreachable = false; w(g, "~MZ.C := "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif Z.C /= "); generate_AE(g, p->AE); writef(g, " then~N~+", p); write_failure(g); w(g, "~-~Mend if;~N"); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; w(g, "~MC := Skip_Utf8~S0 (Z, "); generate_AE(g, p->AE); writef(g, ");~N", p); write_failure_if(g, "C < 0", p); writef(g, "~MZ.C := C;~N", p); g->temporary_used = true; } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MSlice_Del (Z);~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; writef(g, "~MZ.C := Z.L~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "Z.C ~S1 Z.L~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "Bra" : "Ket"; writef(g, "~MZ.~S0 := Z.C;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "Ket" : "Bra"; writef(g, "~MZ.~S0 := Z.C;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := Assign_To (Z, ~V0);~N", p); write_failure_if(g, "~V0 == 0", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := Ada.Strings.Unbounded.To_Unbounded_String (Slice_To (Z));~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { w(g, "Ada.Strings.Unbounded.To_String ("); write_varref(g, p->name); w(g, ")"); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~MC := Z.C;~N"); writef(g, "~MInsert (Z, Z.C, Z.C, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~MZ.C := C;~N"); g->temporary_used = true; } } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~MC := Z.C;~N", p); if (p->mode == m_forward) { writef(g, "~MInsert (Z, Z.C, Z.L, ", p); } else { writef(g, "~MInsert (Z, Z.Lb, Z.C, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~MZ.C := C;~N"); g->temporary_used = true; } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MSlice_From (Z, "); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif Z.C ~S0 "); generate_AE(g, q->AE); writef(g, " then~N~+", q); w(g, "~MResult := False;~N"); w(g, "~Mreturn;~-~N"); w(g, "~Mend if;~N"); w(g, "~M~B0"); g->unreachable = false; if (p->mode == m_forward) { w(g, " := Z.L - Z.C; Z.L := "); } else { w(g, " := Z.Lb; Z.Lb := "); } generate_AE(g, q->AE); w(g, ";~N"); if (p->mode == m_forward) { str_assign(g->failure_str, "Z.L := Z.L + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "Z.Lb := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 := Z.L - Z.C;~N"); w(g, "~MZ.L := Z.C;~N"); } else { w(g, "~M~B0 := Z.Lb;~N"); w(g, "~MZ.Lb := Z.C;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "Z.L := Z.L + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "Z.Lb := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); g->V[0] = p->name; { struct str * saved_output = g->outbuf; str_clear(g->failure_str); g->outbuf = g->failure_str; writef(g, "~V0 := FCurrent; " "FCurrent := ~B0_Current; " "FCursor := ~B0_Cursor; " "FLimit := ~B0_Limit; " "FBkLimit := ~B0_BkLimit; " "FBra := ~B0_Bra; " "FKet := ~B0_Ket;", p); g->failure_str = g->outbuf; g->outbuf = saved_output; } write_declare(g, "~B0_Current : AnsiString", p); write_declare(g, "~B0_Cursor : Integer", p); write_declare(g, "~B0_Limit : Integer", p); write_declare(g, "~B0_BkLimit : Integer", p); write_declare(g, "~B0_Bra : Integer", p); write_declare(g, "~B0_Ket : Integer", p); writef(g, "~{" "~M~B0_Current := FCurrent;~N" "{ ~M~B0_Current := Copy(FCurrent, 1, FLimit); }~N" "~M~B0_Cursor := FCursor;~N" "~M~B0_Limit := FLimit;~N" "~M~B0_BkLimit := FBkLimit;~N" "~M~B0_Bra := FBra;~N" "~M~B0_Ket := FKet;~N" "~MFCurrent := ~V0;~N" "~MFCursor := 0;~N" "~MFLimit := Length(current);~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; w(g, "~M~V0 := "); if (s != NULL) { g->S[0] = s; w(g, "~V0 ~S0 "); } generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~MResult := ("); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ");~N"); } else { w(g, " then~+~N"); write_failure(g); w(g, "~-~Mend if;~N"); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return) { if (p->right && p->right->type == c_functionend) { /* Tail call. */ writef(g, "~M~V0 (Z, Result);~N", p); return; } if (signals == 0) { /* Always fails. */ writef(g, "~M~V0 (Z, Result);~N", p); w(g, "~Mreturn~N"); return; } } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V0 (Z, Result);~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V0 (Z, Result);~N", p); write_failure(g); } else { writef(g, "~M~V0 (Z, Result);~N", p); write_failure_if(g, "not Result", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->S[1] = complement ? "Out_" : "In_"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; writef(g, "~M~S1Grouping~S0 (Z, ~V0, ~I0, ~I1, False, C);~N", p); write_failure_if(g, "C /= 0", p); g->temporary_used = true; } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->V[0] = p->name; writef(g, "~MC := Eq_S~S0 (Z, Ada.Strings.Unbounded.To_String (~V0));", p); write_failure_if(g, "C = 0", p); g->temporary_used = true; } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->L[0] = b; writef(g, "~MC := Eq_S~S0 (Z, ~L0);~N", p); write_failure_if(g, "C = 0", p); if (p->mode == m_forward) { writef(g, "~MZ.C := Z.C + C;~N", p); } else { writef(g, "~MZ.C := Z.C - C;~N", p); } g->temporary_used = true; } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); /* Generate function header. */ g->V[0] = q; w(g, "~Mprocedure ~W0 (Z : in out Context_Type; Result : out Boolean) is~N"); /* Save output. */ struct str *saved_output = g->outbuf; struct str *saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ w(g, "~{"); int signals = check_possible_signals_list(g, p->left, c_define, 0); g->temporary_used = false; generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } g->V[0] = q; w(g, "~-~Mend ~W0;~N"); if (g->temporary_used) { str_append_string(saved_output, " C : Result_Index;\n"); } if (need_among_var(p->left)) { str_append_string(saved_output, " A : Integer;\n"); } if (g->var_number) { str_append(saved_output, g->declarations); } str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~MResult := True;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = x->shortest_size; int call_done = 0; int need_among_handler = (x->function_count > 0); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->I[0] = x->number; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (int c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } if (block != -1 || n_cases <= 2) { char buf[64]; char buf2[128]; char buf3[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; g->S[3] = buf3; snprintf(buf3, sizeof(buf3), "16#%x#", bitmap); if (p->mode == m_forward) { if (shortest_size == 1) sprintf(buf, "Z.C"); else sprintf(buf, "Z.C + %d", shortest_size - 1); snprintf(buf2, sizeof(buf2), "Character'Pos (Z.P (%s + 1))", buf); g->S[1] = buf; g->S[2] = buf2; if (shortest_size == 1) { writef(g, "~Mif Z.C >= Z.L", p); } else { writef(g, "~Mif Z.C + ~I4 >= Z.L", p); } } else { g->S[1] = "Z.C - 1"; g->S[2] = "Character'Pos (Z.P (Z.C))"; if (shortest_size == 1) { writef(g, "~Mif Z.C <= Z.Lb", p); } else { writef(g, "~Mif Z.C - ~I4 <= Z.Lb", p); } } if (n_cases == 0) { /* We get this for the degenerate case: among ( '' ) * This doesn't seem to be a useful construct, but it is * syntactically valid. */ } else if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " or else ~S2 /= ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " or else (~S2 /= ~I4 and then ~S2 /= ~I5)", p); } else { writef(g, " or else Check_Among (Z, ~S1, ~I2, ~S3)", p); } writef(g, " then~+~N", p); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "~MA := ~I4;~-~N~Melse~+~N", p); if (need_among_handler) { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, Among_Handler'Access, A);~N", p); } else { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, null, A);~N", p); } if (!x->always_matches) { write_failure_if(g, "A = 0", p); } call_done = 1; } else { write_failure(g); } writef(g, "~-~Mend if;~N", p); } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (!call_done) { if (need_among_handler) { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, Among_Handler'Access, A);~N", p); } else { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, null, A);~N", p); } if (!x->always_matches) { write_failure_if(g, "A = 0", p); } } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mcase A is~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mwhen ~I0 =>~N"); g->margin++; generate(g, x->commands[i - 1]); g->margin--; g->unreachable = false; } w(g, "~Mwhen others =>~N"); w(g, "~M null;~N"); w(g, "~-~Mend case;~N"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "not ~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, NULL); break; case c_plusassign: generate_integer_assign(g, p, "+"); break; case c_minusassign: generate_integer_assign(g, p, "-"); break; case c_multiplyassign:generate_integer_assign(g, p, "*"); break; case c_divideassign: generate_integer_assign(g, p, "/"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* Class declaration generation. */ static void generate_unit_start(struct generator * g) { g->margin = 0; write_start_comment(g, "-- ", NULL); } static void generate_method_decl(struct generator * g, struct name * q) { g->V[0] = q; w(g, "~Mprocedure ~W0 (Z : in out Context_Type; Result : out Boolean);~N"); } static void generate_method_decls(struct generator * g, enum name_types type) { struct among * a = g->analyser->amongs; int need_among_handler = 0; for (struct name * q = g->analyser->names; q; q = q->next) { if ((enum name_types)q->type == type) { generate_method_decl(g, q); } } while (a != NULL && need_among_handler == 0) { need_among_handler = (a->function_count > 0); a = a->next; } if (need_among_handler) { w(g, "~N~Mprocedure Among_Handler (Context : in out Stemmer.Context_Type'Class; Operation : in Operation_Index; Result : out Boolean);~N"); } } static void generate_member_decls(struct generator * g) { w(g, " type Context_Type is new Stemmer.Context_Type with"); if (g->analyser->name_count[t_string] > 0 || g->analyser->name_count[t_integer] > 0 || g->analyser->name_count[t_boolean] > 0) { w(g, " record~N~+"); for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0 : Ada.Strings.Unbounded.Unbounded_String;~N"); break; case t_integer: w(g, "~M~W0 : Integer;~N"); break; case t_boolean: w(g, "~M~W0 : Boolean;~N"); break; } } w(g, "~-"); w(g, "~- end record;~N"); } else { w(g, " null record;~N"); } } static int generate_among_string(struct generator * g, struct among * x, int count) { struct amongvec * v = x->b; int limit = count == 0 ? 38 : 80; g->I[0] = x->number; for (int i = 0; i < x->literalstring_count; i++, v++) { /* Write among's string. */ g->L[0] = v->b; g->I[1] = i; if (count + SIZE(v->b) > limit) { w(g, "~N~M& "); count = 3; limit = 80; } else if (count > 0) { w(g, " & "); } w(g, "~L0"); count += SIZE(v->b) + 5; } return count; } static int generate_among_table(struct generator * g, struct among * x, int start_pos, int *operation) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count - 1; w(g, "~MA_~I0 : constant Among_Array_Type (0 .. ~I1) := ~+(~N"); v = x->b; for (int i = 0; i < x->literalstring_count; i++) { g->I[1] = start_pos; /* Write among's string position. */ if (x->literalstring_count == 1) { w(g, "~Mothers => (~I1, "); } else { w(g, "~M(~I1, "); } start_pos = start_pos + SIZE(v[i].b); g->I[1] = start_pos - 1; w(g, "~I1, "); /* Write among's index & result. */ g->I[2] = v[i].i; w(g, "~I2, "); g->I[2] = v[i].result; w(g, "~I2, "); /* Write among's handler. */ if (v[i].function == NULL) { w(g, "0)"); } else { *operation = *operation + 1; g->I[1] = *operation; w(g, "~I1)"); } if (i + 1 < x->literalstring_count) { w(g, ",~N"); } } w(g, ");~-~N~N"); return start_pos; } static int generate_amongs(struct generator * g) { struct among * a = g->analyser->amongs; if (!a) return 0; int count; int start_pos; w(g, "~MAmong_String : constant String := ~+"); count = 0; while (a != NULL) { count = generate_among_string(g, a, count); a = a->next; } w(g, ";~N~-~N"); int operation = 0; start_pos = 1; a = g->analyser->amongs; while (a != NULL) { start_pos = generate_among_table(g, a, start_pos, &operation); a = a->next; } return operation; } static int generate_constructor(struct generator * g) { return generate_amongs(g); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } static int generate_operations_dispatcher(struct generator * g) { int operation = 0; w(g, "~N~Mprocedure Among_Handler (Context : in out Stemmer.Context_Type'Class; Operation : in Operation_Index; Result : out Boolean) is~N"); w(g, "~Mbegin~+~N~M"); w(g, "case Operation is~+~N~M"); for (struct among * x = g->analyser->amongs; x; x = x->next) { struct amongvec * v = x->b; for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function != NULL) { operation++; g->I[2] = operation; w(g, "when ~I2 =>~N~M"); g->V[0] = v[i].function; w(g, " ~W0 (Context_Type (Context), Result);~N~M"); } } } w(g, "when others =>~N~M"); w(g, " Result := False;~-~N~Mend case;~-~N~M"); w(g, "end Among_Handler;~N~-"); return operation; } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int need_comma = 0; for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = 8 * size - 1; w(g, "~N~M~W0 : constant Grouping_Array (0 .. ~I0) := (~N~+~M"); for (int i = 0; i < size; i++) { unsigned char m = map[i]; if (i != 0) { w(g, ",~N~M"); need_comma = 0; } for (int j = 0; j < 8; j++) { if (need_comma) w(g, ", "); if (m & (1 << j)) { w(g, "True"); } else { w(g, "False"); } need_comma = 1; } } w(g, "~N~-~M);~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } extern void generate_program_ada(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_unit_start(g); /* generate implementation. */ w(g, "package body Stemmer."); w(g, g->options->package); w(g, " is~N~+~N"); w(g, "~Mpragma Style_Checks (\"-mr\");~N"); w(g, "~Mpragma Warnings (Off, \"*variable*is never read and never assigned*\");~N"); w(g, "~Mpragma Warnings (Off, \"*mode could be*instead of*\");~N"); w(g, "~Mpragma Warnings (Off, \"*formal parameter.*is not modified*\");~N"); w(g, "~Mpragma Warnings (Off, \"*this line is too long*\");~N"); w(g, "~Mpragma Warnings (Off, \"*is not referenced*\");~N"); w(g, "~N"); generate_method_decls(g, t_routine); generate_groupings(g); int operations = generate_constructor(g); generate_methods(g); if (operations > 0) { generate_operations_dispatcher(g); } w(g, "end Stemmer."); w(g, g->options->package); w(g, ";~N"); output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); g->margin = 0; write_start_comment(g, "-- ", NULL); if (g->analyser->name_count[t_string]) { w(g, "private with Ada.Strings.Unbounded;~N"); } w(g, "package Stemmer."); w(g, g->options->package); w(g, " with SPARK_Mode is~N~+"); w(g, " type Context_Type is new Stemmer.Context_Type with private;~N"); w(g, " procedure Stem (Z : in out Context_Type; Result : out Boolean);~N"); w(g, "private~N"); generate_member_decls(g); w(g, "end Stemmer."); w(g, g->options->package); w(g, ";~N"); output_str(g->options->output_h, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_csharp.c000066400000000000000000001157331500727106100206510ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "c"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { /* In c#, references look just the same */ write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_string(g, "\""); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 0x590 && ch != 127) { if (ch == '"' || ch == '\\') write_char(g, '\\'); // Our C# generator uses ENC_WIDECHARS so we need to convert. write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u"); write_hex4(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetl(struct generator * g, int n) { w(g, "~-~Mlab~+"); write_int(g, n); w(g, ": ; ~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_block_start(g); write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_string(g, ";"); g->label_used = 1; } write_newline(g); if (str_len(g->failure_str) != 0) write_block_end(g); } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "int.MaxValue"); break; case c_minint: write_string(g, "int.MinValue"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_lenof: /* Same as sizeof() for C#. */ case c_sizeof: g->V[0] = p->name; w(g, "~V0.Length"); break; case c_len: /* Same as size() for C#. */ case c_size: w(g, "current.Length"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); wgotol(g, out_lab); if (g->label_used) wsetl(g, g->failure_label); if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); if (savevar) { write_block_end(g); str_delete(savevar); } wsetl(g, out_lab); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); { int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_failure(g); if (u) wsetl(g, l); } if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_block_end(g); str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); } if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { w(g, "~Mif (~S1_grouping~S0(~V0, ~I0, ~I1, true) < 0)~N~f~N"); } else { w(g, "~{~N" "~Mint ret = ~S1_grouping~S0(~V0, ~I0, ~I1, true);~N" "~Mif (ret < 0)~N~f~N"); if (p->mode == m_forward) w(g, "~Mcursor += ret;~N"); else w(g, "~Mcursor -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); w(g, "~Mwhile (true)~N~{"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (style == 1) { /* include for goto; omit for gopast */ write_restorecursor(g, p, savevar); } w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); write_block_start(g); generate(g, p->left); write_block_end(g); str_delete(loopvar); } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * atleast_case) { writef(g, "~Mwhile (true)~N~{", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (atleast_case != NULL) { g->B[0] = str_data(atleast_case); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Massign_to(~V0);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Mslice_to(~V0);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = "); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } str_delete(savevar); } generate(g, p->aux); write_margin(g); write_str(g, g->failure_str); write_newline(g); str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->V[0] = p->name; str_assign(g->failure_str, "copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->B[0] = str_data(savevar); writef(g, "~{~MEnv ~B0 = new Env(this);~N" "~Mcurrent = ~V0;~N" "~Mcursor = 0;~N" "~Mlimit = current.Length;~N", p); generate(g, p->left); write_margin(g); write_str(g, g->failure_str); write_newline(g); w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); // Relational operators are the same as C. write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~V0();~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V0();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V0();~N", p); write_failure(g); } else { w(g, "~Mif (!~V0())~N~+"); write_failure(g); w(g, "~-"); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; w(g, "~Mif (~S1_grouping~S0(~V0, ~I0, ~I1, false) != 0)~N~f"); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(eq_s~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!(eq_s~S0(~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); if (q->type == t_routine) { g->S[0] = "private"; } else { g->S[0] = "protected override"; } g->V[0] = q; w(g, "~M~S0 bool ~V0()~N~M{~+~N"); /* Save output. */ struct str * saved_output = g->outbuf; g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) w(g, "~Mint among_var;~N"); str_clear(g->failure_str); g->failure_label = x_return; g->label_used = 0; int signals = check_possible_signals_list(g, p->left, c_define, 0); /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~}"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(a_~I0);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mfind_among~S0(a_~I0);~N", p); } else { write_failure_if(g, "find_among~S0(a_~I0) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; /* Put a block around each case which seems to workaround bogus * compiler errors (typically with repeat reports at the same * location): * * dutchStemmer.generated.cs(543,25): error CS0165: Use of unassigned local variable `c5' * * The c5 variable is initialised at point of declaration and we * don't `goto` into the block it is declared in from outside so * this seems to be buggy code flow analysis in the compiler. * Unclear where to usefully report mono bugs in 2025 so I've * not tried. */ w(g, "~Mcase ~I0: {~N~+"); generate(g, x->commands[i - 1]); w(g, "~Mbreak;~N~-~M}~N"); } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!(~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "#pragma warning disable 0164~N"); w(g, "#pragma warning disable 0162~N~N"); w(g, "~Mnamespace "); w(g, g->options->package); w(g, "~N~{"); w(g, "~Musing System;~N"); w(g, "~Musing System.Text;~N"); w(g, "~M~N"); w(g, "~M///~N"); w(g, "~M/// This class implements the stemming algorithm defined by a snowball script.~N"); w(g, "~M/// "); write_generated_comment_content(g); w(g, "~N" "~M///~N"); w(g, "~M/// ~N"); w(g, "~M[System.CodeDom.Compiler.GeneratedCode(\"Snowball\", \"" SNOWBALL_VERSION "\")]~N"); w(g, "~Mpublic partial class ~n : "); w(g, g->options->parent_class_name); w(g, "~N~{"); } static void generate_class_end(struct generator * g) { w(g, "~N"); w(g, "~}"); w(g, "~}"); w(g, "~N"); } static void generate_among_table(struct generator * g, struct among * x, const char * type) { write_newline(g); write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->S[0] = type; w(g, "~M~S0a_~I0 = new[] ~N~M{~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; g->L[0] = v[i].b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~Mnew Among(~L0, ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", "); write_varname(g, v[i].function); } w(g, ")~S0~N"); } w(g, "~-~M};~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { if (x->function_count) { g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~N~Mprivate readonly Among[] a_~I0;~N"); } else { generate_among_table(g, x, "private static readonly Among[] "); } } w(g, "~N"); if (g->analyser->among_with_function_count == 0) return; w(g, "~M/// ~N"); w(g, "~M/// Initializes a new instance of the class.~N"); w(g, "~M/// ~N"); w(g, "~M/// ~N"); w(g, "~Mpublic ~n()~N~{"); for (struct among * x = g->analyser->amongs; x; x = x->next) { if (x->function_count) { generate_among_table(g, x, ""); } } w(g, "~}~N~N"); } static void generate_grouping_table(struct generator * g, struct grouping * q) { symbol * b = q->b; g->V[0] = q->name; w(g, "~Mprivate const string ~V0 = "); write_literal_string(g, b); w(g, ";~N"); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { int wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~Mprivate "); w(g, g->options->string_class); w(g, " ~W0 = new "); w(g, g->options->string_class); w(g, "();~N"); wrote_members = true; break; case t_integer: w(g, "~Mprivate int ~W0;~N"); wrote_members = true; break; case t_boolean: w(g, "~Mprivate bool ~W0;~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); } } extern void generate_program_csharp(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); generate_members(g); generate_groupings(g); generate_amongs(g); generate_methods(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_go.c000066400000000000000000001161741500727106100177760ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include /* for toupper */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: { char save_initial = p->s[0]; p->s[0] = toupper(save_initial); str_append_s(g->outbuf, p->s); p->s[0] = save_initial; return; } default: { int ch = "SbirxG"[p->type]; write_char(g, ch); write_char(g, '_'); break; } } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { write_string(g, "context."); write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_string(g, "\""); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); if (32 <= ch && ch < 0x590 && ch != 127) { if (ch == '"' || ch == '\\') write_char(g, '\\'); write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u"); write_hex4(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; /* FIXME could use Go //line syntax if we had original filename */ write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.Limit - "; writef(g, "~Mvar ~B0 = ~S1env.Cursor~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "env.Cursor = "); if (p->mode != m_forward) str_append_string(out, "env.Limit - "); str_append(out, savevar); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.NextChar();" : "env.PrevChar();"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~Mlab~I0: for {~N~+"); } static void wsetlab_end(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak lab~I0~N"); w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak lab~I0~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false~N"); g->unreachable = true; break; default: g->I[0] = g->failure_label; w(g, "~Mbreak lab~I0~N"); g->unreachable = true; } } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.Cursor >= env.Limit", p); } else { write_failure_if(g, "env.Cursor <= env.LimitBackward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "snowballRuntime.MaxInt"); break; case c_minint: write_string(g, "snowballRuntime.MinInt"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "env.Cursor"); break; case c_limit: w(g, p->mode == m_forward ? "env.Limit" : "env.LimitBackward"); break; case c_lenof: g->V[0] = p->name; w(g, "snowballRuntime.RuneCountInString(~V0)"); break; case c_sizeof: g->V[0] = p->name; w(g, "len(~V0)"); break; case c_len: w(g, "snowballRuntime.RuneCountInString(env.Current())"); break; case c_size: w(g, "len(env.Current())"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } w(g, "~-~M}~N"); g->unreachable = false; if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.LimitBackward = env.Cursor~N" "~Menv.Cursor = env.Limit~N", p); generate(g, p->left); w(g, "~Menv.Cursor = env.LimitBackward~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~W0(env, context)~N"); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "In" : "Out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.Go~S1Grouping~S0(~W0, ~I0, ~I1)", p); if (!is_goto) { write_string(g, p->mode == m_forward ? "env.NextChar();" : "env.PrevChar();"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~Mgolab~I0: for {~N~+"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0~N"); } g->unreachable = false; w(g, "~-~M}~N"); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~Mfor _ = range make([]struct{},"); generate_AE(g, p->AE); writef(g, ") {~+", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "~Mreplab~I0: for{~N~+", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); g->I[0] = g->failure_label; w(g, "~Mlab~I0: for _ = range [2]struct{}{} {~N~+"); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--~N"); } g->I[0] = replab; w(g, "~Mcontinue replab~I0~N"); } w(g, "~-~M}~N"); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->I[0] = replab; w(g, "~Mbreak replab~I0~N~-~M}~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mvar ~B0 = "); generate_AE(g, p->AE); w(g, "~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.Cursor~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif env.Cursor ~S0 "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.Cursor = "); generate_AE(g, p->AE); writef(g, "~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif env.Cursor != "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "Back"; g->S[1] = p->AE->type == c_number ? "" : "Checked"; write_failure_if(g, "!env.Hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif !env.SliceDel() {~N" "~+~Mreturn false~N~-" "~M}~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Limit" : "env.LimitBackward"; writef(g, "~Menv.Cursor = ~S0~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Limit" : "env.LimitBackward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "env.Cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Bra" : "env.Ket"; writef(g, "~M~S0 = env.Cursor~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Ket" : "env.Bra"; writef(g, "~M~S0 = env.Cursor~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.AssignTo()~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.SliceTo()~N" "~Mif ~V0 == \"\" {~N" "~+~Mreturn false~N~-~M}~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_block_start(g); write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mvar c = env.Cursor~N"); w(g, "~Mbra, ket := env.Cursor, env.Cursor~N"); writef(g, "~Menv.Insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.Cursor = c~N"); write_block_end(g); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_block_start(g); write_comment(g, p); if (keep_c) writef(g, "~Mvar c = env.Cursor~N", p); if (p->mode == m_forward) { writef(g, "~Menv.Insert(env.Cursor, env.Limit, ", p); } else { writef(g, "~Menv.Insert(env.LimitBackward, env.Cursor, ", p); } generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Menv.Cursor = c~N"); write_block_end(g); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif !env.SliceFrom("); generate_address(g, p); writef(g, ") {~N" "~+~Mreturn false~N~-~M}~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif env.Cursor ~S0 "); generate_AE(g, q->AE); w(g, " "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.Limit - env.Cursor~N"); w(g, "~Menv.Limit = "); } else { w(g, "~Mvar ~B0 = env.LimitBackward~N"); w(g, "~Menv.LimitBackward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "env.Limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.LimitBackward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.Limit - env.Cursor~N"); w(g, "~Menv.Limit = env.Cursor~N"); } else { w(g, "~Mvar ~B0 = env.LimitBackward~N"); w(g, "~Menv.LimitBackward = env.Cursor~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.Limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.LimitBackward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->V[0] = p->name; g->B[0] = str_data(savevar); writef(g, "~Mvar ~B0 = env.Clone()~N" "~Menv.SetCurrent(~V0)~N", p); generate(g, p->left); if (!g->unreachable) { g->V[0] = p->name; g->B[0] = str_data(savevar); /* Update string variable. */ w(g, "~M~V0 = env.Current()~N"); /* Reset env */ w(g, "~M*env = *~B0~N"); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, "~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); // Relational operators are the same as C. write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~W0(env, context)~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~W0(env, context)~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~W0(env, context)~N", p); write_failure(g); } else { write_failure_if(g, "!~W0(env, context)", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "Out" : "In"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.~S1Grouping~S0(~W0, ~I0, ~I1)", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; g->V[0] = p->name; write_failure_if(g, "!env.EqS~S0(~V0)", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; g->L[0] = b; write_failure_if(g, "!env.EqS~S0(~L0)", p); } static void generate_setup_context(struct generator * g) { w(g, "~Mvar context = &Context {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0: \"\",~N"); break; case t_integer: w(g, "~M~W0: 0,~N"); break; case t_boolean: w(g, "~M~W0: false,~N"); break; } } w(g, "~-~M}~N"); w(g, "~M_ = context~N"); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); g->V[0] = q; if (q->type == t_routine) { w(g, "~Mfunc ~W0(env *snowballRuntime.Env, ctx interface{}) bool {~+~N"); w(g, "~Mcontext := ctx.(*Context)~N"); w(g, "~M_ = context~N"); } else { w(g, "~Mfunc ~W0(env *snowballRuntime.Env) bool {~+~N"); generate_setup_context(g); } if (p->amongvar_needed) w(g, "~Mvar among_var int32~N"); /* Save output. */ struct str * saved_output = g->outbuf; g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; int signals = check_possible_signals_list(g, p->left, c_define, 0); /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~-~M}~N"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "B"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = env.FindAmong~S0(A_~I0, context)~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Menv.FindAmong~S0(A_~I0, context)~N", p); } else { write_failure_if(g, "env.FindAmong~S0(A_~I0, context) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch among_var {~N"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); w(g, "~-"); g->unreachable = false; } w(g, "~M}~N"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.Debug(~I0, ~I1)~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "package "); w(g, g->options->package); w(g, "~N~N"); w(g, "import(~N"); w(g, " snowballRuntime \""); w(g, g->options->go_snowball_runtime); w(g, "\"~N)~N~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mvar A_~I0 = []*snowballRuntime.Among{~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; g->L[0] = v[i].b; g->S[0] = ","; w(g, "~M&snowballRuntime.Among{Str:~L0, A:~I0, B:~I1, "); if (v[i].function != NULL) { w(g, "F:"); write_varname(g, v[i].function); } else { w(g, "F:nil"); } w(g, "}~S0~N"); } w(g, "~-~M}~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "~Mvar ~W0 = []byte{"); for (int i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "}~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { w(g, "type Context struct {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0 string~N"); break; case t_integer: w(g, "~M~W0 int~N"); break; case t_boolean: w(g, "~M~W0 bool~N"); break; } } w(g, "~-}~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_go(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); if (g->analyser->int_limits_used) { /* std::usize is used in the code generated for usize::MAX and usize::MIN */ w(g, "use std::usize;~N~N"); } generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_java.c000066400000000000000000001167131500727106100203110ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { /* In java, references look just the same */ write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_string(g, "\""); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { w(g, "~Mlab"); write_int(g, n); w(g, ": {~+~N"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); g->unreachable = true; break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_string(g, ";"); g->unreachable = true; } write_newline(g); } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "Integer.MAX_VALUE"); break; case c_minint: write_string(g, "Integer.MIN_VALUE"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_lenof: /* Same as sizeof() for Java. */ case c_sizeof: g->V[0] = p->name; w(g, "~V0.length()"); break; case c_len: /* Same as size() for Java. */ case c_size: w(g, "length"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { g->failure_label = new_label(g); wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g); g->unreachable = false; if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } g->failure_label = new_label(g); str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!go_~S1_grouping~S0(~V0, ~I0, ~I1)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mcursor++;~N"); else w(g, "~Mcursor--;~N"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~Mgolab~I0: while(true)~N"); w(g, "~{"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile(true)~N~{", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Massign_to(~V0);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Mslice_to(~V0);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = "); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~N" "~MSnowballProgram ~B0 = new SnowballProgram(this);~N", p); ++g->copy_from_count; str_assign(g->failure_str, "copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->V[0] = p->name; writef(g, "~Mcurrent = ~V0;~N" "~Mcursor = 0;~N" "~Mlimit = current.length();~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); // Relational operators are the same as C. write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~V0();~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V0();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V0();~N", p); write_failure(g); } else { write_failure_if(g, "!~V0()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(eq_s~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!(eq_s~S0(~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); if (q->type == t_routine) { g->S[0] = "private"; } else { w(g, "~M@Override~N"); g->S[0] = "public"; } g->V[0] = q; w(g, "~M~S0 boolean ~V0() {~+~N"); /* Save output. */ struct str * saved_output = g->outbuf; g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) w(g, "~Mint among_var;~N"); str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; int signals = check_possible_signals_list(g, p->left, c_define, 0); /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~}"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(a_~I0);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mfind_among~S0(a_~I0);~N", p); } else { write_failure_if(g, "find_among~S0(a_~I0) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!(~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "package "); w(g, g->options->package); w(g, ";~N~N"); if (g->analyser->among_count > 0) { w(g, "import "); w(g, g->options->among_class); w(g, ";~N~N"); } if (g->copy_from_count > 0) { w(g, "import org.tartarus.snowball.SnowballProgram;~N~N"); } w(g, "/**~N" " * This class implements the stemming algorithm defined by a snowball script.~N" " *

~N" " * "); write_generated_comment_content(g); w(g, "~N" " *

~N" " */~N" "@SuppressWarnings(\"unused\")~N" "public class ~n extends "); w(g, g->options->parent_class_name); w(g, " {~+~N" "~N" "~Mprivate static final long serialVersionUID = 1L;~N"); if (g->analyser->among_with_function_count > 0) { w(g, "~Mprivate static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();~N"); } write_newline(g); } static void generate_class_end(struct generator * g) { w(g, "~N}"); w(g, "~N"); } static void generate_equals(struct generator * g) { w(g, "~N" "~M@Override~N" "~Mpublic boolean equals( Object o ) {~N" "~+~Mreturn o instanceof "); w(g, g->options->name); w(g, ";~N~-~M}~N" "~N" "~M@Override~N" "~Mpublic int hashCode() {~N" "~+~Mreturn "); w(g, g->options->name); w(g, ".class.getName().hashCode();~N" "~-~M}~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mprivate final static Among[] a_~I0 = {~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; g->L[0] = v[i].b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~Mnew Among(~L0, ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", \""); write_varname(g, v[i].function); w(g, "\", methodObject"); } w(g, ")~S0~N"); } w(g, "~-~M};~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "~Mprivate static final char[] ~V0 = {"); for (int i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, " };~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { int wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~Mprivate "); w(g, g->options->string_class); w(g, " ~W0 = new "); w(g, g->options->string_class); w(g, "();~N"); wrote_members = true; break; case t_integer: w(g, "~Mprivate int ~W0;~N"); wrote_members = true; break; case t_boolean: w(g, "~Mprivate boolean ~W0;~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_java(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); w(g, "~+"); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_equals(g); generate_class_end(g); w(g, "~-"); { /* We need to call generate_class_begin() after we've generated the * methods so we know if copy_from_count > 0. */ struct str * body = g->outbuf; g->outbuf = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); str_append(g->outbuf, body); str_delete(body); } output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_js.c000066400000000000000000001225421500727106100200010ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_string(g, "\""); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "base.limit - "; writef(g, "~M~C /** number */ ~B0 = ~S1base.cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "base.cursor = "); if (p->mode != m_forward) str_append_string(out, "base.limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "base.cursor++;" : "base.cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~Mlab~I0: {~N~+"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); g->unreachable = true; break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_string(g, ";"); g->unreachable = true; } write_newline(g); } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "base.cursor >= base.limit", p); } else { write_failure_if(g, "base.cursor <= base.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; case 'P': write_string(g, g->options->parent_class_name); continue; case 'C': { // Constant. if (g->options->js_esm) { w(g, "const"); } else { w(g, "/** @const */ var"); } continue; } case 'D': { // Declare variable. if (g->options->js_esm) { w(g, "let"); } else { w(g, "var"); } continue; } default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "(-1>>>1)"); break; case c_minint: write_string(g, "(~(-1>>>1))"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_divide: /* Snowball specifies integer division with semantics matching C, * so we need to use `Math.trunc(x/y)` here. */ write_string(g, "Math.trunc("); generate_AE(g, p->left); write_string(g, " / "); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "base.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "base.limit" : "base.limit_backward"); break; case c_lenof: /* Same as sizeof() for Javascript. */ case c_sizeof: g->V[0] = p->name; w(g, "~V0.length"); break; case c_len: /* Same as size() for Javascript. */ case c_size: w(g, "base.current.length"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { g->failure_label = new_label(g); wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g); g->unreachable = false; if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mbase.limit_backward = base.cursor; base.cursor = base.limit;~N", p); generate(g, p->left); w(g, "~Mbase.cursor = base.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } g->failure_label = new_label(g); str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!base.go_~S1_grouping~S0(~V0, ~I0, ~I1)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mbase.cursor++;~N"); else w(g, "~Mbase.cursor--;~N"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~Mgolab~I0: while(true)~N"); w(g, "~{"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor (~D /** number */ ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile(true)~N~{", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~M~D ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = base.cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (base.cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mbase.cursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (base.cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { int c_count = ++g->keep_count; write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; g->I[0] = c_count; w(g, "~{~M~C /** number */ c~I0 = base.cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->I[0] = c_count; g->S[1] = p->mode == m_forward ? "> base.limit" : "< base.limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c~I0 ~S1", p); } else { write_failure_if(g, "c~I0 ~S1 || c~I0 ~S2 base.cursor", p); } writef(g, "~Mbase.cursor = c~I0;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif (!base.slice_del())~N" "~M{~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mbase.cursor = base.limit;~N", p); } else { writef(g, "~Mbase.cursor = base.limit_backward;~N", p); } } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { write_failure_if(g, "base.cursor < base.limit", p); } else { write_failure_if(g, "base.cursor > base.limit_backward", p); } } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mbase.bra = base.cursor;~N", p); } else { writef(g, "~Mbase.ket = base.cursor;~N", p); } } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mbase.ket = base.cursor;~N", p); } else { writef(g, "~Mbase.bra = base.cursor;~N", p); } } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = base.assign_to();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = base.slice_to();~N" "~Mif (~V0 == '')~N" "~M{~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int c_count; int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { c_count = ++g->keep_count; g->I[0] = c_count; w(g, "~{~M~C /** number */ c~I0 = base.cursor;~N"); } writef(g, "~Mbase.insert(base.cursor, base.cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) { g->I[0] = c_count; w(g, "~Mbase.cursor = c~I0;~N~}"); } } static void generate_assignfrom(struct generator * g, struct node * p) { int c_count; int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) { c_count = ++g->keep_count; g->I[0] = c_count; w(g, "~{~M~C /** number */ c~I0 = base.cursor;~N"); } if (p->mode == m_forward) { writef(g, "~Mbase.insert(base.cursor, base.limit, ", p); } else { writef(g, "~Mbase.insert(base.limit_backward, base.cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { g->I[0] = c_count; w(g, "~Mbase.cursor = c~I0;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (!base.slice_from("); generate_address(g, p); writef(g, "))~N" "~M{~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (base.cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); w(g, "~M~C /** number */ ~B0 = "); if (p->mode == m_forward) { w(g, "base.limit - base.cursor;~N"); w(g, "~Mbase.limit = "); } else { w(g, "base.limit_backward;~N"); w(g, "~Mbase.limit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "base.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "base.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); w(g, "~M~C /** number */ ~B0 = "); if (p->mode == m_forward) { w(g, "base.limit - base.cursor;~N"); w(g, "~Mbase.limit = base.cursor;~N"); } else { w(g, "base.limit_backward;~N"); w(g, "~Mbase.limit_backward = base.cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "base.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "base.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~N" "~M~D /** !Object */ ~B0 = new ~P();~N", p); writef(g, "~M~B0.copy_from(base);~N", p); ++g->copy_from_count; str_assign(g->failure_str, "base.copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->V[0] = p->name; writef(g, "~Mbase.current = ~V0;~N" "~Mbase.cursor = 0;~N" "~Mbase.limit_backward = 0;~N" "~Mbase.limit = base.current.length;~N", p); generate(g, p->left); if (!g->unreachable) { g->V[0] = p->name; writef(g, "~M~V0 = base.current;~N", p); write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); // Relational operators are the same as C. write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~V0();~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V0();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V0();~N", p); write_failure(g); } else { write_failure_if(g, "!~V0()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!(base.~S1_grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(base.eq_s~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!(base.eq_s~S0(~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); g->V[0] = q; if (q->type == t_routine) { w(g, "~M/** @return {boolean} */~N" "~Mfunction ~W0() {~+~N"); } else { w(g, "~Mthis.~W0 = /** @return {boolean} */ function() {~+~N"); } /* Save output. */ struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) { w(g, "~M~D /** number */ among_var;~N"); } str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; g->keep_count = 0; int signals = check_possible_signals_list(g, p->left, c_define, 0); /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~-~M};~N"); str_append(saved_output, g->declarations); str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = base.find_among~S0(a_~I0);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mbase.find_among~S0(a_~I0);~N", p); } else { write_failure_if(g, "base.find_among~S0(a_~I0) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mbase.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so we need to use `Math.trunc(x/y)` here. */ g->V[0] = p->name; w(g, "~M~V0 = Math.trunc(~V0 / "); generate_AE(g, p->AE); w(g, ");~N"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { if (g->options->js_esm) { w(g, "// deno-lint-ignore-file~N" "import ~P from './base-stemmer.mjs'~N" "~N" "/** @typedef {{ stemWord(word: string): string }} Stemmer */~N" "~N" "/** @type {{ new(): Stemmer }} */~N" "~C ~n = function() {~+~N" "~M~D base = new ~P();~N"); } else { w(g, "/**@constructor*/~N" "~D ~n = function() {~+~N" "~M~C ~P = require('./base-stemmer.js');~N" "~M~D base = new ~P();~N"); } write_newline(g); } static void generate_class_end(struct generator * g) { w(g, "~N"); w(g, "~M/**@return{string}*/~N"); w(g, "~Mthis['stemWord'] = function(/**string*/word) {~+~N"); w(g, "~Mbase.setCurrent(word);~N"); w(g, "~Mthis.stem();~N"); w(g, "~Mreturn base.getCurrent();~N"); w(g, "~-~M};~N"); w(g, "~-};~N"); if (g->options->js_esm) { w(g, "~N" "export default ~n~N"); } else { w(g, "~N" "if (typeof module === 'object' && module.exports) module.exports = ~n;~N"); } } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~M~C a_~I0 = [~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; g->L[0] = v[i].b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~M[~L0, ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", "); write_varname(g, v[i].function); } w(g, "]~S0~N"); } w(g, "~-~M];~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "~M~C /** Array */ ~W0 = ["); for (int i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "];~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { int wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~D /** string */ ~W0 = '';~N"); wrote_members = true; break; case t_integer: w(g, "~M~D /** number */ ~W0 = 0;~N"); wrote_members = true; break; case t_boolean: w(g, "~M~D /** boolean */ ~W0 = false;~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_js(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_pascal.c000066400000000000000000001304351500727106100206300ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" #define BASE_UNIT "SnowballProgram" #define BASE_CLASS "T" BASE_UNIT /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { if (p->type != t_external) { /* Pascal identifiers are case-insensitive but Snowball identifiers * should be case-sensitive. To address this, we encode the case of * the identifier. For readability of the generated code, the * encoding tries to be minimally intrusive for common cases. * * After the letter which indicates the type and before the "_" we * encode the case pattern in the Snowball identifier using "U" for * an upper-case letter, "l" for a lower-case letter and nothing for * other characters. Any trailing string of "l" is omitted (since * it's redundant and decreases readability). * * Identifiers without any upper-case encode most simply, e.g. I_foo2 * * A capitalised identifier is also concise, e.g. IU_Foo2 * * All-caps gives a string of Us, e.g. IUUUUUUUU_SHOUTING * * But any example can be handled, e.g. IUllU_Foo79_Bar * * We don't try to solve this problem for external identifiers - it * seems more helpful to leave those alone and encourage snowball * program authors to avoid naming externals which only differ by * case. */ int len = SIZE(p->s); int lower_pending = 0; write_char(g, "SBIrxg"[p->type]); for (int i = 0; i != len; ++i) { int ch = p->s[i]; if (ch >= 'a' && ch <= 'z') { ++lower_pending; } else if (ch >= 'A' && ch <= 'Z') { while (lower_pending) { write_char(g, 'l'); --lower_pending; } write_char(g, 'U'); } } write_char(g, '_'); } write_s(g, p->s); } static void write_literal_string(struct generator * g, symbol * p) { write_char(g, '\''); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (ch == '\'') { write_string(g, "''"); } else if (32 <= ch && ch < 127) { write_char(g, ch); } else { write_char(g, '\''); write_char(g, '#'); write_int (g, ch); write_char(g, '\''); } } write_char(g, '\''); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_relop(struct generator * g, int relop) { switch (relop) { case c_eq: write_string(g, " = "); break; case c_ne: write_string(g, " <> "); break; case c_gt: write_string(g, " > "); break; case c_ge: write_string(g, " >= "); break; case c_lt: write_string(g, " < "); break; case c_le: write_string(g, " <= "); break; default: fprintf(stderr, "Unexpected type #%d in generate_integer_test\n", relop); exit(1); } } /* Write a variable declaration. */ static void write_declare(struct generator * g, const char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_string(g, ";"); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "{ "); write_comment_content(g, p); write_string(g, " }"); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~MBegin~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~MEnd;~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "FLimit - "; write_declare(g, "~B0 : Integer", p); writef(g, "~M~B0 := ~S1FCursor;~N" , p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "FCursor := "); if (p->mode != m_forward) str_append_string(out, "FLimit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "Inc(FCursor);" : "Dec(FCursor);"); write_newline(g); } static void wsetlab_begin(struct generator * g) { w(g, "~MRepeat~N~+"); } static void wsetlab_end(struct generator * g, int n) { w(g, "~-~MUntil True;~N"); w(g, "lab"); write_int(g, n); w(g, ":~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "Begin Result := False; Exit; End;"); g->unreachable = true; break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_string(g, ";"); g->unreachable = true; } write_newline(g); } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~MIf (", p); writef(g, s, p); writef(g, ") Then~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "FCursor >= FLimit", p); } else { write_failure_if(g, "FCursor <= FBkLimit", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varname(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "MAXINT"); break; case c_minint: write_string(g, "(-MAXINT - 1)"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " div "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "FCursor"); break; case c_limit: w(g, p->mode == m_forward ? "FLimit" : "FBkLimit"); break; case c_len: case c_size: w(g, "Length(current)"); break; case c_lenof: case c_sizeof: g->V[0] = p->name; w(g, "Length(~V0)"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g); if (savevar) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right) { g->failure_label = new_label(g); wsetlab_begin(g); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~MFBkLimit := FCursor; FCursor := FLimit;~N", p); generate(g, p->left); w(g, "~MFCursor := FBkLimit;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); int l = g->failure_label; generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, l); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } g->failure_label = new_label(g); str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := True;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := False;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, g->failure_label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->S[1] = complement ? "In" : "Out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "Not (Go~S1Grouping~S0(~V0, ~I0, ~I1))", p); if (!is_goto) { write_string(g, p->mode == m_forward ? "Inc(FCursor);" : "Dec(FCursor);"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int end_unreachable = false; int golab = new_label(g); w(g, "~MWhile True Do~N"); w(g, "~{"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mgoto lab~I0;~N"); } g->unreachable = false; wsetlab_end(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); g->I[0] = golab; w(g, "~}lab~I0:~N"); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~MFor ~B0 := "); generate_AE(g, p->AE); writef(g, " DownTo 1 Do~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "lab~I0:~N~MWhile True Do~N~{", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~MDec(~B0);~N"); } g->I[0] = replab; w(g, "~Mgoto lab~I0;~N"); } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~MBreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~M~B0 := "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~W0 := FCursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~MIf (FCursor ~S0 "); generate_AE(g, p->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~MFCursor := "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MIf (FCursor <> "); generate_AE(g, p->AE); writef(g, ") Then~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~MC := FCursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> FLimit" : "< FBkLimit"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "C ~S1", p); } else { write_failure_if(g, "(C ~S1) Or (C ~S2 FCursor)", p); } writef(g, "~MFCursor := C;~N", p); writef(g, "~}", p); g->temporary_used = true; } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MSliceDel;~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FLimit" : "FBkLimit"; writef(g, "~MFCursor := ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FLimit" : "FBkLimit"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "FCursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FBra" : "FKet"; writef(g, "~M~S0 := FCursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FKet" : "FBra"; writef(g, "~M~S0 := FCursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := AssignTo();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := SliceTo();~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varname(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { w(g, "~{~MC := FCursor;~N"); g->temporary_used = true; } writef(g, "~Minsert(FCursor, FCursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MFCursor := C;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) { writef(g, "~{~MC := FCursor;~N", p); g->temporary_used = true; } if (p->mode == m_forward) { writef(g, "~Minsert(FCursor, FLimit, ", p); } else { writef(g, "~Minsert(FBkLimit, FCursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MFCursor := c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MSliceFrom("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~MIf (FCursor ~S0 "); generate_AE(g, q->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->mode == m_forward) { w(g, "~M~B0 := FLimit - FCursor;~N"); w(g, "~MFLimit := "); } else { w(g, "~M~B0 := FBkLimit;~N"); w(g, "~MFBkLimit := "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "FLimit := FLimit + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "FBkLimit := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->mode == m_forward) { w(g, "~M~B0 := FLimit - FCursor;~N"); w(g, "~MFLimit := FCursor;~N"); } else { w(g, "~M~B0 := FBkLimit;~N"); w(g, "~MFBkLimit := FCursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "FLimit := FLimit + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "FBkLimit := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); g->V[0] = p->name; { struct str * saved_output = g->outbuf; str_clear(g->failure_str); g->outbuf = g->failure_str; writef(g, "~V0 := FCurrent; " "FCurrent := ~B0_Current; " "FCursor := ~B0_Cursor; " "FLimit := ~B0_Limit; " "FBkLimit := ~B0_BkLimit; " "FBra := ~B0_Bra; " "FKet := ~B0_Ket;", p); g->failure_str = g->outbuf; g->outbuf = saved_output; } write_declare(g, "~B0_Current : AnsiString", p); write_declare(g, "~B0_Cursor : Integer", p); write_declare(g, "~B0_Limit : Integer", p); write_declare(g, "~B0_BkLimit : Integer", p); write_declare(g, "~B0_Bra : Integer", p); write_declare(g, "~B0_Ket : Integer", p); writef(g, "~{" "~M~B0_Current := FCurrent;~N" "~M~B0_Cursor := FCursor;~N" "~M~B0_Limit := FLimit;~N" "~M~B0_BkLimit := FBkLimit;~N" "~M~B0_Bra := FBra;~N" "~M~B0_Ket := FKet;~N" "~MFCurrent := ~V0;~N" "~MFCursor := 0;~N" "~MFLimit := Length(current);~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; w(g, "~M~W0 := "); if (s != NULL) { g->S[0] = s; w(g, "~W0 ~S0 "); } generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~MResult := "); p->right = NULL; } else { w(g, "~MIf "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { w(g, " Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return) { if (p->right && p->right->type == c_functionend) { /* Tail call. */ writef(g, "~MResult = ~V0;~N", p); return; } if (signals == 0) { /* Always fails. */ writef(g, "~MBegin; Result = ~V0; Exit; End;~N", p); return; } } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V0;~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V0;~N", p); write_failure(g); } else { write_failure_if(g, "Not ~V0", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->S[1] = complement ? "Out" : "In"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "Not (~S1Grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; g->V[0] = p->name; write_failure_if(g, "Not (EqV~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; g->I[0] = SIZE(b); g->L[0] = b; write_failure_if(g, "Not (EqS~S0(~I0, ~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); /* Generate function header. */ g->V[0] = q; w(g, "~MFunction T~n.~W0 : Boolean;~N"); /* Save output. */ struct str *saved_output = g->outbuf; struct str *saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ w(g, "~{"); int signals = check_possible_signals_list(g, p->left, c_define, 0); g->temporary_used = false; generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~}"); if (g->temporary_used) { str_append_string(g->declarations, " C : Integer;\n"); } if (p->amongvar_needed) { str_append_string(g->declarations, " AmongVar : Integer;\n"); } if (str_len(g->declarations) > 0) { str_append_string(saved_output, "Var\n"); str_append(saved_output, g->declarations); } if (g->next_label) { str_append_string(saved_output, "Label\n"); int num = g->next_label; for (int i = 0; i < num; ++i) { str_append_string(saved_output, " lab"); str_append_int(saved_output, i); str_append_string(saved_output, i == num - 1 ? ";\n" : ",\n"); } } str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~MResult := True;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->I[0] = x->number; g->I[1] = x->literalstring_count; if (x->amongvar_needed) { writef(g, "~MAmongVar := FindAmong~S0(a_~I0, ~I1);~N", p); if (!x->always_matches) { write_failure_if(g, "AmongVar = 0", p); } } else if (x->always_matches) { writef(g, "~MFindAmong~S0(a_~I0, ~I1);~N", p); } else { write_failure_if(g, "FindAmong~S0(a_~I0, ~I1) = 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~MCase AmongVar Of~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~M~I0:~N~{"); generate(g, x->commands[i - 1]); w(g, "~}"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "Not (~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, NULL); break; case c_plusassign: generate_integer_assign(g, p, "+"); break; case c_minusassign: generate_integer_assign(g, p, "-"); break; case c_multiplyassign:generate_integer_assign(g, p, "*"); break; case c_divideassign: generate_integer_assign(g, p, "div"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* Class declaration generation. */ static void generate_unit_start(struct generator * g) { write_start_comment(g, "{ ", " }"); w(g, "Unit ~n;~N~N{$HINTS OFF}~N~NInterface~N~NUses " BASE_UNIT ";~N"); } static void generate_unit_end(struct generator * g) { w(g, "~NEnd.~N"); } static void generate_class_begin(struct generator * g) { w(g, "~NType~N~+~MT~n = Class(" BASE_CLASS ")~N"); } static void generate_class_end(struct generator * g) { w(g, "~}~NImplementation~N"); } static void generate_method_decl(struct generator * g, struct name * q) { g->V[0] = q; w(g, "~MFunction ~W0 : Boolean;"); if (q->type == t_external) { w(g, " Override;"); } w(g, "~N"); } static void generate_method_decls(struct generator * g) { w(g, "~Mpublic~N~+"); w(g, "~MConstructor Create;~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_external) { generate_method_decl(g, q); } } w(g, "~-"); int first = true; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_routine) { if (first) { w(g, "~Mprivate~N~+"); first = false; } generate_method_decl(g, q); } } if (!first) w(g, "~-"); } static void generate_member_decls(struct generator * g) { int first = true; for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: case t_integer: case t_boolean: if (first) { w(g, "~Mprivate~N~+"); first = false; } switch (q->type) { case t_string: w(g, "~M~W0 : AnsiString;~N"); break; case t_integer: w(g, "~M~W0 : Integer;~N"); break; case t_boolean: w(g, "~M~W0 : Boolean;~N"); break; } } } if (!first) w(g, "~-"); } static void generate_among_decls(struct generator * g) { struct among *a = g->analyser->amongs; if (a == NULL) return; w(g, "~Mprivate~N~+"); while (a != NULL) { g->I[0] = a->number; w(g, "~Ma_~I0 : Array Of TAmong;~N"); a = a->next; } w(g, "~-"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~MSetLength(a_~I0, ~I1);~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[1] = i; /* Write among's string. */ g->L[0] = v[i].b; w(g, "~Ma_~I0[~I1].Str := ~L0;~N"); /* Write among's index & result. */ g->I[2] = v[i].i; w(g, "~Ma_~I0[~I1].Index := ~I2;~N"); g->I[2] = v[i].result; w(g, "~Ma_~I0[~I1].Result := ~I2;~N"); /* Write among's handler. */ w(g, "~Ma_~I0[~I1].Method := "); if (v[i].function == NULL) { w(g, "nil;~N~N"); } else { g->V[0] = v[i].function; w(g, "Self.~W0;~N~N"); } } w(g, "~-"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void generate_constructor(struct generator * g) { w(g, "~N~MConstructor T~n.Create;~N~{"); generate_amongs(g); w(g, "~}"); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != NULL) { generate(g, p); g->unreachable = false; p = p->right; } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = size - 1; w(g, "~N~MConst~+~N~M~W0 : Array [0..~I0] Of Char = (~N~+"); for (int i = 0; i < size; i++) { if (i != 0) w(g, ",~N"); g->I[0] = map[i]; w(g, "~MChr(~I0)"); } w(g, "~N~-~M);~N~-"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } extern void generate_program_pascal(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_unit_start(g); /* Generate class declaration. */ generate_class_begin(g); generate_member_decls(g); generate_among_decls(g); generate_method_decls(g); generate_class_end(g); /* generate implementation. */ generate_groupings(g); generate_constructor(g); generate_methods(g); generate_unit_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_python.c000066400000000000000000001206521500727106100207060ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { int next_label = g->next_label++; g->max_label = (next_label > g->max_label) ? next_label : g->max_label; return next_label; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: write_char(g, '_'); break; case t_routine: write_string(g, "__"); /* FALLTHRU */ default: { int ch = "SBIrxg"[p->type]; write_char(g, ch); write_char(g, '_'); break; } } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { write_string(g, "self."); write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_string(g, "u\""); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 0x590 && ch != 127) { if (ch == '"' || ch == '\\') write_char(g, '\\'); // Our Python generator uses ENC_WIDECHARS so we need to convert. write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u"); write_hex4(g, ch); } } write_string(g, "\""); } static void write_literal_char(struct generator * g, symbol ch) { write_string(g, "u\""); if (32 <= ch && ch < 0x590 && ch != 127) { if (ch == '"' || ch == '\\') write_char(g, '\\'); // Python uses ENC_WIDECHARS so we need to convert. write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u"); write_hex4(g, ch); } write_string(g, "\""); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "# "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "self.limit - "; writef(g, "~M~B0 = ~S1self.cursor~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "self.cursor = "); if (p->mode != m_forward) str_append_string(out, "self.limit - "); str_append(out, savevar); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "self.cursor += 1" : "self.cursor -= 1"); write_newline(g); } static void wsetlab_begin(struct generator * g) { w(g, "~Mtry:~N~+"); } static void wsetlab_end(struct generator * g, int n) { g->I[0] = n; w(g, "~-~Mexcept lab~I0: pass~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mraise lab~I0()~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn False~N"); g->unreachable = true; break; default: g->I[0] = g->failure_label; w(g, "~Mraise lab~I0()~N"); g->unreachable = true; } } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, ":", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "self.cursor >= self.limit", p); } else { write_failure_if(g, "self.cursor <= self.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "sys.maxsize"); break; case c_minint: write_string(g, "(~sys.maxsize)"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_divide: /* Snowball specifies integer division with semantics matching C, * so Python's `/` or `//` isn't suitable (`//` would be in cases * where we knew that the arguments had the same sign). * * The `float(`...`)` is needed for Python2. */ write_string(g, "int(float("); generate_AE(g, p->left); write_string(g, ") / "); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "self.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "self.limit" : "self.limit_backward"); break; case c_lenof: /* Same as sizeof() for Python. */ case c_sizeof: g->V[0] = p->name; w(g, "len(~V0)"); break; case c_len: /* Same as size() for Python. */ case c_size: w(g, "len(self.current)"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mself.limit_backward = self.cursor~N" "~Mself.cursor = self.limit~N", p); generate(g, p->left); w(g, "~Mself.cursor = self.limit_backward~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } wsetlab_begin(g); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); restore_string(p, g->failure_str, savevar); } wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = True~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = False~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0()~N"); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->V[0] = p->name; write_failure_if(g, "not self.go_~S1_grouping~S0(~n.~W0)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mself.cursor += 1~N"); else w(g, "~Mself.cursor -= 1~N"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); w(g, "~Mtry:~N~+" "~Mwhile True:~N~+"); if (keep_c) write_savecursor(g, p, savevar); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mraise lab~I0()~N"); } g->unreachable = false; wsetlab_end(g, label); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); w(g, "~-~-"); g->I[0] = golab; w(g, "~Mexcept lab~I0: pass~N"); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); if (p->AE->type == c_number && p->AE->number <= 4) { // Use a tuple instead of range() for small constant numbers of // iterations. w(g, "~Mfor ~B0 in "); for (int i = p->AE->number; i > 0; --i) { w(g, "0"); if (i > 1) w(g, ", "); } writef(g, ":~N", p); } else { w(g, "~Mfor ~B0 in range("); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "):~N", p); } writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int label = new_label(g); g->failure_label = label; writef(g, "~Mwhile True:~N~+", p); if (keep_c) write_savecursor(g, p, savevar); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1~N"); } w(g, "~Mcontinue~N"); } wsetlab_end(g, label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); w(g, "~Mbreak~N~}"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~M~B0 = "); generate_AE(g, p->AE); w(g, "~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = self.cursor~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif self.cursor ~S0 "); generate_AE(g, p->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mself.cursor = "); generate_AE(g, p->AE); writef(g, "~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif self.cursor != "); generate_AE(g, p->AE); writef(g, ":", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~Mc = self.cursor ~S0 "); generate_AE(g, p->AE); w(g, "~N"); g->S[1] = p->mode == m_forward ? "> self.limit" : "< self.limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 or c ~S2 self.cursor", p); } writef(g, "~Mself.cursor = c~N", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif not self.slice_del():~N" "~+~Mreturn False~N~-" "~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; writef(g, "~Mself.cursor = ~S0~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "self.cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.bra" : "self.ket"; writef(g, "~M~S0 = self.cursor~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.ket" : "self.bra"; writef(g, "~M~S0 = self.cursor~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = self.assign_to()~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = self.slice_to()~N" "~Mif ~V0 == '':~N" "~+~Mreturn False~N~-", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mc = self.cursor~N"); writef(g, "~Mself.insert(self.cursor, self.cursor, ", p); generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Mself.cursor = c~N"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~Mc = self.cursor~N", p); if (p->mode == m_forward) { writef(g, "~Mself.insert(self.cursor, self.limit, ", p); } else { writef(g, "~Mself.insert(self.limit_backward, self.cursor, ", p); } generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Mself.cursor = c~N"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif not self.slice_from("); generate_address(g, p); writef(g, "):~N" "~+~Mreturn False~N~-", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif self.cursor ~S0 "); generate_AE(g, q->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = self.limit - self.cursor~N"); w(g, "~Mself.limit = "); } else { w(g, "~M~B0 = self.limit_backward~N"); w(g, "~Mself.limit_backward = "); } generate_AE(g, q->AE); writef(g, "~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "self.limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "self.limit_backward = "); str_append(g->failure_str, varname); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = self.limit - self.cursor~N"); w(g, "~Mself.limit = self.cursor~N"); } else { w(g, "~M~B0 = self.limit_backward~N"); w(g, "~Mself.limit_backward = self.cursor~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "self.limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "self.limit_backward = "); str_append(g->failure_str, varname); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); write_comment(g, p); writef(g, "~M~B0 = BaseStemmer()~N" "~M~B0.copy_from(self)~N", p); { struct str * saved_output = g->outbuf; str_clear(g->failure_str); g->outbuf = g->failure_str; g->V[0] = p->name; writef(g, "~V0 = self.current; ", p); /* For Python 3, this can just be: super().copy_from(~B0) */ writef(g, "super(~n, self).copy_from(~B0)", p); g->failure_str = g->outbuf; g->outbuf = saved_output; } writef(g, "~Mself.current = ~V0~N" "~Mself.cursor = 0~N" "~Mself.limit = len(self.current)~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, "~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); // Relational operators are the same as C. write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~V0()~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V0()~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V0()~N", p); write_failure(g); } else { write_failure_if(g, "not ~V0()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; write_failure_if(g, "not self.~S1_grouping~S0(~n.~W0)", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "not self.eq_s~S0(~V0)", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "not self.eq_s~S0(~L0)", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); g->V[0] = q; w(g, "~Mdef ~W0(self):~+~N"); /* Save output. */ struct str * saved_output = g->outbuf; g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; int signals = check_possible_signals_list(g, p->left, c_define, 0); generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~-"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn True~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = self.find_among~S0(~n.a_~I0)~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mself.find_among~S0(~n.a_~I0)~N", p); } else { write_failure_if(g, "self.find_among~S0(~n.a_~I0) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { /* We dispatch the integer result in `among_var` with an if-chain, * which is O(n) unless Python has a special optimisation (and * profiling with the `timeit` module suggests it doesn't). There * doesn't appear to be a good alternative in Python (3.10 added * `match` but that seems to be aimed more at pattern matching rather * than O(1) dispatch of an integer and it was actually slower when we * tried generating it here). */ for (int i = 1; i <= x->command_count; i++) { if (i == x->command_count && x->nocommand_count == 0) { w(g, "~Melse:~N~+"); } else { g->I[0] = i; w(g, (i > 1 ? "~Melif" : "~Mif")); w(g, " among_var == ~I0:~N~+"); } generate(g, x->commands[i - 1]); w(g, "~-"); g->unreachable = false; } } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "not ~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mself.debug(~I0, ~I1)~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so Python's `/=` or `//=` isn't suitable (`//=` would be in * cases where we knew that the arguments had the same sign). * * The `float(`...`)` is needed for Python2. */ g->V[0] = p->name; w(g, "~M~V0 = int(float(~V0) / "); generate_AE(g, p->AE); w(g, ")~N"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "from .basestemmer import "); w(g, g->options->parent_class_name); w(g, "~N" "from .among import Among~N" "~N" "~N" "class ~n("); w(g, g->options->parent_class_name); w(g, "):~N" "~+~M'''~N" "~MThis class implements the stemming algorithm defined by a snowball script.~N" "~M"); write_generated_comment_content(g); w(g, "~N" "~M'''~N" "~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_newline(g); write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Ma_~I0 = [~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; g->L[0] = v[i].b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~MAmong(~L0, ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", "); write_varname(g, v[i].function); } w(g, ")~S0~N"); } w(g, "~-~M]~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void generate_grouping_table(struct generator * g, struct grouping * q) { symbol * b = q->b; g->V[0] = q->name; // We could use frozenset, but it seems slightly slower to construct which // adds to startup time. w(g, "~M~W0 = {"); for (int i = 0; i < SIZE(b); i++) { if (i > 0) w(g, ", "); write_literal_char(g, b[i]); } w(g, "}~N~N"); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, " ~W0 = \"\"~N"); break; case t_integer: w(g, " ~W0 = 0~N"); break; case t_boolean: w(g, " ~W0 = False~N"); break; } } } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != NULL) { generate(g, p); g->unreachable = false; p = p->right; } } static void generate_label_classes(struct generator * g) { for (int i = 0; i <= g->max_label; i++) { g->I[0] = i; w(g, "~N~Nclass lab~I0(BaseException): pass~N"); } } extern void generate_program_python(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); // Only needed for Python 2, which defaults to ASCII. w(g, "#-*- coding: utf-8 -*-~N"); write_start_comment(g, "# ", NULL); if (g->analyser->int_limits_used) { /* sys.maxsize is used in the code generated for maxint and minint */ w(g, "import sys~N~N"); } generate_class_begin(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_amongs(g); generate_label_classes(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/generator_rust.c000066400000000000000000001273671500727106100203740ustar00rootroot00000000000000#include #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: break; default: { int ch = "SbirxG"[p->type]; write_char(g, ch); write_char(g, '_'); break; } } write_s(g, p->s); } static void write_varref(struct generator * g, struct name * p) { write_string(g, "context."); write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_string(g, "\""); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); if (32 <= ch && ch < 0x590 && ch != 127) { if (ch == '"' || ch == '\\') write_char(g, '\\'); write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u{"); write_hex4(g, ch); write_string(g, "}"); } } write_string(g, "\""); } static void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.limit - "; writef(g, "~Mlet ~B0 = ~S1env.cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "env.cursor = "); if (p->mode != m_forward) str_append_string(out, "env.limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.next_char();" : "env.previous_char();"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~M'lab~I0: loop {~N~+"); } static void wsetlab_end(struct generator * g, int n) { if (!g->unreachable) { g->I[0] = n; w(g, "~Mbreak 'lab~I0;~N"); } w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak 'lab~I0;~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false;~N"); g->unreachable = true; break; default: w(g, "~Mbreak 'lab"); write_int(g, g->failure_label); w(g, ";~N"); g->unreachable = true; } } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.cursor >= env.limit", p); } else { write_failure_if(g, "env.cursor <= env.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { (void)p; int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->V) / sizeof(g->V[0]))) goto invalid_escape2; if (ch == 'V') write_varref(g, g->V[j]); else write_varname(g, g->V[j]); continue; } case 'L': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->L) / sizeof(g->L[0]))) goto invalid_escape2; write_literal_string(g, g->L[j]); continue; } case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "i32::MAX"); break; case c_minint: write_string(g, "i32::MIN"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "env.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "env.limit" : "env.limit_backward"); break; case c_lenof: g->V[0] = p->name; w(g, "(~V0.chars().count() as i32)"); break; case c_sizeof: g->V[0] = p->name; w(g, "(~V0.len() as i32)"); break; case c_len: w(g, "(env.current.chars().count() as i32)"); break; case c_size: w(g, "(env.current.len() as i32)"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } w(g, "~-~M}~N"); g->unreachable = false; if (savevar) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.limit_backward = env.cursor;~N" "~Menv.cursor = env.limit;~N", p); generate(g, p->left); w(g, "~Menv.cursor = env.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(g, p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~W0(env, context);~N"); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.go_~S1_grouping~S0(~W0, ~I0, ~I1)", p); if (!is_goto) { write_string(g, p->mode == m_forward ? "env.next_char();" : "env.previous_char();"); } } static void generate_GO(struct generator * g, struct node * p, int style) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~M'golab~I0: loop {~N~+"); struct str * savevar = NULL; if (style == 1 || repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak 'golab~I0;~N"); } g->unreachable = false; w(g, "~-~M}~N"); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~Mfor _ in 0.."); generate_AE(g, p->AE); writef(g, " {~+~N", p); generate(g, p->left); w(g, "~-~M}~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "~M'replab~I0: loop{~N~+", p); struct str * savevar = NULL; if (repeat_restore(g, p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); g->I[0] = g->failure_label; w(g, "~M'lab~I0: for _ in 0..1 {~N~+"); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1;~N"); } g->I[0] = replab; w(g, "~Mcontinue 'replab~I0;~N"); } w(g, "~-~M}~N"); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->I[0] = replab; w(g, "~Mbreak 'replab~I0;~N~-~M}~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mlet mut ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif env.cursor ~S0 "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.cursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif env.cursor != "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "_back"; g->S[1] = p->AE->type == c_number ? "" : "_checked"; write_failure_if(g, "!env.hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif !env.slice_del() {~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.limit" : "env.limit_backward"; writef(g, "~Menv.cursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.limit" : "env.limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "env.cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.bra" : "env.ket"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.ket" : "env.bra"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.assign_to();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.slice_to();~N" "~Mif ~V0.is_empty() {~N" "~+~Mreturn false;~N~-~M}~N", p); } static void generate_address(struct generator * g, struct node * p) { /* If we deal with a string variable which is of type String we need to * pass it by reference not by value. Literalstrings on the other hand are * of type &'static str so we can pass them by value. */ symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_char(g, '&'); write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mlet c = env.cursor;~N"); w(g, "~Mlet (bra, ket) = (env.cursor, env.cursor);~N"); writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.cursor = c;~N"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~Mlet c = env.cursor;~N", p); /* Copying limits and cursors is necessary here because the rust * borrowchecker does not like taking something from someone you are about * to mutate... */ if (p->mode == m_forward) { writef(g, "~Mlet (bra, ket) = (env.cursor, env.limit);~N", p); } else { writef(g, "~Mlet (bra, ket) = (env.limit_backward, env.cursor);~N", p); } writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.cursor = c;~N"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif !env.slice_from("); generate_address(g, p); writef(g, ") {~N" "~+~Mreturn false;~N~-~M}~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif env.cursor ~S0 "); generate_AE(g, q->AE); w(g, " "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet ~B0 = env.limit - env.cursor;~N"); w(g, "~Menv.limit = "); } else { w(g, "~Mlet ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet ~B0 = env.limit - env.cursor;~N"); w(g, "~Menv.limit = env.cursor;~N"); } else { w(g, "~Mlet ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = env.cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); g->V[0] = p->name; g->B[0] = str_data(savevar); writef(g, "~Mlet ~B0 = env.clone();~N" "~Menv.set_current_s(~V0.clone());~N" "~Menv.cursor = 0;~N" "~Menv.limit = env.current.len() as i32;~N", p); generate(g, p->left); if (!g->unreachable) { g->V[0] = p->name; g->B[0] = str_data(savevar); /* Update string variable. */ w(g, "~M~V0 = env.current.clone().into_owned();~N"); /* Reset env */ w(g, "~M*env = ~B0;~N"); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = (g->failure_label == x_return && p->right && p->right->type == c_functionend); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); // Relational operators are the same as C. write_c_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = check_possible_signals_list(g, p->name->definition, c_define, 0); write_comment(g, p); g->V[0] = p->name; if (g->failure_label == x_return && (signals == 0 || (p->right && p->right->type == c_functionend))) { /* Always fails or tail call. */ writef(g, "~Mreturn ~W0(env, context);~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~W0(env, context);~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~W0(env, context);~N", p); write_failure(g); } else { write_failure_if(g, "!~W0(env, context)", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.~S1_grouping~S0(~W0, ~I0, ~I1)", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!env.eq_s~S0(&~V0)", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!env.eq_s~S0(&~L0)", p); } static void generate_setup_context(struct generator * g) { w(g, "~Mlet mut context = &mut Context {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0: String::new(),~N"); break; case t_integer: w(g, "~M~W0: 0,~N"); break; case t_boolean: w(g, "~M~W0: false,~N"); break; } } w(g, "~-~M};~N"); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; if (q->type == t_routine && !q->used) return; write_newline(g); write_comment(g, p); g->V[0] = q; if (q->type == t_routine) { w(g, "~Mfn ~W0(env: &mut SnowballEnv, context: &mut Context) -> bool {~+~N"); } else { w(g, "~Mpub fn ~W0(env: &mut SnowballEnv) -> bool {~+~N"); generate_setup_context(g); } if (p->amongvar_needed) w(g, "~Mlet mut among_var;~N"); /* Save output. */ struct str * saved_output = g->outbuf; g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; int signals = check_possible_signals_list(g, p->left, c_define, 0); /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } w(g, "~-~M}~N"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = x->shortest_size; int block_opened = 0; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (int c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } if (block != -1 || n_cases <= 2) { char buf[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; if (p->mode == m_forward) { sprintf(buf, "env.current.as_bytes()[(env.cursor + %d) as usize]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { writef(g, "~Mif (env.cursor >= env.limit", p); } else { writef(g, "~Mif (env.cursor + ~I4 >= env.limit", p); } } else { g->S[1] = "env.current.as_bytes()[(env.cursor - 1) as usize]"; if (shortest_size == 1) { writef(g, "~Mif (env.cursor <= env.limit_backward", p); } else { writef(g, "~Mif (env.cursor - ~I4 <= env.limit_backward", p); } } if (n_cases == 0) { /* We get this for the degenerate case: among ( '' ) * This doesn't seem to be a useful construct, but it is * syntactically valid. */ } else if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " || ~S1 as u8 != ~I4 as u8", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " || (~S1 as u8 != ~I4 as u8 && ~S1 as u8 != ~I5 as u8)", p); } else { writef(g, " || ~S1 as u8 >> 5 != ~I2 as u8 || ((~I3 as i32 >> (~S1 as u8 & 0x1f)) & 1) == 0", p); } write_string(g, ") "); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "{among_var = ~I4;}~N~Melse ", p); write_block_start(g); block_opened = 1; } else { writef(g, "~f~N", p); } } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (x->amongvar_needed) { writef(g, "~Mamong_var = env.find_among~S0(A_~I0, context);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Menv.find_among~S0(A_~I0, context);~N", p); } else { write_failure_if(g, "env.find_among~S0(A_~I0, context) == 0", p); } if (block_opened) write_block_end(g); } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mmatch among_var {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~M~I0 => {~N~+"); generate(g, x->commands[i - 1]); w(g, "~-~M}~N"); g->unreachable = false; } w(g, "~M_ => ()~N"); w(g, "~-~M}~N"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* rustc emits warnings if variables don't match the style guide */ /* (i.e. upper-case for globals, snake case for fields etc.) */ /* To allow warning free compilation of generated code and */ /* consistency with snowball variable namings we allow some kind of warnings here */ static void generate_allow_warnings(struct generator * g) { w(g, "#![allow(non_snake_case)]~N"); w(g, "#![allow(non_upper_case_globals)]~N"); w(g, "#![allow(unused_mut)]~N"); w(g, "#![allow(unused_parens)]~N"); w(g, "#![allow(unused_variables)]~N"); } static void generate_class_begin(struct generator * g) { w(g, "use snowball::SnowballEnv;~N"); if (g->analyser->among_count > 0) { w(g, "use snowball::Among;~N~N"); } } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~Mstatic A_~I0: &'static [Among; ~I1] = &[~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; g->L[0] = v[i].b; g->S[0] = ","; w(g, "~MAmong(~L0, ~I0, ~I1, "); if (v[i].function != NULL) { w(g, "Some(&"); write_varname(g, v[i].function); w(g, ")"); } else { w(g, "None"); } w(g, ")~S0~N"); } w(g, "~-~M];~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = size; w(g, "~Mstatic ~W0: &'static [u8; ~I0] = &["); for (int i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "];~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { w(g, "#[derive(Clone)]~N"); w(g, "struct Context {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0: String,~N"); break; case t_integer: w(g, "~M~W0: i32,~N"); break; case t_boolean: w(g, "~M~W0: bool,~N"); break; } } w(g, "~-}~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_rust(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); generate_allow_warnings(g); if (g->analyser->int_limits_used) { /* std::i32 is used in the code generated for i32::MAX and i32::MIN */ w(g, "use std::i32;~N~N"); } generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.0.1/compiler/header.h000066400000000000000000000363001500727106100165500ustar00rootroot00000000000000#include #define SNOWBALL_VERSION "3.0.1" typedef unsigned char byte; typedef unsigned short symbol; #define true 1 #define false 0 #define MALLOC check_malloc #define FREE check_free #define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) #define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n)) #define SIZE(p) ((int *)(p))[-1] #define CAPACITY(p) ((int *)(p))[-2] extern symbol * create_b(int n); extern void report_b(FILE * out, const symbol * p); extern void lose_b(symbol * p); extern symbol * increase_capacity_b(symbol * p, int n); extern symbol * add_to_b(symbol * p, const symbol * q, int n); extern symbol * copy_b(const symbol * p); extern char * b_to_sz(const symbol * p); extern symbol * add_symbol_to_b(symbol * p, symbol ch); // These routines are like those above but work in byte instead of symbol. extern byte * create_s(int n); extern void report_s(FILE * out, const byte * p); extern void lose_s(byte * p); extern byte * increase_capacity_s(byte * p, int n); extern byte * copy_s(const byte * p); extern byte * add_s_to_s(byte * p, const char * s, int n); extern byte * add_sz_to_s(byte * p, const char * s); extern byte * add_char_to_s(byte * p, char ch); // "" LIT is a trick to make compilation fail if LIT is not a string literal. #define add_literal_to_s(P, LIT) add_s_to_s(P, "" LIT, sizeof(LIT) - 1) struct str; /* defined in space.c */ extern struct str * str_new(void); extern void str_delete(struct str * str); extern void str_append(struct str * str, const struct str * add); extern void str_append_ch(struct str * str, char add); extern void str_append_s(struct str * str, const byte * q); extern void str_append_string(struct str * str, const char * s); extern void str_append_int(struct str * str, int i); extern void str_append_wchar_as_utf8(struct str * str, symbol ch); extern void str_clear(struct str * str); extern void str_assign(struct str * str, const char * s); extern struct str * str_copy(const struct str * old); extern byte * str_data(const struct str * str); extern int str_len(const struct str * str); extern int str_back(const struct str *str); extern void str_pop(const struct str *str); extern void output_str(FILE * outfile, struct str * str); extern int get_utf8(const symbol * p, int * slot); extern int put_utf8(int ch, symbol * p); typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc; /* stringdef name and value */ struct m_pair { struct m_pair * next; byte * name; symbol * value; }; /* struct input must be a prefix of struct tokeniser. */ struct input { struct input * next; byte * p; int c; char * file; // -1 : Release file with: lose_s((byte *)file) // 0 : We don't own file. // 1 : Release file with: free(file) int file_owned; int line_number; }; struct include { struct include * next; byte * s; }; enum token_codes { /* The relational operator token values are chosen such that we can * invert the relation with a simple xor with 1. */ c_gt = 0, c_le, c_ge, c_lt, c_eq, c_ne, /* Other token values just need to be unique. */ c_among, c_and, c_as, c_assign, c_assignto, c_atleast, c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards, c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug, c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, c_dollar, c_externals, c_fail, c_false, c_for, c_get, c_gopast, c_goto, c_groupings, c_hex, c_hop, c_insert, c_integers, c_ket, c_leftslice, c_len, c_lenof, c_limit, c_loop, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, c_multiplyassign, c_next, c_non, c_not, c_or, c_plus, c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring, c_test, c_tolimit, c_tomark, c_true, c_try, c_unset, /* These token values don't directly correspond to a keyword. */ c_name, c_number, c_literalstring, /* These token values are synthesised by the analyser. */ c_mathassign, c_neg, c_call, c_grouping, c_booltest, c_functionend, c_goto_grouping, c_gopast_grouping, c_goto_non, c_gopast_non, NUM_TOKEN_CODES }; enum uplus_modes { UPLUS_NONE, UPLUS_DEFINED, UPLUS_UNICODE }; /* struct input must be a prefix of struct tokeniser. */ struct tokeniser { struct input * next; byte * p; int c; char * file; // -1 : Release file with: lose_s((byte *)file) // 0 : We don't own file. // 1 : Release file with: free(file) int file_owned; int line_number; // Used for c_literalstring values. symbol * b; // Used for c_name names. byte * s; int number; int m_start; int m_end; struct m_pair * m_pairs; int get_depth; int error_count; int token; int previous_token; byte token_held; byte token_reported_as_unexpected; enc encoding; int omission; struct include * includes; /* Mode in which U+ has been used: * UPLUS_NONE - not used yet * UPLUS_DEFINED - stringdef U+xxxx .... * UPLUS_UNICODE - {U+xxxx} used with implicit meaning */ int uplusmode; char token_disabled[NUM_TOKEN_CODES]; }; extern byte * get_input(const char * filename); extern struct tokeniser * create_tokeniser(byte * b, char * file); extern int read_token(struct tokeniser * t); extern int peek_token(struct tokeniser * t); #define hold_token(T) ((T)->token_held = true) extern const char * name_of_token(int code); extern void disable_token(struct tokeniser * t, int code); extern void close_tokeniser(struct tokeniser * t); extern int space_count; extern void * check_malloc(size_t n); extern void check_free(void * p); struct node; struct name { struct name * next; byte * s; byte type; /* t_string etc */ byte mode; /* for routines, externals (m_forward, etc) */ byte referenced; byte used_in_among; /* Function used in among? */ byte value_used; /* (For variables) is its value ever used? */ byte initialised; /* (For variables) is it ever initialised? */ byte used_in_definition; /* (grouping) used in grouping definition? */ struct node * definition; /* for routines, externals */ int count; /* 0, 1, 2 for each type */ struct grouping * grouping; /* for grouping names */ struct node * used; /* First use, or NULL if not used */ struct name * local_to; /* Local to one routine/external */ int declaration_line_number;/* Line number of declaration */ }; struct literalstring { struct literalstring * next; symbol * b; }; struct amongvec { symbol * b; /* the string giving the case */ int size; /* - and its size */ struct node * action; /* the corresponding action */ int i; /* the amongvec index of the longest substring of b */ int result; /* the numeric result for the case */ int line_number; /* for diagnostics and stable sorting */ struct name * function; }; struct among { struct among * next; struct amongvec * b; /* pointer to the amongvec */ int number; /* amongs are numbered 0, 1, 2 ... */ int literalstring_count; /* in this among */ int command_count; /* in this among (includes "no command" entries) */ int nocommand_count; /* number of "no command" entries in this among */ int function_count; /* in this among */ int amongvar_needed; /* do we need to set among_var? */ int always_matches; /* will this among always match? */ int shortest_size; /* smallest non-zero string length in this among */ struct node * substring; /* i.e. substring ... among ( ... ) */ struct node ** commands; /* array with command_count entries */ struct node * node; /* pointer to the node for this among */ }; struct grouping { struct grouping * next; symbol * b; /* the characters of this group */ int largest_ch; /* character with max code */ int smallest_ch; /* character with min code */ struct name * name; /* so g->name->grouping == g */ int line_number; }; struct node { struct node * next; struct node * left; struct node * aux; /* used in setlimit */ struct among * among; /* used in among */ struct node * right; byte type; byte mode; // We want to distinguish constant AEs which have the same value everywhere // (e.g. 42, 2+2, lenof '{U+0246}') from constant AEs which can have a // different value depending on platform and/or target language and/or // Unicode mode (e.g. maxint, sizeof '{U+0246}') - some warnings which // depend on a constant AEs value should only fire for the first set. byte fixed_constant; struct node * AE; struct name * name; symbol * literalstring; int number; int line_number; int amongvar_needed; /* used in routine definitions */ }; enum name_types { t_size = 6, t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4, t_grouping = 5 /* If this list is extended, adjust write_varname in generator.c */ }; /* In name_count[i] below, remember that type is ----+---- 0 | string 1 | boolean 2 | integer 3 | routine 4 | external 5 | grouping */ struct analyser { struct tokeniser * tokeniser; struct node * nodes; struct name * names; struct literalstring * literalstrings; byte mode; byte modifyable; /* false inside reverse(...) */ struct node * program; struct node * program_end; int name_count[t_size]; /* name_count[i] counts the number of names of type i */ struct among * amongs; struct among * amongs_end; int among_count; int amongvar_needed; /* used in reading routine definitions */ int among_with_function_count; /* number of amongs with functions */ struct grouping * groupings; struct grouping * groupings_end; struct node * substring; /* pending 'substring' in current routine definition */ enc encoding; byte int_limits_used; /* are maxint or minint used? */ }; enum analyser_modes { // m_unknown is used as the initial value for struct node's mode member. // When a routine (or external) is used or defined we check the mode // member matches, but for the first use/definition we see we want to // instead set it to the mode of that use/definition. m_forward = 0, m_backward, m_unknown }; extern void print_program(struct analyser * a); extern struct analyser * create_analyser(struct tokeniser * t); extern void close_analyser(struct analyser * a); extern void read_program(struct analyser * a); struct generator { struct analyser * analyser; struct options * options; int unreachable; /* 0 if code can be reached, 1 if current code * is unreachable. */ int var_number; /* Number of next variable to use. */ struct str * outbuf; /* temporary str to store output */ struct str * declarations; /* str storing variable declarations */ int next_label; #ifndef DISABLE_PYTHON int max_label; #endif int margin; /* Target language code to execute in case of failure. */ struct str * failure_str; int label_used; /* Keep track of whether the failure label is used. */ int failure_label; int debug_count; int copy_from_count; /* count of calls to copy_from() */ const char * S[10]; /* strings */ byte * B[10]; /* byte blocks */ int I[10]; /* integers */ struct name * V[5]; /* variables */ symbol * L[5]; /* literals, used in formatted write */ int line_count; /* counts number of lines output */ int line_labelled; /* in ISO C, will need extra ';' if it is a block end */ int literalstring_count; int keep_count; /* used to number keep/restore pairs to avoid compiler warnings about shadowed variables */ int temporary_used; /* track if temporary variable used (Ada and Pascal) */ }; /* Special values for failure_label in struct generator. */ enum special_labels { x_return = -1 }; struct options { /* for the command line: */ const char * output_file; char * name; FILE * output_src; FILE * output_h; byte syntax_tree; byte comments; byte js_esm; enc encoding; enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO, LANG_ADA } make_lang; const char * externals_prefix; const char * variables_prefix; const char * runtime_path; const char * parent_class_name; const char * package; const char * go_snowball_runtime; const char * string_class; const char * among_class; struct include * includes; struct include * includes_end; }; /* Generator functions common to several backends. */ extern struct generator * create_generator(struct analyser * a, struct options * o); extern void close_generator(struct generator * g); extern void write_char(struct generator * g, int ch); extern void write_newline(struct generator * g); extern void write_string(struct generator * g, const char * s); extern void write_wchar_as_utf8(struct generator * g, symbol ch); extern void write_int(struct generator * g, int i); extern void write_hex4(struct generator * g, int ch); extern void write_symbol(struct generator * g, symbol s); extern void write_s(struct generator * g, const byte * b); extern void write_str(struct generator * g, struct str * str); extern void write_c_relop(struct generator * g, int relop); extern void write_comment_content(struct generator * g, struct node * p); extern void write_generated_comment_content(struct generator * g); extern void write_start_comment(struct generator * g, const char * comment_start, const char * comment_end); extern int K_needed(struct generator * g, struct node * p); extern int repeat_restore(struct generator * g, struct node * p); extern int check_possible_signals_list(struct generator * g, struct node * p, int type, int call_depth); /* Generator for C code. */ extern void generate_program_c(struct generator * g); #ifndef DISABLE_JAVA /* Generator for Java code. */ extern void generate_program_java(struct generator * g); #endif #ifndef DISABLE_CSHARP /* Generator for C# code. */ extern void generate_program_csharp(struct generator * g); #endif #ifndef DISABLE_PASCAL extern void generate_program_pascal(struct generator * g); #endif #ifndef DISABLE_PYTHON /* Generator for Python code. */ extern void generate_program_python(struct generator * g); #endif #ifndef DISABLE_JS extern void generate_program_js(struct generator * g); #endif #ifndef DISABLE_RUST extern void generate_program_rust(struct generator * g); #endif #ifndef DISABLE_GO extern void generate_program_go(struct generator * g); #endif #ifndef DISABLE_ADA extern void generate_program_ada(struct generator * g); #endif snowball-3.0.1/compiler/space.c000066400000000000000000000212701500727106100164060ustar00rootroot00000000000000 #include /* for printf */ #include /* malloc, free */ #include /* memmove */ #include "header.h" #define HEAD 2*sizeof(int) #define EXTENDER 40 /* This modules provides a simple mechanism for arbitrary length writable strings, called 'blocks'. They are 'symbol *' items rather than 'char *' items however. The calls are: symbol * b = create_b(n); - create an empty block b with room for n symbols b = increase_capacity_b(b, n); - increase the capacity of block b by n symbols (b may change) b2 = copy_b(b) - copy block b into b2 lose_b(b); - lose block b b = add_to_b(b, p, n); - add the n symbols at address p to the end of the data in b SIZE(b) - is the number of symbols in b For example: symbol * b = create_b(0); { symbol i; for (i = 'A'; i <= 'Z'; i++) { add_symbol_to_b(b, i); } } After running the above code b contains: { (symbol)'A', (symbol)'B', ..., (symbol)'Z' } */ /* For a block b, SIZE(b) is the number of symbols so far written into it, CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b). In fact blocks have 1 extra character over the promised capacity so they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of overwriting. */ extern symbol * create_b(int n) { symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol))); CAPACITY(p) = n; SIZE(p) = 0; return p; } extern void report_b(FILE * out, const symbol * p) { int i; for (i = 0; i < SIZE(p); i++) { if (p[i] > 255) { printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } putc(p[i], out); } } extern void output_str(FILE * outfile, struct str * str) { report_s(outfile, str_data(str)); } extern void lose_b(symbol * p) { if (p == NULL) return; FREE((char *) p - HEAD); } extern symbol * increase_capacity_b(symbol * p, int n) { symbol * q = create_b(CAPACITY(p) + n + EXTENDER); memmove(q, p, CAPACITY(p) * sizeof(symbol)); SIZE(q) = SIZE(p); lose_b(p); return q; } extern symbol * add_to_b(symbol * p, const symbol * q, int n) { int x = SIZE(p) + n - CAPACITY(p); if (x > 0) p = increase_capacity_b(p, x); memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; } extern symbol * copy_b(const symbol * p) { int n = SIZE(p); symbol * q = create_b(n); add_to_b(q, p, n); return q; } int space_count = 0; static void * xmalloc(size_t n) { void * result = malloc(n); if (result == NULL) { fprintf(stderr, "Failed to allocate %lu bytes\n", (unsigned long)n); exit(1); } return result; } extern void * check_malloc(size_t n) { space_count++; return xmalloc(n); } extern void check_free(void * p) { space_count--; free(p); } /* To convert a block to a zero terminated string: */ extern char * b_to_sz(const symbol * p) { int n = SIZE(p); char * s = (char *)xmalloc(n + 1); { int i; for (i = 0; i < n; i++) { if (p[i] > 255) { printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } s[i] = (char)p[i]; } } s[n] = 0; return s; } /* Add a single symbol to a block. If p = 0 the block is created. */ extern symbol * add_symbol_to_b(symbol * p, symbol ch) { int k; if (p == NULL) p = create_b(1); k = SIZE(p); { int x = k + 1 - CAPACITY(p); if (x > 0) p = increase_capacity_b(p, x); } p[k] = ch; SIZE(p)++; return p; } extern byte * create_s(int n) { byte * p = (byte *) (HEAD + (byte *) MALLOC(HEAD + (n + 1))); CAPACITY(p) = n; SIZE(p) = 0; return p; } extern void report_s(FILE * out, const byte * p) { fwrite(p, 1, SIZE(p), out); } extern void lose_s(byte * p) { if (p == NULL) return; FREE((byte *) p - HEAD); } extern byte * increase_capacity_s(byte * p, int n) { byte * q = create_s(CAPACITY(p) + n + EXTENDER); memmove(q, p, CAPACITY(p)); SIZE(q) = SIZE(p); lose_s(p); return q; } extern byte * copy_s(const byte * p) { return add_s_to_s(NULL, (const char*)p, SIZE(p)); } /* Add a string with given length to a byte block. If p = 0 the block is created. */ extern byte * add_s_to_s(byte * p, const char * s, int n) { int k; if (p == NULL) p = create_s(n); k = SIZE(p); { int x = k + n - CAPACITY(p); if (x > 0) p = increase_capacity_s(p, x); } memcpy(p + k, s, n); SIZE(p) += n; return p; } /* Add a zero terminated string to a byte block. If p = 0 the block is created. */ extern byte * add_sz_to_s(byte * p, const char * s) { return add_s_to_s(p, s, strlen(s)); } /* Add a single character to a byte block. If p = 0 the block is created. */ extern byte * add_char_to_s(byte * p, char ch) { int k; if (p == NULL) p = create_s(1); k = SIZE(p); { int x = k + 1 - CAPACITY(p); if (x > 0) p = increase_capacity_s(p, x); } p[k] = ch; SIZE(p)++; return p; } /* The next section defines string handling capabilities in terms of the lower level byte block handling capabilities of space.c */ /* -------------------------------------------------------------*/ struct str { byte * data; }; /* Create a new string. */ extern struct str * str_new(void) { struct str * output = (struct str *) xmalloc(sizeof(struct str)); output->data = create_s(0); return output; } /* Delete a string. */ extern void str_delete(struct str * str) { lose_s(str->data); free(str); } /* Append a str to this str. */ extern void str_append(struct str * str, const struct str * add) { byte * q = add->data; str->data = add_s_to_s(str->data, (char *)q, SIZE(q)); } /* Append a character to this str. */ extern void str_append_ch(struct str * str, char add) { str->data = add_char_to_s(str->data, add); } /* Append a low level byte block to a str. */ extern void str_append_s(struct str * str, const byte * q) { str->data = add_s_to_s(str->data, (const char *)q, SIZE(q)); } /* Append a (char *, null terminated) string to a str. */ extern void str_append_string(struct str * str, const char * s) { str->data = add_sz_to_s(str->data, s); } /* Append an integer to a str. */ extern void str_append_int(struct str * str, int i) { char s[30]; sprintf(s, "%d", i); str_append_string(str, s); } /* Append wide character to a string as UTF-8. */ extern void str_append_wchar_as_utf8(struct str * str, symbol ch) { if (ch < 0x80) { str_append_ch(str, ch); return; } if (ch < 0x800) { str_append_ch(str, (ch >> 6) | 0xC0); str_append_ch(str, (ch & 0x3F) | 0x80); return; } str_append_ch(str, (ch >> 12) | 0xE0); str_append_ch(str, ((ch >> 6) & 0x3F) | 0x80); str_append_ch(str, (ch & 0x3F) | 0x80); } /* Clear a string */ extern void str_clear(struct str * str) { SIZE(str->data) = 0; } /* Set a string */ extern void str_assign(struct str * str, const char * s) { str_clear(str); str_append_string(str, s); } /* Copy a string. */ extern struct str * str_copy(const struct str * old) { struct str * newstr = str_new(); str_append(newstr, old); return newstr; } /* Get the data stored in this str. */ extern byte * str_data(const struct str * str) { return str->data; } /* Get the length of the str. */ extern int str_len(const struct str * str) { return SIZE(str->data); } /* Get the last character of the str. * * Or -1 if the string is empty. */ extern int str_back(const struct str *str) { return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1; } /* Remove the last character of the str. * * Or do nothing if the string is empty. */ extern void str_pop(const struct str *str) { if (SIZE(str->data)) --SIZE(str->data); } extern int get_utf8(const symbol * p, int * slot) { int b0, b1; b0 = *p++; if (b0 < 0xC0) { /* 1100 0000 */ * slot = b0; return 1; } b1 = *p++; if (b0 < 0xE0) { /* 1110 0000 */ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; } * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3; } extern int put_utf8(int ch, symbol * p) { if (ch < 0x80) { p[0] = ch; return 1; } if (ch < 0x800) { p[0] = (ch >> 6) | 0xC0; p[1] = (ch & 0x3F) | 0x80; return 2; } p[0] = (ch >> 12) | 0xE0; p[1] = ((ch >> 6) & 0x3F) | 0x80; p[2] = (ch & 0x3F) | 0x80; return 3; } snowball-3.0.1/compiler/syswords.h000066400000000000000000000101151500727106100172110ustar00rootroot00000000000000static const struct system_word vocab[82+1] = { { 0, (const byte *)"", 82+1}, { 1, (const byte *)"$", c_dollar }, { 1, (const byte *)"(", c_bra }, { 1, (const byte *)")", c_ket }, { 1, (const byte *)"*", c_multiply }, { 1, (const byte *)"+", c_plus }, { 1, (const byte *)"-", c_minus }, { 1, (const byte *)"/", c_divide }, { 1, (const byte *)"<", c_lt }, { 1, (const byte *)"=", c_assign }, { 1, (const byte *)">", c_gt }, { 1, (const byte *)"?", c_debug }, { 1, (const byte *)"[", c_leftslice }, { 1, (const byte *)"]", c_rightslice }, { 2, (const byte *)"!=", c_ne }, { 2, (const byte *)"*=", c_multiplyassign }, { 2, (const byte *)"+=", c_plusassign }, { 2, (const byte *)"-=", c_minusassign }, { 2, (const byte *)"->", c_sliceto }, { 2, (const byte *)"/*", c_comment2 }, { 2, (const byte *)"//", c_comment1 }, { 2, (const byte *)"/=", c_divideassign }, { 2, (const byte *)"<+", c_insert }, { 2, (const byte *)"<-", c_slicefrom }, { 2, (const byte *)"<=", c_le }, { 2, (const byte *)"==", c_eq }, { 2, (const byte *)"=>", c_assignto }, { 2, (const byte *)">=", c_ge }, { 2, (const byte *)"as", c_as }, { 2, (const byte *)"do", c_do }, { 2, (const byte *)"or", c_or }, { 3, (const byte *)"and", c_and }, { 3, (const byte *)"for", c_for }, { 3, (const byte *)"get", c_get }, { 3, (const byte *)"hex", c_hex }, { 3, (const byte *)"hop", c_hop }, { 3, (const byte *)"len", c_len }, { 3, (const byte *)"non", c_non }, { 3, (const byte *)"not", c_not }, { 3, (const byte *)"set", c_set }, { 3, (const byte *)"try", c_try }, { 4, (const byte *)"fail", c_fail }, { 4, (const byte *)"goto", c_goto }, { 4, (const byte *)"loop", c_loop }, { 4, (const byte *)"next", c_next }, { 4, (const byte *)"size", c_size }, { 4, (const byte *)"test", c_test }, { 4, (const byte *)"true", c_true }, { 5, (const byte *)"among", c_among }, { 5, (const byte *)"false", c_false }, { 5, (const byte *)"lenof", c_lenof }, { 5, (const byte *)"limit", c_limit }, { 5, (const byte *)"unset", c_unset }, { 6, (const byte *)"atmark", c_atmark }, { 6, (const byte *)"attach", c_attach }, { 6, (const byte *)"cursor", c_cursor }, { 6, (const byte *)"define", c_define }, { 6, (const byte *)"delete", c_delete }, { 6, (const byte *)"gopast", c_gopast }, { 6, (const byte *)"insert", c_insert }, { 6, (const byte *)"maxint", c_maxint }, { 6, (const byte *)"minint", c_minint }, { 6, (const byte *)"repeat", c_repeat }, { 6, (const byte *)"sizeof", c_sizeof }, { 6, (const byte *)"tomark", c_tomark }, { 7, (const byte *)"atleast", c_atleast }, { 7, (const byte *)"atlimit", c_atlimit }, { 7, (const byte *)"decimal", c_decimal }, { 7, (const byte *)"reverse", c_reverse }, { 7, (const byte *)"setmark", c_setmark }, { 7, (const byte *)"strings", c_strings }, { 7, (const byte *)"tolimit", c_tolimit }, { 8, (const byte *)"booleans", c_booleans }, { 8, (const byte *)"integers", c_integers }, { 8, (const byte *)"routines", c_routines }, { 8, (const byte *)"setlimit", c_setlimit }, { 9, (const byte *)"backwards", c_backwards }, { 9, (const byte *)"externals", c_externals }, { 9, (const byte *)"groupings", c_groupings }, { 9, (const byte *)"stringdef", c_stringdef }, { 9, (const byte *)"substring", c_substring }, { 12, (const byte *)"backwardmode", c_backwardmode }, { 13, (const byte *)"stringescapes", c_stringescapes } }; snowball-3.0.1/compiler/tokeniser.c000066400000000000000000000471611500727106100173250ustar00rootroot00000000000000 #include /* stderr etc */ #include /* malloc free */ #include /* strlen */ #include /* isalpha etc */ #include "header.h" struct system_word { int s_size; /* size of system word */ const byte * s; /* pointer to the system word */ int code; /* its internal code */ }; /* ASCII collating assumed in syswords.h */ #include "syswords.h" #define INITIAL_INPUT_BUFFER_SIZE 8192 static int hex_to_num(int ch); static int smaller(int a, int b) { return a < b ? a : b; } extern byte * get_input(const char * filename) { FILE * input = fopen(filename, "r"); if (input == NULL) { return NULL; } { byte * u = create_s(INITIAL_INPUT_BUFFER_SIZE); int size = 0; while (true) { int ch = getc(input); if (ch == EOF) break; if (size >= CAPACITY(u)) u = increase_capacity_s(u, size); u[size++] = ch; } fclose(input); SIZE(u) = size; return u; } } static void error(struct tokeniser * t, const char * s1, byte * p, int n, const char * s2) { if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } fprintf(stderr, "%s:%d: ", t->file, t->line_number); if (s1) fprintf(stderr, "%s", s1); if (p) { int i; for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); } if (s2) fprintf(stderr, "%s", s2); fprintf(stderr, "\n"); t->error_count++; } static void error1(struct tokeniser * t, const char * s) { error(t, s, NULL, 0, NULL); } static void error2(struct tokeniser * t, const char * s) { error(t, "unexpected end of text after ", NULL, 0, s); } static int compare_words(int m, const byte * p, int n, const byte * q) { if (m != n) return m - n; return memcmp(p, q, n); } static int find_word(int n, const byte * p) { int i = 0; int j = vocab->code; do { int k = i + (j - i)/2; const struct system_word * w = vocab + k; int diff = compare_words(n, p, w->s_size, w->s); if (diff == 0) return w->code; if (diff < 0) j = k; else i = k; } while (j - i != 1); return -1; } static int white_space(struct tokeniser * t, int ch) { switch (ch) { case '\n': t->line_number++; /* fall through */ case '\r': case '\t': case ' ': return true; } return false; } static symbol * find_in_m(struct tokeniser * t, int n, byte * p) { struct m_pair * q; for (q = t->m_pairs; q; q = q->next) { byte * name = q->name; if (n == SIZE(name) && memcmp(name, p, n) == 0) return q->value; } return NULL; } static int read_literal_string(struct tokeniser * t, int c) { byte * p = t->p; int ch; SIZE(t->b) = 0; while (true) { if (c >= SIZE(p) || p[c] == '\n') { error1(t, "string literal not terminated"); return c; } ch = p[c]; c++; if (ch == t->m_start) { /* Inside insert characters. */ int c0 = c; int newlines = false; /* no newlines as yet */ int all_whitespace = true; /* no printing chars as yet */ while (true) { if (c >= SIZE(p) || (p[c] == '\n' && !all_whitespace)) { error1(t, "string literal not terminated"); return c; } ch = p[c]; if (ch == '\n') { newlines = true; } c++; if (ch == t->m_end) break; if (!white_space(t, ch)) all_whitespace = false; } if (!newlines) { int n = c - c0 - 1; /* macro size */ int firstch = p[c0]; symbol * q = find_in_m(t, n, p + c0); if (q == NULL) { if (n == 1 && (firstch == '\'' || firstch == t->m_start)) t->b = add_symbol_to_b(t->b, p[c0]); else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { int codepoint = 0; int x; if (t->uplusmode == UPLUS_DEFINED) { /* See if found with xxxx upper-cased. */ byte * uc = create_s(n); int i; for (i = 0; i != n; ++i) { uc[i] = toupper(p[c0 + i]); } q = find_in_m(t, n, uc); lose_s(uc); if (q != NULL) { t->b = add_to_b(t->b, q, SIZE(q)); continue; } error1(t, "Some U+xxxx stringdefs seen but not this one"); } else { t->uplusmode = UPLUS_UNICODE; } for (x = c0 + 2; x != c - 1; ++x) { int hex = hex_to_num(p[x]); if (hex < 0) { error1(t, "Bad hex digit following U+"); break; } codepoint = (codepoint << 4) | hex; } if (t->encoding == ENC_UTF8) { if (codepoint < 0 || codepoint > 0x01ffff) { error1(t, "character values exceed 0x01ffff"); } /* Ensure there's enough space for a max length * UTF-8 sequence. */ if (CAPACITY(t->b) < SIZE(t->b) + 3) { t->b = increase_capacity_b(t->b, 3); } SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b)); } else { if (t->encoding == ENC_SINGLEBYTE) { /* Only ISO-8859-1 is handled this way - for * other single-byte character sets you need * to stringdef all the U+xxxx codes you use * like - e.g.: * * stringdef U+0171 hex 'FB' */ if (codepoint < 0 || codepoint > 0xff) { error1(t, "character values exceed 256"); } } else { if (codepoint < 0 || codepoint > 0xffff) { error1(t, "character values exceed 64K"); } } t->b = add_symbol_to_b(t->b, (symbol)codepoint); } } else { error(t, "string macro '", p + c0, n, "' undeclared"); } } else { t->b = add_to_b(t->b, q, SIZE(q)); } } } else { if (ch == '\'') return c; if (ch < 0 || ch >= 0x80) { if (t->encoding != ENC_WIDECHARS) { /* We don't really want people using non-ASCII literal * strings, but historically it's worked for single-byte * and UTF-8 if the source encoding matches what the * generated stemmer works in and it seems unfair to just * suddenly make this a hard error. */ fprintf(stderr, "%s:%d: warning: Non-ASCII literal strings aren't " "portable - use stringdef instead\n", t->file, t->line_number); } else { error1(t, "Non-ASCII literal strings aren't " "portable - use stringdef instead"); } } t->b = add_symbol_to_b(t->b, p[c - 1]); } } } static int next_token(struct tokeniser * t) { byte * p = t->p; int c = t->c; int ch; int code = -1; while (true) { if (c >= SIZE(p)) { t->c = c; return -1; } ch = p[c]; if (white_space(t, ch)) { c++; continue; } if (isalpha(ch)) { int c0 = c; while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; code = find_word(c - c0, p + c0); if (code < 0 || t->token_disabled[code]) { SIZE(t->s) = 0; t->s = add_s_to_s(t->s, (const char*)p + c0, c - c0); code = c_name; } } else if (isdigit(ch)) { int value = ch - '0'; while (++c < SIZE(p) && isdigit(p[c])) { value = 10 * value + (p[c] - '0'); } t->number = value; code = c_number; } else if (ch == '\'') { c = read_literal_string(t, c + 1); code = c_literalstring; } else { int lim = smaller(2, SIZE(p) - c); int i; for (i = lim; i > 0; i--) { code = find_word(i, p + c); if (code >= 0) { c += i; break; } } } if (code >= 0) { t->c = c; return code; } error(t, "'", p + c, 1, "' unknown"); c++; continue; } } static int next_char(struct tokeniser * t) { if (t->c >= SIZE(t->p)) return -1; return t->p[t->c++]; } static int next_real_char(struct tokeniser * t) { while (true) { int ch = next_char(t); if (!white_space(t, ch)) return ch; } } static void read_chars(struct tokeniser * t) { int ch = next_real_char(t); if (ch < 0) { error2(t, "stringdef"); return; } { int c0 = t->c-1; while (true) { ch = next_char(t); if (white_space(t, ch) || ch < 0) break; } SIZE(t->s) = 0; t->s = add_s_to_s(t->s, (const char*)t->p + c0, t->c - c0 - 1); } } static int decimal_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; return -1; } static int hex_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; return -1; } static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { int c = 0; int d = 0; while (true) { while (c < SIZE(p) && p[c] == ' ') c++; if (c == SIZE(p)) break; { int number = 0; while (c != SIZE(p)) { int ch = p[c]; if (ch == ' ') break; if (base == 10) { ch = decimal_to_num(ch); if (ch < 0) { error1(t, "decimal string contains non-digits"); return; } } else { ch = hex_to_num(ch); if (ch < 0) { error1(t, "hex string contains non-hex characters"); return; } } number = base * number + ch; c++; } if (t->encoding == ENC_SINGLEBYTE) { if (number < 0 || number > 0xff) { error1(t, "character values exceed 256"); return; } } else { if (number < 0 || number > 0xffff) { error1(t, "character values exceed 64K"); return; } } if (t->encoding == ENC_UTF8) d += put_utf8(number, p + d); else p[d++] = number; } } SIZE(p) = d; } extern int read_token(struct tokeniser * t) { byte * p = t->p; int held = t->token_held; t->token_held = false; if (held) return t->token; t->token_reported_as_unexpected = false; while (true) { int code = next_token(t); switch (code) { case c_comment1: /* slash-slash comment */ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; continue; case c_comment2: { /* slash-star comment */ // Scan for a '*' stopping one before the end since we need a // '/' to follow it to close the comment. int size_less_one = SIZE(p) - 1; int c = t->c; while (true) { if (c >= size_less_one) { error1(t, "/* comment not terminated"); t->token = -1; return -1; } if (p[c] == '\n') { t->line_number++; } else if (p[c] == '*' && p[c + 1] == '/') { // Found '*/' to end of comment. t->c = c + 2; break; } ++c; } continue; } case c_stringescapes: { int ch1 = next_real_char(t); int ch2 = next_real_char(t); if (ch2 < 0) { error2(t, "stringescapes"); continue; } if (ch1 == '\'') { error1(t, "first stringescape cannot be '"); continue; } t->m_start = ch1; t->m_end = ch2; continue; } case c_stringdef: { int base = 0; read_chars(t); code = read_token(t); if (code == c_hex) { base = 16; code = read_token(t); } else if (code == c_decimal) { base = 10; code = read_token(t); } if (code != c_literalstring) { error1(t, "string omitted after stringdef"); continue; } if (base > 0) convert_numeric_string(t, t->b, base); { NEW(m_pair, q); q->next = t->m_pairs; q->name = copy_s(t->s); q->value = copy_b(t->b); t->m_pairs = q; if (t->uplusmode != UPLUS_DEFINED && (SIZE(t->s) >= 3 && t->s[0] == 'U' && t->s[1] == '+')) { if (t->uplusmode == UPLUS_UNICODE) { error1(t, "U+xxxx already used with implicit meaning"); } else { t->uplusmode = UPLUS_DEFINED; } } } continue; } case c_get: code = read_token(t); if (code != c_literalstring) { error1(t, "string omitted after get"); continue; } t->get_depth++; if (t->get_depth > 10) { error1(t, "get directives go 10 deep. Looping?"); exit(1); } { NEW(input, q); char * file = b_to_sz(t->b); int file_owned = 1; byte * u = get_input(file); if (u == NULL) { struct include * r; for (r = t->includes; r; r = r->next) { byte * s = copy_s(r->s); s = add_sz_to_s(s, file); s[SIZE(s)] = 0; if (file_owned > 0) { free(file); } else { lose_s((byte *)file); } file = (char*)s; file_owned = -1; u = get_input(file); if (u != NULL) break; } } if (u == NULL) { error(t, "Can't get '", (byte *)file, strlen(file), "'"); exit(1); } memmove(q, t, sizeof(struct input)); t->next = q; t->p = u; t->c = 0; t->file = file; t->file_owned = file_owned; t->line_number = 1; } p = t->p; continue; case -1: if (t->next) { lose_s(p); { struct input * q = t->next; memmove(t, q, sizeof(struct input)); p = t->p; FREE(q); } t->get_depth--; continue; } /* fall through */ default: t->previous_token = t->token; t->token = code; return code; } } } extern int peek_token(struct tokeniser * t) { int token = read_token(t); t->token_held = true; return token; } extern const char * name_of_token(int code) { int i; for (i = 1; i < vocab->code; i++) if ((vocab + i)->code == code) return (const char *)(vocab + i)->s; switch (code) { case c_mathassign: return "="; case c_name: return "name"; case c_number: return "number"; case c_literalstring:return "literal"; case c_neg: return "neg"; case c_grouping: return "grouping"; case c_call: return "call"; case c_booltest: return "Boolean test"; case c_functionend: return "Function end"; case c_goto_grouping: return "goto grouping"; case c_gopast_grouping: return "gopast grouping"; case c_goto_non: return "goto non"; case c_gopast_non: return "gopast non"; case -2: return "start of text"; case -1: return "end of text"; default: return "?"; } } extern void disable_token(struct tokeniser * t, int code) { t->token_disabled[code] = 1; } extern struct tokeniser * create_tokeniser(byte * p, char * file) { NEW(tokeniser, t); t->next = NULL; t->p = p; t->c = 0; t->file = file; t->file_owned = 0; t->line_number = 1; t->b = create_b(0); t->s = create_s(0); t->m_start = -1; t->m_pairs = NULL; t->get_depth = 0; t->error_count = 0; t->token_held = false; t->token_reported_as_unexpected = false; t->token = -2; t->previous_token = -2; t->uplusmode = UPLUS_NONE; memset(t->token_disabled, 0, sizeof(t->token_disabled)); return t; } extern void close_tokeniser(struct tokeniser * t) { lose_b(t->b); lose_s(t->s); { struct m_pair * q = t->m_pairs; while (q) { struct m_pair * q_next = q->next; lose_s(q->name); lose_b(q->value); FREE(q); q = q_next; } } { struct input * q = t->next; while (q) { struct input * q_next = q->next; FREE(q); q = q_next; } } if (t->file_owned > 0) { free(t->file); } else if (t->file_owned < 0) { lose_s((byte *)t->file); } FREE(t); } snowball-3.0.1/csharp/000077500000000000000000000000001500727106100146135ustar00rootroot00000000000000snowball-3.0.1/csharp/.gitignore000066400000000000000000000001061500727106100166000ustar00rootroot00000000000000*.o *.suo *.user *.GhostDoc.xml bin/ obj/ TestResults/ TestResult.xml snowball-3.0.1/csharp/Snowball/000077500000000000000000000000001500727106100163745ustar00rootroot00000000000000snowball-3.0.1/csharp/Snowball/Algorithms/000077500000000000000000000000001500727106100205055ustar00rootroot00000000000000snowball-3.0.1/csharp/Snowball/Algorithms/.gitignore000066400000000000000000000000171500727106100224730ustar00rootroot00000000000000*.generated.cs snowball-3.0.1/csharp/Snowball/Among.cs000066400000000000000000000076051500727106100177740ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // * this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // * notice, this list of conditions and the following disclaimer in the // * documentation and/or other materials provided with the distribution. // * Neither the name of the copyright holders nor the names of its contributors // * may be used to endorse or promote products derived from this software // * without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.Text; /// /// Snowball's among construction. /// /// public sealed class Among { /// /// Search string. /// /// public string SearchString { get; private set; } /// /// Index to longest matching substring. /// /// public int MatchIndex { get; private set; } /// /// Result of the lookup. /// /// public int Result { get; private set; } /// /// Action to be invoked. /// /// public Func Action { get; private set; } /// /// Initializes a new instance of the class. /// /// /// The search string. /// The index to the longest matching substring. /// The result of the lookup. /// public Among(String str, int index, int result) : this(str, index, result, null) { } /// /// Initializes a new instance of the class. /// /// /// The search string. /// The index to the longest matching substring. /// The result of the lookup. /// The action to be performed, if any. /// public Among(String str, int index, int result, Func action) { this.SearchString = str; this.MatchIndex = index; this.Result = result; this.Action = action; } /// /// Returns a that represents this instance. /// /// /// /// A that represents this instance. /// /// public override string ToString() { return SearchString; } } } snowball-3.0.1/csharp/Snowball/AssemblyInfo.cs000066400000000000000000000026271500727106100213250ustar00rootroot00000000000000using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. [assembly: AssemblyTitle("Snowball")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] [assembly: AssemblyCompany("")] [assembly: AssemblyProduct("Snowball")] [assembly: AssemblyCopyright("Copyright © 2015-2019")] [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] // Setting ComVisible to false makes the types in this assembly not visible // to COM components. If you need to access a type in this assembly from // COM, set the ComVisible attribute to true on that type. [assembly: ComVisible(false)] // The following GUID is for the ID of the typelib if this project is exposed to COM [assembly: Guid("5c54ebc8-a3a3-46f8-b732-60b1440c8b0b")] // Version information for an assembly consists of the following four values: // // Major Version // Minor Version // Build Number // Revision // // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] [assembly: AssemblyVersion(/*SNOWBALL_VERSION*/"3.0.1.0")] [assembly: AssemblyFileVersion(/*SNOWBALL_VERSION*/"3.0.1.0")] snowball-3.0.1/csharp/Snowball/Stemmer.cs000066400000000000000000000441071500727106100203450ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // Copyright (c) 2018, Olly Betts // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // * this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // * notice, this list of conditions and the following disclaimer in the // * documentation and/or other materials provided with the distribution. // * Neither the name of the copyright holders nor the names of its contributors // * may be used to endorse or promote products derived from this software // * without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.Linq; using System.Text; /// /// Class holding current state. /// /// public class Env { /// /// Initializes a new instance of the class. /// /// protected Env() { } /// /// Gets the current string. /// /// protected StringBuilder current; /// /// Current cursor position. /// /// protected int cursor; /// /// Forward limit for inspecting the buffer. /// /// protected int limit; /// /// Backward limit for inspecting the buffer. /// /// protected int limit_backward; /// /// Starting bracket position. /// /// protected int bra; /// /// Ending bracket position. /// /// protected int ket; /// /// Copy another Env object. /// /// public Env(Env other) { copy_from(other); } /// /// Copy another Env object. /// /// protected void copy_from(Env other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } } /// /// Base class for Snowball's stemmer algorithms. /// /// public abstract class Stemmer : Env { /// /// Initializes a new instance of the class. /// /// protected Stemmer() { current = new StringBuilder(); setBufferContents(""); } /// /// Calls the stemmer to process the next word. /// /// protected abstract bool stem(); /// /// Stems the buffer's contents. /// /// public bool Stem() { return this.stem(); } /// /// Stems a given word. /// /// /// The word to be stemmed. /// /// The stemmed word. /// public string Stem(string word) { setBufferContents(word); this.stem(); return current.ToString(); } /// /// Gets the current processing buffer. /// /// public StringBuilder Buffer { get { return current; } } /// /// Gets or sets the current word to be stemmed /// or the stemmed word, if the stemmer has been /// processed. /// /// public string Current { get { return current.ToString(); } set { setBufferContents(value); } } private void setBufferContents(string value) { current.Clear(); current.Insert(0, value); cursor = 0; limit = current.Length; limit_backward = 0; bra = cursor; ket = limit; } /// /// Determines whether the current character is /// inside a given group of characters s. /// protected int in_grouping(string s, int min, int max, bool repeat) { do { if (cursor >= limit) return -1; char ch = current[cursor]; if (ch > max || ch < min) return 1; if (!s.Contains(ch)) return 1; cursor++; } while (repeat); return 0; } /// /// Determines whether the current character is /// inside a given group of characters s. /// protected int in_grouping_b(string s, int min, int max, bool repeat) { do { if (cursor <= limit_backward) return -1; char ch = current[cursor - 1]; if (ch > max || ch < min) return 1; if (!s.Contains(ch)) return 1; cursor--; } while (repeat); return 0; } /// /// Determines whether the current character is /// outside a given group of characters s. /// protected int out_grouping(string s, int min, int max, bool repeat) { do { if (cursor >= limit) return -1; char ch = current[cursor]; if (ch > max || ch < min) { cursor++; continue; } if (!s.Contains(ch)) { cursor++; continue; } return 1; } while (repeat); return 0; } /// /// Determines whether the current character is /// outside a given group of characters s. /// protected int out_grouping_b(string s, int min, int max, bool repeat) { do { if (cursor <= limit_backward) return -1; char ch = current[cursor - 1]; if (ch > max || ch < min) { cursor--; continue; } if (!s.Contains(ch)) { cursor--; continue; } return 1; } while (repeat); return 0; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going forward. /// protected bool eq_s(String s) { if (limit - cursor < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor + i] != s[i]) return false; } cursor += s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going forward. /// protected bool eq_s(StringBuilder s) { if (limit - cursor < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor + i] != s[i]) return false; } cursor += s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going backwards. /// protected bool eq_s_b(String s) { if (cursor - limit_backward < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor - s.Length + i] != s[i]) return false; } cursor -= s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going backwards. /// protected bool eq_s_b(StringBuilder s) { if (cursor - limit_backward < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor - s.Length + i] != s[i]) return false; } cursor -= s.Length; return true; } /// /// Searches if the current buffer matches against one of the /// amongs, starting from the current cursor position and going /// forward. /// /// protected int find_among(Among[] v) { int i = 0; int j = v.Length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; for (int i2 = common; i2 < w.SearchString.Length; i2++) { if (c + common == l) { diff = -1; break; } diff = current[c + common] - w.SearchString[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.SearchString.Length) { cursor = c + w.SearchString.Length; if (w.Action == null) return w.Result; bool res = w.Action(); cursor = c + w.SearchString.Length; if (res) return w.Result; } i = w.MatchIndex; if (i < 0) return 0; } } /// /// Searches if the current buffer matches against one of the /// amongs, starting from the current cursor position and going /// backwards. /// /// protected int find_among_b(Among[] v) { int i = 0; int j = v.Length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; for (int i2 = w.SearchString.Length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current[c - 1 - common] - w.SearchString[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.SearchString.Length) { cursor = c - w.SearchString.Length; if (w.Action == null) return w.Result; bool res = w.Action(); cursor = c - w.SearchString.Length; if (res) return w.Result; } i = w.MatchIndex; if (i < 0) return 0; } } /// /// Replaces the characters between c_bra /// and c_ket by the characters in s. /// /// protected int replace_s(int c_bra, int c_ket, String s) { int adjustment = s.Length - (c_ket - c_bra); Replace(current, c_bra, c_ket, s); limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } /// /// Checks if a slicing can be done. /// protected void slice_check() { if (bra < 0 || bra > ket || ket > limit || limit > current.Length) { System.Diagnostics.Trace.WriteLine("faulty slice operation"); } } /// /// Replaces the contents of the bracket with the string s. /// /// /// The s. protected void slice_from(String s) { slice_check(); replace_s(bra, ket, s); } /// /// Removes the current bracket contents. /// /// protected void slice_del() { slice_from(""); } /// /// Replaces the contents of the bracket with the string s. /// /// protected void insert(int c_bra, int c_ket, String s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /// /// Replaces the contents of the bracket with the string s. /// /// protected void insert(int c_bra, int c_ket, StringBuilder s) { int adjustment = replace_s(c_bra, c_ket, s.ToString()); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /// /// Replaces the contents of the bracket with the string s. /// /// protected void slice_to(StringBuilder s) { slice_check(); Replace(s, 0, s.Length, current.ToString(bra, ket - bra)); } /// /// Replaces the contents of the bracket with the string s. /// /// protected void assign_to(StringBuilder s) { Replace(s, 0, s.Length, current.ToString(0, limit)); } /// /// Replaces a specific region of the buffer with another text. /// public static StringBuilder Replace(StringBuilder sb, int index, int length, string text) { sb.Remove(index, length - index); sb.Insert(index, text); return sb; } } } snowball-3.0.1/csharp/Stemwords/000077500000000000000000000000001500727106100166025ustar00rootroot00000000000000snowball-3.0.1/csharp/Stemwords/App.config000066400000000000000000000002661500727106100205150ustar00rootroot00000000000000 snowball-3.0.1/csharp/Stemwords/Program.cs000066400000000000000000000100651500727106100205420ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // Copyright (c) 2025, Olly Betts // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // * this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // * notice, this list of conditions and the following disclaimer in the // * documentation and/or other materials provided with the distribution. // * Neither the name of the copyright holders nor the names of its contributors // * may be used to endorse or promote products derived from this software // * without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.IO; using System.Reflection; using System.Linq; /// /// Snowball's Stemmer program. /// /// public static class Program { private static void usage() { Console.WriteLine("Usage: stemwords.exe -l [-i ] [-o ]"); } /// /// Main program entrypoint. /// /// public static void Main(String[] args) { string language = null; string inputName = null; string outputName = null; for (int i = 0; i < args.Length; i++) { if (args[i] == "-l") language = args[i + 1]; else if (args[i] == "-i") inputName = args[i + 1]; else if (args[i] == "-o") outputName = args[i + 1]; } if (language == null) { usage(); return; } Stemmer stemmer = typeof(Stemmer).Assembly.GetTypes() .Where(t => t.IsSubclassOf(typeof(Stemmer)) && !t.IsAbstract) .Where(t => match(t.Name, language)) .Select(t => (Stemmer)Activator.CreateInstance(t)).FirstOrDefault(); if (stemmer == null) { Console.WriteLine("Language not found."); return; } Console.WriteLine("Using " + stemmer.GetType()); TextWriter output = System.Console.Out; if (outputName != null) output = new StreamWriter(outputName); TextReader input = System.Console.In; if (inputName != null) input = new StreamReader(inputName); while (true) { var line = input.ReadLine(); if (line == null) break; var o = stemmer.Stem(line); output.WriteLine(o); } output.Flush(); } private static bool match(string stemmerName, string language) { string expectedName = language + "Stemmer"; return stemmerName.StartsWith(expectedName, StringComparison.CurrentCultureIgnoreCase); } } } snowball-3.0.1/doc/000077500000000000000000000000001500727106100141005ustar00rootroot00000000000000snowball-3.0.1/doc/TODO000066400000000000000000000010221500727106100145630ustar00rootroot00000000000000Things to do: - Write documentation for how to use libstemmer (as opposed to how stemming algorithms themselves work). Currently, the documentation in the include/libstemmer.h header file is pretty clear and comprehensive, but an overview document wouldn't go amiss. Things that would be nice to include at some point. - Add version numbers to each stemming algorithm, and allow the interface to request a specific version of the stemming algorithms. Default to providing the latest version of the algorithm. snowball-3.0.1/doc/libstemmer_c_README000066400000000000000000000145111500727106100175070ustar00rootroot00000000000000libstemmer_c ============ This document pertains to the C version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Compiling the library ===================== A simple makefile is provided for Unix style systems. On such systems, it should be possible simply to run "make", and the file "libstemmer.o" and the example program "stemwords" will be generated. If this doesn't work on your system, you need to write your own build system (or call the compiler directly). The files to compile are all contained in the "libstemmer", "runtime" and "src_c" directories, and the public header file is contained in the "include" directory. The library comes in two flavours; UTF-8 only, and UTF-8 plus other character sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of "libstemmer.c". For convenience "mkinc.mak" is a makefile fragment listing the source files and header files used to compile the standard version of the library. "mkinc_utf8.mak" is a comparable makefile fragment listing just the source files for the UTF-8 only version of the library. Using the library ================= The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. The library provides a simple C API. Essentially, a new stemmer can be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then used to stem a word, "sb_stemmer_length" returns the stemmed length of the last word processed, and "sb_stemmer_delete" is used to delete a stemmer. Generally you should create a stemmer object and reuse it rather than creating a fresh object for each word stemmed (since there's some cost to creating and destroying the object). The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. libstemmer does not currently incorporate any mechanism for caching the results of stemming operations. Such caching can greatly increase the performance of a stemmer under certain situations, so suitable patches will be considered for inclusion. The standard libstemmer sources contain an algorithm for each of the supported languages. The algorithm may be selected using the english name of the language, or using the 2 or 3 letter ISO 639 language codes. In addition, the traditional "Porter" stemming algorithm for english is included for backwards compatibility purposes, but we recommend use of the "English" stemmer in preference for new projects. (Some minor algorithms which are included only as curiosities in the snowball website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not included in the standard libstemmer sources. These are not really supported by the snowball project, but it would be possible to compile a modified libstemmer library containing these if desired.) The stemwords example ===================== The stemwords example program allows you to run any of the stemmers compiled into the libstemmer library on a sample vocabulary. For details on how to use it, run it with the "-h" command line option. Using the library in a larger system ==================================== If you are incorporating the library into the build system of a larger program, I recommend copying the unpacked tarball without modification into a subdirectory of the sources of your program. Future versions of the library are intended to keep the same structure, so this will keep the work required to move to a new version of the library to a minimum. As an additional convenience, the list of source and header files used in the library is detailed in mkinc.mak - a file which is in a suitable format for inclusion by a Makefile. By including this file in your build system, you can link the snowball system into your program with a few extra rules. Using the library in a system using GNU autotools ================================================= The libstemmer_c library can be integrated into a larger system which uses the GNU autotool framework (and in particular, automake and autoconf) as follows: 1) Unpack libstemmer_c-*.tar.gz in the top level project directory and rename the resulting directory to remove the version number so that there is a libstemmer_c subdirectory of the top level directory of the project. 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: noinst_LTLIBRARIES = libstemmer.la include $(srcdir)/mkinc.mak noinst_HEADERS = $(snowball_headers) libstemmer_la_SOURCES = $(snowball_sources) (You may also need to add other lines to this, for example, if you are using compiler options which are not compatible with compiling the libstemmer library.) 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's configure.ac file. 4) Add to the top level makefile the following lines (or modify existing assignments to these variables appropriately): AUTOMAKE_OPTIONS = subdir-objects AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include SUBDIRS=libstemmer_c _LIBADD = libstemmer_c/libstemmer.la (Where is the name of the library or executable which links against libstemmer.) snowball-3.0.1/doc/libstemmer_csharp_README000066400000000000000000000042101500727106100205400ustar00rootroot00000000000000libstemmer_csharp ================= This document pertains to the C# version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Compiling the library ===================== To build a library:: mcs -target:library -out:snowballstemmer.dll csharp/Snowball/*.cs csharp/Snowball/Algorithms/*cs And to build the example program using that library:: mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Program.cs Using the library ================= The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. There is currently no formal documentation on the use of the C# version of the library. Additionally, its interface is not guaranteed to be stable. The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. snowball-3.0.1/doc/libstemmer_java_README000066400000000000000000000054451500727106100202140ustar00rootroot00000000000000libstemmer_java =============== This document pertains to the Java version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Requirements ============ The Java code generated by Snowball requires Java >= 7 (since Snowball 3.0.0). Java 7 was released in 2011, and Java 6's EOL was 2013 so we don't expect this to be a problematic requirement. Compiling the library ===================== Simply run the java compiler on all the java source files under the java directory. For example, this can be done under unix by changing directory into the java directory, and running: javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java This will compile the library and also an example program "TestApp" which provides a command line interface to the library. Using the library ================= The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. There is currently no formal documentation on the use of the Java version of the library. Additionally, its interface is not guaranteed to be stable. The best documentation of the library is the source of the TestApp example program. The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. The TestApp example =================== The TestApp example program allows you to run any of the stemmers compiled into the libstemmer library on a sample vocabulary. For details on how to use it, run it with no command line parameters. snowball-3.0.1/doc/libstemmer_js_README000066400000000000000000000040511500727106100176770ustar00rootroot00000000000000Snowball stemming library collection for Javascript =================================================== What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. How to use library ------------------ The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. You can use each stemming modules from Javascript code - e.g to use them with node: .. code-block:: javascript var EnglishStemmer = require('english-stemmer.js'); var stemmer = new EnglishStemmer(); console.log(stemmer.stemWord("testing")); You'll need to bundle ``base-stemmer.js`` and whichever languages you want stemmers for (e.g. ``english-stemmer.js`` for English). FIXME: Document how to use in a web browser. The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. snowball-3.0.1/doc/libstemmer_python_README000066400000000000000000000113061500727106100206050ustar00rootroot00000000000000Snowball stemming library collection for Python =============================================== Python 3 (>= 3.3) is supported. We no longer support Python 2 as the Python developers stopped supporting it at the start of 2020. Snowball 2.1.0 was the last release to officially support Python 2; Snowball 3.0.0 was the last release which had the code to support Python 2, but we were no longer testing it. What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. How to use library ------------------ The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. The ``snowballstemmer`` module has two functions. The ``snowballstemmer.algorithms`` function returns a list of available algorithm names. The ``snowballstemmer.stemmer`` function takes an algorithm name and returns a ``Stemmer`` object. ``Stemmer`` objects have a ``Stemmer.stemWord(word)`` method and a ``Stemmer.stemWords(word[])`` method. .. code-block:: python import snowballstemmer stemmer = snowballstemmer.stemmer('english'); print(stemmer.stemWords("We are the world".split())); Generally you should create a stemmer object and reuse it rather than creating a fresh object for each word stemmed (since there's some cost to creating and destroying the object). The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar (e.g. `threading.Lock` in Python) but that's liable to slow your program down as threads can end up waiting for the lock. Automatic Acceleration ---------------------- `PyStemmer `_ is a wrapper module for Snowball's ``libstemmer_c`` and should provide results 100% compatible to **snowballstemmer**. **PyStemmer** is faster because it wraps generated C versions of the stemmers; **snowballstemmer** uses generate Python code and is slower but offers a pure Python solution. If PyStemmer is installed, ``snowballstemmer.stemmer`` returns a ``PyStemmer`` ``Stemmer`` object which provides the same ``Stemmer.stemWord()`` and ``Stemmer.stemWords()`` methods. Benchmark ~~~~~~~~~ This is a crude benchmark which measures the time for running each stemmer on every word in its sample vocabulary (10,787,583 words over 26 languages). It's not a realistic test of normal use as a real application would do much more than just stemming. It's also skewed towards the stemmers which do more work per word and towards those with larger sample vocabularies. * Python 2.7 + **snowballstemmer** : 13m00s (15.0 * PyStemmer) * Python 3.7 + **snowballstemmer** : 12m19s (14.2 * PyStemmer) * PyPy 7.1.1 (Python 2.7.13) + **snowballstemmer** : 2m14s (2.6 * PyStemmer) * PyPy 7.1.1 (Python 3.6.1) + **snowballstemmer** : 1m46s (2.0 * PyStemmer) * Python 2.7 + **PyStemmer** : 52s For reference the equivalent test for C runs in 9 seconds. These results are for Snowball 2.0.0. They're likely to evolve over time as the code Snowball generates for both Python and C continues to improve (for a much older test over a different set of stemmers using Python 2.7, **snowballstemmer** was 30 times slower than **PyStemmer**, or 9 times slower with **PyPy**). The message to take away is that if you're stemming a lot of words you should either install **PyStemmer** (which **snowballstemmer** will then automatically use for you as described above) or use PyPy. The TestApp example ------------------- The ``testapp.py`` example program allows you to run any of the stemmers on a sample vocabulary. Usage:: testapp.py "sentences ... " .. code-block:: bash $ python testapp.py English "sentences... " snowball-3.0.1/examples/000077500000000000000000000000001500727106100151515ustar00rootroot00000000000000snowball-3.0.1/examples/stemwords.c000066400000000000000000000144061500727106100173510ustar00rootroot00000000000000/* This is a simple program which uses libstemmer to provide a command * line interface for stemming using any of the algorithms provided. */ #include #include /* for malloc, free */ #include /* for memmove */ #include /* for isupper, tolower */ #include "libstemmer.h" const char * progname; static int pretty = 1; static void stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) { #define INC 10 int lim = INC; sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); while (1) { int ch = getc(f_in); if (ch == EOF) { free(b); return; } { int i = 0; int inlen = 0; while (ch != '\n' && ch != EOF) { if (i == lim) { sb_symbol * newb; newb = (sb_symbol *) realloc(b, (lim + INC) * sizeof(sb_symbol)); if (newb == NULL) goto error; b = newb; lim = lim + INC; } /* Update count of utf-8 characters. */ if (ch < 0x80 || ch > 0xBF) inlen += 1; /* force lower case: */ ch = tolower(ch); b[i] = ch; i++; ch = getc(f_in); } { const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); if (stemmed == NULL) { fprintf(stderr, "Out of memory or internal error\n"); exit(1); } if (pretty == 1) { fwrite(b, i, 1, f_out); fputs(" -> ", f_out); } else if (pretty == 2) { fwrite(b, i, 1, f_out); if (sb_stemmer_length(stemmer) > 0) { int j; if (inlen < 30) { for (j = 30 - inlen; j > 0; j--) fputs(" ", f_out); } else { fputs("\n", f_out); for (j = 30; j > 0; j--) fputs(" ", f_out); } } } fputs((const char *)stemmed, f_out); putc('\n', f_out); } } } error: free(b); return; } /** Display the command line syntax, and then exit. * @param n The value to exit with. */ static void usage(int n) { printf("usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h]\n" "\n" "The input file consists of a list of words to be stemmed, one per\n" "line. Words should be in lower case, but (for English) A-Z letters\n" "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" "used.\n" "\n" "If -c is given, the argument is the character encoding of the input\n" "and output files. If it is omitted, the UTF-8 encoding is used.\n" "\n" "If -p is given the output file consists of each word of the input\n" "file followed by \"->\" followed by its stemmed equivalent.\n" "If -p2 is given the output file is a two column layout containing\n" "the input words in the first column and the stemmed equivalents in\n" "the second column.\n" "Otherwise, the output file consists of the stemmed words, one per\n" "line.\n" "\n" "-h displays this help\n", progname); exit(n); } int main(int argc, char * argv[]) { const char * in = NULL; const char * out = NULL; FILE * f_in; FILE * f_out; struct sb_stemmer * stemmer; const char * language = "english"; const char * charenc = NULL; int i = 1; pretty = 0; progname = argv[0]; while (i < argc) { const char * s = argv[i++]; if (s[0] == '-') { if (strcmp(s, "-o") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } out = argv[i++]; } else if (strcmp(s, "-i") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } in = argv[i++]; } else if (strcmp(s, "-l") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } language = argv[i++]; } else if (strcmp(s, "-c") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } charenc = argv[i++]; } else if (strcmp(s, "-p2") == 0) { pretty = 2; } else if (strcmp(s, "-p") == 0) { pretty = 1; } else if (strcmp(s, "-h") == 0) { usage(0); } else { fprintf(stderr, "option %s unknown\n", s); usage(1); } } else { fprintf(stderr, "unexpected parameter %s\n", s); usage(1); } } /* prepare the files */ f_in = (in == NULL) ? stdin : fopen(in, "r"); if (f_in == NULL) { fprintf(stderr, "file %s not found\n", in); exit(1); } f_out = (out == NULL) ? stdout : fopen(out, "w"); if (f_out == NULL) { fprintf(stderr, "file %s cannot be opened\n", out); exit(1); } /* do the stemming process: */ stemmer = sb_stemmer_new(language, charenc); if (stemmer == NULL) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stem_file(stemmer, f_in, f_out); sb_stemmer_delete(stemmer); if (in != NULL) (void) fclose(f_in); if (out != NULL) (void) fclose(f_out); return 0; } snowball-3.0.1/go/000077500000000000000000000000001500727106100137405ustar00rootroot00000000000000snowball-3.0.1/go/README.md000066400000000000000000000034501500727106100152210ustar00rootroot00000000000000# Go Target for Snowball The initial implementation was built as a port of the Rust target. The initial focus has been on getting it to function, and making it work correctly. No attempt has been made to beautify the implementation, generated code, or address performance issues. ## Usage To generate Go source for a Snowball algorithm: ``` $ snowball path/to/algorithm.sbl -go -o algorithm ``` ### Go specific options `-gop[ackage]` the package name used in the generated go file (defaults to `snowball`) `-gor[untime]` the import path used for the Go Snowball runtime (defaults to `github.com/snowballstem/snowball/go`) ## Code Organization `compiler/generator_go.c` has the Go code generation logic `go/` contains the default Go Snowball runtime support `go/stemwords` contains the source for a Go version of the stemwords utility `go/algorithms` location where the makefile generated code will end up ## Using the Generated Stemmers Assuming you generated a stemmer, put that code in a package which is imported by this code as `english`. ``` env := snowball.NewEnv("beautiful") english.Stem(env) fmt.Printf("stemmed word is: %s", env.Current()) ``` NOTE: you can use the env.SetCurrent("new_word") to reuse the env on subsequent calls to the stemmer. ## Testing Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language. Run: ``` $ make check_go ``` An initial pass of fuzz-testing has been performed on the generated stemmers for the algorithms in this repo. Each ran for 5 minutes and used an initial corpus seeded with 10k words from the algorithm's snowballstem-data voc.txt file. ## Known Limitations - Code going through generate_dollar production has not been tested - Code going through generate_debug production has not been tested snowball-3.0.1/go/among.go000066400000000000000000000004071500727106100153710ustar00rootroot00000000000000package snowball import "fmt" type AmongF func(env *Env, ctx interface{}) bool type Among struct { Str string A int32 B int32 F AmongF } func (a *Among) String() string { return fmt.Sprintf("str: `%s`, a: %d, b: %d, f: %p", a.Str, a.A, a.B, a.F) } snowball-3.0.1/go/env.go000066400000000000000000000211131500727106100150550ustar00rootroot00000000000000package snowball import ( "log" "strings" "unicode/utf8" ) // Env represents the Snowball execution environment type Env struct { current string Cursor int Limit int LimitBackward int Bra int Ket int } // NewEnv creates a new Snowball execution environment on the provided string func NewEnv(val string) *Env { return &Env{ current: val, Cursor: 0, Limit: len(val), LimitBackward: 0, Bra: 0, Ket: len(val), } } func (env *Env) Current() string { return env.current } func (env *Env) SetCurrent(s string) { env.current = s env.Cursor = 0 env.Limit = len(s) env.LimitBackward = 0 env.Bra = 0 env.Ket = len(s) } func (env *Env) ReplaceS(bra, ket int, s string) int32 { adjustment := int32(len(s)) - (int32(ket) - int32(bra)) result := env.current[:bra] result += s rsplit := ket if ket < bra { rsplit = bra } result += env.current[rsplit:] newLim := int32(env.Limit) + adjustment env.Limit = int(newLim) if env.Cursor >= ket { newCur := int32(env.Cursor) + adjustment env.Cursor = int(newCur) } else if env.Cursor > bra { env.Cursor = bra } env.current = result return adjustment } func (env *Env) EqS(s string) bool { if env.Cursor >= env.Limit { return false } if strings.HasPrefix(env.current[env.Cursor:], s) { env.Cursor += len(s) for !onCharBoundary(env.current, env.Cursor) { env.Cursor++ } return true } return false } func (env *Env) EqSB(s string) bool { if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) { return false } else if !onCharBoundary(env.current, env.Cursor-len(s)) || !strings.HasPrefix(env.current[env.Cursor-len(s):], s) { return false } else { env.Cursor -= len(s) return true } } func (env *Env) SliceFrom(s string) bool { bra, ket := env.Bra, env.Ket env.ReplaceS(bra, ket, s) return true } func (env *Env) NextChar() { env.Cursor++ for !onCharBoundary(env.current, env.Cursor) { env.Cursor++ } } func (env *Env) PrevChar() { env.Cursor-- for !onCharBoundary(env.current, env.Cursor) { env.Cursor-- } } func (env *Env) Hop(delta int32) bool { res := env.Cursor for delta > 0 { delta-- if res >= env.Limit { return false } res++ for res < env.Limit && !onCharBoundary(env.current, res) { res++ } } env.Cursor = res return true } func (env *Env) HopChecked(delta int32) bool { return delta >= 0 && env.Hop(delta) } func (env *Env) HopBack(delta int32) bool { res := env.Cursor for delta > 0 { delta-- if res <= env.LimitBackward { return false } res-- for res > env.LimitBackward && !onCharBoundary(env.current, res) { res-- } } env.Cursor = res return true } func (env *Env) HopBackChecked(delta int32) bool { return delta >= 0 && env.HopBack(delta) } func (env *Env) InGrouping(chars []byte, min, max int32) bool { if env.Cursor >= env.Limit { return false } r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { return false } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return false } env.NextChar() return true } func (env *Env) GoInGrouping(chars []byte, min, max int32) bool { for env.Cursor < env.Limit { r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return true } env.NextChar() } return false } func (env *Env) InGroupingB(chars []byte, min, max int32) bool { if env.Cursor <= env.LimitBackward { return false } c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { env.Cursor = c return false } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.Cursor = c return false } return true } func (env *Env) GoInGroupingB(chars []byte, min, max int32) bool { for env.Cursor > env.LimitBackward { c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { env.Cursor = c return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.Cursor = c return true } } return false } func (env *Env) OutGrouping(chars []byte, min, max int32) bool { if env.Cursor >= env.Limit { return false } r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { env.NextChar() return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.NextChar() return true } return false } func (env *Env) GoOutGrouping(chars []byte, min, max int32) bool { for env.Cursor < env.Limit { r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r <= max && r >= min { r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 { return true } } env.NextChar() } return false } func (env *Env) OutGroupingB(chars []byte, min, max int32) bool { if env.Cursor <= env.LimitBackward { return false } c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return true } env.Cursor = c return false } func (env *Env) GoOutGroupingB(chars []byte, min, max int32) bool { for env.Cursor > env.LimitBackward { c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r <= max && r >= min { r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 { env.Cursor = c return true } } } return false } func (env *Env) SliceDel() bool { return env.SliceFrom("") } func (env *Env) Insert(bra, ket int, s string) { adjustment := env.ReplaceS(bra, ket, s) if bra <= env.Bra { env.Bra = int(int32(env.Bra) + adjustment) } if bra <= env.Ket { env.Ket = int(int32(env.Ket) + adjustment) } } func (env *Env) SliceTo() string { return env.current[env.Bra:env.Ket] } func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 { var i int32 j := int32(len(amongs)) c := env.Cursor l := env.Limit var commonI, commonJ int firstKeyInspected := false for { k := i + ((j - i) >> 1) var diff int32 common := min(commonI, commonJ) w := amongs[k] for lvar := common; lvar < len(w.Str); lvar++ { if c+common == l { diff-- break } diff = int32(env.current[c+common]) - int32(w.Str[lvar]) if diff != 0 { break } common++ } if diff < 0 { j = k commonJ = common } else { i = k commonI = common } if j-i <= 1 { if i > 0 { break } if j == i { break } if firstKeyInspected { break } firstKeyInspected = true } } for { w := amongs[i] if commonI >= len(w.Str) { env.Cursor = c + len(w.Str) if w.F != nil { res := w.F(env, ctx) env.Cursor = c + len(w.Str) if res { return w.B } } else { return w.B } } i = w.A if i < 0 { return 0 } } } func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 { var i int32 j := int32(len(amongs)) c := env.Cursor lb := env.LimitBackward var commonI, commonJ int firstKeyInspected := false for { k := i + ((j - i) >> 1) diff := int32(0) common := min(commonI, commonJ) w := amongs[k] for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- { if c-common == lb { diff-- break } diff = int32(env.current[c-common-1]) - int32(w.Str[lvar]) if diff != 0 { break } // Count up commons. But not one character but the byte width of that char common++ } if diff < 0 { j = k commonJ = common } else { i = k commonI = common } if j-i <= 1 { if i > 0 { break } if j == i { break } if firstKeyInspected { break } firstKeyInspected = true } } for { w := amongs[i] if commonI >= len(w.Str) { env.Cursor = c - len(w.Str) if w.F != nil { res := w.F(env, ctx) env.Cursor = c - len(w.Str) if res { return w.B } } else { return w.B } } i = w.A if i < 0 { return 0 } } } func (env *Env) Debug(count, lineNumber int) { log.Printf("snowball debug, count: %d, line: %d", count, lineNumber) } func (env *Env) Clone() *Env { clone := *env return &clone } func (env *Env) AssignTo() string { return env.Current() } snowball-3.0.1/go/stemwords/000077500000000000000000000000001500727106100157675ustar00rootroot00000000000000snowball-3.0.1/go/stemwords/generate.go000066400000000000000000000020421500727106100201060ustar00rootroot00000000000000// +build ignore package main import ( "flag" "fmt" "io" "io/ioutil" "log" "os" ) // tool to register all algorithms built with the stemwords tool func main() { flag.Parse() if flag.NArg() < 1 { log.Fatal("must specify algorithms directory") } var w io.Writer if flag.NArg() > 1 { var err error w, err = os.Create(flag.Arg(1)) if err != nil { log.Fatalf("error creating output file %v", err) } } else { w = os.Stdout } fmt.Fprintf(w, "%s", header) files, err := ioutil.ReadDir(flag.Arg(0)) if err != nil { log.Fatal(err) } for _, file := range files { fmt.Fprintf(w, " %s \"github.com/snowballstem/snowball/go/algorithms/%s\"\n", file.Name(), file.Name()) } fmt.Fprintf(w, closeImportStartInit) for _, file := range files { fmt.Fprintf(w, " languages[\"%s\"] = %s.Stem\n", file.Name(), file.Name()) } fmt.Fprintf(w, "%s", footer) } var header = `// generated list of supported algorithms, DO NOT EDIT package main import ( ` var closeImportStartInit = `) func init() {` var footer = `} ` snowball-3.0.1/go/stemwords/main.go000066400000000000000000000023251500727106100172440ustar00rootroot00000000000000//go:generate go run generate.go ../algorithms algorithms.go //go:generate gofmt -s -w algorithms.go package main import ( "bufio" "flag" "fmt" "log" "os" snowballRuntime "github.com/snowballstem/snowball/go" ) var language = flag.String("l", "", "language") var input = flag.String("i", "", "input file") var output = flag.String("o", "", "output file") func main() { flag.Parse() if *language == "" { log.Fatal("must specify language") } stemmer, ok := languages[*language] if !ok { log.Fatalf("no language support for %s", *language) } var reader = os.Stdin if *input != "" { var err error reader, err = os.Open(*input) if err != nil { log.Fatal(err) } defer reader.Close() } var writer = os.Stdout if *output != "" { var err error writer, err = os.Create(*output) if err != nil { log.Fatal(err) } defer writer.Close() } var err error scanner := bufio.NewScanner(reader) for scanner.Scan() { word := scanner.Text() env := snowballRuntime.NewEnv(word) stemmer(env) fmt.Fprintf(writer, "%s\n", env.Current()) } if err = scanner.Err(); err != nil { log.Fatal(err) } } type StemFunc func(env *snowballRuntime.Env) bool var languages = make(map[string]StemFunc) snowball-3.0.1/go/util.go000066400000000000000000000010561500727106100152460ustar00rootroot00000000000000package snowball import ( "math" "unicode/utf8" ) const MaxInt = math.MaxInt32 const MinInt = math.MinInt32 func min(a, b int) int { if a < b { return a } return b } func onCharBoundary(s string, pos int) bool { if pos <= 0 || pos >= len(s) { return true } return utf8.RuneStart(s[pos]) } // RuneCountInString is a wrapper around utf8.RuneCountInString // this allows us to not have to conditionally include // the utf8 package into some stemmers and not others func RuneCountInString(str string) int { return utf8.RuneCountInString(str) } snowball-3.0.1/iconv.py000066400000000000000000000024211500727106100150220ustar00rootroot00000000000000#!env python # Simple (but slow) iconv replacement in Python. import sys in_cs = out_cs = in_file = out_file = pending = None for arg in sys.argv[1:]: if pending != None: arg = pending + arg pending = None if arg.startswith('-'): if arg[1] in ('f', 't', 'o'): if len(arg) == 2: pending = arg continue if arg[1] == 'f': in_cs = arg[2:] continue if arg[1] == 't': out_cs = arg[2:] continue if arg[1] == 'o': out_file = open(arg[2:], 'wb') continue print("Unknown option: '%s'" % arg) sys.exit(1) if in_file == None: in_file = open(arg, 'rb') continue print("Too many arguments") sys.exit(1) if in_cs == None: print("Need to specify input cs with -f") sys.exit(1) if out_cs == None: print("Need to specify output cs with -t") sys.exit(1) if in_file == None: if hasattr(sys.stdin, 'buffer'): in_file = sys.stdin.buffer else: in_file = sys.stdin if out_file == None: if hasattr(sys.stdout, 'buffer'): out_file = sys.stdout.buffer else: out_file = sys.stdout out_file.write(in_file.read().decode(in_cs).encode(out_cs)) snowball-3.0.1/include/000077500000000000000000000000001500727106100147565ustar00rootroot00000000000000snowball-3.0.1/include/libstemmer.h000066400000000000000000000060431500727106100172750ustar00rootroot00000000000000 /* Make header file work when included from C++ */ #ifdef __cplusplus extern "C" { #endif struct sb_stemmer; typedef unsigned char sb_symbol; /* FIXME - should be able to get a version number for each stemming * algorithm (which will be incremented each time the output changes). */ /** Returns an array of the names of the available stemming algorithms. * Note that these are the canonical names - aliases (ie, other names for * the same algorithm) will not be included in the list. * The list is terminated with a null pointer. * * The list must not be modified in any way. */ const char ** sb_stemmer_list(void); /** Create a new stemmer object, using the specified algorithm, for the * specified character encoding. * * All algorithms will usually be available in UTF-8, but may also be * available in other character encodings. * * @param algorithm The algorithm name. This is either the english * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the * language. Note that case is significant in this parameter - the * value should be supplied in lower case. * * @param charenc The character encoding. NULL may be passed as * this value, in which case UTF-8 encoding will be assumed. Otherwise, * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is * significant in this parameter. * * @return NULL if the specified algorithm is not recognised, or the * algorithm is not available for the requested encoding. Otherwise, * returns a pointer to a newly created stemmer for the requested algorithm. * The returned pointer must be deleted by calling sb_stemmer_delete(). * * @note NULL will also be returned if an out of memory error occurs. */ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); /** Delete a stemmer object. * * This frees all resources allocated for the stemmer. After calling * this function, the supplied stemmer may no longer be used in any way. * * It is safe to pass a null pointer to this function - this will have * no effect. */ void sb_stemmer_delete(struct sb_stemmer * stemmer); /** Stem a word. * * The stemming algorithms generally expect the input text to use composed * accents (Unicode NFC or NFKC) and to have been folded to lower case * already. * * The return value is owned by the stemmer - it must not be freed or * modified, and it will become invalid when the stemmer is called again, * or if the stemmer is freed. * * The length of the return value can be obtained using sb_stemmer_length(). * * If an out-of-memory error occurs, this will return NULL. */ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size); /** Get the length of the result of the last stemmed word. * This should not be called before sb_stemmer_stem() has been called. */ int sb_stemmer_length(struct sb_stemmer * stemmer); #ifdef __cplusplus } #endif snowball-3.0.1/java/000077500000000000000000000000001500727106100142545ustar00rootroot00000000000000snowball-3.0.1/java/org/000077500000000000000000000000001500727106100150435ustar00rootroot00000000000000snowball-3.0.1/java/org/tartarus/000077500000000000000000000000001500727106100167105ustar00rootroot00000000000000snowball-3.0.1/java/org/tartarus/snowball/000077500000000000000000000000001500727106100205315ustar00rootroot00000000000000snowball-3.0.1/java/org/tartarus/snowball/Among.java000066400000000000000000000033431500727106100224400ustar00rootroot00000000000000package org.tartarus.snowball; import java.lang.invoke.MethodHandle; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; import java.util.Locale; /** * Internal class used by Snowball stemmers */ public class Among { public Among (String s, int substring_i, int result) { this.s = s.toCharArray(); this.substring_i = substring_i; this.result = result; this.method = null; } public Among (String s, int substring_i, int result, String methodname, MethodHandles.Lookup methodobject) { this.s = s.toCharArray(); this.substring_i = substring_i; this.result = result; final Class clazz = methodobject.lookupClass().asSubclass(SnowballProgram.class); if (methodname.length() > 0) { try { this.method = methodobject.findVirtual(clazz, methodname, MethodType.methodType(boolean.class)) .asType(MethodType.methodType(boolean.class, SnowballProgram.class)); } catch (NoSuchMethodException | IllegalAccessException e) { throw new RuntimeException(String.format(Locale.ENGLISH, "Snowball program '%s' is broken, cannot access method: boolean %s()", clazz.getSimpleName(), methodname ), e); } } else { this.method = null; } } final char[] s; /* search string */ final int substring_i; /* index to longest matching substring */ final int result; /* result of the lookup */ // Make sure this is not accessible outside package for Java security reasons! final MethodHandle method; /* method to use if substring matches */ } snowball-3.0.1/java/org/tartarus/snowball/SnowballProgram.java000066400000000000000000000321371500727106100245130ustar00rootroot00000000000000 package org.tartarus.snowball; import java.lang.reflect.UndeclaredThrowableException; import java.io.Serializable; import java.util.Arrays; /** * Base class for a snowball stemmer */ public class SnowballProgram implements Serializable { protected SnowballProgram() { cursor = 0; length = limit = 0; limit_backward = 0; bra = cursor; ket = limit; } static final long serialVersionUID = 2016072500L; /** * Set the current string. */ public void setCurrent(String value) { setCurrent(value.toCharArray(), value.length()); } /** * Get the current string. */ public String getCurrent() { return new String(current, 0, length); } /** * Set the current string. * @param text character array containing input * @param length valid length of text. */ public void setCurrent(char[] text, int length) { current = text; cursor = 0; this.length = limit = length; limit_backward = 0; bra = cursor; ket = limit; } /** * Get the current buffer containing the stem. *

* NOTE: this may be a reference to a different character array than the * one originally provided with setCurrent, in the exceptional case that * stemming produced a longer intermediate or result string. *

*

* It is necessary to use {@link #getCurrentBufferLength()} to determine * the valid length of the returned buffer. For example, many words are * stemmed simply by subtracting from the length to remove suffixes. *

* @see #getCurrentBufferLength() */ public char[] getCurrentBuffer() { return current; } /** * Get the valid length of the character array in * {@link #getCurrentBuffer()}. * @return valid length of the array. */ public int getCurrentBufferLength() { return length; } // current string private char[] current; protected int cursor; protected int length; protected int limit; protected int limit_backward; protected int bra; protected int ket; public SnowballProgram(SnowballProgram other) { current = other.current; cursor = other.cursor; length = other.length; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected void copy_from(SnowballProgram other) { current = other.current; cursor = other.cursor; length = other.length; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected boolean in_grouping(char[] s, int min, int max) { if (cursor >= limit) return false; int ch = current[cursor]; if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor++; return true; } protected boolean go_in_grouping(char[] s, int min, int max) { while (cursor < limit) { int ch = current[cursor]; if (ch > max || ch < min) return true; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return true; cursor++; } return false; } protected boolean in_grouping_b(char[] s, int min, int max) { if (cursor <= limit_backward) return false; int ch = current[cursor - 1]; if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor--; return true; } protected boolean go_in_grouping_b(char[] s, int min, int max) { while (cursor > limit_backward) { int ch = current[cursor - 1]; if (ch > max || ch < min) return true; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return true; cursor--; } return false; } protected boolean out_grouping(char[] s, int min, int max) { if (cursor >= limit) return false; int ch = current[cursor]; if (ch > max || ch < min) { cursor++; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor++; return true; } return false; } protected boolean go_out_grouping(char[] s, int min, int max) { while (cursor < limit) { int ch = current[cursor]; if (ch <= max && ch >= min) { ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) != 0) { return true; } } cursor++; } return false; } protected boolean out_grouping_b(char[] s, int min, int max) { if (cursor <= limit_backward) return false; int ch = current[cursor - 1]; if (ch > max || ch < min) { cursor--; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor--; return true; } return false; } protected boolean go_out_grouping_b(char[] s, int min, int max) { while (cursor > limit_backward) { int ch = current[cursor - 1]; if (ch <= max && ch >= min) { ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) != 0) { return true; } } cursor--; } return false; } protected boolean eq_s(CharSequence s) { if (limit - cursor < s.length()) return false; int i; for (i = 0; i != s.length(); i++) { if (current[cursor + i] != s.charAt(i)) return false; } cursor += s.length(); return true; } protected boolean eq_s_b(CharSequence s) { if (cursor - limit_backward < s.length()) return false; int i; for (i = 0; i != s.length(); i++) { if (current[cursor - s.length() + i] != s.charAt(i)) return false; } cursor -= s.length(); return true; } protected int find_among(Among[] v) { int i = 0; int j = v.length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; int i2; for (i2 = common; i2 < w.s.length; i2++) { if (c + common == l) { diff = -1; break; } diff = current[c + common] - w.s[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c + w.s.length; if (w.method == null) return w.result; boolean res = false; try { res = (boolean) w.method.invokeExact(this); } catch (Error | RuntimeException e) { throw e; } catch (Throwable e) { throw new UndeclaredThrowableException(e); } cursor = c + w.s.length; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } // find_among_b is for backwards processing. Same comments apply protected int find_among_b(Among[] v) { int i = 0; int j = v.length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; int i2; for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current[c - 1 - common] - w.s[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c - w.s.length; if (w.method == null) return w.result; boolean res = false; try { res = (boolean) w.method.invokeExact(this); } catch (Error | RuntimeException e) { throw e; } catch (Throwable e) { throw new UndeclaredThrowableException(e); } cursor = c - w.s.length; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } /* to replace chars between c_bra and c_ket in current by the * chars in s. */ protected int replace_s(int c_bra, int c_ket, CharSequence s) { final int adjustment = s.length() - (c_ket - c_bra); final int newLength = length + adjustment; //resize if necessary if (newLength > current.length) { current = Arrays.copyOf(current, newLength); } // if the substring being replaced is longer or shorter than the // replacement, need to shift things around if (adjustment != 0 && c_ket < length) { System.arraycopy(current, c_ket, current, c_bra + s.length(), length - c_ket); } // insert the replacement text // Note, faster is s.getChars(0, s.length(), current, c_bra); // but would have to duplicate this method for both String and StringBuilder for (int i = 0; i < s.length(); i++) current[c_bra + i] = s.charAt(i); length += adjustment; limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } protected void slice_check() { assert bra >= 0 : "bra=" + bra; assert bra <= ket : "bra=" + bra + ",ket=" + ket; assert limit <= length : "limit=" + limit + ",length=" + length; assert ket <= limit : "ket=" + ket + ",limit=" + limit; } protected void slice_from(CharSequence s) { slice_check(); replace_s(bra, ket, s); } protected void slice_del() { slice_from(""); } protected void insert(int c_bra, int c_ket, CharSequence s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /* Copy the slice into the supplied StringBuilder */ protected void slice_to(StringBuilder s) { slice_check(); int len = ket - bra; s.setLength(0); s.append(current, bra, len); } protected void assign_to(StringBuilder s) { s.setLength(0); s.append(current, 0, limit); } /* extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); //if (number >= 0) printf("%3d (line %4d): '", number, line_count); if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } */ } snowball-3.0.1/java/org/tartarus/snowball/SnowballStemmer.java000066400000000000000000000004301500727106100245070ustar00rootroot00000000000000 package org.tartarus.snowball; /** * Parent class of all snowball stemmers, which must implement stem */ public abstract class SnowballStemmer extends SnowballProgram { public abstract boolean stem(); static final long serialVersionUID = 2016072500L; }; snowball-3.0.1/java/org/tartarus/snowball/TestApp.java000066400000000000000000000053351500727106100227620ustar00rootroot00000000000000 package org.tartarus.snowball; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.util.Arrays; public class TestApp { private static void usage() { System.err.println("Usage: TestApp [] [-o ]"); } private static SnowballStemmer getStemmer(String lang) { try { String c = "org.tartarus.snowball.ext." + lang + "Stemmer"; return (SnowballStemmer) Class.forName(c).getDeclaredConstructor().newInstance(); } catch (ReflectiveOperationException e) { return null; } } public static void main(String[] args) throws Throwable { if (args.length < 1) { usage(); return; } SnowballStemmer stemmer = getStemmer(args[0]); if (stemmer == null) { System.err.println("Stemmer " + args[0] + " not found"); return; } int arg = 1; InputStream instream; if (args.length > arg && !args[arg].equals("-o")) { instream = new FileInputStream(args[arg++]); } else { instream = System.in; } OutputStream outstream; if (args.length > arg) { if (args.length != arg + 2 || !args[arg].equals("-o")) { usage(); return; } outstream = new FileOutputStream(args[arg + 1]); } else { outstream = System.out; } Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8); reader = new BufferedReader(reader); Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8); output = new BufferedWriter(output); char[] input = new char[8]; int length = 0; int character; while ((character = reader.read()) != -1) { char ch = (char) character; if (Character.isWhitespace(ch)) { stemmer.setCurrent(input, length); stemmer.stem(); output.write(stemmer.getCurrentBuffer(), 0, stemmer.getCurrentBufferLength()); output.write('\n'); length = 0; } else { if (length == input.length) { input = Arrays.copyOf(input, length + 1); } input[length++] = ch < 127 ? Character.toLowerCase(ch) : ch; } } output.close(); } } snowball-3.0.1/javascript/000077500000000000000000000000001500727106100155015ustar00rootroot00000000000000snowball-3.0.1/javascript/base-stemmer.js000066400000000000000000000300611500727106100204230ustar00rootroot00000000000000// @ts-check /**@constructor*/ const BaseStemmer = function() { /** @protected */ this.current = ''; this.cursor = 0; this.limit = 0; this.limit_backward = 0; this.bra = 0; this.ket = 0; /** * @param {string} value */ this.setCurrent = function(value) { this.current = value; this.cursor = 0; this.limit = this.current.length; this.limit_backward = 0; this.bra = this.cursor; this.ket = this.limit; }; /** * @return {string} */ this.getCurrent = function() { return this.current; }; /** * @param {BaseStemmer} other */ this.copy_from = function(other) { /** @protected */ this.current = other.current; this.cursor = other.cursor; this.limit = other.limit; this.limit_backward = other.limit_backward; this.bra = other.bra; this.ket = other.ket; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.in_grouping = function(s, min, max) { /** @protected */ if (this.cursor >= this.limit) return false; var ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; this.cursor++; return true; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.go_in_grouping = function(s, min, max) { /** @protected */ while (this.cursor < this.limit) { var ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) return true; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return true; this.cursor++; } return false; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.in_grouping_b = function(s, min, max) { /** @protected */ if (this.cursor <= this.limit_backward) return false; var ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; this.cursor--; return true; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.go_in_grouping_b = function(s, min, max) { /** @protected */ while (this.cursor > this.limit_backward) { var ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) return true; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return true; this.cursor--; } return false; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.out_grouping = function(s, min, max) { /** @protected */ if (this.cursor >= this.limit) return false; var ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) { this.cursor++; return true; } ch -= min; if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) { this.cursor++; return true; } return false; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.go_out_grouping = function(s, min, max) { /** @protected */ while (this.cursor < this.limit) { var ch = this.current.charCodeAt(this.cursor); if (ch <= max && ch >= min) { ch -= min; if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) != 0) { return true; } } this.cursor++; } return false; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.out_grouping_b = function(s, min, max) { /** @protected */ if (this.cursor <= this.limit_backward) return false; var ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) { this.cursor--; return true; } ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) { this.cursor--; return true; } return false; }; /** * @param {number[]} s * @param {number} min * @param {number} max * @return {boolean} */ this.go_out_grouping_b = function(s, min, max) { /** @protected */ while (this.cursor > this.limit_backward) { var ch = this.current.charCodeAt(this.cursor - 1); if (ch <= max && ch >= min) { ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) != 0) { return true; } } this.cursor--; } return false; }; /** * @param {string} s * @return {boolean} */ this.eq_s = function(s) { /** @protected */ if (this.limit - this.cursor < s.length) return false; if (this.current.slice(this.cursor, this.cursor + s.length) != s) { return false; } this.cursor += s.length; return true; }; /** * @param {string} s * @return {boolean} */ this.eq_s_b = function(s) { /** @protected */ if (this.cursor - this.limit_backward < s.length) return false; if (this.current.slice(this.cursor - s.length, this.cursor) != s) { return false; } this.cursor -= s.length; return true; }; /** * @param {Among[]} v * @return {number} */ this.find_among = function(v) { /** @protected */ var i = 0; var j = v.length; var c = this.cursor; var l = this.limit; var common_i = 0; var common_j = 0; var first_key_inspected = false; while (true) { var k = i + ((j - i) >>> 1); var diff = 0; var common = common_i < common_j ? common_i : common_j; // smaller // w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional) var w = v[k]; var i2; for (i2 = common; i2 < w[0].length; i2++) { if (c + common == l) { diff = -1; break; } diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } do { var w = v[i]; if (common_i >= w[0].length) { this.cursor = c + w[0].length; if (w.length < 4) return w[2]; var res = w[3](this); this.cursor = c + w[0].length; if (res) return w[2]; } i = w[1]; } while (i >= 0); return 0; }; // find_among_b is for backwards processing. Same comments apply /** * @param {Among[]} v * @return {number} */ this.find_among_b = function(v) { /** @protected */ var i = 0; var j = v.length var c = this.cursor; var lb = this.limit_backward; var common_i = 0; var common_j = 0; var first_key_inspected = false; while (true) { var k = i + ((j - i) >> 1); var diff = 0; var common = common_i < common_j ? common_i : common_j; var w = v[k]; var i2; for (i2 = w[0].length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } do { var w = v[i]; if (common_i >= w[0].length) { this.cursor = c - w[0].length; if (w.length < 4) return w[2]; var res = w[3](this); this.cursor = c - w[0].length; if (res) return w[2]; } i = w[1]; } while (i >= 0); return 0; }; /* to replace chars between c_bra and c_ket in this.current by the * chars in s. */ /** * @param {number} c_bra * @param {number} c_ket * @param {string} s * @return {number} */ this.replace_s = function(c_bra, c_ket, s) { /** @protected */ var adjustment = s.length - (c_ket - c_bra); this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket); this.limit += adjustment; if (this.cursor >= c_ket) this.cursor += adjustment; else if (this.cursor > c_bra) this.cursor = c_bra; return adjustment; }; /** * @return {boolean} */ this.slice_check = function() { /** @protected */ if (this.bra < 0 || this.bra > this.ket || this.ket > this.limit || this.limit > this.current.length) { return false; } return true; }; /** * @param {number} c_bra * @return {boolean} */ this.slice_from = function(s) { /** @protected */ var result = false; if (this.slice_check()) { this.replace_s(this.bra, this.ket, s); result = true; } return result; }; /** * @return {boolean} */ this.slice_del = function() { /** @protected */ return this.slice_from(""); }; /** * @param {number} c_bra * @param {number} c_ket * @param {string} s */ this.insert = function(c_bra, c_ket, s) { /** @protected */ var adjustment = this.replace_s(c_bra, c_ket, s); if (c_bra <= this.bra) this.bra += adjustment; if (c_bra <= this.ket) this.ket += adjustment; }; /** * @return {string} */ this.slice_to = function() { /** @protected */ var result = ''; if (this.slice_check()) { result = this.current.slice(this.bra, this.ket); } return result; }; /** * @return {string} */ this.assign_to = function() { /** @protected */ return this.current.slice(0, this.limit); }; }; if (typeof module === 'object' && module.exports) module.exports = BaseStemmer; snowball-3.0.1/javascript/stemwords.js000066400000000000000000000055051500727106100200730ustar00rootroot00000000000000const fs = require('fs'); const readline = require('readline'); function usage() { console.log("usage: stemwords.js [-l ] -i -o [-c ] [-h]\n"); console.log("The input file consists of a list of words to be stemmed, one per"); console.log("line. Words should be in lower case.\n"); console.log("If -c is given, the argument is the character encoding of the input"); console.log("and output files. If it is omitted, the UTF-8 encoding is used.\n"); console.log("The output file consists of the stemmed words, one per line.\n"); console.log("-h displays this help"); } if (process.argv.length < 5) { usage(); } else { var input = ''; var output = ''; var encoding = 'utf8'; var language = 'English'; var show_help = false; while (process.argv.length > 0) { var arg = process.argv.shift(); switch (arg) { case "-h": show_help = true; process.argv.length = 0; break; case "-l": if (process.argv.length == 0) { show_help = true; break; } language = process.argv.shift(); break; case "-i": if (process.argv.length == 0) { show_help = true; break; } input = process.argv.shift(); break; case "-o": if (process.argv.length == 0) { show_help = true; break; } output = process.argv.shift(); break; case "-c": if (process.argv.length == 0) { show_help = true; break; } encoding = process.argv.shift(); break; } } if (show_help || input == '' || output == '') { usage(); } else { stemming(language, input, output, encoding); } } // function stemming (lang : string, input : string, output : string, encoding : string) { function stemming (lang, input, output, encoding) { const lines = readline.createInterface({ input: fs.createReadStream(input, encoding), terminal: false }); var out = fs.createWriteStream(output, encoding); var stemmer = create(lang); lines.on('line', (original) => { out.write(stemmer.stemWord(original) + '\n'); }); } function create (name) { var lc_name = name.toLowerCase(); if (!lc_name.match('\\W') && lc_name != 'base') { try { const Stemmer = require(lc_name + '-stemmer.js'); return new Stemmer(); } catch (error) { } } console.log('Unknown stemming language: ' + name + '\n'); usage(); process.exit(1); } snowball-3.0.1/libstemmer/000077500000000000000000000000001500727106100154765ustar00rootroot00000000000000snowball-3.0.1/libstemmer/libstemmer_c.in000066400000000000000000000043061500727106100204760ustar00rootroot00000000000000 #include #include #include "../include/libstemmer.h" #include "../runtime/api.h" #include "@MODULES_H@" struct sb_stemmer { struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); struct SN_env * env; }; extern const char ** sb_stemmer_list(void) { return algorithm_names; } static stemmer_encoding_t sb_getenc(const char * charenc) { const struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; } extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; const struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); if (stemmer == NULL) return NULL; stemmer->create = module->create; stemmer->close = module->close; stemmer->stem = module->stem; stemmer->env = stemmer->create(); if (stemmer->env == NULL) { sb_stemmer_delete(stemmer); return NULL; } return stemmer; } void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; if (stemmer->close) { stemmer->close(stemmer->env); stemmer->close = 0; } free(stemmer); } const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) { int ret; if (SN_set_current(stemmer->env, size, (const symbol *)(word))) { stemmer->env->l = 0; return NULL; } ret = stemmer->stem(stemmer->env); if (ret < 0) return NULL; stemmer->env->p[stemmer->env->l] = 0; return (const sb_symbol *)(stemmer->env->p); } int sb_stemmer_length(struct sb_stemmer * stemmer) { return stemmer->env->l; } snowball-3.0.1/libstemmer/mkalgorithms.pl000077500000000000000000000035441500727106100205450ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use 5.006; use warnings; my $progname = $0; if (scalar @ARGV != 2) { print "Usage: $progname \n"; exit 1; } my $outname = shift(@ARGV); my $descfile = shift(@ARGV); my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while ($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT <{$enc}; } print OUT "\n"; } } readinput(); printoutput(); snowball-3.0.1/libstemmer/mkmodules.pl000077500000000000000000000137251500727106100200460ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use 5.006; use warnings; my $progname = $0; if (scalar @ARGV < 4 || scalar @ARGV > 5) { print "Usage: $progname []\n"; exit 1; } my $outname = shift(@ARGV); my $c_src_dir = shift(@ARGV); my $descfile = shift(@ARGV); my $srclistfile = shift(@ARGV); my $enc_only; my $extn = ''; if (@ARGV) { $enc_only = shift(@ARGV); $extn = '_'.$enc_only; } my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $enc_only) { my $norm_enc = lc $enc; $norm_enc =~ s/_//g; if ($norm_enc ne $enc_only) { return; } } if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while ($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { # print "$alias, $enc\n"; $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT < 77) { print OUT ",\n * "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n */\n\n"; foreach $lang (@algorithms) { my $hashref = $algorithm_encs{$lang}; foreach $enc (sort keys (%$hashref)) { print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n"; } } print OUT <$srclistfile") or die "Can't open output file `$srclistfile': $!\n"; print OUT < 77) { print OUT ",\n# "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n\nsnowball_sources= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { print OUT " src_c/stem_${enc}_${lang}.c \\\n"; } } $need_sep = 0; for $srcfile ('runtime/api.c', 'runtime/utilities.c', "libstemmer/libstemmer${extn}.c") { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\nsnowball_headers= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { my $p = "${lang}_${enc}"; print OUT " src_c/stem_${enc}_${lang}.h \\\n"; } } $need_sep = 0; for $srcfile ('include/libstemmer.h', "libstemmer/modules${extn}.h", 'runtime/api.h', 'runtime/header.h') { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\n"; close OUT or die "Can't close ${srclistfile}: $!\n"; } readinput(); printoutput(); printsrclist(); snowball-3.0.1/libstemmer/modules.txt000066400000000000000000000067671500727106100177270ustar00rootroot00000000000000# This file contains a list of stemmers to include in the distribution. # The format is a set of space separated lines - on each line: # First item is name of stemmer. # Second item is comma separated list of character sets. # Third item is comma separated list of names to refer to the stemmer by. # # Lines starting with a #, or blank lines, are ignored. # List all the main algorithms for each language, in UTF-8, and also with # the most commonly used encoding. arabic UTF_8 arabic,ar,ara armenian UTF_8 armenian,hy,hye,arm basque UTF_8,ISO_8859_1 basque,eu,eus,baq catalan UTF_8,ISO_8859_1 catalan,ca,cat danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld,kraaij_pohlmann english UTF_8,ISO_8859_1 english,en,eng esperanto UTF_8 esperanto,eo,epo estonian UTF_8 estonian,et,est finnish UTF_8,ISO_8859_1 finnish,fi,fin french UTF_8,ISO_8859_1 french,fr,fre,fra german UTF_8,ISO_8859_1 german,de,ger,deu greek UTF_8 greek,el,gre,ell hindi UTF_8 hindi,hi,hin hungarian UTF_8,ISO_8859_2 hungarian,hu,hun indonesian UTF_8,ISO_8859_1 indonesian,id,ind irish UTF_8,ISO_8859_1 irish,ga,gle italian UTF_8,ISO_8859_1 italian,it,ita lithuanian UTF_8 lithuanian,lt,lit nepali UTF_8 nepali,ne,nep norwegian UTF_8,ISO_8859_1 norwegian,no,nor portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus serbian UTF_8 serbian,sr,srp spanish UTF_8,ISO_8859_1 spanish,es,esl,spa swedish UTF_8,ISO_8859_1 swedish,sv,swe tamil UTF_8 tamil,ta,tam turkish UTF_8 turkish,tr,tur yiddish UTF_8 yiddish,yi,yid # Also include the traditional porter algorithm for english. # The porter algorithm is included in the libstemmer distribution to assist # with backwards compatibility, but for new systems the english algorithm # should be used in preference. porter UTF_8,ISO_8859_1 porter english # This is Martin Porter's Dutch stemmer. It was the default Dutch stemming # in Snowball 2.2.0 and earlier, but after user feedback and careful evaluation # we concluded that the Kraaij-Pohlmann Dutch stemmer was a better default. # We still provide this to help people who have a lot of existing data indexed # using it. dutch_porter UTF_8,ISO_8859_1 dutch_porter dutch # Some other stemmers in the snowball project are not included in the standard # distribution. To compile a libstemmer with them in, add them to this list, # and regenerate the distribution. (You will need a full source checkout for # this.) They are included in the snowball website as curiosities, but are not # intended for general use, and use of them is is not fully supported. These # algorithms are: # # lovins - This is an english stemmer, but fairly outdated, and # only really applicable to a restricted type of input text # (keywords in academic publications). #lovins UTF_8,ISO_8859_1 lovins english snowball-3.0.1/libstemmer/test.c000066400000000000000000000020041500727106100166150ustar00rootroot00000000000000 #include "libstemmer.h" /* test code */ void error(const char * err) { printf("%s\n", err); exit(1); } int main () { const char * stemmed; const char * unstemmed; struct sb_stemmer * s; const char ** list = sb_stemmer_list(); if (*list == 0) error("TEST FAIL: empty list of stemmers"); s = sb_stemmer_new("e"); if (s != 0) error("TEST FAIL: non zero return for unrecognised language"); s = sb_stemmer_new("english"); if (s == 0) error("TEST FAIL: zero return for recognised language"); sb_stemmer_delete(s); s = sb_stemmer_new("en"); if (s == 0) error("TEST FAIL: zero return for recognised language"); unstemmed = "recognised"; stemmed = sb_stemmer_stem(s, unstemmed, 10); printf("%s -> %s\n", unstemmed, stemmed); if (sb_stemmer_length(s) != strlen(stemmed)) error("TEST FAIL: length not correct"); unstemmed = "recognized"; printf("%s -> %s\n", unstemmed, stemmed); sb_stemmer_delete(s); printf("Success\n"); return 0; } snowball-3.0.1/pascal/000077500000000000000000000000001500727106100145765ustar00rootroot00000000000000snowball-3.0.1/pascal/.gitignore000066400000000000000000000000571500727106100165700ustar00rootroot00000000000000/*.ppu /*Stemmer.pas /stemwords.dpr /stemwords snowball-3.0.1/pascal/SnowballProgram.pas000066400000000000000000000277051500727106100204270ustar00rootroot00000000000000unit SnowballProgram; interface Type TAmongHandler = Function : Boolean of Object; Type TAmong = record Str : AnsiString; // search string Index : Integer; // index to longest matching substring Result : Integer; // result of the lookup Method : TAmongHandler; // method to use if substring matches End; Type {$M+} TSnowballProgram = Class Protected FCurrent : AnsiString; FCursor : Integer; FLimit : Integer; FBkLimit : Integer; FBra : Integer; FKet : Integer; Procedure SetCurrent(Current: AnsiString); Protected Function InGrouping(s : array of char; min, max : Integer) : Boolean; Function GoInGrouping(s : array of char; min, max : Integer) : Boolean; Function InGroupingBk(s : array of char; min, max : Integer) : Boolean; Function GoInGroupingBk(s : array of char; min, max : Integer) : Boolean; Function OutGrouping(s : array of char; min, max : Integer) : Boolean; Function GoOutGrouping(s : array of char; min, max : Integer) : Boolean; Function OutGroupingBk(s : array of char; min, max : Integer) : Boolean; Function GoOutGroupingBk(s : array of char; min, max : Integer) : Boolean; Function EqS(s_size : Integer; s : AnsiString) : Boolean; Function EqSBk(s_size : Integer; s : AnsiString) : Boolean; Function EqV(s : AnsiString) : Boolean; Function EqVBk(s : AnsiString) : Boolean; Function FindAmong(v : array of TAmong; v_size : Integer) : Integer; Function FindAmongBk(v : array of TAmong; v_size : Integer) : Integer; Procedure SliceDel; Procedure SliceCheck; Procedure SliceFrom(s : AnsiString); Function ReplaceS(bra, ket : Integer; s : AnsiString) : Integer; Procedure Insert(bra, ket : Integer; s : AnsiString); Function SliceTo : AnsiString; Function AssignTo : AnsiString; Public { Set & Retrieve current string } Property Current: AnsiString Read FCurrent Write SetCurrent; { Method subclasses need to implement } Function stem : Boolean; Virtual; Abstract; End; Implementation Uses Math; Procedure TSnowballProgram.SetCurrent(Current: AnsiString); Begin FCurrent := Current; FCursor := 0; FLimit := Length(Current); FBkLimit := 0; FBra := FCursor; FKet := FLimit; End; Function TSnowballProgram.InGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor >= FLimit) Then Exit; ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Inc(FCursor); Result := True; End; Function TSnowballProgram.GoInGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor < FLimit) Do Begin ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Inc(FCursor); End; Result := False; End; Function TSnowballProgram.InGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor <= FBkLimit) Then Exit; ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Dec(FCursor); Result := True; End; Function TSnowballProgram.GoInGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor > FBkLimit) Do Begin ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Dec(FCursor); End; Result := False; End; Function TSnowballProgram.OutGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor >= FLimit) Then Exit; ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Begin Inc(FCursor); Result := True; Exit; End; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Begin Inc(FCursor); Result := True; End; End; Function TSnowballProgram.GoOutGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor < FLimit) Do Begin ch := Ord(FCurrent[FCursor + 1]); If (ch <= max) And (ch >= min) Then Begin ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) <> 0 Then Begin Exit; End; End; Inc(FCursor); End; Result := False; End; Function TSnowballProgram.OutGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor <= FBkLimit) Then Exit; ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Begin Dec(FCursor); Result := True; Exit; End; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Begin Dec(FCursor); Result := True; End; End; Function TSnowballProgram.GoOutGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor > FBkLimit) Do Begin ch := Ord(FCurrent[FCursor]); If (ch <= max) And (ch >= min) Then Begin ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) <> 0 Then Begin Exit; End; End; Dec(FCursor); End; Result := False; End; Function TSnowballProgram.EqS(s_size : Integer; s : AnsiString) : Boolean; Var I : Integer; Begin Result := False; If (FLimit - FCursor) < s_size Then Exit; For I := 1 To s_size Do If FCurrent[FCursor + I] <> s[I] Then Exit; FCursor := FCursor + s_size; Result := True; End; Function TSnowballProgram.EqSBk(s_size : Integer; s : AnsiString) : Boolean; Var I : Integer; Begin Result := False; if (FCursor - FBkLimit) < s_size Then Exit; For I := 1 To s_size Do If FCurrent[FCursor - s_size + I] <> s[i] Then Exit; FCursor := FCursor - s_size; Result := True; End; Function TSnowballProgram.EqV(s : AnsiString) : Boolean; Begin Result := EqS(Length(s), s); End; Function TSnowballProgram.EqVBk(s : AnsiString) : Boolean; Begin Result := EqSBk(Length(s), s); End; Function TSnowballProgram.FindAmong(v : array of TAmong; v_size : Integer) : Integer; Var i, i2, j, c, l, common_i, common_j, k, diff, common : Integer; first_key_inspected, res : Boolean; w : TAmong; Begin i := 0; j := v_size; c := FCursor; l := FLimit; common_i := 0; common_j := 0; first_key_inspected := false; While True Do Begin k := i + ((j - i) Shr 1); diff := 0; common := Min(common_i, common_j); // smaller w := v[k]; For i2 := common To Length(w.Str) - 1 Do Begin if (c + common) = l Then Begin diff := -1; Break; End; diff := Ord(FCurrent[c + common + 1]) - Ord(w.Str[i2 + 1]); if diff <> 0 Then Break; Inc(common); End; if diff < 0 Then Begin j := k; common_j := common; End Else Begin i := k; common_i := common; End; If (j - i) <= 1 Then Begin If (i > 0) Then Break; // v->s has been inspected if (j = i) Then Break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) Then Break; first_key_inspected := True; End; End; While True Do Begin w := v[i]; If (common_i >= Length(w.Str)) Then Begin FCursor := c + Length(w.Str); If Not Assigned(w.Method) Then Begin Result := w.Result; Exit; End; res := w.Method; FCursor := c + Length(w.Str); if (res) Then Begin Result := w.Result; Exit; End; End; i := w.Index; if i < 0 Then Begin Result := 0; Exit; End; End; End; Function TSnowballProgram.FindAmongBk(v : array of TAmong; v_size : Integer) : Integer; Var i, j, c, lb, common_i, common_j, k, diff, common, i2 : Integer; first_key_inspected, res : Boolean; w : TAmong; Begin i := 0; j := v_size; c := FCursor; lb := FBkLimit; common_i := 0; common_j := 0; first_key_inspected := false; While True Do Begin k := i + ((j - i) Shr 1); diff := 0; common := Min(common_i, common_j); w := v[k]; For i2 := Length(w.Str) - 1 - common DownTo 0 Do Begin If (c - common) = lb Then Begin diff := -1; Break; End; diff := Ord(FCurrent[c - common]) - Ord(w.Str[i2 + 1]); if diff <> 0 Then Break; Inc(common); End; If diff < 0 Then Begin j := k; common_j := common; End Else Begin i := k; common_i := common; End; If (j - i) <= 1 Then Begin if i > 0 Then Break; if j = i Then Break; if first_key_inspected Then Break; first_key_inspected := True; End; End; While True Do Begin w := v[i]; if common_i >= Length(w.Str) Then Begin FCursor := c - Length(w.Str); If Not Assigned(w.Method) Then Begin Result := w.Result; Exit; End; res := w.Method; FCursor := c - Length(w.Str); If Res Then Begin Result := w.Result; Exit; End; End; i := w.Index; If i < 0 Then Begin Result := 0; Exit; End; End; End; Procedure TSnowballProgram.SliceCheck; Begin if (FBra < 0) Or (FBra > FKet) Or (FKet > FLimit) Or (FLimit > Length(FCurrent)) Then Begin WriteLn('Faulty slice operation.'); Halt; End; End; Procedure TSnowballProgram.SliceDel; Begin SliceFrom(''); End; Function TSnowballProgram.ReplaceS(bra, ket : Integer; s : AnsiString) : Integer; Var adjustment : Integer; Begin adjustment := Length(s) - (ket - bra); Delete(FCurrent, bra + 1, ket - bra); System.Insert(s, FCurrent, bra + 1); FLimit := FLimit + adjustment; if (FCursor >= ket) Then FCursor := FCursor + adjustment Else If (FCursor > bra) Then FCursor := bra; Result := adjustment; End; Procedure TSnowballProgram.Insert(bra, ket : Integer; s : AnsiString); Var adjustment : Integer; Begin adjustment := ReplaceS(bra, ket, s); If (bra <= FBra) Then FBra := FBra + adjustment; If (bra <= FKet) Then FKet := FKet + adjustment; End; Function TSnowballProgram.SliceTo() : AnsiString; Begin SliceCheck(); Result := Copy(FCurrent, FBra + 1, FKet - FBra); End; Procedure TSnowballProgram.SliceFrom(s : AnsiString); Begin SliceCheck(); ReplaceS(FBra, FKet, s); End; Function TSnowballProgram.AssignTo() : AnsiString; Begin Result := Copy(FCurrent, 1, FLimit); End; End. snowball-3.0.1/pascal/generate.pl000077500000000000000000000010201500727106100167210ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; # Generate Pascal stemwords source. my @sources = @ARGV; while (defined(my $line = )) { if ($line =~ /\{\s*BEGIN TEMPLATE\s*\}/) { my $template = ''; while (defined($line = ) && $line !~ /\{\s*END TEMPLATE\s*\}/) { $template .= $line; } foreach my $source(@sources) { my $out = $template; $out =~ s/%STEMMER%/$source/g; print $out; } next; } print $line; } snowball-3.0.1/pascal/stemwords-template.dpr000066400000000000000000000026751500727106100211570ustar00rootroot00000000000000program stemwords; {$ifdef windows} {$APPTYPE CONSOLE} {$endif} uses SnowballProgram, { BEGIN TEMPLATE } %STEMMER%Stemmer in '%STEMMER%Stemmer.pas', { END TEMPLATE } SysUtils; Var Stemmer : TSnowballProgram; CurWord : AnsiString; i : Integer; language : AnsiString; Const Delimiters : Set Of Char = [#10, #13]; Function NextWord : Boolean; Var C : Char; Begin CurWord := ''; Result := Not Eof; While Not Eof Do Begin Read(C); If IOResult <> 0 Then Break; If C In Delimiters Then Break; CurWord := CurWord + C; End; End; begin language := 'english'; i := 0; while i < ParamCount do begin i := i + 1; if ParamStr(i) = '-l' then begin i := i + 1; language := ParamStr(i); continue; end; WriteLn('option '+ParamStr(i)+' unknown'); Exit; end; if False then { BEGIN TEMPLATE } else if language = '%STEMMER%' then Stemmer := T%STEMMER%Stemmer.Create { END TEMPLATE } else begin WriteLn('Stemming language '+language+' unknown'); Exit; end; Try While Not Eof Do Begin While NextWord Do Begin Stemmer.Current := CurWord; Stemmer.Stem; WriteLn(Stemmer.Current); End; End; Finally Stemmer.Free; End; end. snowball-3.0.1/python/000077500000000000000000000000001500727106100146545ustar00rootroot00000000000000snowball-3.0.1/python/MANIFEST.in000066400000000000000000000001761500727106100164160ustar00rootroot00000000000000include *.rst include modules.txt include setup.* recursive-include src *.py include MANIFEST.in include COPYING include NEWS snowball-3.0.1/python/create_init.py000066400000000000000000000025611500727106100175200ustar00rootroot00000000000000#! /bin/sh/env python import sys import re import os python_out_folder = sys.argv[1] filematch = re.compile(r"(\w+)_stemmer\.py$") imports = [] languages = [] for pyscript in os.listdir(python_out_folder): match = filematch.match(pyscript) if (match): langname = match.group(1) titlecase = re.sub(r"_", "", langname.title()) languages.append(" '%(lang)s': %(title)sStemmer," % {'lang': langname, 'title': titlecase}) imports.append('from .%(lang)s_stemmer import %(title)sStemmer' % {'lang': langname, 'title': titlecase}) imports.sort() languages.sort() if len(languages) == 0: raise AssertionError('languages list is empty!') src = '''__all__ = ('language', 'stemmer') %(imports)s _languages = { %(languages)s } try: import Stemmer cext_available = True except ImportError: cext_available = False def algorithms(): if cext_available: return Stemmer.language() else: return list(_languages.keys()) def stemmer(lang): if cext_available: return Stemmer.Stemmer(lang) if lang.lower() in _languages: return _languages[lang.lower()]() else: raise KeyError("Stemming algorithm '%%s' not found" %% lang) ''' % {'imports': '\n'.join(imports), 'languages': '\n'.join(languages)} with open(os.path.join(python_out_folder, '__init__.py'), 'w') as out: out.write(src) snowball-3.0.1/python/setup.cfg000066400000000000000000000001321500727106100164710ustar00rootroot00000000000000[metadata] long_description = file: README.rst long_description_content_type = text/x-rst snowball-3.0.1/python/setup.py000066400000000000000000000055451500727106100163770ustar00rootroot00000000000000#!/usr/bin/env python from setuptools import setup import re SNOWBALL_VERSION = '3.0.1' n_stemmers = 0 langs = [] variants = {} with open('modules.txt') as fp: for line in fp.readlines(): if len(line) <= 1 or line[0] == '#': continue if line[-1:] == '\n': line = line[:-1] tokens = re.split(r'\s+', line) if len(tokens) < 3: print("Bad modules.txt line: " + line) continue (name, encs, codes) = tokens[:3] if len(tokens) > 3: variant_of = tokens[3] if variant_of in variants: variants[variant_of].append(name) else: variants[variant_of] = [name] else: langs.append(name) n_stemmers += 1 desc = 'This package provides ' + str(n_stemmers) + ' stemmers for ' + \ str(len(langs)) + ' languages generated from Snowball algorithms.' classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License' ] for lang in langs: lang_titlecase = lang.title() # Only classifiers listed in https://pypi.org/classifiers/ are allowed # Remove them here or submit them to https://github.com/pypa/trove-classifiers classifiers.append('Natural Language :: ' + lang_titlecase) classifiers.extend([ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Database', 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Linguistic' ]) setup(name='snowballstemmer', version=SNOWBALL_VERSION, description=desc, author='Snowball Developers', author_email='snowball-discuss@lists.tartarus.org', url='https://github.com/snowballstem/snowball', keywords="stemmer", license="BSD-3-Clause", packages=['snowballstemmer'], package_dir={"snowballstemmer": "src/snowballstemmer"}, python_requires='!=3.0.*, !=3.1.*, !=3.2.*', classifiers = classifiers ) snowball-3.0.1/python/snowballstemmer/000077500000000000000000000000001500727106100200725ustar00rootroot00000000000000snowball-3.0.1/python/snowballstemmer/among.py000066400000000000000000000006241500727106100215470ustar00rootroot00000000000000 class Among(object): def __init__(self, s, substring_i, result, method=None): """ @ivar s search string @ivar substring index to longest matching substring @ivar result of the lookup @ivar method method to use if substring matches """ self.s = s self.substring_i = substring_i self.result = result self.method = method snowball-3.0.1/python/snowballstemmer/basestemmer.py000066400000000000000000000177441500727106100227700ustar00rootroot00000000000000class BaseStemmer(object): def __init__(self): self.set_current("") def set_current(self, value): ''' Set the self.current string. ''' self.current = value self.cursor = 0 self.limit = len(self.current) self.limit_backward = 0 self.bra = self.cursor self.ket = self.limit def get_current(self): ''' Get the self.current string. ''' return self.current def copy_from(self, other): self.current = other.current self.cursor = other.cursor self.limit = other.limit self.limit_backward = other.limit_backward self.bra = other.bra self.ket = other.ket def in_grouping(self, s): if self.cursor >= self.limit: return False if self.current[self.cursor] not in s: return False self.cursor += 1 return True def go_in_grouping(self, s): while self.cursor < self.limit: if self.current[self.cursor] not in s: return True self.cursor += 1 return False def in_grouping_b(self, s): if self.cursor <= self.limit_backward: return False if self.current[self.cursor - 1] not in s: return False self.cursor -= 1 return True def go_in_grouping_b(self, s): while self.cursor > self.limit_backward: if self.current[self.cursor - 1] not in s: return True self.cursor -= 1 return False def out_grouping(self, s): if self.cursor >= self.limit: return False if self.current[self.cursor] not in s: self.cursor += 1 return True return False def go_out_grouping(self, s): while self.cursor < self.limit: if self.current[self.cursor] in s: return True self.cursor += 1 return False def out_grouping_b(self, s): if self.cursor <= self.limit_backward: return False if self.current[self.cursor - 1] not in s: self.cursor -= 1 return True return False def go_out_grouping_b(self, s): while self.cursor > self.limit_backward: if self.current[self.cursor - 1] in s: return True self.cursor -= 1 return False def eq_s(self, s): if self.limit - self.cursor < len(s): return False if self.current[self.cursor:self.cursor + len(s)] != s: return False self.cursor += len(s) return True def eq_s_b(self, s): if self.cursor - self.limit_backward < len(s): return False if self.current[self.cursor - len(s):self.cursor] != s: return False self.cursor -= len(s) return True def find_among(self, v): i = 0 j = len(v) c = self.cursor l = self.limit common_i = 0 common_j = 0 first_key_inspected = False while True: k = i + ((j - i) >> 1) diff = 0 common = min(common_i, common_j) # smaller w = v[k] for i2 in range(common, len(w.s)): if c + common == l: diff = -1 break diff = ord(self.current[c + common]) - ord(w.s[i2]) if diff != 0: break common += 1 if diff < 0: j = k common_j = common else: i = k common_i = common if j - i <= 1: if i > 0: break # v->s has been inspected if j == i: break # only one item in v # - but now we need to go round once more to get # v->s inspected. This looks messy, but is actually # the optimal approach. if first_key_inspected: break first_key_inspected = True while True: w = v[i] if common_i >= len(w.s): self.cursor = c + len(w.s) if w.method is None: return w.result res = w.method(self) self.cursor = c + len(w.s) if res: return w.result i = w.substring_i if i < 0: return 0 return -1 # not reachable def find_among_b(self, v): ''' find_among_b is for backwards processing. Same comments apply ''' i = 0 j = len(v) c = self.cursor lb = self.limit_backward common_i = 0 common_j = 0 first_key_inspected = False while True: k = i + ((j - i) >> 1) diff = 0 common = min(common_i, common_j) w = v[k] for i2 in range(len(w.s) - 1 - common, -1, -1): if c - common == lb: diff = -1 break diff = ord(self.current[c - 1 - common]) - ord(w.s[i2]) if diff != 0: break common += 1 if diff < 0: j = k common_j = common else: i = k common_i = common if j - i <= 1: if i > 0: break if j == i: break if first_key_inspected: break first_key_inspected = True while True: w = v[i] if common_i >= len(w.s): self.cursor = c - len(w.s) if w.method is None: return w.result res = w.method(self) self.cursor = c - len(w.s) if res: return w.result i = w.substring_i if i < 0: return 0 return -1 # not reachable def replace_s(self, c_bra, c_ket, s): ''' to replace chars between c_bra and c_ket in self.current by the chars in s. @type c_bra int @type c_ket int @type s: string ''' adjustment = len(s) - (c_ket - c_bra) self.current = self.current[0:c_bra] + s + self.current[c_ket:] self.limit += adjustment if self.cursor >= c_ket: self.cursor += adjustment elif self.cursor > c_bra: self.cursor = c_bra return adjustment def slice_check(self): if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current): return False return True def slice_from(self, s): ''' @type s string ''' result = False if self.slice_check(): self.replace_s(self.bra, self.ket, s) result = True return result def slice_del(self): return self.slice_from("") def insert(self, c_bra, c_ket, s): ''' @type c_bra int @type c_ket int @type s: string ''' adjustment = self.replace_s(c_bra, c_ket, s) if c_bra <= self.bra: self.bra += adjustment if c_bra <= self.ket: self.ket += adjustment def slice_to(self): ''' Return the slice as a string. ''' result = '' if self.slice_check(): result = self.current[self.bra:self.ket] return result def assign_to(self): ''' Return the current string up to the limit. ''' return self.current[0:self.limit] def stemWord(self, word): self.set_current(word) self._stem() return self.get_current() def stemWords(self, words): return [self.stemWord(word) for word in words] snowball-3.0.1/python/stemwords.py000066400000000000000000000064061500727106100172630ustar00rootroot00000000000000import sys import codecs import snowballstemmer def usage(): print('''usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case, but (for English) A-Z letters are mapped to their a-z equivalents anyway. If omitted, stdin is used. If -c is given, the argument is the character encoding of the input and output files. If it is omitted, the UTF-8 encoding is used. If -p is given the output file consists of each word of the input file followed by \"->\" followed by its stemmed equivalent. If -p2 is given the output file is a two column layout containing the input words in the first column and the stemmed equivalents in the second column. Otherwise, the output file consists of the stemmed words, one per line. -h displays this help''' % sys.argv[0]) def main(): argv = sys.argv[1:] if len(argv) < 5: usage() else: pretty = 0 input = '' output = '' encoding = 'utf_8' language = 'English' show_help = False while len(argv): arg = argv.pop(0) if arg == '-h': show_help = True break elif arg == "-p": pretty = 1 elif arg == "-p2": pretty = 2 elif arg == "-l": if len(argv) == 0: show_help = True break language = argv.pop(0) elif arg == "-i": if len(argv) == 0: show_help = True break input = argv.pop(0) elif arg == "-o": if len(argv) == 0: show_help = True break output = argv.pop(0) elif arg == "-c": if len(argv) == 0: show_help = True break encoding = argv.pop(0) if show_help or input == '' or output == '': usage() else: stemming(language, input, output, encoding, pretty) def stemming(lang, input, output, encoding, pretty): stemmer = snowballstemmer.stemmer(lang) with codecs.open(input, "r", encoding) as infile: with codecs.open(output, "w", encoding) as outfile: for original in infile.readlines(): original = original.strip() # Convert only ASCII-letters to lowercase, to match C behavior original = ''.join((c.lower() if 'A' <= c <= 'Z' else c for c in original)) stemmed = stemmer.stemWord(original) if pretty == 0: if stemmed != "": outfile.write(stemmed) elif pretty == 1: outfile.write(original, " -> ", stemmed) elif pretty == 2: outfile.write(original) if len(original) < 30: outfile.write(" " * (30 - len(original))) else: outfile.write("\n") outfile.write(" " * 30) outfile.write(stemmed) outfile.write('\n') main() snowball-3.0.1/python/testapp.py000066400000000000000000000012011500727106100167000ustar00rootroot00000000000000import sys import re import snowballstemmer def usage(): print("testapp.py \"sentence\"...") def main(): argv = sys.argv if len(argv) < 1: usage() return algorithm = 'english' if len(argv) > 2: algorithm = argv[1] argv = argv[2:] else: argv = argv[1:] stemmer = snowballstemmer.stemmer(algorithm) splitter = re.compile(r"[\s\.-]") for arg in argv: for word in splitter.split(arg): if word == '': continue original = word.lower() print(original + " -> " + stemmer.stemWord(original)) main() snowball-3.0.1/runtime/000077500000000000000000000000001500727106100150165ustar00rootroot00000000000000snowball-3.0.1/runtime/api.c000066400000000000000000000025151500727106100157360ustar00rootroot00000000000000 #include /* for malloc, calloc, free */ #include "header.h" extern struct SN_env * SN_create_env(int S_size, int I_size) { static const struct SN_env default_SN_env = {}; struct SN_env * z = (struct SN_env *) malloc(sizeof(struct SN_env)); if (z == NULL) return NULL; *z = default_SN_env; z->p = create_s(); if (z->p == NULL) goto error; if (S_size) { int i; z->S = (symbol * *) malloc(S_size * sizeof(symbol *)); if (z->S == NULL) goto error; for (i = 0; i < S_size; i++) { z->S[i] = create_s(); if (z->S[i] == NULL) { S_size = i; goto error; } } } if (I_size) { z->I = (int *) calloc(I_size, sizeof(int)); if (z->I == NULL) goto error; } return z; error: SN_close_env(z, S_size); return NULL; } extern void SN_close_env(struct SN_env * z, int S_size) { if (z == NULL) return; if (z->S) { int i; for (i = 0; i < S_size; i++) { lose_s(z->S[i]); } free(z->S); } free(z->I); if (z->p) lose_s(z->p); free(z); } extern int SN_set_current(struct SN_env * z, int size, const symbol * s) { int err = replace_s(z, 0, z->l, size, s, NULL); z->c = 0; return err; } snowball-3.0.1/runtime/api.h000066400000000000000000000014341500727106100157420ustar00rootroot00000000000000 typedef unsigned char symbol; /* Or replace 'char' above with 'short' for 16 bit characters. More precisely, replace 'char' with whatever type guarantees the character width you need. Note however that sizeof(symbol) should divide HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise there is an alignment problem. In the unlikely event of a problem here, consult Martin Porter. */ struct SN_env { symbol * p; int c; int l; int lb; int bra; int ket; symbol * * S; int * I; }; #ifdef __cplusplus extern "C" { #endif extern struct SN_env * SN_create_env(int S_size, int I_size); extern void SN_close_env(struct SN_env * z, int S_size); extern int SN_set_current(struct SN_env * z, int size, const symbol * s); #ifdef __cplusplus } #endif snowball-3.0.1/runtime/header.h000066400000000000000000000050561500727106100164250ustar00rootroot00000000000000 #include "api.h" #define HEAD 2*sizeof(int) #define SIZE(p) ((int *)(p))[-1] #define SET_SIZE(p, n) ((int *)(p))[-1] = n #define CAPACITY(p) ((int *)(p))[-2] struct among { /* Number of symbols in s. */ int s_size; /* Search string. */ const symbol * s; /* Delta of index to longest matching substring, or 0 if none. */ int substring_i; /* Result of the lookup. */ int result; /* Optional condition routine, or NULL if none. */ int (* function)(struct SN_env *); }; extern symbol * create_s(void); extern void lose_s(symbol * p); extern int skip_utf8(const symbol * p, int c, int limit, int n); extern int skip_b_utf8(const symbol * p, int c, int limit, int n); extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int eq_s(struct SN_env * z, int s_size, const symbol * s); extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); extern int eq_v(struct SN_env * z, const symbol * p); extern int eq_v_b(struct SN_env * z, const symbol * p); extern int find_among(struct SN_env * z, const struct among * v, int v_size); extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); extern int slice_from_v(struct SN_env * z, const symbol * p); extern int slice_del(struct SN_env * z); extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); extern symbol * slice_to(struct SN_env * z, symbol * p); extern symbol * assign_to(struct SN_env * z, symbol * p); extern int len_utf8(const symbol * p); extern void debug(struct SN_env * z, int number, int line_count); snowball-3.0.1/runtime/utilities.c000066400000000000000000000331061500727106100172000ustar00rootroot00000000000000 #include #include #include #include "header.h" #define CREATE_SIZE 1 extern symbol * create_s(void) { symbol * p; void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); if (mem == NULL) return NULL; p = (symbol *) (HEAD + (char *) mem); CAPACITY(p) = CREATE_SIZE; SET_SIZE(p, 0); return p; } extern void lose_s(symbol * p) { if (p == NULL) return; free((char *) p - HEAD); } /* new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c. new_p is the new position, or -1 on failure. -- used to implement hop and next in the utf8 case. */ extern int skip_utf8(const symbol * p, int c, int limit, int n) { int b; if (n < 0) return -1; for (; n > 0; n--) { if (c >= limit) return -1; b = p[c++]; if (b >= 0xC0) { /* 1100 0000 */ while (c < limit) { b = p[c]; if (b >= 0xC0 || b < 0x80) break; /* break unless b is 10------ */ c++; } } } return c; } /* new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1 new_p is the new position, or -1 on failure. -- used to implement hop and next in the utf8 case. */ extern int skip_b_utf8(const symbol * p, int c, int limit, int n) { int b; if (n < 0) return -1; for (; n > 0; n--) { if (c <= limit) return -1; b = p[--c]; if (b >= 0x80) { /* 1000 0000 */ while (c > limit) { b = p[c]; if (b >= 0xC0) break; /* 1100 0000 */ c--; } } } return c; } /* Code for character groupings: utf8 cases */ static int get_utf8(const symbol * p, int c, int l, int * slot) { int b0, b1, b2; if (c >= l) return 0; b0 = p[c++]; if (b0 < 0xC0 || c == l) { /* 1100 0000 */ *slot = b0; return 1; } b1 = p[c++] & 0x3F; if (b0 < 0xE0 || c == l) { /* 1110 0000 */ *slot = (b0 & 0x1F) << 6 | b1; return 2; } b2 = p[c++] & 0x3F; if (b0 < 0xF0 || c == l) { /* 1111 0000 */ *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; return 3; } *slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); return 4; } static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { int a, b; if (c <= lb) return 0; b = p[--c]; if (b < 0x80 || c == lb) { /* 1000 0000 */ *slot = b; return 1; } a = b & 0x3F; b = p[--c]; if (b >= 0xC0 || c == lb) { /* 1100 0000 */ *slot = (b & 0x1F) << 6 | a; return 2; } a |= (b & 0x3F) << 6; b = p[--c]; if (b >= 0xE0 || c == lb) { /* 1110 0000 */ *slot = (b & 0xF) << 12 | a; return 3; } *slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a; return 4; } extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); if (!w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c += w; } while (repeat); return 0; } extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); if (!w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c -= w; } while (repeat); return 0; } extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); if (!w) return -1; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return w; z->c += w; } while (repeat); return 0; } extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); if (!w) return -1; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return w; z->c -= w; } while (repeat); return 0; } /* Code for character groupings: non-utf8 cases */ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c++; } while (repeat); return 0; } extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c--; } while (repeat); return 0; } extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return 1; z->c++; } while (repeat); return 0; } extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return 1; z->c--; } while (repeat); return 0; } extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; z->c += s_size; return 1; } extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; z->c -= s_size; return 1; } extern int eq_v(struct SN_env * z, const symbol * p) { return eq_s(z, SIZE(p), p); } extern int eq_v_b(struct SN_env * z, const symbol * p) { return eq_s_b(z, SIZE(p), p); } extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int i = 0; int j = v_size; int c = z->c; int l = z->l; const symbol * q = z->p + c; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; /* smaller */ w = v + k; { int i2; for (i2 = common; i2 < w->s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = q[common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; /* v->s has been inspected */ if (j == i) break; /* only one item in v */ /* - but now we need to go round once more to get v->s inspected. This looks messy, but is actually the optimal approach. */ if (first_key_inspected) break; first_key_inspected = 1; } } w = v + i; while (1) { if (common_i >= w->s_size) { z->c = c + w->s_size; if (w->function == NULL) return w->result; { int res = w->function(z); z->c = c + w->s_size; if (res) return w->result; } } if (!w->substring_i) return 0; w += w->substring_i; } } /* find_among_b is for backwards processing. Same comments apply */ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int i = 0; int j = v_size; int c = z->c; int lb = z->lb; const symbol * q = z->p + c - 1; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; w = v + k; { int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = q[- common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = 1; } } w = v + i; while (1) { if (common_i >= w->s_size) { z->c = c - w->s_size; if (w->function == NULL) return w->result; { int res = w->function(z); z->c = c - w->s_size; if (res) return w->result; } } if (!w->substring_i) return 0; w += w->substring_i; } } /* Increase the size of the buffer pointed to by p to at least n symbols. * If insufficient memory, returns NULL and frees the old buffer. */ static symbol * increase_size(symbol * p, int n) { symbol * q; int new_size = n + 20; void * mem = realloc((char *) p - HEAD, HEAD + (new_size + 1) * sizeof(symbol)); if (mem == NULL) { lose_s(p); return NULL; } q = (symbol *) (HEAD + (char *)mem); CAPACITY(q) = new_size; return q; } /* to replace symbols between c_bra and c_ket in z->p by the s_size symbols at s. Returns 0 on success, -1 on error. Also, frees z->p (and sets it to NULL) on error. */ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) { int adjustment; int len; if (z->p == NULL) { z->p = create_s(); if (z->p == NULL) return -1; } adjustment = s_size - (c_ket - c_bra); len = SIZE(z->p); if (adjustment != 0) { if (adjustment + len > CAPACITY(z->p)) { z->p = increase_size(z->p, adjustment + len); if (z->p == NULL) return -1; } memmove(z->p + c_ket + adjustment, z->p + c_ket, (len - c_ket) * sizeof(symbol)); SET_SIZE(z->p, adjustment + len); z->l += adjustment; if (z->c >= c_ket) z->c += adjustment; else if (z->c > c_bra) z->c = c_bra; } if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); if (adjptr != NULL) *adjptr = adjustment; return 0; } static int slice_check(struct SN_env * z) { if (z->bra < 0 || z->bra > z->ket || z->ket > z->l || z->p == NULL || z->l > SIZE(z->p)) /* this line could be removed */ { #if 0 fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); #endif return -1; } return 0; } extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { if (slice_check(z)) return -1; return replace_s(z, z->bra, z->ket, s_size, s, NULL); } extern int slice_from_v(struct SN_env * z, const symbol * p) { return slice_from_s(z, SIZE(p), p); } extern int slice_del(struct SN_env * z) { return slice_from_s(z, 0, NULL); } extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { int adjustment; if (replace_s(z, bra, ket, s_size, s, &adjustment)) return -1; if (bra <= z->bra) z->bra += adjustment; if (bra <= z->ket) z->ket += adjustment; return 0; } extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { return insert_s(z, bra, ket, SIZE(p), p); } extern symbol * slice_to(struct SN_env * z, symbol * p) { if (slice_check(z)) { lose_s(p); return NULL; } { int len = z->ket - z->bra; if (CAPACITY(p) < len) { p = increase_size(p, len); if (p == NULL) return NULL; } memmove(p, z->p + z->bra, len * sizeof(symbol)); SET_SIZE(p, len); } return p; } extern symbol * assign_to(struct SN_env * z, symbol * p) { int len = z->l; if (CAPACITY(p) < len) { p = increase_size(p, len); if (p == NULL) return NULL; } memmove(p, z->p, len * sizeof(symbol)); SET_SIZE(p, len); return p; } extern int len_utf8(const symbol * p) { int size = SIZE(p); int len = 0; while (size--) { symbol b = *p++; if (b >= 0xC0 || b < 0x80) ++len; } return len; } #if 0 extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } #endif snowball-3.0.1/rust/000077500000000000000000000000001500727106100143305ustar00rootroot00000000000000snowball-3.0.1/rust/Cargo.toml000066400000000000000000000002101500727106100162510ustar00rootroot00000000000000[package] name = "testapp" version = "0.1.0" authors = ["Jakob Demler "] build = "build.rs" [dependencies] snowball-3.0.1/rust/build.rs000066400000000000000000000037561500727106100160100ustar00rootroot00000000000000use std::env; use std::fs; use std::fs::{OpenOptions}; use std::io::Write; use std::path::Path; // This build script makes the code independent from the algorithms declared // in the makefile. // We check which stemmers were generated and then produce the corresponding // includes for src/algorithms/mod.rs and a closure for src/main.rs to match // strings to stemmers fn main() { let out_dir = env::var("OUT_DIR").unwrap(); let lang_match_path = Path::new(&out_dir).join("lang_matches.rs"); let lang_include_path = Path::new(&out_dir).join("lang_include.rs"); let mut lang_match_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_match_path).unwrap(); let mut lang_include_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_include_path).unwrap(); let src_dir = Path::new(&env::var("CARGO_MANIFEST_DIR").unwrap()).join("src"); let algo_dir = src_dir.join("snowball/algorithms"); lang_match_file.write_all(b" move |lang:String|{ match lang.as_str() {") .unwrap(); for file in fs::read_dir(&algo_dir).unwrap() { let file = file.unwrap(); let path = file.path(); let filestem = path.file_stem().unwrap().to_str().unwrap(); if path.is_file() && filestem != "mod" { //Also we need to copy all the stemmer files into OUT_DIR... fs::copy(&path, Path::new(&out_dir).join(file.file_name())).unwrap(); let split = filestem.len() - 8; let langname = &filestem[..split]; writeln!(&mut lang_match_file, "\"{}\" => Stemmer {{ stemmer: snowball::algorithms::{}_stemmer::stem}},", langname, langname) .unwrap(); writeln!(&mut lang_include_file, "pub mod {}_stemmer;", langname).unwrap(); } } lang_match_file.write_all(b" x => panic!(\"Unknown algorithm '{}'\", x) } } ") .unwrap(); } snowball-3.0.1/rust/rust-pre-1.27-compat.patch000066400000000000000000000024201500727106100207760ustar00rootroot00000000000000Applying this patch restores compatibility with Rust < 1.27 (but causes newer versions to report "warning: trait objects without an explicit `dyn` are deprecated"). diff --git a/rust/src/main.rs b/rust/src/main.rs index 064325a9..bf752795 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -56,9 +56,9 @@ fn main() { let mut output = if let Some(output_file) = output_arg { - Box::new(File::create(Path::new(&output_file)).unwrap()) as Box + Box::new(File::create(Path::new(&output_file)).unwrap()) as Box } else { - Box::new(std::io::stdout()) as Box + Box::new(std::io::stdout()) as Box }; if let Some(input_file) = input_arg { diff --git a/rust/src/snowball/among.rs b/rust/src/snowball/among.rs index 57fc8bae..70631933 100644 --- a/rust/src/snowball/among.rs +++ b/rust/src/snowball/among.rs @@ -3,4 +3,4 @@ use snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, - pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); + pub Option<&'static (Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); snowball-3.0.1/rust/src/000077500000000000000000000000001500727106100151175ustar00rootroot00000000000000snowball-3.0.1/rust/src/main.rs000066400000000000000000000063041500727106100164140ustar00rootroot00000000000000use std::fs::File; use std::io::{BufRead, BufReader, Write}; use std::path::Path; use std::env; use std::borrow::Cow; pub mod snowball; use snowball::SnowballEnv; fn usage(name: &str) { println!("{} -l [-i ] [-o ] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case, but (for English) A-Z letters are mapped to their a-z equivalents anyway. If omitted, stdin is used.", name); } fn main() { let args: Vec = env::args().collect(); if args.len() < 3 { usage(&args[0]); } else { let mut language = None; let mut input_arg = None; let mut output_arg = None; let mut i = 1; while i < args.len() { match args[i].as_str() { "-l" => { language = Some(args[i+1].clone()); i += 2; }, "-i" => { input_arg = Some(args[i+1].clone()); i += 2; }, "-o" => { output_arg = Some(args[i+1].clone()); i += 2; }, x => { println!("Unrecognized option '{}'", x); usage(&args[0]); return } } } if language.is_none() { println!("Please specify a language!"); usage(&args[0]); return; } let stemmer = Stemmer::create(language.unwrap()); let mut output = if let Some(output_file) = output_arg { Box::new(File::create(Path::new(&output_file)).unwrap()) as Box } else { Box::new(std::io::stdout()) as Box }; if let Some(input_file) = input_arg { for line in BufReader::new(File::open(Path::new(&input_file)).unwrap()).lines() { writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap(); } } else { let stdin = std::io::stdin(); for line in stdin.lock().lines() { writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap(); } } } } /// Wraps a usable interface around the actual stemmer implementation pub struct Stemmer { stemmer: fn(&mut SnowballEnv) -> bool, } impl Stemmer { /// Create a new stemmer from an algorithm pub fn create(lang: String) -> Self { // Have a look at ../build.rs // There we generate a file that is rust code for a closure that returns a stemmer. // We match against all the algorithms in src/snowball/algoritms/ folder. // Alas, this cannot be included as a match statement or function because of Rust's // hygenic macros. let match_language = include!(concat!(env!("OUT_DIR"), "/lang_matches.rs")); match_language(lang) } /// Stem a single word /// Please note, that the input is expected to be all lowercase (if that is applicable). pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> { let mut env = SnowballEnv::create(input); (self.stemmer)(&mut env); env.get_current() } } snowball-3.0.1/rust/src/snowball/000077500000000000000000000000001500727106100167405ustar00rootroot00000000000000snowball-3.0.1/rust/src/snowball/algorithms/000077500000000000000000000000001500727106100211115ustar00rootroot00000000000000snowball-3.0.1/rust/src/snowball/algorithms/mod.rs000066400000000000000000000001231500727106100222320ustar00rootroot00000000000000// Have a look at build.rs include!(concat!(env!("OUT_DIR"), "/lang_include.rs")); snowball-3.0.1/rust/src/snowball/among.rs000066400000000000000000000003751500727106100204140ustar00rootroot00000000000000use snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); snowball-3.0.1/rust/src/snowball/mod.rs000066400000000000000000000001771500727106100200720ustar00rootroot00000000000000pub mod algorithms; mod among; mod snowball_env; pub use snowball::among::Among; pub use snowball::snowball_env::SnowballEnv; snowball-3.0.1/rust/src/snowball/snowball_env.rs000066400000000000000000000364671500727106100220170ustar00rootroot00000000000000use std::borrow::Cow; use snowball::Among; #[derive(Debug, Clone)] pub struct SnowballEnv<'a> { pub current: Cow<'a, str>, pub cursor: i32, pub limit: i32, pub limit_backward: i32, pub bra: i32, pub ket: i32, } impl<'a> SnowballEnv<'a> { pub fn create(value: &'a str) -> Self { let len = value.len(); SnowballEnv { current: Cow::from(value), cursor: 0, limit: len as i32, limit_backward: 0, bra: 0, ket: len as i32, } } pub fn get_current(self) -> Cow<'a, str> { self.current } pub fn set_current(&mut self, current: &'a str) { self.current = Cow::from(current); } pub fn set_current_s(&mut self, current: String) { self.current = Cow::from(current); } fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 { let adjustment = s.len() as i32 - (ket - bra); let mut result = String::with_capacity(self.current.len()); { let (lhs, _) = self.current.split_at(bra as usize); let (_, rhs) = self.current.split_at(ket as usize); result.push_str(lhs); result.push_str(s); result.push_str(rhs); } // ... not very nice... let new_lim = self.limit + adjustment; self.limit = new_lim; if self.cursor >= ket { let new_cur = self.cursor + adjustment; self.cursor = new_cur; } else if self.cursor > bra { self.cursor = bra } self.current = Cow::from(result); adjustment } /// Check if s is after cursor. /// If so, move cursor to the end of s pub fn eq_s(&mut self, s: &str) -> bool { if self.cursor >= self.limit { return false; } if self.current[(self.cursor as usize)..].starts_with(s) { self.cursor += s.len() as i32; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } true } else { false } } /// Check if 's' is before cursor /// If so, move cursor to the beginning of s pub fn eq_s_b(&mut self, s: &str) -> bool { if (self.cursor - self.limit_backward) < s.len() as i32 { false // Check if cursor -s.len is a char boundary. if not well... return false obv } else if !self.current.is_char_boundary(self.cursor as usize - s.len()) || !self.current[self.cursor as usize - s.len()..].starts_with(s) { false } else { self.cursor -= s.len() as i32; true } } /// Replace string between `bra` and `ket` with s pub fn slice_from(&mut self, s: &str) -> bool { let (bra, ket) = (self.bra, self.ket); self.replace_s(bra, ket, s); true } /// Move cursor to next character pub fn next_char(&mut self) { self.cursor += 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } } /// Move cursor to previous character pub fn previous_char(&mut self) { self.cursor -= 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor -= 1; } } pub fn hop(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res >= self.limit { return false; } res += 1; while res < self.limit && !self.current.is_char_boundary(res as usize) { res += 1; } } self.cursor = res; return true; } pub fn hop_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop(delta); } pub fn hop_back(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res <= self.limit_backward { return false; } res -= 1; while res > self.limit_backward && !self.current.is_char_boundary(res as usize) { res -= 1; } } self.cursor = res; return true; } pub fn hop_back_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop_back(delta); } // A grouping is represented by a minimum code point, a maximum code point, // and a bitfield of which code points in that range are in the grouping. // For example, in english.sbl, valid_LI is 'cdeghkmnrt'. // The minimum and maximum code points are 99 and 116, // so every time one of these grouping functions is called for g_valid_LI, // min must be 99 and max must be 116. There are 18 code points within that // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding: // // cdefghij klmnopqr st // 11101100 10110001 01000000 // // The first bit is the least significant. // Those three bytes become &[0b00110111, 0b10001101, 0b00000010], // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs. /// Check if the char the cursor points to is in the grouping pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return false; } self.next_char(); return true; } return false; } pub fn go_in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor < self.limit { if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return true; } self.next_char(); } else { return false; } } return false; } pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.cursor = c; return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.cursor = c; return false; } return true; } return false; } pub fn go_in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor > self.limit_backward { let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.cursor = c; return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.cursor = c; return true; } } else { return false; } } return false; } pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.next_char(); return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.next_char(); return true; } } return false; } pub fn go_out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor < self.limit { if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch <= max && ch >= min { ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) != 0 { return true; } } self.next_char(); } else { return false; } } return false; } pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return true; } self.cursor = c; } return false; } pub fn go_out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor > self.limit_backward { let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch <= max && ch >= min { ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) != 0 { self.cursor = c; return true; } } } else { return false; } } return false; } /// Helper function that removes the string slice between `bra` and `ket` pub fn slice_del(&mut self) -> bool { self.slice_from("") } pub fn insert(&mut self, bra: i32, ket: i32, s: &str) { let adjustment = self.replace_s(bra, ket, s); if bra <= self.bra { self.bra = self.bra + adjustment; } if bra <= self.ket { self.ket = self.ket + adjustment; } } pub fn assign_to(&mut self) -> String { self.current[0..self.limit as usize].to_string() } pub fn slice_to(&mut self) -> String { self.current[self.bra as usize..self.ket as usize].to_string() } pub fn find_among(&mut self, amongs: &[Among], context: &mut T) -> i32 { use std::cmp::min; let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let l = self.limit; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = min(common_i, common_j); let w = &amongs[k as usize]; for lvar in common..w.0.len() as i32 { if c + common == l { diff = -1; break; } diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32; if diff != 0 { break; } common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32{ self.cursor = c + w.0.len() as i32; if let Some(ref method) = w.3 { let res = method(self, context); self.cursor = c + w.0.len() as i32; if res { return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } pub fn find_among_b(&mut self, amongs: &[Among], context: &mut T) -> i32 { let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let lb = self.limit_backward; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = if common_i < common_j { common_i } else { common_j }; let w = &amongs[k as usize]; for lvar in (0..w.0.len() - common as usize).rev() { if c - common == lb { diff = -1; break; } diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32; if diff != 0 { break; } // Count up commons. But not one character but the byte width of that char common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32 { self.cursor = c - w.0.len() as i32; if let Some(ref method) = w.3 { let res = method(self, context); self.cursor = c - w.0.len() as i32; if res { return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } } snowball-3.0.1/tests/000077500000000000000000000000001500727106100144755ustar00rootroot00000000000000snowball-3.0.1/tests/stemtest.c000066400000000000000000000060531500727106100165150ustar00rootroot00000000000000/* Unit tests for handling of cases the vocabularies don't cover. */ #include #include #include /* for strlen, memcmp */ #include "libstemmer.h" #define EMOJI_FACE_THROWING_A_KISS "\xf0\x9f\x98\x98" #define U_40079 "\xf1\x80\x81\xb9" static const struct testcase { /* Stemmer to use, or 0 to test with all stemmers */ const char * language; /* Character encoding (can be 0 for UTF-8) */ const char * charenc; /* Input string (0 marks end of list) */ const char * input; /* Expected output string (0 means same as input) */ const char * expect; } testcases[] = { // Regression tests for C support code bug decoding 4 byte UTF-8 sequences. // https://github.com/snowballstem/snowball/issues/138 { "en", 0, "a" EMOJI_FACE_THROWING_A_KISS "ing", "a" EMOJI_FACE_THROWING_A_KISS "e" }, { "en", 0, U_40079 "wing", 0 }, // The Finnish stemmer used to damage numbers ending with two or more of // the same digit. Regression test, applied to all stemmers. // https://github.com/snowballstem/snowball/issues/66 { 0, 0, "2000", 0 }, { 0, 0, "999", 0 }, { 0, 0, "1000000000", 0 }, // The Danish stemmer used to damage a number at the end of a word. // Regression test, applied to all stemmers. // https://github.com/snowballstem/snowball/issues/81 { 0, 0, "space1999", 0 }, { 0, 0, "hal9000", 0 }, { 0, 0, "0x0e00", 0 }, { 0, 0, 0, 0 } }; static void run_testcase(const char * language, const struct testcase *test) { const char * charenc = test->charenc; const char * input = test->input; const char * expect = test->expect; struct sb_stemmer * stemmer = sb_stemmer_new(language, charenc); const sb_symbol * stemmed; int len; if (expect == NULL) expect = input; if (stemmer == 0) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stemmed = sb_stemmer_stem(stemmer, (const unsigned char*)input, strlen(input)); if (stemmed == NULL) { fprintf(stderr, "Out of memory"); exit(1); } len = sb_stemmer_length(stemmer); if (len != (int)strlen(expect) || memcmp(stemmed, expect, len) != 0) { fprintf(stderr, "%s stemmer output for %s was %.*s not %s\n", language, input, len, stemmed, expect); exit(1); } sb_stemmer_delete(stemmer); } int main(int argc, char * argv[]) { const char ** all_languages = sb_stemmer_list(); const struct testcase * p; (void)argc; (void)argv; for (p = testcases; p->input; ++p) { const char * language = p->language; if (language) { run_testcase(language, p); } else { const char ** l; for (l = all_languages; *l; ++l) { run_testcase(*l, p); } } } return 0; }