pax_global_header00006660000000000000000000000064140271574210014515gustar00rootroot0000000000000052 comment=7bad6941bbbdf69b50645acd5568f43e6f457eba buftok-0.3.0/000077500000000000000000000000001402715742100130075ustar00rootroot00000000000000buftok-0.3.0/.github/000077500000000000000000000000001402715742100143475ustar00rootroot00000000000000buftok-0.3.0/.github/workflows/000077500000000000000000000000001402715742100164045ustar00rootroot00000000000000buftok-0.3.0/.github/workflows/ruby.yml000066400000000000000000000021451402715742100201120ustar00rootroot00000000000000# This workflow uses actions that are not certified by GitHub. # They are provided by a third-party and are governed by # separate terms of service, privacy policy, and support # documentation. # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby name: Ruby on: push: branches: [ master ] pull_request: branches: [ master ] jobs: test: runs-on: ubuntu-latest strategy: matrix: ruby-version: ['2.6', '2.7', '3.0'] steps: - uses: actions/checkout@v2 - name: Set up Ruby # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby, # change this to (see https://github.com/ruby/setup-ruby#versioning): # uses: ruby/setup-ruby@v1 uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e with: ruby-version: ${{ matrix.ruby-version }} bundler-cache: true # runs 'bundle install' and caches installed gems automatically - name: Run tests run: bundle exec rake buftok-0.3.0/.gitignore000066400000000000000000000002321402715742100147740ustar00rootroot00000000000000*.gem *.rbc .bundle .config .yardoc Gemfile.lock InstalledFiles _yardoc coverage doc/ lib/bundler/man pkg rdoc spec/reports test/tmp test/version_tmp tmp buftok-0.3.0/CONTRIBUTING.md000066400000000000000000000037231402715742100152450ustar00rootroot00000000000000## Contributing In the spirit of [free software][free-sw], **everyone** is encouraged to help improve this project. Here are some ways *you* can contribute: [free-sw]: http://www.fsf.org/licensing/essays/free-sw.html * Use alpha, beta, and pre-release versions. * Report bugs. * Suggest new features. * Write or edit documentation. * Write specifications. * Write code (**no patch is too small**: fix typos, add comments, clean up inconsistent whitespace). * Refactor code. * Fix [issues][]. * Review patches. [issues]: https://github.com/sferik/buftok/issues ## Submitting an Issue We use the [GitHub issue tracker][issues] to track bugs and features. Before submitting a bug report or feature request, check to make sure it hasn't already been submitted. When submitting a bug report, please include a [Gist][] that includes a stack trace and any details that may be necessary to reproduce the bug, including your gem version, Ruby version, and operating system. Ideally, a bug report should include a pull request with failing specs. [gist]: https://gist.github.com/ ## Submitting a Pull Request 1. [Fork the repository.][fork] 2. [Create a topic branch.][branch] 3. Add specs for your unimplemented feature or bug fix. 4. Run `bundle exec rake spec`. If your specs pass, return to step 3. 5. Implement your feature or bug fix. 6. Run `bundle exec rake spec`. If your specs fail, return to step 5. 7. Run `open coverage/index.html`. If your changes are not completely covered by your tests, return to step 3. 8. Run `RUBYOPT=W2 bundle exec rake spec 2>&1 | grep buftok`. If your changes produce any warnings, return to step 5. 9. Add documentation for your feature or bug fix. 10. Run `bundle exec rake yard`. If your changes are not 100% documented, go back to step 9. 11. Commit and push your changes. 12. [Submit a pull request.][pr] [fork]: http://help.github.com/fork-a-repo/ [branch]: http://learn.github.com/p/branching.html [pr]: http://help.github.com/send-pull-requests/ buftok-0.3.0/Gemfile000066400000000000000000000000471402715742100143030ustar00rootroot00000000000000source "https://rubygems.org" gemspec buftok-0.3.0/LICENSE.txt000066400000000000000000000021301402715742100146260ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2021 Tony Arcieri, Martin Emde, Erik Michaels-Ober Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. buftok-0.3.0/README.md000066400000000000000000000033011402715742100142630ustar00rootroot00000000000000# BufferedTokenizer [![Gem Version](http://img.shields.io/gem/v/buftok.svg)][gem] [![Build Status](https://github.com/sferik/buftok/actions/workflows/ruby.yml/badge.svg)][build] [gem]: https://rubygems.org/gems/buftok [build]: https://github.com/sferik/buftok/actions ###### Statefully split input data by a specifiable token BufferedTokenizer takes a delimiter upon instantiation, or acts line-based by default. It allows input to be spoon-fed from some outside source which receives arbitrary length datagrams which may-or-may-not contain the token by which entities are delimited. In this respect it's ideally paired with something like [EventMachine][]. [EventMachine]: http://rubyeventmachine.com/ ## Supported Ruby Versions This library aims to support and is [tested against][build] the following Ruby implementations: * Ruby 2.6 * Ruby 2.7 * Ruby 3.0 If something doesn't work on one of these interpreters, it's a bug. This code will likely still work on older versions since it has not undergone many changes since release. However, support will not be provided for end-of-life ruby versions. If you would like this library to support another Ruby version, you may volunteer to be a maintainer. Being a maintainer entails making sure all tests run and pass on that implementation. When something breaks on your implementation, you will be responsible for providing patches in a timely fashion. If critical issues for a particular implementation exist at the time of a major release, support for that Ruby version may be dropped. ## Copyright Copyright (c) 2006-2021 Tony Arcieri, Martin Emde, Erik Michaels-Ober. Distributed under the [MIT license][license]. [license]: https://opensource.org/licenses/MIT buftok-0.3.0/Rakefile000066400000000000000000000031271402715742100144570ustar00rootroot00000000000000require "bundler" require "rdoc/task" require "rake/testtask" task default: :test Bundler::GemHelper.install_tasks RDoc::Task.new do |task| task.rdoc_dir = "doc" task.title = "BufferedTokenizer" task.rdoc_files.include("lib/**/*.rb") end Rake::TestTask.new :test do |t| t.libs << "lib" t.test_files = FileList["test/**/*.rb"] end desc "Benchmark the current implementation" task :bench do require "benchmark" require File.expand_path("lib/buftok", File.dirname(__FILE__)) n = 50000 delimiter = "\n\n" frequency1 = 1000 puts "generating #{n} strings, with #{delimiter.inspect} every #{frequency1} strings..." data1 = (0...n).map do |i| ((i % frequency1 == 1 ? "\n" : "") + ("s" * i) + (i % frequency1 == 0 ? "\n" : "")).freeze end frequency2 = 10 puts "generating #{n} strings, with #{delimiter.inspect} every #{frequency2} strings..." data2 = (0...n).map do |i| ((i % frequency2 == 1 ? "\n" : "") + ("s" * i) + (i % frequency2 == 0 ? "\n" : "")).freeze end Benchmark.bmbm do |x| x.report("1 char, freq: #{frequency1}") do bt1 = BufferedTokenizer.new n.times { |i| bt1.extract(data1[i]) } end x.report("2 char, freq: #{frequency1}") do bt2 = BufferedTokenizer.new(delimiter) n.times { |i| bt2.extract(data1[i]) } end x.report("1 char, freq: #{frequency2}") do bt3 = BufferedTokenizer.new n.times { |i| bt3.extract(data2[i]) } end x.report("2 char, freq: #{frequency2}") do bt4 = BufferedTokenizer.new(delimiter) n.times { |i| bt4.extract(data2[i]) } end end end buftok-0.3.0/buftok.gemspec000066400000000000000000000015311402715742100156460ustar00rootroot00000000000000Gem::Specification.new do |spec| spec.version = "0.3.0" spec.authors = ["Tony Arcieri", "Martin Emde", "Erik Michaels-Ober"] spec.summary = %q{BufferedTokenizer extracts token delimited entities from a sequence of string inputs} spec.description = spec.summary spec.email = ["sferik@gmail.com", "martin.emde@gmail.com"] spec.files = %w(CONTRIBUTING.md LICENSE.txt README.md buftok.gemspec) + Dir["lib/**/*.rb"] spec.homepage = "https://github.com/sferik/buftok" spec.licenses = ["MIT"] spec.name = "buftok" spec.require_paths = ["lib"] spec.required_rubygems_version = ">= 1.3.5" spec.add_development_dependency "bundler", ">= 1.17" spec.add_development_dependency "rake", "~> 10.0" spec.add_development_dependency "rdoc" spec.add_development_dependency "test-unit" end buftok-0.3.0/lib/000077500000000000000000000000001402715742100135555ustar00rootroot00000000000000buftok-0.3.0/lib/buftok.rb000066400000000000000000000050161402715742100153760ustar00rootroot00000000000000# frozen_string_literal: true # # BufferedTokenizer takes a delimiter upon instantiation, or acts line-based # by default. It allows input to be spoon-fed from some outside source which # receives arbitrary length datagrams which may-or-may-not contain the token # by which entities are delimited. In this respect it's ideally paired with # something like EventMachine (http://rubyeventmachine.com/). class BufferedTokenizer # New BufferedTokenizers will operate on lines delimited by a delimiter, # which is by default the global input delimiter $/ ("\n"). # # The input buffer is stored as an array. This is by far the most efficient # approach given language constraints (in C a linked list would be a more # appropriate data structure). Segments of input data are stored in a list # which is only joined when a token is reached, substantially reducing the # number of objects required for the operation. def initialize(delimiter = $/) @delimiter = delimiter @input = [] @tail = String.new @trim = @delimiter.length - 1 end # Determine the size of the internal buffer. # # Size is not cached and is determined every time this method is called # in order to optimize throughput for extract. def size @tail.length + @input.inject(0) { |total, input| total + input.length } end # Extract takes an arbitrary string of input data and returns an array of # tokenized entities, provided there were any available to extract. This # makes for easy processing of datagrams using a pattern like: # # tokenizer.extract(data).map { |entity| Decode(entity) }.each do ... # # Using -1 makes split to return "" if the token is at the end of # the string, meaning the last element is the start of the next chunk. def extract(data) if @trim > 0 tail_end = @tail.slice!(-@trim, @trim) # returns nil if string is too short data = tail_end + data if tail_end end @input << @tail entities = data.split(@delimiter, -1) @tail = entities.shift unless entities.empty? @input << @tail entities.unshift @input.join @input.clear @tail = entities.pop end entities end # Flush the contents of the input buffer, i.e. return the input buffer even though # a token has not yet been encountered def flush @input << @tail buffer = @input.join @input.clear @tail = String.new # @tail.clear is slightly faster, but not supported on 1.8.7 buffer end end # The expected constant for a gem named buftok Buftok = BufferedTokenizer buftok-0.3.0/test/000077500000000000000000000000001402715742100137665ustar00rootroot00000000000000buftok-0.3.0/test/test_buftok.rb000066400000000000000000000027761402715742100166600ustar00rootroot00000000000000# frozen_string_literal: true # # Desipte the frozen_string_literal declaration, I'm leaving the explicit calls # to .freeze to be extra clear about treating input as immutable. require "test/unit" require "buftok" class TestBuftok < Test::Unit::TestCase def test_constant assert_same BufferedTokenizer, Buftok end def test_buftok tokenizer = BufferedTokenizer.new assert_equal %w[foo], tokenizer.extract("foo\nbar".freeze) assert_equal %w[barbaz qux], tokenizer.extract("baz\nqux\nquu".freeze) assert_equal "quu", tokenizer.flush assert_equal "", tokenizer.flush end def test_delimiter tokenizer = BufferedTokenizer.new("<>".freeze) assert_equal ["", "foo\n"], tokenizer.extract("<>foo\n<>".freeze) assert_equal %w[bar], tokenizer.extract("bar<>baz".freeze) assert_equal "baz", tokenizer.flush end def test_split_delimiter tokenizer = BufferedTokenizer.new("<>".freeze) assert_equal [], tokenizer.extract("foo<".freeze) assert_equal %w[foo], tokenizer.extract(">bar<".freeze) assert_equal %w[barqux<>".freeze) assert_equal "", tokenizer.flush end def test_size tokenizer = BufferedTokenizer.new("<>".freeze) assert_equal [], tokenizer.extract("foo<".freeze) assert_equal 4, tokenizer.size assert_equal %w[foo], tokenizer.extract(">bar<".freeze) assert_equal 4, tokenizer.size assert_equal %w[barqux<>".freeze) assert_equal 0, tokenizer.size end end