pax_global_header00006660000000000000000000000064150677654170014534gustar00rootroot0000000000000052 comment=4dc3520ab40f4be81088d9d80e8a8d68b8ca78a4 vsearch-2.30.1/000077500000000000000000000000001506776541700132525ustar00rootroot00000000000000vsearch-2.30.1/.dockerignore000066400000000000000000000000341506776541700157230ustar00rootroot00000000000000.git .gitignore .travis.yml vsearch-2.30.1/.github/000077500000000000000000000000001506776541700146125ustar00rootroot00000000000000vsearch-2.30.1/.github/workflows/000077500000000000000000000000001506776541700166475ustar00rootroot00000000000000vsearch-2.30.1/.github/workflows/jekyll-gh-pages.yml000066400000000000000000000032141506776541700223550ustar00rootroot00000000000000# Sample workflow for building and deploying a Jekyll site to GitHub Pages name: Deploy Jekyll with GitHub Pages dependencies preinstalled on: # Runs on pushes targeting the dev branch push: branches: ["dev"] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: contents: write pages: write id-token: write # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. concurrency: group: "pages" cancel-in-progress: false jobs: # Build job build: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 with: ref: dev - name: generate markdown files working-directory: ./man run: | sudo apt update sudo apt install -y pandoc bash ./scripts/generate_online_documentation.sh ls -lh ../docs/* - name: Setup Pages uses: actions/configure-pages@v5 - name: Build with Jekyll uses: actions/jekyll-build-pages@v1 with: source: ./docs destination: ./_site - name: Upload artifact uses: actions/upload-pages-artifact@v3 # Deployment job deploy: environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest needs: build steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 vsearch-2.30.1/.gitignore000066400000000000000000000004441506776541700152440ustar00rootroot00000000000000*.a *.clang *.json *.o *.pdf *.texinfo *~ .cache/ .deps .dirstamp /autom4te.cache /bin /docs /man/manpages /config.h /config.log /config.status /config.guess /config.sub /stamp-h1 Makefile aclocal.m4 config.h.in configure compile depcomp install-sh missing Makefile.in .vscode .DS_Store .Tpo vsearch-2.30.1/.travis.yml000066400000000000000000000004671506776541700153720ustar00rootroot00000000000000language: - cpp arch: - arm64 os: - linux addons: apt: packages: - ghostscript - groff compiler: - g++ - clang script: - ./autogen.sh - ./configure - make - export PATH=$PWD/bin:$PATH - git clone https://github.com/frederic-mahe/vsearch-tests.git - cd vsearch-tests - bash ./run_all_tests.sh vsearch-2.30.1/CITATION.cff000066400000000000000000000026171506776541700151520ustar00rootroot00000000000000cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - family-names: "Rognes" given-names: "Torbjørn" orcid: "https://orcid.org/0000-0002-9329-9974" - family-names: "Flouri" given-names: "Tomas" orcid: "https://orcid.org/0000-0002-8474-9507" - family-names: "Nichols" given-names: "Benjamin" - family-names: "Quince" given-names: "Christopher" orcid: "https://orcid.org/0000-0003-1884-8440" - family-names: "Mahé" given-names: "Frédéric" orcid: "https://orcid.org/0000-0002-2808-0984" title: "VSEARCH: versatile open-source tool for microbiome analysis" version: 2.22.1 date-released: 2022-09-19 url: "https://github.com/torognes/vsearch" preferred-citation: type: article authors: - family-names: "Rognes" given-names: "Torbjørn" orcid: "https://orcid.org/0000-0002-9329-9974" - family-names: "Flouri" given-names: "Tomas" orcid: "https://orcid.org/0000-0002-8474-9507" - family-names: "Nichols" given-names: "Ben" - family-names: "Quince" given-names: "Christopher" orcid: "https://orcid.org/0000-0003-1884-8440" - family-names: "Mahé" given-names: "Frédéric" orcid: "https://orcid.org/0000-0002-2808-0984" doi: "10.7717/peerj.2584" journal: "Peer Journal" day: 18 month: 10 start: e2584 # First page number title: "VSEARCH: a versatile open source tool for metagenomic" volume: 4 year: 2016 vsearch-2.30.1/Dockerfile000066400000000000000000000006261506776541700152500ustar00rootroot00000000000000FROM alpine:latest WORKDIR /opt/vsearch COPY . . RUN apk add --no-cache \ libstdc++ zlib-dev bzip2-dev \ autoconf automake make g++ && \ ./autogen.sh && \ ./configure CFLAGS="-O2" CXXFLAGS="-O2" && \ make clean && \ make && \ make install && \ make clean && \ apk del autoconf automake make g++ && \ rm -rf /opt/vsearch ENTRYPOINT ["/usr/local/bin/vsearch"] vsearch-2.30.1/LICENSE.txt000066400000000000000000000046441506776541700151050ustar00rootroot00000000000000 VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vsearch-2.30.1/LICENSE_GNU_GPL3.txt000066400000000000000000001045131506776541700163770ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . vsearch-2.30.1/Makefile.am000066400000000000000000000001051506776541700153020ustar00rootroot00000000000000AUTOMAKE_OPTIONS = foreign SUBDIRS = src man EXTRA_DIST = autogen.sh vsearch-2.30.1/README.md000066400000000000000000000575651506776541700145530ustar00rootroot00000000000000[![Build Status](https://app.travis-ci.com/torognes/vsearch.svg?branch=master)](https://app.travis-ci.com/torognes/vsearch) # VSEARCH ## Introduction The aim of this project is to create an alternative to the [USEARCH](https://www.drive5.com/usearch/) tool developed by Robert C. Edgar (2010). The new tool should: * have open source code with an appropriate open source license * be free of charge, gratis * have a 64-bit design that handles very large databases and much more than 4GB of memory * be as accurate or more accurate than usearch * be as fast or faster than usearch We have implemented a tool called VSEARCH which supports *de novo* and reference based chimera detection, clustering, full-length and prefix dereplication, rereplication, reverse complementation, masking, all-vs-all pairwise global alignment, exact and global alignment searching, shuffling, subsampling and sorting. It also supports FASTQ file analysis, filtering, conversion and merging of paired-end reads. VSEARCH stands for vectorized search, as the tool takes advantage of parallelism in the form of SIMD vectorization as well as multiple threads to perform accurate alignments at high speed. VSEARCH uses an optimal global aligner (full dynamic programming Needleman-Wunsch), in contrast to USEARCH which by default uses a heuristic seed and extend aligner. This usually results in more accurate alignments and overall improved sensitivity (recall) with VSEARCH, especially for alignments with gaps. [VSEARCH binaries](https://github.com/torognes/vsearch/releases/latest) are provided for GNU/Linux on five 64-bit processor architectures: x86_64, POWER8 (ppc64le), ARMv8 (aarch64), little-endian 64-bit RISC-V (riscv64), and little-endian 64-bit MIPS (mips64el). Binaries are also provided for macOS (version 10.9 Mavericks or later) on Intel (x86_64) and Apple Silicon (ARMv8), as well as Windows (64-bit, version 7 or higher, on x86_64). VSEARCH contains native SIMD code for three processor architectures (SSE2/SSSE3, AltiVec/VMX/VSX, Neon). In addition, VSEARCH uses the SIMD Everywhere (SIMDe) library to enable building on riscv64, mips64el, and other little-endian architectures, but the performance may be lower than a native implementation. | CPU \ OS | GNU/Linux | macOS | Windows | | ------------- | :-----------: | :----: | :-------: | | x86_64 | ✔ | ✔ | ✔ | | ARMv8 | ✔ | ✔ | | | POWER8 | ✔ | | | | RISC-V 64 LE | ✔ | | | | MIPS 64 LE | not tested | | | Various packages, plugins and wrappers for VSEARCH are also available from other sources - see [below](https://github.com/torognes/vsearch#packages-plugins-and-wrappers). The source code compiles correctly with `gcc` (versions 4.8.5 to 14.0) and `llvm-clang` (3.8 to 19.0). The source code should also compile on [FreeBSD](https://www.freebsd.org/) and [NetBSD](https://www.netbsd.org/) systems. VSEARCH can directly read input query and database files that are compressed using gzip (.gz) and bzip2 (.bz2) if the zlib and bzip2 libraries are available. Most of the nucleotide based commands and options in USEARCH version 7 are supported, as well as some in version 8. The same option names as in USEARCH version 7 has been used in order to make VSEARCH an almost drop-in replacement. VSEARCH does not support amino acid sequences or local alignments. These features may be added in the future. ## Getting Help If you can't find an answer in [online documentation](https://torognes.github.io/vsearch/), or in the [manpage](https://github.com/torognes/vsearch/releases/download/v2.30.1/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. ## Example In the example below, VSEARCH will identify sequences in the file database.fsa that are at least 90% identical on the plus strand to the query sequences in the file queries.fsa and write the results to the file alnout.txt. `./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9 --alnout alnout.txt` ## Download and install **Source distribution** To download the source distribution from a [release](https://github.com/torognes/vsearch/releases) and build the executable and the documentation, use the following commands: ``` wget https://github.com/torognes/vsearch/archive/v2.30.1.tar.gz tar xzf v2.30.1.tar.gz cd vsearch-2.30.1 ./autogen.sh ./configure CFLAGS="-O2" CXXFLAGS="-O2" make ARFLAGS="cr" sudo make install ``` You may customize the installation directory using the `--prefix=DIR` option to `configure`. If the compression libraries [zlib](https://www.zlib.net) and/or [bzip2](https://www.sourceware.org/bzip2/) are installed on the system, they will be detected automatically and support for compressed files will be included in vsearch (see section **Dependencies** below). Support for compressed files may be disabled using the `--disable-zlib` and `--disable-bzip2` options to `configure`. A PDF version of the manual will be created from the `vsearch.1` manual file if `ps2pdf` is available, unless disabled using the `--disable-pdfman` option to `configure`. It is recommended to run configure with the options `CFLAGS="-O2"` and `CXXFLAGS="-O2"`. Other options may also be applied to `configure`, please run `configure -h` to see them all. GNU autoconf (version 2.63 or later), automake and the GCC C++ (`g++`) compiler is required to build vsearch. Version 3.82 or later of `make` may be required on Linux, while version 3.81 is sufficient on macOS. Warning: Compiling the `align_simd.cc` file on x86_64 systems using the GNU C++ compiler version 9 or later with the `-O3` optimization option on may result in incorrect code that may cause bad alignments in some circumstances. This was due to the `-ftree-partial-pre` optimization enabled by `-O3`. A compiler pragma has been inserted in the code to specifically turn off this optimization for the affected code. Using `-O3` should be safe. To build VSEARCH on Debian and similar Linux distributions (Ubuntu etc) you'll need the following packages: autoconf, automake, g++, ghostscript, groff, libbz2-dev, make, zlib1g-dev. Include libsimde-dev to build on riscv64 or mips64el. To build VSEARCH on Fedora and similar Linux distributions (RHEL, Centos etc) you'll need the following packages: autoconf, automake, bzip2-devel, gcc-c++, ghostscript, groff-base, make, zlib-devel. Instead of downloading the source distribution as a compressed archive, you could clone the repo and build it as shown below. The options to `configure` as described above are still valid. ``` git clone https://github.com/torognes/vsearch.git cd vsearch ./autogen.sh ./configure CFLAGS="-O2" CXXFLAGS="-O2" make ARFLAGS="cr" sudo make install ``` **Binary distribution**: Starting with version 1.4.0, binary distribution files containing pre-compiled binaries as well as the documentation will be made available as part of each [release](https://github.com/torognes/vsearch/releases). The included executables include support for input files compressed by zlib and bzip2 (with files usually ending in `.gz` or `.bz2`). Binary distributions are provided for x86-64 systems running GNU/Linux, macOS (version 10.7 or higher) or Windows (64-bit, version 7 or higher), 64-bit AMDv8 (aarch64) systems running GNU/Linux or macOS, as well as POWER8 (ppc64le), 64-bit little-endian RISC-V (risv64), and 64-bit little endian MIPS (mips64el) systems running GNU/Linux. A universal macOS binary is also provided. In addition, an x86_64 binary built for the discontinued RHEL 7 and CentOS 7 linux distributions is provided. The other Linux binaries are built on Debian 11 (oldstable, Bullseye). Static binaries are available for all Linux architectures except x86_64, these can be used on systems that do not have all the necessary libraries installed. The Windows binary was built with cross compilation using [Mingw-w64](http://mingw-w64.org/). Download the appropriate executable for your system using the following commands if you are using a Linux or macOS system: ```sh wget https://github.com/torognes/vsearch/releases/download/v{VERSION}/vsearch-{VERSION}-{OS}-{ARCH}.tar.gz tar xzf vsearch-{VERSION}-{OS}-{ARCH}.tar.gz ``` Replace `{VERSION}` with the VSEARCH version number (e.g. `2.30.1`), `{OS}` with the target operating system (`linux` or `macos`), and `{ARCH}` with the architecture (`x86_64`, `aarch64`, `ppc64le`, `riscv64`, or `mips64el`). You could add `-static` after `{ARCH}` to get a statically compiled version for Linux (except x86_64). The name of the binary for the RHEL 7 and CentOS 7 Linux distributions ends in `-ubi7`. Or, if you are using Windows, download and extract (unzip) the contents of this file: ``` https://github.com/torognes/vsearch/releases/download/v{VERSION}/vsearch-{VERSION}-win-x86_64.zip ``` **Linux and Mac**: You will now have the binary distribution in a folder called `vsearch-{VERSION}-{OS}-{ARCH}` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. **Windows**: You will now have the binary distribution in a folder called `vsearch-{VERSION}-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. If you want to be able to call `vsearch.exe` from any command prompt window, you can put the VSEARCH executable in a folder (for instance `C:\Users\\bin`), and add the new folder to the user `Path`: open the `Environment Variables` window by searching for it in the Start menu, `Edit` user variables, add `;C:\Users\\bin` to the end of the `Path` variable, and save your changes. The windows distribution also includes the `libbz2.dll` and `zlib1.dll` files required for reading compressed input files. These DLL's have been obtained for mingw-w64 from the MSYS2 platform. **Documentation:** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.30.1/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.30.1/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). ## Packages, plugins, and wrappers **Conda package** Thanks to the [BioConda](https://bioconda.github.io/) team, there is now a [vsearch package](https://anaconda.org/bioconda/vsearch) in [Conda](https://conda.io/). **Debian package** Thanks to the [Debian Med](https://www.debian.org/devel/debian-med/) team, there is now a [vsearch](https://packages.debian.org/sid/vsearch) package in [Debian](https://www.debian.org/). **FreeBSD ports package** Thanks to [Jason Bacon](https://github.com/outpaddling), a [vsearch](https://www.freebsd.org/cgi/ports.cgi?query=vsearch&stype=all) [FreeBSD ports](https://www.freebsd.org/ports/) package is available. Install the binary package with `pkg install vsearch`, or build from source with additional optimizations. **Galaxy wrapper** Thanks to the work of the [Intergalactic Utilities Commission](https://wiki.galaxyproject.org/IUC) members, VSEARCH is now part of the [Galaxy ToolShed](https://toolshed.g2.bx.psu.edu/view/iuc/vsearch/). **Homebrew package** Thanks to [Torsten Seeman](https://github.com/tseemann), a [vsearch package](https://formulae.brew.sh/formula/vsearch) for [Homebrew](http://brew.sh/) has been made. **Pkgsrc package** Thanks to [Jason Bacon](https://github.com/outpaddling), a vsearch [pkgsrc](https://www.pkgsrc.org) package is available for NetBSD and other UNIX-like systems. Install the binary package with `pkgin install vsearch`, or build from source with additional optimizations. **QIIME 2 plugin** Thanks to the [QIIME 2](https://github.com/qiime2) team, there is now a plugin called [q2-vsearch](https://github.com/qiime2/q2-vsearch) for [QIIME 2](https://qiime2.org). ## Converting output to a biom file for use in QIIME and other software With the `from-uc`command in [biom](http://biom-format.org/) 2.1.5 or later, it is possible to convert data in a `.uc` file produced by vsearch into a biom file that can be read by QIIME and other software. It is described [here](https://gist.github.com/gregcaporaso/f3c042e5eb806349fa18). Please note that VSEARCH version 2.2.0 and later are able to directly output OTU tables in biom 1.0 format as well as the classic and mothur formats. ## Implementation details and initial assessment Please see the paper for details: Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584 doi: [10.7717/peerj.2584](https://doi.org/10.7717/peerj.2584) ## Dependencies Compiling VSEARCH requires either GCC (`g++`) or `clang`, `make` and the autotools (`ui-auto` on Debian-based distributions). Optionally, the header files for the following two optional libraries are required if support for gzip and bzip2 compressed FASTA and FASTQ input files is needed: * libz (zlib library) (`zlib.h` header file, available as `zlib1g-dev` on Debian-based distributions) (optional) * libbz2 (bzip2lib library) (`bzlib.h` header file, available as `libbz2-dev`on Debian-based distributions) (optional) VSEARCH will automatically check whether these libraries are available and load them dynamically. On Windows these libraries are called `zlib1.dll` and `libbz2.dll`. These DLL's are included with the released distribution of vsearch 2.27.0 and later. To create the PDF file with the manual the ps2pdf tool is required. It is part of the `ghostscript` package. ## VSEARCH license and third party licenses The VSEARCH code is dual-licensed either under the GNU General Public License version 3 or under the BSD 2-clause license. Please see LICENSE.txt for details. VSEARCH includes code from several other projects. We thank the authors for making their source code available. VSEARCH includes code from Google's [CityHash project](https://github.com/google/cityhash) by Geoff Pike and Jyrki Alakuijala, providing some excellent hash functions available under a MIT license. VSEARCH includes code derived from Tatusov and Lipman's DUST program that is in the public domain. VSEARCH includes public domain code written by Alexander Peslyak for the MD5 message digest algorithm. VSEARCH includes public domain code written by Steve Reid and others for the SHA1 message digest algorithm. The VSEARCH distribution includes code from GNU Autoconf which normally is available under the GNU General Public License, but may be distributed with the special autoconf configure script exception. VSEARCH may include code from the [zlib](https://www.zlib.net) library copyright Jean-loup Gailly and Mark Adler, distributed under the [zlib license](https://www.zlib.net/zlib_license.html). VSEARCH may include code from the [bzip2](https://www.sourceware.org/bzip2/) library copyright Julian R. Seward, distributed under a BSD-style license. ## Code The code is written mostly in C++. File | Description ---|--- **align_simd.cc** | SIMD parallel global alignment of 1 query with 8 database sequences **allpairs.cc** | All-vs-all optimal global pairwise alignment (no heuristics) **arch.cc** | Architecture specific code (Mac/Linux) **attributes.cc** | Extraction and printing of attributes in FASTA headers **bitmap.cc** | Implementation of bitmaps **chimera.cc** | Chimera detection **city.cc** | CityHash code **cluster.cc** | Clustering (cluster\_fast and cluster\_smallmem) **cpu.cc** | Code dependent on specific cpu features (e.g. ssse3) **cut.cc** | Restriction site cutting **db.cc** | Handles the database file read, access etc **dbhash.cc** | Database hashing for exact searches **dbindex.cc** | Indexes the database by identifying unique kmers in the sequences **derep.cc** | Dereplication, full-length **derep_prefix.cc** | Dereplication, prefix **derep_smallmem.cc** | Dereplication, small memory usage **dynlibs.cc** | Dynamic loading of compression libraries **eestats.cc** | Produce statistics for fastq_eestats command **fasta.cc** | FASTA file parser **fasta2fastq.cc** | FASTA to FASTQ conversion **fastq.cc** | FASTQ file parser **fastq_chars.cc** | FASTQ statistics **fastq_join.cc** | FASTQ paired-end reads joining **fastqops.cc** | FASTQ file statistics etc **fastx.cc** | Detection of FASTA and FASTQ files, wrapper for FASTA and FASTQ parsers **filter.cc** | Trimming and filtering of sequences in FASTA and FASTQ files **getseq.cc** | Extraction of sequences based on header labels **kmerhash.cc** | Hash for kmers used by paired-end read merger **linmemalign.cc** | Linear memory global sequence aligner **mask.cc** | Masking (DUST) **md5.c** | MD5 message digest **mergepairs.cc** | Paired-end read merging **minheap.cc** | A minheap implementation for the list of top kmer matches **msa.cc** | Simple multiple sequence alignment and consensus sequence computation for clusters **orient.cc** | Orient direction of sequences based on reference database **otutable.cc** | Generate OTU tables in various formats **rereplicate.cc** | Rereplication **results.cc** | Output results in various formats (alnout, userout, blast6, uc) **search.cc** | Implements search using global alignment **search_exact.cc** | Exact search functions **searchcore.cc** | Core search functions for searching, clustering and chimera detection **sff_convert.cc** | SFF to FASTQ file conversion **sha1.c** | SHA1 message digest **showalign.cc** | Output an alignment in a human-readable way given a CIGAR-string and the sequences **shuffle.cc** | Shuffle sequences **sintax.cc** | Taxonomic classification using Sintax method **sortbylength.cc** | Code for sorting by length **sortbysize.cc** | Code for sorting by size (abundance) **subsample.cc** | Subsampling reads from a FASTA file **tax.cc** | Taxonomy information parsing **udb.cc** | UDB database file handling **unique.cc** | Find unique kmers in a sequence **userfields.cc** | Code for parsing the userfields option argument **util.cc** | Various common utility functions **vsearch.cc** | Main program file, general initialization, reads arguments and parses options, writes info. **utils/maps.cc** | Utilities, maps for encoding of nucleotides **utils/seqcmp.cpp** | Utilities, sequence comparison VSEARCH may be compiled with zlib or bzip2 integration that allows it to read compressed FASTA files. The [zlib](http://www.zlib.net/) and the [bzip2](https://www.sourceware.org/bzip2/) libraries are needed for this. ## Bugs All bug reports are highly appreciated. You may submit a bug report here on GitHub as an [issue](https://github.com/torognes/vsearch/issues) (preferred), you could post a message on the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) or you could send an email to [torognes@ifi.uio.no](mailto:torognes@ifi.uio.no?subject=bug_in_vsearch). ## Limitations VSEARCH is designed for rather short sequences, and will be slow when sequences are longer than about 5,000 bp. This is because it always performs optimal global alignment on selected sequences. ## The VSEARCH team The main contributors to VSEARCH: * Torbjørn Rognes (Coding, testing, documentation, evaluation) * Frédéric Mahé (Documentation, testing, coding, feature suggestions) * Tomáš Flouri (Coding, testing) * Christopher Quince (Initiator, feature suggestions, evaluation) * Ben Nichols (Evaluation) ## Acknowledgements Special thanks to the following people for patches, suggestions, computer access etc: * Davide Albanese * Colin Brislawn * Michael R. Crusoe * Jeff Epler * Christopher M. Sullivan * Andreas Tille * Sarah Westcott ## Citing VSEARCH Please cite the following publication if you use VSEARCH: Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. doi: [10.7717/peerj.2584](https://doi.org/10.7717/peerj.2584) Please note that citing any of the underlying algorithms, e.g. UCHIME, may also be appropriate. ## Test datasets Test datasets (found in the separate vsearch-data repository) were obtained from the BioMarks project ([Logares et al. 2014](https://doi.org/10.1016/j.cub.2014.02.050)), the [TARA OCEANS project](https://oceans.taraexpeditions.org/en/) ([Karsenti et al. 2011](https://doi.org/10.1371/journal.pbio.1001177)) and the [Protist Ribosomal Reference Database (PR2)](https://github.com/pr2database/pr2database) ([Guillou et al. 2013](https://doi.org/10.1093/nar/gks1160)). ## References * Edgar RC (2010) **Search and clustering orders of magnitude faster than BLAST.** *Bioinformatics*, 26 (19): 2460-2461. doi:[10.1093/bioinformatics/btq461](https://doi.org/10.1093/bioinformatics/btq461) * Edgar RC (2016) **SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences.** *bioRxiv*. doi:[10.1101/074161](https://doi.org/10.1101/074161) * Edgar RC (2016) **UNOISE2: improved error-correction for Illumina 16S and ITS amplicon sequencing.** *bioRxiv*. doi:[10.1101/081257](https://doi.org/10.1101/081257) * Edgar RC, Flyvbjerg H (2015) **Error filtering, pair assembly and error correction for next-generation sequencing reads.** *Bioinformatics*, 31 (21): 3476-3482. doi:[10.1093/bioinformatics/btv401](https://doi.org/10.1093/bioinformatics/btv401) * Edgar RC, Haas BJ, Clemente JC, Quince C, Knight R (2011) **UCHIME improves sensitivity and speed of chimera detection.** *Bioinformatics*, 27 (16): 2194-2200. doi:[10.1093/bioinformatics/btr381](https://doi.org/10.1093/bioinformatics/btr381) * Guillou L, Bachar D, Audic S, Bass D, Berney C, Bittner L, Boutte C, Burgaud G, de Vargas C, Decelle J, del Campo J, Dolan J, Dunthorn M, Edvardsen B, Holzmann M, Kooistra W, Lara E, Lebescot N, Logares R, Mahé F, Massana R, Montresor M, Morard R, Not F, Pawlowski J, Probert I, Sauvadet A-L, Siano R, Stoeck T, Vaulot D, Zimmermann P & Christen R (2013) **The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy.** *Nucleic Acids Research*, 41 (D1), D597-D604. doi:[10.1093/nar/gks1160](https://doi.org/10.1093/nar/gks1160) * Karsenti E, González Acinas S, Bork P, Bowler C, de Vargas C, Raes J, Sullivan M B, Arendt D, Benzoni F, Claverie J-M, Follows M, Jaillon O, Gorsky G, Hingamp P, Iudicone D, Kandels-Lewis S, Krzic U, Not F, Ogata H, Pesant S, Reynaud E G, Sardet C, Sieracki M E, Speich S, Velayoudon D, Weissenbach J, Wincker P & the Tara Oceans Consortium (2011) **A holistic approach to marine eco-systems biology.** *PLoS Biology*, 9(10), e1001177. doi:[10.1371/journal.pbio.1001177](https://doi.org/10.1371/journal.pbio.1001177) * Logares R, Audic S, Bass D, Bittner L, Boutte C, Christen R, Claverie J-M, Decelle J, Dolan J R, Dunthorn M, Edvardsen B, Gobet A, Kooistra W H C F, Mahé F, Not F, Ogata H, Pawlowski J, Pernice M C, Romac S, Shalchian-Tabrizi K, Simon N, Stoeck T, Santini S, Siano R, Wincker P, Zingone A, Richards T, de Vargas C & Massana R (2014) **The patterning of rare and abundant community assemblages in coastal marine-planktonic microbial eukaryotes.** *Current Biology*, 24(8), 813-821. doi:[10.1016/j.cub.2014.02.050](https://doi.org/10.1016/j.cub.2014.02.050) * Rognes T (2011) **Faster Smith-Waterman database searches by inter-sequence SIMD parallelisation.** *BMC Bioinformatics*, 12: 221. doi:[10.1186/1471-2105-12-221](https://doi.org/10.1186/1471-2105-12-221) vsearch-2.30.1/_config.yml000066400000000000000000000006531506776541700154050ustar00rootroot00000000000000remote_theme: pages-themes/hacker@v0.2.0 # attempt to fix lack of CSS rendering in deployed mode baseurl: /vsearch url: https://torognes.github.io plugins: - jekyll-relative-links - jekyll-remote-theme relative_links: enabled: true collections: true include: - CONTRIBUTING.md - README.md - LICENSE.md - COPYING.md - CODE_OF_CONDUCT.md - CONTRIBUTING.md - ISSUE_TEMPLATE.md - PULL_REQUEST_TEMPLATE.md vsearch-2.30.1/autogen.sh000077500000000000000000000000471506776541700152540ustar00rootroot00000000000000#!/bin/sh autoreconf --force --install vsearch-2.30.1/configure.ac000066400000000000000000000062031506776541700155410ustar00rootroot00000000000000# -*- Autoconf -*- # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) AC_INIT([vsearch], [2.30.1], [torognes@ifi.uio.no], [vsearch], [https://github.com/torognes/vsearch]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C++]) AC_CONFIG_SRCDIR([src/vsearch.cc]) AC_CONFIG_HEADERS([config.h]) AC_SUBST(MACOSX_DEPLOYMENT_TARGET) MACOSX_DEPLOYMENT_TARGET="10.9" # Checks for programs. AC_PROG_CXX AC_PROG_RANLIB AC_PROG_INSTALL # Checks for libraries. AC_CHECK_LIB([pthread], [pthread_create]) AC_CHECK_LIB([dl], [dlopen]) AC_CHECK_LIB([psapi], [GetProcessMemoryInfo]) # Checks for header files. AC_CHECK_HEADERS([getopt.h fcntl.h regex.h string.h sys/time.h dlfcn.h pthread.h]) # Checks for typedefs, structures, and compiler characteristics. AC_C_INLINE AC_TYPE_SIZE_T AC_TYPE_UINT32_T AC_TYPE_INT64_T AC_TYPE_UINT64_T AC_TYPE_UINT8_T # Checks for library functions. AC_CHECK_FUNCS([memmove memcpy posix_memalign gettimeofday localtime memchr memset pow regcomp strchr strcspn sysinfo]) have_bzip2=no AC_ARG_ENABLE(bzip2, AS_HELP_STRING([--disable-bzip2], [Disable bzip2 support])) AS_IF([test "x$enable_bzip2" != "xno"], [ have_bzip2=yes ]) if test "x${have_bzip2}" = "xyes"; then AC_CHECK_HEADERS([bzlib.h], [], [have_bzip2=no]) fi have_zlib=no AC_ARG_ENABLE(zlib, AS_HELP_STRING([--disable-zlib], [Disable zlib support])) AS_IF([test "x$enable_zlib" != "xno"], [ have_zlib=yes ]) if test "x${have_zlib}" = "xyes"; then AC_CHECK_HEADERS([zlib.h], [], [have_zlib=no]) fi have_ps2pdf=no AC_ARG_ENABLE(pdfman, AS_HELP_STRING([--disable-pdfman], [Disable PDF manual creation])) AS_IF([test "x$enable_pdfman" != "xno"], [ have_ps2pdf=yes AC_CHECK_PROG(HAVE_PS2PDF, ps2pdf, yes, no) if test "x$HAVE_PS2PDF" = "xno"; then AC_MSG_WARN([*** ps2pdf is required to build a PDF version of the manual]) have_ps2pdf=no fi ]) # Check for --enable-profiling option AC_ARG_ENABLE([profiling], [AS_HELP_STRING([--enable-profiling], [Enable profiling build])], [enable_profiling=$enableval], [enable_profiling=no]) # Define AM_CONDITIONAL for profiling AM_CONDITIONAL([ENABLE_PROFILING], [test "x$enable_profiling" = "xyes"]) # Check for --enable-debug option AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable debug build])], [enable_debug=$enableval], [enable_debug=no]) # Define AM_CONDITIONAL for debug AM_CONDITIONAL([ENABLE_DEBUG], [test "x$enable_debug" = "xyes"]) have_man_html=no case $target in aarch64*) target_aarch64="yes" ;; powerpc64*) target_ppc="yes" ;; x86_64*) target_x86_64="yes" ;; esac AC_CHECK_HEADERS([windows.h], [AM_CONDITIONAL(TARGET_WIN, true)], [AM_CONDITIONAL(TARGET_WIN, false)]) AM_CONDITIONAL(HAVE_PS2PDF, test "x${have_ps2pdf}" = "xyes") AM_CONDITIONAL(HAVE_MAN_HTML, test "x${have_man_html}" = "xyes") AM_CONDITIONAL(TARGET_PPC, test "x${target_ppc}" = "xyes") AM_CONDITIONAL(TARGET_AARCH64, test "x${target_aarch64}" = "xyes") AM_CONDITIONAL(TARGET_X86_64, test "x${target_x86_64}" = "xyes") AM_PROG_CC_C_O AC_CONFIG_FILES([Makefile src/Makefile man/Makefile]) AC_OUTPUT vsearch-2.30.1/dockerfiles/000077500000000000000000000000001506776541700155445ustar00rootroot00000000000000vsearch-2.30.1/dockerfiles/Dockerfile.debian000066400000000000000000000005171506776541700207620ustar00rootroot00000000000000FROM debian:latest WORKDIR /opt/vsearch COPY . . RUN apt-get update RUN apt-get -y install \ autoconf \ automake \ g++ \ ghostscript \ groff \ libbz2-dev \ make \ zlib1g-dev RUN ./autogen.sh RUN ./configure CFLAGS="-O2" CXXFLAGS="-O2" RUN make clean RUN make ARFLAGS="cr" RUN make install ENTRYPOINT ["/usr/local/bin/vsearch"] vsearch-2.30.1/dockerfiles/Dockerfile.fedora000066400000000000000000000005301506776541700207730ustar00rootroot00000000000000FROM fedora:latest WORKDIR /opt/vsearch COPY . . RUN yum update -y RUN yum -y install \ autoconf \ automake \ bzip2-devel \ gcc-c++ \ ghostscript \ groff-base \ make \ zlib-devel RUN ./autogen.sh RUN ./configure CFLAGS="-O2" CXXFLAGS="-O2" RUN make clean RUN make ARFLAGS="cr" RUN make install ENTRYPOINT ["/usr/local/bin/vsearch"] vsearch-2.30.1/man/000077500000000000000000000000001506776541700140255ustar00rootroot00000000000000vsearch-2.30.1/man/Makefile.am000077500000000000000000000010671506776541700160700ustar00rootroot00000000000000# Makefile for creating PDF manual from man file dist_man_MANS = vsearch.1 doc_DATA = CLEANFILES = if HAVE_MAN_HTML doc_DATA += vsearch_manual.html vsearch_manual.html : vsearch.1 sed -e 's/\\-/-/g' $< | \ iconv -f UTF-8 -t ISO-8859-1 | \ groff -t -m mandoc -m www -Thtml > $@ CLEANFILES += vsearch_manual.html endif if HAVE_PS2PDF doc_DATA += vsearch_manual.pdf vsearch_manual.pdf : vsearch.1 sed -e 's/\\-/-/g' $< | \ iconv -f UTF-8 -t ISO-8859-1 | \ groff -W space -t -m mandoc -T ps -P -pa4 | ps2pdf - $@ CLEANFILES += vsearch_manual.pdf endif vsearch-2.30.1/man/commands/000077500000000000000000000000001506776541700156265ustar00rootroot00000000000000vsearch-2.30.1/man/commands/fragments/000077500000000000000000000000001506776541700176145ustar00rootroot00000000000000vsearch-2.30.1/man/commands/fragments/date.md000066400000000000000000000000241506776541700210470ustar00rootroot00000000000000% February 27, 2025 vsearch-2.30.1/man/commands/fragments/footer.md000066400000000000000000000074621506776541700214450ustar00rootroot00000000000000# CITATION Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. **PeerJ** 4:e2584 doi: [10.7717/peerj.2584](https://doi.org/10.7717/peerj.2584) # REPORTING BUGS Submit suggestions and bug-reports at [https://github.com/torognes/vsearch/issues](https://github.com/torognes/vsearch/issues), send a pull request on [https://github.com/torognes/vsearch](https://github.com/torognes/vsearch), or compose a friendly or curmudgeont e-mail to Torbjørn Rognes (torognes@ifi.uio.no). # AVAILABILITY Source code and binaries are available at [https://github.com/torognes/vsearch](https://github.com/torognes/vsearch). # COPYRIGHT Copyright (C) 2014-2025, Torbjørn Rognes, Frédéric Mahé and Tomás Flouri All rights reserved. Contact: Torbjørn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. ## GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see [http://www.gnu.org/licenses/](http://www.gnu.org/licenses/). ## The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ACKNOWLEDGMENTS We would like to thank the authors of the following projects for making their source code available: - **vsearch** includes code from Google's CityHash project by Geoff Pike and Jyrki Alakuijala, providing some excellent hash functions available under a MIT license. - **vsearch** includes code derived from Tatusov and Lipman's DUST program that is in the public domain. - **vsearch** includes public domain code written by Alexander Peslyak for the MD5 message digest algorithm. - **vsearch** includes public domain code written by Steve Reid and others for the SHA1 message digest algorithm. - **vsearch** binaries may include code from the zlib library, copyright Jean-Loup Gailly and Mark Adler. - **vsearch** binaries may include code from the bzip2 library, copyright Julian R. Seward. vsearch-2.30.1/man/commands/fragments/option_abskew_1.md000066400000000000000000000006111506776541700232200ustar00rootroot00000000000000`--abskew` *real* : Set the minimum abundance skew ratio between a chimera and its potential parent. The assumption is that chimeras appear later in the PCR amplification process and are therefore less abundant than their parents. Any positive value equal or greater than 1.0 can be used. Default is 1.0, which means that the parents should be at least as abundant than the chimera. vsearch-2.30.1/man/commands/fragments/option_abskew_16.md000066400000000000000000000006251506776541700233130ustar00rootroot00000000000000`--abskew` *real* : Set the minimum abundance skew ratio between a chimera and its potential parent. The assumption is that chimeras appear later in the PCR amplification process and are therefore less abundant than their parents. Any positive value equal or greater than 1.0 can be used. Default is 16.0, which means that the parents should be at least 16 times more abundant than the chimera. vsearch-2.30.1/man/commands/fragments/option_abskew_2.md000066400000000000000000000006241506776541700232250ustar00rootroot00000000000000`--abskew` *real* : Set the minimum abundance skew ratio between a chimera and its potential parent. The assumption is that chimeras appear later in the PCR amplification process and are therefore less abundant than their parents. Any positive value equal or greater than 1.0 can be used. Default is 2.0, which means that the parents should be at least 2 times more abundant than the chimera. vsearch-2.30.1/man/commands/fragments/option_alignwidth.md000066400000000000000000000003011506776541700236520ustar00rootroot00000000000000`--alignwidth` *positive integer* : Set maximal width of three-way alignments when writing alignments with `--alnout`. Default width is 60 nucleotides. Set to zero (0) to suppress folding. vsearch-2.30.1/man/commands/fragments/option_bzip2_decompress.md000066400000000000000000000003171506776541700250010ustar00rootroot00000000000000`--bzip2_decompress` : Specify that the *input pipe* is streaming data compressed using Huffman coding (see `bzip2(1)`). This option is not needed when reading from a regular file compressed with bzip2. vsearch-2.30.1/man/commands/fragments/option_chimeras.md000066400000000000000000000002101506776541700233120ustar00rootroot00000000000000`--chimeras` *filename* : Write chimeric sequences to *filename*, in fasta format. Output order may vary when using multiple threads. vsearch-2.30.1/man/commands/fragments/option_chimeras_diff_pct.md000066400000000000000000000003331506776541700251560ustar00rootroot00000000000000`--chimeras_diff_pct` *real* : Set the maximal mismatch percentage allowed in each chimeric region (excluding insertion and deletions). Accepted values range from 0.0 to 50.0%. Default is 0.0 (no mismatch allowed). vsearch-2.30.1/man/commands/fragments/option_chimeras_length_min.md000066400000000000000000000001651506776541700255270ustar00rootroot00000000000000`--chimeras_length_min` *positive non-null integer* : Set the minimum length of each chimeric region. Default is 10. vsearch-2.30.1/man/commands/fragments/option_chimeras_parents_max.md000066400000000000000000000002271506776541700257230ustar00rootroot00000000000000`--chimeras_parents_max` *positive non-null integer* : Set the maximum number of parent sequences. Accepted values range from 2 to 20. Default is 3. vsearch-2.30.1/man/commands/fragments/option_chimeras_parts.md000066400000000000000000000002041506776541700245260ustar00rootroot00000000000000`--chimeras_parts` *positive non-null integer* : Set the number of parts to divide sequences. Default is (sequence length / 100). vsearch-2.30.1/man/commands/fragments/option_dbmask.md000066400000000000000000000005151506776541700227700ustar00rootroot00000000000000`--dbmask` *none*|*dust*|*soft* : Mask regions in the database sequences using the *dust* method or the *soft* method, or *none* to suppress masking. See [`vsearch-mask(1)`](./vsearch-mask.1.md) for more details. Warning, when using *soft* masking, search commands become case sensitive. The default is to mask using *dust*. vsearch-2.30.1/man/commands/fragments/option_fasta_width.md000066400000000000000000000003421506776541700240220ustar00rootroot00000000000000`--fasta_width` *positive integer* : Set maximal width of sequences when writing fasta files. Longer sequences are folded and written on several lines. Default width is 80 nucleotides. Set to zero (0) to suppress folding. vsearch-2.30.1/man/commands/fragments/option_fastq_ascii.md000066400000000000000000000005531506776541700240170ustar00rootroot00000000000000`--fastq_ascii` 33|64 : Specify the *offset* used as the basis for the fastq quality score when reading fastq files. For example, an offset of 33 means that a quality value of 41 is represented by the 74th ASCII symbol (33 + 41 = 74), which is 'J'. See `ascii(7)` for a view of the ASCII character set. The offset value is either 33 or 64, default is 33. vsearch-2.30.1/man/commands/fragments/option_fastq_asciiout.md000066400000000000000000000005651506776541700245520ustar00rootroot00000000000000`--fastq_asciiout` 33|64 : Specify the *offset* used as the basis for the fastq quality score when writing fastq output files. For example, an offset of 33 means that a quality value of 41 is represented by the 74th ASCII symbol (33 + 41 = 74), which is 'J'. See `ascii(7)` for a view of the ASCII character set. The offset value is either 33 or 64, default is 33. vsearch-2.30.1/man/commands/fragments/option_fastq_qmax.md000066400000000000000000000006201506776541700236700ustar00rootroot00000000000000`--fastq_qmax` *positive integer* : Specify the maximal quality score accepted when reading fastq sequences. Stop with an error message if a quality score higher than the specified value is read. Accepted values range from 0 to 93 if the offset is 33 (see `--fastq_ascii`), or range from 0 to 62 if the offset is 64. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. vsearch-2.30.1/man/commands/fragments/option_fastq_qmax_ignored.md000066400000000000000000000001111506776541700253720ustar00rootroot00000000000000`--fastq_qmax` *positive integer* : Option is ignored and has no effect. vsearch-2.30.1/man/commands/fragments/option_fastq_qmin.md000066400000000000000000000006771506776541700237020ustar00rootroot00000000000000`--fastq_qmin` *positive integer* : Specify the minimal quality score accepted when reading fastq sequences. Stop with an error message if a quality score lower than the specified value is read. Accepted values range from 0 to 93 if the offset is 33 (see `--fastq_ascii`), or range from 0 to 62 if the offset is 64. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. vsearch-2.30.1/man/commands/fragments/option_fastq_qmin_ignored.md000066400000000000000000000001111506776541700253700ustar00rootroot00000000000000`--fastq_qmin` *positive integer* : Option is ignored and has no effect. vsearch-2.30.1/man/commands/fragments/option_gapext.md000066400000000000000000000005611506776541700230200ustar00rootroot00000000000000`--gapext` *string* : Set penalties for a gap extension. See [`vsearch-pairwise_alignment_parameters(7)`](./misc/vsearch-pairwise_alignment_parameters.7.md) for a complete description of the gap penalty declaration system. By default, the penalty is set to 2 for extending internal gaps and to 1 for extending terminal gaps, in both query and target sequences. vsearch-2.30.1/man/commands/fragments/option_gapopen.md000066400000000000000000000005551506776541700231640ustar00rootroot00000000000000`--gapopen` *string* : Set penalties for a gap opening. See [`vsearch-pairwise_alignment_parameters(7)`](./misc/vsearch-pairwise_alignment_parameters.7.md) for a complete description of the gap penalty declaration system. By default, the penalty is set to 20 for opening internal gaps and to 2 for opening terminal gaps, in both query and target sequences. vsearch-2.30.1/man/commands/fragments/option_gzip_decompress.md000066400000000000000000000003171506776541700247240ustar00rootroot00000000000000`--gzip_decompress` : Specify that the *input pipe* is streaming data compressed using Lempel-Ziv coding (see `gzip(1)`). This option is not needed when reading from a regular file compressed with gzip. vsearch-2.30.1/man/commands/fragments/option_hardmask.md000066400000000000000000000000631506776541700233170ustar00rootroot00000000000000`--hardmask` : Replace masked nucleotides with Ns. vsearch-2.30.1/man/commands/fragments/option_label_suffix.md000066400000000000000000000003351506776541700241720ustar00rootroot00000000000000`--label_suffix` *string* : Add the suffix *string* to sequence headers when writing fasta or fastq files. For example, with `--label_suffix ";status=healthy"`, sequence header '>seq1' becomes '>seq1;status=healthy'. vsearch-2.30.1/man/commands/fragments/option_lengthout.md000066400000000000000000000002021506776541700235310ustar00rootroot00000000000000`--lengthout` : Add a sequence length annotation (`;length=integer`) to each sequence header when writing fasta or fastq files. vsearch-2.30.1/man/commands/fragments/option_log.md000066400000000000000000000004731506776541700223130ustar00rootroot00000000000000`--log` *filename* : Write messages to *filename*. Messages include program version, start and finish times, elapsed time, amount of memory available, maximum amount of memory consumed, number of cores and command line options, and if need be, command-specific informational messages, warnings, and errors. vsearch-2.30.1/man/commands/fragments/option_match.md000066400000000000000000000002101506776541700226130ustar00rootroot00000000000000`--match` *integer* : Set the score assigned to a match (i.e. equivalent nucleotides) in pairwise alignments. The default value is 2. vsearch-2.30.1/man/commands/fragments/option_maxseqlength.md000066400000000000000000000001701506776541700242240ustar00rootroot00000000000000`--maxseqlength` *positive integer* : Discard sequences longer than *positive integer* (50,000 nucleotides by default). vsearch-2.30.1/man/commands/fragments/option_minseqlength_1.md000066400000000000000000000001631506776541700244440ustar00rootroot00000000000000`--minseqlength` *positive integer* : Discard sequences shorter than *positive integer* (1 nucleotide by default). vsearch-2.30.1/man/commands/fragments/option_minseqlength_32.md000066400000000000000000000001651506776541700245320ustar00rootroot00000000000000`--minseqlength` *positive integer* : Discard sequences shorter than *positive integer* (32 nucleotides by default). vsearch-2.30.1/man/commands/fragments/option_mismatch.md000066400000000000000000000002161506776541700233320ustar00rootroot00000000000000`--mismatch` *integer* : Set the score assigned to a mismatch (i.e. different nucleotides) in pairwise alignments. The default value is -4. vsearch-2.30.1/man/commands/fragments/option_no_progress.md000066400000000000000000000001771506776541700240730ustar00rootroot00000000000000`--no_progress` : Suppress the gradually increasing progress indicator normally written to the *standard error* `stderr(3)`. vsearch-2.30.1/man/commands/fragments/option_nonchimeras.md000066400000000000000000000002171506776541700240340ustar00rootroot00000000000000`--nonchimeras` *filename* : Write non-chimeric sequences to *filename*, in fasta format. Output order may vary when using multiple threads. vsearch-2.30.1/man/commands/fragments/option_notrunclabels.md000066400000000000000000000002671506776541700244060ustar00rootroot00000000000000`--notrunclabels` : Retain whole sequence headers in output files. By default, vsearch truncates sequence headers at first space or tabulation. This option suppresses truncation. vsearch-2.30.1/man/commands/fragments/option_output.md000066400000000000000000000000761506776541700230710ustar00rootroot00000000000000`--output` *filename* : Write to *filename*, in fasta format. vsearch-2.30.1/man/commands/fragments/option_qmask.md000066400000000000000000000005051506776541700226420ustar00rootroot00000000000000`--qmask` *none*|*dust*|*soft* : Mask regions in query sequences using the *dust* method or the *soft* method, or *none* to suppress masking. See [`vsearch-mask(1)`](./vsearch-mask.1.md) for more details. Warning, when using *soft* masking, search commands become case sensitive. The default is to mask using *dust*. vsearch-2.30.1/man/commands/fragments/option_quiet.md000066400000000000000000000002171506776541700226550ustar00rootroot00000000000000`--quiet` : Suppress messages to the *standard output* `stdout(3)` and *standard error* `stderr(3)`, except for warnings and error messages. vsearch-2.30.1/man/commands/fragments/option_randseed.md000066400000000000000000000004351506776541700233150ustar00rootroot00000000000000`--randseed` *integer* : Set the seed for the pseudo-random generator. A given seed always produces the same results, which is useful for replicability. By default, vsearch uses a pseudo-random seed if `--randseed` is not set, or if `--randseed` is set to the special value zero (0). vsearch-2.30.1/man/commands/fragments/option_relabel.md000066400000000000000000000006461506776541700231420ustar00rootroot00000000000000`--relabel` *string* : Replace sequence headers with the prefix *string* and a ticker (1, 2, 3, etc.). For example, with `--relabel "cluster:"`, the first sequence header becomes '>cluster:1', the second sequence header becomes '>cluster:2', and so on. To retain annotations, use their corresponding options (`--lengthout`, `--eeout`, and `--sizeout`). Use `--relabel_keep` to retain old sequence identifiers. vsearch-2.30.1/man/commands/fragments/option_relabel_keep.md000066400000000000000000000001651506776541700241420ustar00rootroot00000000000000`--relabel_keep` : Retain old sequence identifiers by including them at the end of the new headers, after a space. vsearch-2.30.1/man/commands/fragments/option_relabel_md5.md000066400000000000000000000012051506776541700236770ustar00rootroot00000000000000`--relabel_md5` : Replace each sequence header with the MD5 digest derived from the sequence itself. The sequence is converted to upper case, and each 'U' is replaced with a 'T' before computation of the digest. The MD5 digest is a 128-bit value, represented using a string of 32 ASCII characters. Each pair of characters encodes an hexadecimal value, ranging from `x00` to `xff`. See `md5(3)` for more details on MD5, and `--relabel_sha1` for an alternative algorithm. To retain annotations, use their corresponding options (`--lengthout`, `--eeout`, and `--sizeout`). Use `--relabel_keep` to retain old sequence identifiers. vsearch-2.30.1/man/commands/fragments/option_relabel_self.md000066400000000000000000000003561506776541700241510ustar00rootroot00000000000000`--relabel_self` : Replace each sequence header with the sequence itself. To retain annotations, use their corresponding options (`--lengthout`, `--eeout`, and `--sizeout`). Use `--relabel_keep` to retain old sequence identifiers. vsearch-2.30.1/man/commands/fragments/option_relabel_sha1.md000066400000000000000000000012111506776541700240430ustar00rootroot00000000000000`--relabel_sha1` : Replace each sequence header with the SHA1 digest derived from the sequence itself. The sequence is converted to upper case, and each 'U' is replaced with a 'T' before computation of the digest. The SHA1 digest is a 160-bit value, represented using a string of 40 ASCII characters. Each pair of characters encodes an hexadecimal value, ranging from `x00` to `xff`. See `sha1(3)` for more details on SHA1, and `--relabel_md5` for an alternative algorithm. To retain annotations, use their corresponding options (`--lengthout`, `--eeout`, and `--sizeout`). Use `--relabel_keep` to retain old sequence identifiers. vsearch-2.30.1/man/commands/fragments/option_sample.md000066400000000000000000000005611506776541700230110ustar00rootroot00000000000000`--sample` *string* : Add the the given sample identifier *string* to sequence headers when writing fasta or fastq files. For instance, if *string* is 'ABC', the text `;sample=ABC` will be added to the headers. Note that *string* will be truncated at the first ';' or blank character. Other characters (alphabetical, numerical and punctuations) are accepted. vsearch-2.30.1/man/commands/fragments/option_sizein.md000066400000000000000000000003541506776541700230310ustar00rootroot00000000000000`--sizein` : Use the abundance annotations present in sequence headers when reading fasta or fastq file. Search for the pattern `[>@;]size=integer[;]`. Entries without abundance annotations are silently assumed to be of `size=1`. vsearch-2.30.1/man/commands/fragments/option_sizein_implied.md000066400000000000000000000005531506776541700245350ustar00rootroot00000000000000`--sizein` : Use the abundance annotations present in sequence headers when reading fasta or fastq file. Search for the pattern `[>@;]size=integer[;]`. Entries without abundance annotations are silently assumed to be of `size=1`. With *de novo* clustering commands, option `--sizein` and `--sizeout` are always implied, and do not need to be specified. vsearch-2.30.1/man/commands/fragments/option_sizeout.md000066400000000000000000000004651506776541700232350ustar00rootroot00000000000000`--sizeout` : Add abundance annotations to sequence headers when writing fasta or fastq files. Add the pattern `;size=integer;`. If option `--sizein` is not used, abundance values are set to 1 for all entries. If `--sizein` is used, existing abundance annotations are simply reported to output files. vsearch-2.30.1/man/commands/fragments/option_threads_not_multithreaded.md000066400000000000000000000001361506776541700267530ustar00rootroot00000000000000`--threads` *positive non-null integer* : Command is not multithreaded, option has no effect. vsearch-2.30.1/man/commands/fragments/option_topn.md000066400000000000000000000003051506776541700225040ustar00rootroot00000000000000`--topn` *positive non-null integer* : Write only the *n* first entries. The parameter *n* must be greater than zero, and is silently ignored if it is greater than the total number of entries. vsearch-2.30.1/man/commands/fragments/option_wordlength.md000066400000000000000000000013201506776541700236770ustar00rootroot00000000000000`--wordlength` *positive integer* : Set the length of words (i.e. *k*-mers) used for sequence comparisons. The range of possible values goes from 3 to 15 (default value is 12). Longer words may reduce the sensitivity/recall for weak similarities, but can increase precision. On the other hand, shorter words may increase sensitivity or recall, but may reduce precision. Computation time generally increases with shorter words and decreases with longer words, but it increases again for very long words. Memory requirements for a part of the index increase by a factor of 4 each time word length increases by one nucleotide, and this generally becomes significant for words longer than 12. vsearch-2.30.1/man/commands/fragments/option_xee.md000066400000000000000000000005111506776541700223040ustar00rootroot00000000000000`--xee` : Strip expected error (ee) annotations from sequence headers when writing fasta or fastq files. Search for the pattern `[>@;]ee=float[;]`. Expected error annotations are added by the synonymous options `--fastq_eeout` and `--eeout` described in [`vsearch-fastx_filter(1)`](./commands/vsearch-fastx_filter.1.md). vsearch-2.30.1/man/commands/fragments/option_xlength.md000066400000000000000000000003431506776541700231770ustar00rootroot00000000000000`--xlength` : Strip sequence length annotations from sequence headers when writing fasta or fastq files. Search for the pattern `[>@;]length=integer[;]`. Sequence length annotations are added by the `--lengthout` option. vsearch-2.30.1/man/commands/fragments/option_xn.md000066400000000000000000000005701506776541700221550ustar00rootroot00000000000000`--xn` *strictly positive real number* : Set the weight of 'no' votes, corresponding to the parameter *beta* in the chimera scoring function. Default value is 8.0. Increasing `--xn` reduces the likelihood of tagging a sequence as a chimera (less false positives, but also more false negatives). Decreasing `--xn` reduces false negative, but increases false positives. vsearch-2.30.1/man/commands/fragments/option_xsize.md000066400000000000000000000003211506776541700226640ustar00rootroot00000000000000`--xsize` : Strip abundance annotations from sequence headers when writing fasta or fastq files. Search for the pattern `[>@;]size=integer[;]`. Abundance annotations are added by the `--sizeout` option. vsearch-2.30.1/man/commands/vsearch-chimeras_denovo.1.md000066400000000000000000000137261506776541700231160ustar00rootroot00000000000000% vsearch-chimeras_denovo(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-chimeras_denovo --- detect chimeras *de novo* in long exact sequences # SYNOPSIS | **vsearch** **\-\-chimeras_denovo** _inputfile_ (\-\-chimeras | \-\-nonchimeras | \-\-alnout | \-\-tabbedout) _outputfile_ \[_options_] # DESCRIPTION The vsearch command `--chimeras_denovo` detect chimeras *de novo* (i.e. without external references) in long exact sequences (*inputfile*, in fasta or fastq format). It uses a modified uchime algorithm that can automatically adapt to a wide range of sequence lengths. Abundance annotations (pattern '[>@;]size=integer[;]') present in sequence headers are taken into account by default. This means that option `--sizein` is always implied, and does not need to be specified. Sequences are sorted into chimeras and non-chimeras, and can be written to fasta files (see output options `--chimeras`, `--nonchimeras`). Additional information on each chimera can be collected with the output options `--tabbedout` and `--alnout`. The latter outputs for each chimera (i.e. 'Query') a multi-way alignment and a model showing the most likely parent sequence of each section of the chimera. Here is an example of a short chimeric sequence (Q), with two parents (A and B): ```text ------------------------------------------------------------------------ Query ( 20 nt) Q ParentA ( 20 nt) A ParentB ( 20 nt) B Q 1 GTAGGCCGTGCTGAGCCGTA 20 A 1 GTAGGCCGTGgTagGCCGTg 20 B 1 cTgaGCCGTaCTGAGCCGTA 20 Diffs A AA AB BB B Model AAAAAAAAAABBBBBBBBBB Ids. QA 80.00%, QB 80.00%, QC 0.00%, QT 80.00%, QModel 100.00%, Div. +25.00% ``` Lowercase positions indicate a mismatch between the parent and the query. The line Diffs indicates the positions that favor a particular parent when modeling the chimera. The line Ids gives global similarity percentages with the different parents (QA, QB, and QC), the closest parent (QT), the model (QModel, always 100.00%), and the divergence of the model with the closest parent (Div). If there are only two parents (A and B), QC is set to 0.00%. If there are more than three parents, only QA, QB and QC are reported. # OPTIONS ## mandatory options At least one of `--alnout`, `--chimeras`, `--nonchimeras`, and `--tabbedout` must be specified. `--alnout` *filename* : Write multi-way alignments to *filename* using a human-readable format (see example above). Use `--alignwidth` to set the alignment width (60 nucleotides by default). #(./fragments/option_chimeras.md) #(./fragments/option_nonchimeras.md) `--tabbedout` *filename* : Write the results to a eighteen-column tab-delimited file with the specified *filename*. Columns are: 1. score: dummy value, always set to 99.9999 2. query header 3. parent A header 4. parent B header 5. parent C header ("*" if there are only two parents) 6. QModel: max global similarity percentage (always 100.0%) 7. QA: global similarity percentage with parent A 8. QB: global similarity percentage with parent B 9. QC: global similarity percentage with parent C (0.00 if there are only two parents) 10. QT: highest similarity percentage with a parent 11. left yes: ignored, always set to zero 12. left no: ignored, always set to zero 13. left abstain: ignored, always set to zero 14. right yes: ignored, always set to zero 15. right no: ignored, always set to zero 16. right abstain: ignored, always set to zero 17. dummy value, always set to 0.00 18. chimeric status, always set to Y (only chimeras are reported) ## core options #(./fragments/option_abskew_1.md) #(./fragments/option_chimeras_diff_pct.md) #(./fragments/option_chimeras_length_min.md) #(./fragments/option_chimeras_parents_max.md) #(./fragments/option_chimeras_parts.md) #(./fragments/option_sizein_implied.md) #(./fragments/option_xn.md) ## secondary options #(./fragments/option_alignwidth.md) #(./fragments/option_fasta_width.md) #(./fragments/option_hardmask.md) #(./fragments/option_label_suffix.md) #(./fragments/option_lengthout.md) #(./fragments/option_log.md) #(./fragments/option_maxseqlength.md) #(./fragments/option_minseqlength_32.md) #(./fragments/option_no_progress.md) #(./fragments/option_notrunclabels.md) #(./fragments/option_qmask.md) #(./fragments/option_quiet.md) #(./fragments/option_relabel.md) #(./fragments/option_relabel_keep.md) #(./fragments/option_relabel_md5.md) #(./fragments/option_relabel_self.md) #(./fragments/option_relabel_sha1.md) #(./fragments/option_sample.md) #(./fragments/option_sizeout.md) #(./fragments/option_xee.md) #(./fragments/option_xlength.md) #(./fragments/option_xsize.md) ## pairwise alignment options These options modify the parameters of the pairwise alignment model. Modify with caution. #(./fragments/option_gapext.md) #(./fragments/option_gapopen.md) #(./fragments/option_match.md) #(./fragments/option_mismatch.md) ## ignored options #(./fragments/option_threads_not_multithreaded.md) ## unsupported options The following options are not yet supported by the `--chimeras_denovo` command: #(./fragments/option_bzip2_decompress.md) #(./fragments/option_gzip_decompress.md) # EXAMPLES A simple way to filter out chimeras: ```sh vsearch \ --chimeras_denovo input.fasta \ --quiet \ --nonchimeras clean.fasta ``` Add option `--tabbedout` to log the name of the sequences identified as chimeras, and option `--log` to record run parameters: ```sh vsearch \ --chimeras_denovo input.fasta \ --quiet \ --nonchimeras clean.fasta \ --tabbedout chimeras.tsv \ --log chimera_filtering.log ``` # SEE ALSO [`vsearch-uchime_denovo(1)`](./commands/vsearch-uchime_denovo.1.md), [`vsearch-uchime2_denovo(1)`](../formats/vsearch-uchime2_denovo.1.md), [`vsearch-uchime3_denovo(1)`](../formats/vsearch-uchime3_denovo.1.md), [`vsearch-uchime_ref(1)`](../formats/vsearch-uchime_ref.1.md) #(./fragments/footer.md) vsearch-2.30.1/man/commands/vsearch-cut.1.md000066400000000000000000000142111506776541700205320ustar00rootroot00000000000000% vsearch-cut(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-cut --- use a restriction pattern to cut fasta sequences # SYNOPSIS | **vsearch** **\-\-cut** *fastafile* \-\-cut_pattern *pattern* (\-\-fastaout | \-\-fastaout_rev | \-\-fastaout_discarded | \-\-fastaout_discarded_rev) *outputfile* \[*options*] # DESCRIPTION The vsearch command `--cut` uses a restriction pattern to cut input fasta sequences. Input sequences are cut into fragments at **each** restriction site matching the pattern given with the option `--cut_pattern`. Restriction patterns are only searched on the forward (or normal) strand, not on the reverse strand. Fragments on the forward strand are written to the file specified with the `--fastaout` file, and the reverse-complement of fragments are written to the file specified with the `--fastaout_rev` option. Fragments receive the name of their parent sequence. Input sequences with not match are written to the file specified with the option `--fastaout_discarded`, and their reverse-complement are also written to the file specified with the `--fastaout_discarded_rev` option. A typical restriction pattern is "`G^AATT_C`", representing the EcoRI restriction site. The nucleotide symbols represent the sequence to be matched. Lowercase or uppercase nucleotides, as well as ambiguous nucleotides (IUPAC) are accepted. The special character '^' (circumflex) indicates the cutting position on the forward strand, while '`_`' (underscore) indicates the cutting position on the reverse strand. Forward and reverse cutting positions can be the same (for example "`GG^_AA`"), but exactly one cutting position on each strand must be indicated (one '^' and one '`_`'). As noted above, restriction patterns are only searched on the forward (or normal) strand, not on the reverse strand. For palindromic patterns such as EcoRI, this is not an issue. A palindromic pattern is identical to its reverse-complement (same sequence and same cutting positions), so there are no copies on the reverse strand that do not have a counterpart on the forward strand. ``` forward 5' A...G^AATT_C...G 3' reverse 3' T...C_TTAA^G...C 5' fragments: A...G, AATTC...G, but also C...G and AATTC...T ``` All copies are detected. For asymetrical or non-palindromic patterns, if the pattern appears on the reverse strand, it will not be detected. For example, the pattern "`GG^_C`" is detected when present on the forward strand: ``` forward 5' A...GG^C...G 3' reverse 3' T...CC G...C 5' fragments: A...GG, C...G ``` but is not detected when present on the reverse strand: ``` forward 5' A...G CC...G 3' reverse 3' T...C_GG...C 5' <- present but not detected ``` To detect asymetrical or non-palindromic patterns on the reverse strand, it is necessary to run the `--cut` command on the reverse-complemented input sequences (see [`vsearch-fastx_revcomp(1)`](./vsearch-fastx_revcomp.1.md)). Finally, pattern occurrences can overlap. For example, the pattern "`G^_G`" will be detected *three* times in the sequence "`GGGG`". # OPTIONS ## mandatory options `--cut_pattern` *pattern* : Specify the restriction site pattern (case insensitive IUPAC characters) and cutting positions ('^' and '_'). For example, "`G^AATT_C`". See the PATTERN EXAMPLES section for more details. At least one of `--fastaout`, `--fastaout_rev`, `--fastaout_discarded`, or `--fastaout_discarded_rev` must also be specified. ## core options `--fastaout` *filename* : Write the forward strand fragments to *filename*, in fasta format. `--fastaout_rev` *filename* : Write the reverse strand fragments to *filename*, in fasta format. `--fastaout_discarded` *filename* : Write the non-matching sequences to *filename*, in fasta format. `--fastaout_discarded_rev` *filename* : Write the reverse-complemented non-matching sequences to *filename*, in fasta format. ## secondary options #(./fragments/option_bzip2_decompress.md) #(./fragments/option_fasta_width.md) #(./fragments/option_gzip_decompress.md) #(./fragments/option_label_suffix.md) #(./fragments/option_lengthout.md) #(./fragments/option_log.md) #(./fragments/option_no_progress.md) #(./fragments/option_notrunclabels.md) #(./fragments/option_quiet.md) #(./fragments/option_relabel.md) #(./fragments/option_relabel_keep.md) #(./fragments/option_relabel_md5.md) #(./fragments/option_relabel_self.md) #(./fragments/option_relabel_sha1.md) #(./fragments/option_sample.md) #(./fragments/option_sizein.md) #(./fragments/option_sizeout.md) #(./fragments/option_xee.md) #(./fragments/option_xlength.md) #(./fragments/option_xsize.md) ## ignored options #(./fragments/option_threads_not_multithreaded.md) # EXAMPLES Cut the sequences in *query.fasta*, using the restriction pattern "`G^AATT_C`" (`--cut_pattern`). Write the fragments found on the reverse strand to *query_fragments.fasta*, in fasta format (`--fastaout_rev`): ```sh vsearch \ --cut query.fasta \ --cut_pattern "G^AATT_C" \ --fastaout_rev query_fragments.fasta ``` # PATTERN EXAMPLES (the symbol '|' is used to represent a cut on either strand) `EcoRI` : use the palindromic pattern "`G^AATT_C`" to represent the following restriction: ``` 5' G|AATT-C 3' 3' C-TTAA|G 5' ``` `EcoRII` : use the palindromic pattern "`^CCWGG_`" to represent the following restriction: ``` 5' |CCWGG- 3' 3' -GGWCC| 5' ``` `EcoRV` : use the palindromic pattern "`GAT^_ATC`" to represent the following restriction: ``` 5' GAT|ATC 3' 3' CTA|TAG 5' ``` `Fok1` : use the non-palindromic pattern "`N_NNNNNNNNNNNNGGATGNNNNNNNN^N`" to represent the following restriction, with a cut 9 nucleotides downstream of the motif on the forward strand, and 13 nucleotides downstream of the motif on the reverse strand: ``` 5' -NNNNNNNNNNNNGGATGNNNNNNNN| 3' 3' |NNNNNNNNNNNNCCTACNNNNNNNN- 5' ``` `HindIII` : use the palindromic pattern "`A^AGCT_T`" to represent the following restriction: ``` 5' A|AGCT-T 3' 3' T-TCGA|A 5' ``` `NspI` : use the asymetric pattern "`RCATG^_Y`" to represent the following restriction: ``` 5' RCATG|Y 3' 3' YGTAC-R 5' ``` # SEE ALSO [`vsearch-fastx_revcomp(1)`](./vsearch-fastx_revcomp.1.md), #(./fragments/footer.md) vsearch-2.30.1/man/commands/vsearch-fasta2fastq.1.md000066400000000000000000000062141506776541700221620ustar00rootroot00000000000000% vsearch-fasta2fastq(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-fasta2fastq --- convert a fasta file to a fastq file with fake quality scores # SYNOPSIS | **vsearch** **\-\-fasta2fastq** _fastafile_ \-\-fastqout _fastqfile_ \[_options_] # DESCRIPTION The vsearch command `--fasta2fastq` converts fasta sequences into fastq sequences by adding fake quality scores. See [`vsearch-fasta(5)`](../formats/vsearch-fasta.5.md) and [`vsearch-fastq(5)`](../formats/vsearch-fastq.5.md) for more information on these formats. Sequences are written to the file specified with `--fastqout`. The quality score can be adjusted with the `--fastq_qmaxout` option (default is 41). The quality offset can be adjusted with the `--fastq_asciiout` option (either 33 or 64, default is 33). # OPTIONS ## mandatory options `--fastqout` *filename* : Write sequences to *filename* in fastq format, with fake quality scores. The default quality value (41) and offset (33) can be changed with options `--fastq_qmaxout` and `--fastq_asciiout`. ## core options #(./fragments/option_fastq_asciiout.md) `--fastq_qmaxout` *positive integer* : Specify the fake quality score used when writing the fastq file. The default is 41, which is the usual maximal quality score for recent Sanger/Illumina 1.8+ files (maximal quality score was 40 in older formats). Accepted values range from 0 to 93 when the quality offset is set to 33 (see `--fastq_asciiout`), and from 0 to 62 when the quality offset is set to 64. ## secondary options #(./fragments/option_bzip2_decompress.md) #(./fragments/option_gzip_decompress.md) #(./fragments/option_label_suffix.md) #(./fragments/option_lengthout.md) #(./fragments/option_log.md) #(./fragments/option_no_progress.md) #(./fragments/option_quiet.md) #(./fragments/option_relabel.md) #(./fragments/option_relabel_keep.md) #(./fragments/option_relabel_md5.md) #(./fragments/option_relabel_self.md) #(./fragments/option_relabel_sha1.md) #(./fragments/option_sample.md) #(./fragments/option_sizein.md) #(./fragments/option_sizeout.md) #(./fragments/option_xee.md) #(./fragments/option_xlength.md) #(./fragments/option_xsize.md) ## ignored options #(./fragments/option_threads_not_multithreaded.md) # EXAMPLES Convert sequences in *input.fasta* into fastq sequences, with fake quality values (Q = 40), and a quality offset of 64. Add sequence length annotations (`--lengthout`). Write the results to *output.fastq*: ```sh vsearch \ --fasta2fastq input.fasta \ --fastq_qmaxout 40 \ --fastq_asciiout 64 \ --lengthout \ --fastqout output.fastq ``` For instance, the following fasta input: ```text >s1 CACCGCGGTTATACGAGGGGCTCAAATTGATATT >s2 CACCGCGGTTATACGAGGGGCTCAAATTGATATT AATATCAATTTGAGCCCCTCGTATAACCGCGGTG ``` becomes: ```text @s1;length=34 CACCGCGGTTATACGAGGGGCTCAAATTGATATT + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh @s2;length=68 CACCGCGGTTATACGAGGGGCTCAAATTGATATTAATATCAATTTGAGCCCCTCGTATAACCGCGGTG + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh ``` (note that fastq output files are not wrapped) #(./fragments/footer.md) vsearch-2.30.1/man/commands/vsearch-fastq_chars.1.md000066400000000000000000000076601506776541700222470ustar00rootroot00000000000000% vsearch-fastq_chars(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-fastq_chars --- analyze fastq files to identify the quality encoding and the range of quality score values used # SYNOPSIS | **vsearch** **\-\-fastq_chars** _fastqfile_ \[_options_] # DESCRIPTION The vsearch command `--fastq_chars` summarizes the number and composition of sequences and quality strings contained in the input fastq file. Results are written to the *standard error* `stderr(3)`, and to *filename* if option `--log` *filename* is used. The command `--fastq_chars` tries to automatically detect the quality offset (33 or 64) and the fastq format (Solexa, Illumina 1.3+, Illumina 1.5+ or Illumina 1.8+/Sanger) by analyzing the range of observed quality score values. In case of success, `--fastq_chars` suggests values for the quality offset `--fastq_ascii` (33 or 64), as well as `--fastq_qmin` and `--fastq_qmax` values that could be used with other commands. If the quality encoding is ambiguous, an offset of 33 is favored. For example: ```text Qmin 45, QMax 73, Range 29 Guess: -fastq_qmin 12 -fastq_qmax 40 -fastq_ascii 33 Guess: Original Sanger format (phred+33) ``` For each sequence symbol, `--fastq_chars` gives the number of occurrences of the symbol, its relative frequency, and the length of the longest run of that symbol (lowercase symbols are converted to uppercase). For example: ```text Letter N Freq MaxRun ------ ---------- ------ ------ A 9050221 25.8% 19 C 5657659 16.1% 9 G 8731373 24.9% 11 T 11610889 33.1% 24 ``` For each quality symbol, `--fastq_chars` gives the ASCII value of the symbol (see `ascii(7)`), its relative frequency, and the number of times a *k*-mer of that symbol appears at the end of quality strings. The length of the *k*-mer can be set with the option `--fastq_tail` (4 by default). For example: ```text Char ASCII Freq Tails ---- ----- ------ ---------- '-' 45 0.7% 8 '.' 46 1.7% 9 '/' 47 2.8% 2997 '0' 48 2.4% 221 '1' 49 0.9% 0 '2' 50 0.4% 0 '3' 51 0.4% 0 '4' 52 0.2% 0 '5' 53 0.3% 0 '6' 54 0.0% 0 '9' 57 1.3% 12 ':' 58 0.7% 0 ';' 59 1.3% 1 '<' 60 0.6% 0 '=' 61 0.2% 0 '>' 62 0.3% 0 '?' 63 0.6% 0 '@' 64 0.3% 0 'A' 65 1.9% 0 'B' 66 3.9% 91 'C' 67 2.2% 0 'D' 68 1.4% 0 'E' 69 2.9% 0 'F' 70 14.7% 24657 'G' 71 23.5% 5890 'H' 72 34.5% 9 'I' 73 0.0% 0 ``` # OPTIONS ## core options `--fastq_tail` *positive non-null integer* : Count the number of times a series of identical symbols of length *k* = *positive non-null integer*, a *k*-mer, appears at the end of quality strings. By default, *k* = 4. ## secondary options #(./fragments/option_bzip2_decompress.md) #(./fragments/option_gzip_decompress.md) #(./fragments/option_log.md) : A copy of the statistics computed by `--fastq_chars` is also recorded. #(./fragments/option_no_progress.md) #(./fragments/option_quiet.md) ## ignored options #(./fragments/option_threads_not_multithreaded.md) # EXAMPLES Read from *input.fastq.gz*, count series of 5 identical symbols at the end of quality strings (`--fastq_tail 5`), do not write to the standard error (`--quiet`), and write results to *output.log* (`--log`): ```sh vsearch \ --fastq_chars input.fastq.gz \ --fastq_tail 5 \ --quiet \ --log output.log ``` # SEE ALSO [`vsearch-fastq_stats(1)`](./commands/vsearch-stats.1.md), [`vsearch-fastq(5)`](../formats/vsearch-fastq.5.md) #(./fragments/footer.md) vsearch-2.30.1/man/commands/vsearch-fastq_stats.1.md000066400000000000000000000221701506776541700222760ustar00rootroot00000000000000% vsearch-fastq_stats(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-fastq_stats --- analyze fastq sequences and output detailed statistics # SYNOPSIS | **vsearch** **\-\-fastq_stats** _fastqfile_ \-\-log _outputfile_ \[_options_] # DESCRIPTION The vsearch command `--fastq_stats` analyzes fastq sequences and outputs detailed statistics, including projections of the impact different filtering thresholds would have. Results are written to `--log` *outputfile*. The quality encoding of the input fastq and the range of accepted quality values can be specified with `--fastq_ascii`, `--fastq_qmin` and `--fastq_qmax`. The five different statistics tables reported by `--fastq_stats` are described below, with an example for each. ## Read length distribution Observed read lengths are sorted in decreasing order. The largest length value is marked with '>='. 1. L: read length 2. N: number of reads 3. Pct: fraction of reads with this length 4. AccPct: fraction of reads with this length or longer ```text Read length distribution L N Pct AccPct ------- ---------- ------- ------- >= 251 19795 69.2% 69.2% 250 6847 23.9% 93.2% 249 1626 5.7% 98.9% 248 260 0.9% 99.8% 247 24 0.1% 99.8% [...] 75 5 0.0% 100.0% 74 2 0.0% 100.0% 66 2 0.0% 100.0% 65 4 0.0% 100.0% 64 1 0.0% 100.0% ``` ## Quality score distribution Observed quality values are sorted in decreasing order. 1. ASCII: character encoding the quality score 2. Q: Phred quality score 3. Pe: probability of error associated with the quality score 4. N: number of bases with this quality score 5. Pct: fraction of bases with this quality score 6. AccPct: fraction of bases with this quality score or higher ```text Q score distribution ASCII Q Pe N Pct AccPct ----- --- ------- ---------- ------- ------- I 40 0.00010 2 0.0% 0.0% H 39 0.00013 1009552 14.1% 14.1% G 38 0.00016 1236861 17.3% 31.4% F 37 0.00020 1231882 17.2% 48.6% E 36 0.00025 330094 4.6% 53.2% D 35 0.00032 143134 2.0% 55.2% C 34 0.00040 250511 3.5% 58.7% B 33 0.00050 451696 6.3% 65.0% A 32 0.00063 273153 3.8% 68.8% @ 31 0.00079 171356 2.4% 71.2% ? 30 0.00100 131957 1.8% 73.0% > 29 0.00126 59308 0.8% 73.9% = 28 0.00158 25174 0.4% 74.2% < 27 0.00200 44282 0.6% 74.8% ; 26 0.00251 204896 2.9% 77.7% : 25 0.00316 49600 0.7% 78.4% 9 24 0.00398 251427 3.5% 81.9% 6 21 0.00794 32 0.0% 81.9% 5 20 0.01000 11170 0.2% 82.0% 4 19 0.01259 9188 0.1% 82.2% 3 18 0.01585 34528 0.5% 82.7% 2 17 0.01995 25452 0.4% 83.0% 1 16 0.02512 100936 1.4% 84.4% 0 15 0.03162 96675 1.3% 85.8% / 14 0.03981 387620 5.4% 91.2% . 13 0.05012 162798 2.3% 93.5% - 12 0.06310 468864 6.5% 100.0% ``` ## Length vs. quality distribution Positions in reads are sorted in increasing order. 1. L: position in reads (starting from position 2) 2. PctRecs: fraction of reads with at least this length 3. AvgQ: average quality at this position 4. P(AvgQ): error probability corresponding to AvgQ 5. AvgP: average error probability at this position 6. AvgEE: average expected error over all reads up to this position 7. Rate: growth rate of AvgEE between the current and previous position 8. RatePct: Rate (as explained above) expressed as a percentage ```text L PctRecs AvgQ P(AvgQ) AvgP AvgEE Rate RatePct ----- ------- ---- ------- -------- ----- --------- -------- 2 100.0% 28.2 0.00153 0.005655 0.01 0.005053 0.505% 3 100.0% 30.1 0.00097 0.002893 0.01 0.004333 0.433% 4 100.0% 27.7 0.00171 0.005935 0.02 0.004734 0.473% 5 100.0% 28.8 0.00131 0.004416 0.02 0.004670 0.467% 6 100.0% 29.5 0.00112 0.005633 0.03 0.004830 0.483% [...] 247 99.8% 24.4 0.00360 0.019798 2.14 0.008668 0.867% 248 99.8% 24.0 0.00394 0.020644 2.16 0.008711 0.871% 249 98.9% 23.9 0.00407 0.020943 2.17 0.008716 0.872% 250 93.2% 23.8 0.00417 0.021239 2.22 0.008877 0.888% 251 69.2% 22.3 0.00589 0.023948 2.27 0.009048 0.905% ``` ## Effect of expected error and length filtering Positions in reads are sorted in decreasing order, starting with the first read length (L) with a cummulated expected error smaller or equal to 1.0. The next four columns indicate the number of reads that would be retained by the command [`vsearch-fastq_filter(1)`](./commands/vsearch-fastq_filter.1.md) if the reads were truncated at length L with the option `--fastq_trunclen`, and filtered to have a maximum expected error equal to or lesser than 1.0, 0.5, 0.25 or 0.1 with the option `--fastq_maxee`. The last four columns indicate the fraction of reads that would be retained by the command [`vsearch-fastq_filter(1)`](./commands/vsearch-fastq_filter.1.md) using the same length L and maximum expected error parameters. 1. L: read length 2. number of reads if truncating at L and filtering at EE =< 1.0 3. number of reads if truncating at L and filtering at EE =< 0.5 4. number of reads if truncating at L and filtering at EE =< 0.25 5. number of reads if truncating at L and filtering at EE =< 0.1 6. fraction of reads if truncating at L and filtering at EE =< 1.0 7. fraction of reads if truncating at L and filtering at EE =< 0.5 8. fraction of reads if truncating at L and filtering at EE =< 0.25 9. fraction of reads if truncating at L and filtering at EE =< 0.1 ```text L 1.0000 0.5000 0.2500 0.1000 1.0000 0.5000 0.2500 0.1000 ----- ------- ------- ------- ------- ------- ------- ------- ------- 251 6250 3022 1305 354 21.86% 10.57% 4.56% 1.24% 250 8894 4467 2080 637 31.10% 15.62% 7.27% 2.23% 249 9700 4930 2291 693 33.92% 17.24% 8.01% 2.42% 248 9854 5046 2348 712 34.46% 17.65% 8.21% 2.49% 247 9962 5127 2390 727 34.84% 17.93% 8.36% 2.54% [...] 5 28595 28595 28595 28157 100.00% 100.00% 100.00% 98.47% 4 28595 28595 28595 28461 100.00% 100.00% 100.00% 99.53% 3 28595 28595 28595 28595 100.00% 100.00% 100.00% 100.00% 2 28595 28595 28595 28595 100.00% 100.00% 100.00% 100.00% 1 28595 28595 28595 28595 100.00% 100.00% 100.00% 100.00% ``` ## Effect of minimum quality and length filtering Positions in reads are sorted in decreasing order, starting with the largest read length (Len), and limited to the top *n* lengths, where *n* = floor(largest read length / 2) + 1. The next four columns indicate the fraction of reads that would be retained by the command [`vsearch-fastq_filter(1)`](./commands/vsearch-fastq_filter.1.md) if the reads were truncated at length Len with the option `--fastq_trunclen`, or at the first position with a quality Q equal to or lesser than 5, 10, 15 or 20 with the option `--fastq_truncqual`. 1. Len: read length 2. fraction of reads if truncating at Len or at first Q =< 5 3. fraction of reads if truncating at Len or at first Q =< 10 4. fraction of reads if truncating at Len or at first Q =< 15 5. fraction of reads if truncating at Len or at first Q =< 20 ```text Truncate at first Q Len Q=5 Q=10 Q=15 Q=20 ----- ------ ------ ------ ------ 251 69.2% 69.2% 0.8% 0.6% 250 93.2% 93.2% 1.6% 1.1% 249 98.9% 98.9% 1.7% 1.2% 248 99.8% 99.8% 1.8% 1.2% 247 99.8% 99.8% 1.8% 1.2% [...] 129 99.9% 99.9% 30.3% 10.3% 128 99.9% 99.9% 30.7% 10.4% 127 99.9% 99.9% 31.1% 10.5% 126 99.9% 99.9% 31.4% 10.5% 125 99.9% 99.9% 32.1% 10.7% ``` # OPTIONS ## mandatory options #(./fragments/option_log.md) ## core options #(./fragments/option_fastq_ascii.md) #(./fragments/option_fastq_qmax.md) #(./fragments/option_fastq_qmin.md) ## secondary options #(./fragments/option_bzip2_decompress.md) #(./fragments/option_gzip_decompress.md) #(./fragments/option_no_progress.md) #(./fragments/option_quiet.md) ## ignored options #(./fragments/option_threads_not_multithreaded.md) # EXAMPLES Read from *input.fastq.gz*, do not write to the standard error (`--quiet`), and write results to *output.log* (`--log`): ```sh vsearch \ --fastq_stats input.fastq.gz \ --quiet \ --log output.log ``` # SEE ALSO [`vsearch-fastq_chars(1)`](./commands/vsearch-fastq_chars.1.md), [`vsearch-fastq(5)`](../formats/vsearch-fastq.5.md) #(./fragments/footer.md) vsearch-2.30.1/man/commands/vsearch-orient.1.md000066400000000000000000000067711506776541700212530ustar00rootroot00000000000000% vsearch-orient(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-orient --- use a reference database to orient fasta or fastq sequences # SYNOPSIS | **vsearch** **\-\-orient** _fastxfile_ \-\-db (_fastxfile_ | _udbfile_) (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-tabbedout) _outputfile_ \[_options_] # DESCRIPTION The vsearch command `--orient` detects if input sequences are in the same orientation as sequences in the reference database specified with the `--db` option. Detection is based on the number of common *words* (*k*-mers) of length 12. Each reference sequence is decomposed into a set of words. The forward (+) and reverse-complement (-) strands of each input sequence are also decomposed into sets of words. If a strand shares 4 times (or more) words with a reference sequence than the other strand, then it is chosen. Otherwise, the input sequence is marked as undetermined (?). # OPTIONS ## mandatory options `--db` *filename* : Read the reference database from the given fasta, fastq, or UDB *filename*. Only UDB files created with a `--wordlength` of 12 are accepted (see [`vsearch-makeudb_usearch(1)`](./vsearch-makeudb_usearch.1.md) for more details). At least one of `--fastaout`, `--fastqout`, `--notmatched`, `--tabbedout` must also be specified. ## core options `--fastaout` *filename* : Write the correctly oriented sequences to *filename*, in fasta format. `--fastqout` *filename* : Write the correctly oriented sequences to *filename*, in fastq format (requires a fastq input). `--notmatched` *filename* : Write the sequences with undetermined direction to *filename*, in their original format. `--tabbedout` *filename* : Write the resuls to a four-column tab-delimited file with the specified *filename*. Columns are: 1. query label 2. direction (+, -, or ?) 3. number of matching words on the forward strand 4. number of matching words on the reverse complementary strand ## secondary options #(./fragments/option_bzip2_decompress.md) #(./fragments/option_fasta_width.md) #(./fragments/option_dbmask.md) #(./fragments/option_gzip_decompress.md) #(./fragments/option_label_suffix.md) #(./fragments/option_lengthout.md) #(./fragments/option_log.md) #(./fragments/option_no_progress.md) #(./fragments/option_notrunclabels.md) #(./fragments/option_qmask.md) #(./fragments/option_quiet.md) #(./fragments/option_relabel.md) #(./fragments/option_relabel_keep.md) #(./fragments/option_relabel_md5.md) #(./fragments/option_relabel_self.md) #(./fragments/option_relabel_sha1.md) #(./fragments/option_sample.md) #(./fragments/option_sizein.md) #(./fragments/option_sizeout.md) #(./fragments/option_xee.md) #(./fragments/option_xlength.md) #(./fragments/option_xsize.md) ## ignored options #(./fragments/option_threads_not_multithreaded.md) # EXAMPLES Use the sequences in *db.fastq* to orient the sequences in *query.fastq*. Write the results to *query_oriented.fasta*, in fasta format: ```sh vsearch \ --orient query.fastq \ --db db.fastq \ --fastaout query_oriented.fasta ``` Search for mis-oriented in a set of reference sequences *db.fasta* by comparing it to itself. Write the results to the table *db_outliers.table*: ```sh vsearch \ --orient db.fasta \ --db db.fasta \ --tabbedout db_outliers.table ``` # SEE ALSO [`vsearch-makeudb_usearch(1)`](./vsearch-makeudb_usearch.1.md), [`vsearch-udb(5)`](./vsearch-udb.5.md) #(./fragments/footer.md) vsearch-2.30.1/man/commands/vsearch-shuffle.1.md000066400000000000000000000050411506776541700213740ustar00rootroot00000000000000% vsearch-shuffle(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./fragments/date.md) # NAME vsearch \-\-shuffle --- randomize the order of fasta or fastq entries # SYNOPSIS | **vsearch** **\-\-shuffle** _fastxfile_ \-\-output _filename_ \[_options_] # DESCRIPTION The vsearch command `--shuffle` (pseudo-)randomizes the order of fasta or fastq entries (*fastxfile*), and writes to `--output` *filename*, in fasta format. To illustrate: ```text >s1 >s3 AAAA TTTT >s2 --- shuffle --> >s1 CCCC AAAA >s3 >s2 TTTT CCCC ``` The number of entries written to *filename* can be limited with the option `--topn`. For reproducibility, the seed for the pseudo-random generator can be set with the option `--randseed`. # OPTIONS ## mandatory options #(./fragments/option_output.md) ## core options #(./fragments/option_randseed.md) #(./fragments/option_topn.md) ## secondary options #(./fragments/option_bzip2_decompress.md) #(./fragments/option_fasta_width.md) #(./fragments/option_fastq_ascii.md) #(./fragments/option_gzip_decompress.md) #(./fragments/option_label_suffix.md) #(./fragments/option_lengthout.md) #(./fragments/option_log.md) #(./fragments/option_maxseqlength.md) #(./fragments/option_minseqlength_1.md) #(./fragments/option_no_progress.md) #(./fragments/option_notrunclabels.md) #(./fragments/option_quiet.md) #(./fragments/option_relabel.md) #(./fragments/option_relabel_keep.md) #(./fragments/option_relabel_md5.md) #(./fragments/option_relabel_self.md) #(./fragments/option_relabel_sha1.md) #(./fragments/option_sample.md) #(./fragments/option_sizein.md) #(./fragments/option_sizeout.md) #(./fragments/option_xee.md) #(./fragments/option_xlength.md) #(./fragments/option_xsize.md) ## ignored options #(./fragments/option_fastq_qmax_ignored.md) #(./fragments/option_fastq_qmin_ignored.md) #(./fragments/option_threads_not_multithreaded.md) # EXAMPLES Read from *input.fasta*, do not write to the standard error (`--quiet`), use a pseudo-random seed (`--randseed` is not set), write results to *output.fasta* (`--output`), and record shuffling parameters in *output.log* (`--log`): ```sh vsearch \ --shuffle input.fasta \ --quiet \ --log output.log \ --output output.fasta ``` # SEE ALSO Inverse operation (sort): [`vsearch-sortbylength(1)`](./commands/vsearch-sortbylength.1.md), [`vsearch-sortbysize(1)`](../formats/vsearch-sortbysize.1.md) #(./fragments/footer.md) vsearch-2.30.1/man/formats/000077500000000000000000000000001506776541700155005ustar00rootroot00000000000000vsearch-2.30.1/man/formats/fragments/000077500000000000000000000000001506776541700174665ustar00rootroot00000000000000vsearch-2.30.1/man/formats/fragments/format_sequence.md000066400000000000000000000006721506776541700231750ustar00rootroot00000000000000The *sequence* is defined as a string of IUPAC symbols ('ACGTURYSWKMDBHVN' and 'acgturyswkmdbhvn'), starting after the end of the header line and ending before the next header line, or the file's end. vsearch silently ignores ASCII characters 9 to 13, and exits with an error message if ASCII characters 0 to 8, 14 to 31, '.' or '-' are present. All other ASCII or non-ASCII characters are stripped and complained about in a warning message. vsearch-2.30.1/man/formats/vsearch-fasta.5.md000066400000000000000000000025061506776541700207170ustar00rootroot00000000000000% vsearch-fasta(5) version 2.30.0 | vsearch file formats % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(../commands/fragments/date.md) # NAME fasta --- a text-based format for representing nucleotide sequences # DESCRIPTION In fasta files, each entry is made of a _header_ and a _sequence_. The header is defined as the string comprised between the initial '>' symbol and the first space, tabulation, or new line symbol, unless the `--notrunclabels` option is in effect, in which case the entire line is included. The *header* should contain printable ASCII characters (33-126). The program will terminate with a fatal error if there are unprintable ASCII characters (see `ascii(7)`). A warning will be issued if non-ASCII characters (128-255) are encountered. When using the option `--sizein`, if the header matches the patterns `>[;]size=integer;identifier`, `>identifier;size=integer;identifier`, or `>identifier;size=integer[;]`, vsearch will interpret *integer* as the number of occurrences (or abundance) of the sequence in the study. That abundance information is used or created during chimera detection, clustering, dereplication, sorting and searching. #(./fragments/format_sequence.md) # SEE ALSO [`vsearch-fastq(5)`](./vsearch-fastq.5.md), [`vsearch-udb(5)`](./vsearch-udb.5.md) #(../commands/fragments/footer.md) vsearch-2.30.1/man/formats/vsearch-fastq.5.md000066400000000000000000000041531506776541700207370ustar00rootroot00000000000000% vsearch-fastq(5) version 2.30.0 | vsearch file formats % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(../commands/fragments/date.md) # NAME fastq --- a text-based format for representing nucleotide sequences and their corresponding quality scores # DESCRIPTION In fastq files, each entry is made of *sequence header* starting with a symbol '@', a nucleotidic *sequence*, a *quality header* starting with a symbol '+', and a *quality string* of ASCII characters (offset 33 or 64), each one encoding the quality value of the corresponding position in the nucleotidic sequence. The *sequence header* is defined as the string comprised between the initial '@' symbol and the first space, tabulation, or new line symbol, unless the `--notrunclabels` option is in effect, in which case the entire line is included. The sequence header should contain printable ASCII characters (33-126). The program will terminate with a fatal error if there are unprintable ASCII characters (see `ascii(7)`). A warning will be issued if non-ASCII characters (128-255) are encountered. If the sequence header contains patterns such as `[@;]size=integer[;]` or `[@;]ee=float[;]`, vsearch can interpret these annotations and use them for chimera detection, clustering, dereplication, filtering and sorting. #(./fragments/format_sequence.md) The *quality header* is defined as the string comprised between the initial '+' symbol and the first space, tabulation, or new line symbol. The *quality string* is a string of ASCII characters, starting after the end of the quality header line and ending before the next header line, or the file's end. The range of valid ASCII characters can extend from '!' to '~' when the offset is 33, and from '@' to '~' when the offset is 64. vsearch silently ignores ASCII characters 9 to 13, and exits with an error message if ASCII characters 0 to 8, 14 to 31, ‘.’ or ‘-’ are present. All other ASCII or non-ASCII characters are stripped and complained about in a warning message. # SEE ALSO [`vsearch-fasta(5)`](./vsearch-fasta.5.md), [`vsearch-udb(5)`](./vsearch-udb.5.md) #(../commands/fragments/footer.md) vsearch-2.30.1/man/formats/vsearch-sff.5.md000066400000000000000000000264641506776541700204100ustar00rootroot00000000000000% vsearch-sff(5) version 2.30.0 | vsearch file formats % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(../commands/fragments/date.md) # NAME sff --- a binary file format used to encode pyrosequencing results # DESCRIPTION The standard flowgram format (sff), was designed by 454 Life Sciences, the Whitehead Institute for Biomedical Research, and the Sanger Institute. It was used to store reads produced by the Roche 454 sequencing platforms, and early versions of the Ion Torrent PGM sequencing platforms. Original NCBI documentation available at: If unavailable, it can be recovered from . We provide this modified version for preservation purpose. The proposed SFF file format (version 1) is a container file for storing one or many 454 reads. 454 reads differ from standard sequencing reads in that the 454 data does not provide individual base measurements from which basecalls can be derived. Instead, it provides measurements that estimate the length of the next homopolymer stretch in the sequence (i.e., in "AAATGG", "AAA" is a 3-mer stretch of A's, "T" is a 1-mer stretch of T's and "GG" is a 2-mer stretch of G's). A basecalled sequence is then derived by converting each estimate into a homopolymer stretch of that length and concatenating the homopolymers. The file format consists of different sections: a *common header* section occurring once in the file, then for each read stored in the file, a *read header* section and a *read data* section. An optional *index* section can also be present after the *common header* section, or anywhere among the reads, or at the end of the file. The data in each section consists of a combination of numeric and character data, where the specific fields for each section are defined below. The sections adhere to the following rules: - The standard Unix types `uint8_t`, `uint16_t`, `uint32_t` and `uint64_t` are used to define 1, 2, 4 and 8-byte numeric values. - All multi-byte numeric values are stored using big endian byte-order (same as the Standard Chromatogram Format, or SCF, file format). - All character fields use single-byte ASCII characters. - Each section definition ends with an `eight_byte_padding` field, which consists of 0 to 7 bytes of padding, so that the length of each section is divisible by 8 (and hence the next section is aligned on an 8-byte boundary). ## Common Header Section The *common header* section consists of the following fields: ```text magic_number uint32_t version char[4] index_offset uint64_t index_length uint32_t number_of_reads uint32_t header_length uint16_t key_length uint16_t number_of_flows_per_read uint16_t flowgram_format_code uint8_t flow_chars char[number_of_flows_per_read] key_sequence char[key_length] eight_byte_padding uint8_t[*] ``` where the following properties are true for these fields: - The `magic_number` field value is 0x2E736666, the `uint32_t` encoding of the string ".sff" - The version number corresponding to this proposal is 0001, or the byte array "\0\0\0\1". - The `index_offset` and `index_length` fields are the offset and length of an optional index of the reads in the SFF file. If no index is included in the file, both fields must be 0. - The `number_of_reads` field should be set to the number of reads stored in the file. - The `header_length` field should be the total number of bytes required by this set of header fields, and should be equal to "31 + `number_of_flows_per_read` + `key_length`" rounded up to the next value divisible by 8. - The `key_length` and `key_sequence` fields should be set to the length and nucleotide bases of the key sequence used for these reads. Note: The `key_sequence` field is not null-terminated. - The `number_of_flows_per_read` should be set to the number of flows for each of the reads in the file. - The `flowgram_format`_code should be set to the format used to encode each of the flowgram values for each read. - Note: Currently, only one flowgram format has been adopted, so this value should be set to 1. - The flowgram format code 1 stores each value as a uint16_t, where the floating point flowgram value is encoded as "(int) round(value * 100.0)", and decoded as "(storedvalue * 1.0 / 100.0)". - The `flow_chars` should be set to the array of nucleotide bases ('A', 'C', 'G' or 'T') that correspond to the nucleotides used for each flow of each read. The length of the array should equal `number_of_flows_per_read`. - Note: The `flow_chars` field is not null-terminated. - If any `eight_byte_padding` bytes exist in the section, they should have a byte value of 0. ## Read Header Section The rest of the file contains the information about the reads, namely `number_of_reads` entries consisting of *read header* and *read data* sections. The *read header* section consists of the following fields: ```text read_header_length uint16_t name_length uint16_t number_of_bases uint32_t clip_qual_left uint16_t clip_qual_right uint16_t clip_adapter_left uint16_t clip_adapter_right uint16_t name char[name_length] eight_byte_padding uint8_t[*] ``` where these fields have the following properties: - `read_header_length` should be set to the length of the read header for this read, and should be equal to "16 + name_length" rounded up to the next value divisible by 8. - The `name_length` and `name` fields should be set to the length and string of the read's accession or name. - Note: The name field is not null-terminated. - The `number_of_bases` should be set to the number of bases called for this read. - The `clip_qual_left` and `clip_adapter_left` fields should be set to the position of the first base after the clipping point, for quality and/or an adapter sequence, at the beginning of the read. If only a combined clipping position is computed, it should be stored in `clip_qual_left`. - The position values use 1-based indexing, so the first base is at position 1. - If a clipping value is not computed, the field should be set to 0. - Thus, the first base of the insert is: ``` max( 1, max(`clip_qual_left`, `clip_adapter_left`) ) ``` - The `clip_qual_right` and `clip_adapter_right` fields should be set to the position of the last base before the clipping point, for quality and/or an adapter sequence, at the end of the read. If only a combined clipping position is computed, it should be stored in `clip_qual_right`. - The position values use 1-based indexing. - If a clipping value is (not?) computed, the field should be set to 0. - Thus, the last base of the insert is: ``` min( (`clip_qual_right` == 0 ? `number_of_bases` : `clip_qual_right`), (`clip_adapter_right` == 0 ? `number_of_bases` : `clip_adapter_right`) ) ``` ## Read Data Section The read data section consists of the following fields: ```text flowgram_values uint*_t[number_of_flows] flow_index_per_base uint8_t[number_of_bases] bases char[number_of_bases] quality_scores uint8_t[number_of_bases] eight_byte_padding uint8_t[*] ``` where the fields have the following properties: - The `flowgram_values` field contains the homopolymer stretch estimates for each flow of the read. The number of bytes used for each value depends on the common header `flowgram_format_code` value (where the current value uses a `uint16_t` for each value). - The `flow_index_per_base` field contains the flow positions for each base in the called sequence (i.e., for each base, the position in the flowgram whose estimate resulted in that base being called). - These values are "incremental" values, meaning that the stored position is the offset from the previous flow index in the field. - All position values (prior to their incremental encoding) use 1-based indexing, so the first flow is flow 1. - The `bases` field contains the basecalled nucleotide sequence. - The `quality_scores` field contains the quality scores for each of the bases in the sequence, where the values use the standard -log10 probability scale. ## Index Section (optional) If an index is included in the file, the `index_offset` and `index_length` values in the *common header* should point to the section of the file containing the index. Note that the *index* section can be placed after the *common header* section, or anywhere among the reads, or at the end of the file. To support different indexing methods, the index section should begin with the following two fields: ```text index_magic_number uint32_t index_version char[4] ``` followed by the actual `index data` (*n* bytes) and a `eight_byte_padding` field, so that the (total?) length of the index section is divisible by 8. The format of the rest of the index data is specific to the indexing method used. Currently, there are no officially supported indexing formats. The `index_length` given in the common header should include the bytes of these fields and the padding (unclear!). ## Computing Lengths and Scanning the File The length of each read's section will be different, because of different length accession numbers and different length nucleotide sequences. However, the various flow, name and bases lengths given in the common and read headers can be used to scan the file, accessing each read's information or skipping read sections in the file. The following pseudocode gives an example method to scanning the file and accessing each read's data: - Open the file and/or reset the file pointer position to the first byte of the file. - Read the first 31 bytes of the file, confirm the `magic_number` value and `version`, then extract the `number_of_reads`, `number_of_flows_per_read`, `flowgram_format_code`, `header_length`, `key_length`, `index_offset` and `index_length` values. - Convert the `flowgram_format_code` into a `flowgram_bytes_per_flow` value (currently with format_code 1, this value is 2 bytes). - If the `flow_chars` and `key_sequence` information is required, read the next "`header_length` - 31" bytes, then extract that information. Otherwise, set the file pointer position to byte `header_length`. - While the file contains more bytes, do the following: - If the file pointer position equals `index_offset`, either read or skip `index_length` bytes in the file, processing the index if read. - Otherwise, - Read 16 bytes and extract the `read_header_length`, `name_length` and `number_of_bases` values. - Read the next "`read_header_length - 16`" bytes to read the name. - At this point, a test of the `name` field can be perform, to determine whether to read or skip this entry. - Compute the `read_data_length` as: `number_of_flows * flowgram_bytes_per_flow + 3 * number_of_bases`, rounded up to the next value divisible by 8. - Either read or skip `read_data_length` bytes in the file, processing the read data if the section is read. # SEE ALSO [`vsearch-fasta(5)`](./vsearch-fasta.5.md), [`vsearch-fastq(5)`](./vsearch-fastq.5.md) #(../commands/fragments/footer.md) vsearch-2.30.1/man/formats/vsearch-udb.5.md000066400000000000000000000010131506776541700203630ustar00rootroot00000000000000% vsearch-udb(5) version 2.30.0 | vsearch file formats % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(../commands/fragments/date.md) # NAME udb --- a binary format containing fasta sequences and a k-mer index for those sequences # DESCRIPTION Hosting a detailled description of the UDB format is important. - magic number? # EXAMPLES (show how to build udb files?) # SEE ALSO [`vsearch-fasta(5)`](./vsearch-fasta.5.md), [`vsearch-fastq(5)`](./vsearch-fastq.5.md) #(../commands/fragments/footer.md) vsearch-2.30.1/man/index.1.md000066400000000000000000000067071506776541700156270ustar00rootroot00000000000000% vsearch(1) version 2.30.0 | vsearch manual % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(./commands/fragments/date.md) # NAME vsearch --- a versatile open-source tool for metabarcoding and metagenomics # SYNOPSIS | **vsearch** \ \[_file_] \[_options_] (see below for a list of available commands) WARNING: this is a pre-release of the future revised and updated vsearch documentation. Users can already refer to this content for the commands listed below, but be aware that most vsearch commands are not yet featured. # DESCRIPTION vsearch is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pairwise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metabarcoding, metagenomics, genomics, and population genetics. # VSEARCH COMMANDS Each command is described in a dedicated manpage. For example, type `man vsearch-fasta2fastq` to read about the `--fasta2fastq` command. Manpages reffering to commands all belong to section 1 (executable programs). Some vsearch manpages belong to section 5 (file formats and conventions). For example, type `man 5 vsearch-fastq` or `man 'vsearch-fastq(5)'` to read about the fastq format as-seen by vsearch. ## FASTA/FASTQ/SFF file processing: **[`vsearch-fasta2fastq(1)`](./commands/vsearch-fasta2fastq.1.md)** : Convert a fasta file to a fastq file with fake quality scores. **[`vsearch-fastq_chars(1)`](./commands/vsearch-fastq_chars.1.md)** : Analyze fastq files to identify the quality encoding and the range of quality score values used. **[`vsearch-fastq_stats(1)`](./commands/vsearch-fastq_stats.1.md)** : Analyze fastq files and output detailed statistics: read length distribution, quality score distribution, length vs quality distribution, effect of expected error and length filtering, effect of quality score and length filtering. # FILE FORMATS **[`vsearch-fasta(5)`](./formats/vsearch-fasta.5.md)** : Specify the fasta format, as used by vsearch. **[`vsearch-fastq(5)`](./formats/vsearch-fastq.5.md)** : Specify the fastq format, as used by vsearch. (also SFF, and UDB). # SEE ALSO [swarm](https://github.com/torognes/swarm), [swipe](https://github.com/torognes/swipe), [usearch](https://github.com/rcedgar/usearch12) #(./commands/fragments/footer.md) # VERSION HISTORY (inject history here) vsearch-2.30.1/man/misc/000077500000000000000000000000001506776541700147605ustar00rootroot00000000000000vsearch-2.30.1/man/misc/vsearch-expected_error.7.md000066400000000000000000000006221506776541700221120ustar00rootroot00000000000000% vsearch-expected_error(7) version 2.30.0 | vsearch file formats % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(../commands/fragments/date.md) # NAME expected error --- a summary of the quality of a fastq sequence # DESCRIPTION - how to compute? - interpretation - properties # EXAMPLES (is this section needed?) # SEE ALSO (nothing for now) #(../commands/fragments/footer.md) vsearch-2.30.1/man/misc/vsearch-pairwise_alignment_parameters.7.md000066400000000000000000000114431506776541700252070ustar00rootroot00000000000000% vsearch-pairwise_alignment_parameters(7) version 2.30.0 | vsearch file formats % Torbjørn Rognes, Tomás Flouri, and Frédéric Mahé #(../commands/fragments/date.md) # NAME pairwise alignment parameters --- a description of the pairwise alignment model implemented in vsearch # DESCRIPTION vsearch implements an extremely fast Needleman-Wunsch algorithm (REFERENCE). There are many ways to align two sequences. This algorithm finds the pairwise alignment with the best score. (explain the difference with Smith-Waterman?) Along the pairwise alignment, each aligned position contributes to the score, by being either a *match*, a *mismatch*, or a *gap*. TODO: inject "reward and penalty system" ## match and mismatches When aligning sequences, identical symbols will receive a positive match score (default +2, see option `--match`). Note that T and U are considered identical, regardless of their case. If two symbols are not identical, their alignment result in a negative mismatch score (default -4, see option `--mismatch`). Aligning a pair of symbols where at least one of them is an ambiguous symbol (BDHKMNRSVWY) will always result in a score of zero. Alignment of two identical ambiguous symbols (for example, R vs R) also receives a score of zero. Once the optimal pairwise alignent has been found: When computing the amount of similarity by counting matches and mismatches **after** alignment, ambiguous nucleotide symbols will count as matching to other symbols if they have at least one of the nucleotides (ACGTU) they may represent in common. For example: W will match A and T, but also any of MRVHDN. When showing alignments (for example with the output option `--alnout`) matches involving ambiguous symbols will be shown with a plus character (+) between them while exact matches between non-ambiguous symbols will be shown with a vertical bar character (|). Note that --iddef choice has no effect on the score and selection of the optimal pairwise alignment. The similarity is computed after. ## gaps Gaps are further refined into *gap openings* (see option `--gapopen`) or *gap extensions* (see option `--gapext`). ### gap openings Contrary to matches/mismatches, gaps are asymetrical, so a gap opening can occur in six different contexts: in the query (Q) or in the target (T) sequence; inside the sequence (I), or at the left (L) or right (R) extremity. Sequence symbols (Q and T) can be combined with location symbols (L, I, and R), and numerical values to declare penalties for all possible contexts: aQL/bQI/cQR/dTL/eTI/fTR, where abcdef are zero or positive integers, and '/' is used as a separator. `--gapopen` *2QL/20QI/2QR/2TL/20TI/2TR* : Set the six gap opening penalties using a penalty of 20 for opening internal gaps and a penalty of 2 for opening terminal gaps, in both query and target sequences. This is the default. To simplify declarations, the symbol (E) can be used to treat both extremities (L and R) equally, and the symbols Q and T can be omitted to treat query and target sequences equally. `--gapopen` *20I/2E* : Set the six gap opening penalties using a penalty of 20 for opening internal gaps and a penalty of 2 for opening terminal gaps, in both query and target sequences. This is the default. If only a numerical value is given, without any sequence or location symbol, then the penalty applies to all gap openings. For example: `--gapopen` *20* : Set the six gap opening penalties using a penalty of 20 for all gap openings, internal or terminal, in both query and target sequences. To forbid gap-opening, an infinite penalty value can be declared with the symbol '\*'. `--gapopen` *\*I/2E* : Set the gap opening penalties to an infinite value for internal gap openings, in both query and target sequences. To use vsearch as a semi-global aligner, a null-penalty can be applied to the left (L) or right (R) gaps. vsearch always initializes the six gap opening penalties using the default parameters (20I/2E). The user is then free to declare only the values he/she wants to modify. The string is scanned from left to right, accepted symbols are (0123456789*/LIREQT), and later values override previous values. Please note that vsearch, in contrast to usearch, only allows integer gap penalties. Because the lowest gap penalties are 0.5 by default in usearch, all default scores and gap penalties in vsearch have been doubled to maintain equivalent penalties and to produce identical alignments. ### gap extensions Gap extensions follow the same penalty declaration system as gap openings. `--gapext` *2I/1E* : Set the six gap extending penalties using a penalty of 2 for extending internal gaps and a penalty of 1 for extending terminal gaps, in both query and target sequences. This is the default. # EXAMPLES (is this section needed?) # SEE ALSO (nothing for now) #(../commands/fragments/footer.md) vsearch-2.30.1/man/scripts/000077500000000000000000000000001506776541700155145ustar00rootroot00000000000000vsearch-2.30.1/man/scripts/generate_manpages.sh000066400000000000000000000012451506776541700215170ustar00rootroot00000000000000#!/bin/bash ## assume script is launched from vsearch/man/ ## assume any internal link is relative to the md file itself (important) build_markdown_file() { perl -ne \ 's/^#\((.+)\).*/`cat "$1"`/e;print' "${1}" } convert_markdown_to_groff() { pandoc - --standalone --to man } generate_manpage() { build_markdown_file "${1}" | \ convert_markdown_to_groff } # create folder mkdir -p manpages for raw_md in ./{commands,formats,misc}/vsearch*.md ; do FOLDER="$(dirname "${raw_md}")" FILENAME="$(basename "${raw_md}")" (cd "${FOLDER}" || exit 1 generate_manpage "${FILENAME}" > "../manpages/${FILENAME/\.md/}" ) done exit 0 vsearch-2.30.1/man/scripts/generate_online_documentation.sh000066400000000000000000000030361506776541700241410ustar00rootroot00000000000000#!/bin/bash ## assume script is launched from vsearch/man/ ## assume any internal link is relative to the md file itself (important) ## check dependencies for dependency in pandoc perl ; do which "${dependency}" > /dev/null || \ { >&2 echo "Error: missing ${dependency}" ; exit 1 ; } done build_markdown_file() { perl -ne \ 's/^#\((.+)\).*/`cat "$1"`/e;print' "${1}" } convert_markdown_to_github_markdown() { pandoc - --to gfm } generate_github_markdown() { ## Failed tests: # no replacement # sed 's/\\\-\\\-/\-\-/g' # sed 's/\\\-\\\-/\\-\\-/g' # sed 's/\\\-\\\-/\\\-\\\-/g' # sed 's/\\\-\\\-/\\\\-\\\\-/g' # sed 's/\\\-\\\-/\\\\\-\\\\\-/g' # sed 's/\\\-\\\-/\\\\\\-\\\\\\-/g' -> '\\\-' renders '\-' # sed 's/\\\-\\\-/\\\\\\\\-\\\\\\\\-/g' -> renders \\-\\- build_markdown_file "${1}" | sed 's/\\\-\\\-/\\\\\\\\\-\\\\\\\\\-/g' | \ convert_markdown_to_github_markdown } # create folder mkdir -p ../docs/{commands,formats,misc} # test: maybe the config file needs to be placed at the root of the documentation? ln ../_config.yml ../docs/ 2> /dev/null # future: use vsearch.1.md as the starting page (index.html) generate_github_markdown ./index.1.md > "../docs/index.md" # mirror the organization of manpages for raw_md in ./{commands,formats,misc}/vsearch*.md ; do FOLDER="$(dirname "${raw_md}")" FILENAME="$(basename "${raw_md}")" (cd "${FOLDER}" || exit 1 generate_github_markdown "${FILENAME}" > "../../docs/${FOLDER}/${FILENAME}" ) done exit 0 vsearch-2.30.1/man/scripts/include_markdown_files.sh000066400000000000000000000003711506776541700225600ustar00rootroot00000000000000#!/usr/bin/env bash # modified from https://stackoverflow.com/a/18517316 perl -ne 's/^#\((.+)\).*/`cat "$1"`/e;print' "${1}" # perl -ne 's/^#\\((.+)\\).*/cat \"\\$1\"/e;print' "$@" # perl -ne 's#^!\[\[(.+?)\]\].*#`'$0' "$1"`#e;print' "$@" exit 0 vsearch-2.30.1/man/scripts/visualize_manpage.sh000066400000000000000000000001761506776541700215570ustar00rootroot00000000000000#!/bin/bash bash ./include_markdown_files.sh "${1}" | \ pandoc - --standalone --to man | \ /usr/bin/man -l - exit 0 vsearch-2.30.1/man/vsearch.1000066400000000000000000006350331506776541700155540ustar00rootroot00000000000000.\" import www macros (URL, TAG, MTO) .mso www.tmac .\" ============================================================================ .TH vsearch 1 "October 3, 2025" "version 2.30.1" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pairwise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. .\" ============================================================================ .SH SYNOPSIS .\" left justified, ragged right .ad l Chimera detection: .RS \fBvsearch\fR (\-\-uchime_denovo | \-\-uchime2_denovo | \-\-uchime3_denovo) \fIfastafile\fR (\-\-chimeras | \-\-nonchimeras | \-\-uchimealns | \-\-uchimeout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-uchime_ref \fIfastafile\fR (\-\-chimeras | \-\-nonchimeras | \-\-uchimealns | \-\-uchimeout) \fIoutputfile\fR \-\-db \fIfastafile\fR [\fIoptions\fR] .PP .RE Clustering: .RS \fBvsearch\fR (\-\-cluster_fast | \-\-cluster_size | \-\-cluster_smallmem | \-\-cluster_unoise) \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-centroids | \-\-clusters | \-\-mothur_shared_out | \-\-msaout | \-\-otutabout | \-\-profile | \-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR] .PP .RE Dereplication and rereplication: .RS \fBvsearch\fR \-\-fastx_uniques (\fIfastafile\fR | \fIfastqfile\fR) (\-\-fastaout | \-\-fastqout | \-\-tabbedout | \-\-uc) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR (\-\-derep_fulllength | \-\-derep_id | \-\-derep_prefix) \fIfastafile\fR (\-\-output | \-\-uc) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-derep_smallmem (\fIfastafile\fR | \fIfastqfile\fR) \-\-fastaout \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-rereplicate \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP .RE Extraction of sequences: .RS \fBvsearch\fR \-\-fastx_getseq \fIfastafile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-notmatchedfq) \fIoutputfile\fR \-\-label \fIlabel\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_getseqs \fIfastafile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-notmatchedfq) \fIoutputfile\fR (\-\-label \fIlabel\fR \ \-\-labels \fIlabelfile\fR | \-\-label_word \fIlabel\fR | \-\-label_words \fIlabelfile\fR) [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_getsubseq \fIfastafile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-notmatchedfq) \fIoutputfile\fR \-\-label \fIlabel\fR [\-\-subseq_start \fIposition\fR] [\-\-subseq_end \fIposition\fR] [\fIoptions\fR] .PP .RE FASTA/FASTQ/SFF file processing: .RS \fBvsearch\fR \-\-fasta2fastq \fIfastafile\fR \-\-fastqout \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_chars \fIfastqfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_convert \fIfastqfile\fR \-\-fastqout \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR (\-\-fastq_eestats | \-\-fastq_eestats2) \fIfastqfile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_filter \fIfastqfile\fR [\-\-reverse \fIfastqfile\fR] (\-\-fastaout | \-\-fastaout_discarded | \-\-fastqout | \-\-fastqout_discarded \-\-fastaout_rev | \-\-fastaout_discarded_rev | \-\-fastqout_rev | \-\-fastqout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_join \fIfastqfile\fR \-\-reverse \fIfastqfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_mergepairs \fIfastqfile\fR \-\-reverse \fIfastqfile\fR (\-\-fastaout | \-\-fastqout | \-\-fastaout_notmerged_fwd | \-\-fastaout_notmerged_rev | \-\-fastqout_notmerged_fwd | \-\-fastqout_notmerged_rev | \-\-eetabbedout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_stats \fIfastqfile\fR [\-\-log \fIlogfile\fR] [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_filter \fIinputfile\fR [\-\-reverse \fIinputfile\fR] (\-\-fastaout | \-\-fastaout_discarded | \-\-fastqout | \-\-fastqout_discarded \-\-fastaout_rev | \-\-fastaout_discarded_rev | \-\-fastqout_rev | \-\-fastqout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastx_revcomp \fIinputfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-sff_convert \fIsff-file\fR \-\-fastqout \fIoutputfile\fR [\fIoptions\fR] .PP .RE Masking: .RS \fBvsearch\fR \-\-fastx_mask \fIfastxfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-maskfasta \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP .RE Orienting: .RS \fBvsearch\fR \-\-orient \fIfastxfile\fR \-\-db \fIfastxfile\fR (\-\-fastaout | \-\-fastqout | \-\-notmatched | \-\-tabbedout) \fIoutputfile\fR [\fIoptions\fR] .PP .RE Pairwise alignment: .RS \fBvsearch\fR \-\-allpairs_global \fIfastafile\fR (\-\-alnout | \-\-blast6out | \-\-matched | \-\-notmatched | \-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR (\-\-acceptall | \-\-id \fIreal\fR) [\fIoptions\fR] .PP .RE Restriction site cutting: .RS \fBvsearch\fR \-\-cut \fIfastafile\fR \-\-cut_pattern \fIpattern\fR (\-\-fastaout | \-\-fastaout_rev | \-\-fastaout_discarded | \-\-fastaout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP .RE Searching: .RS \fBvsearch\fR \-\-search_exact \fIfastafile\fR \-\-db \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out | \-\-otutabout | \-\-samout | \-\-uc | \-\-userout | \-\-lcaout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-usearch_global \fIfastafile\fR \-\-db (\fIfastafile\fR | \fIudbfile\fR) (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out | \-\-otutabout | \-\-samout | \-\-uc | \-\-userout | \-\-lcaout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR] .PP .RE Shuffling and sorting: .RS \fBvsearch\fR (\-\-shuffle | \-\-sortbylength | \-\-sortbysize) \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP .RE Subsampling: .RS \fBvsearch\fR \-\-fastx_subsample \fIfastafile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR (\-\-sample_pct \fIreal\fR | \-\-sample_size \fIpositive integer\fR) [\fIoptions\fR] .PP .RE Taxonomic classification: .RS \fBvsearch\fR \-\-sintax \fIfastafile\fR \-\-db (\fIfastafile\fR | \fIudbfile\fR) \-\-tabbedout \fIoutputfile\fR [\-\-sintax_cutoff \fIreal\fR] [\fIoptions\fR] .PP .RE UDB database handling: .RS \fBvsearch\fR \-\-makeudb_usearch \fIfastafile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-udb2fasta \fIudbfile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR (\-\-udbinfo | \-\-udbstats) \fIudbfile\fR [\fIoptions\fR] .PP .RE .\" left and right justified (default) .ad b .\" ============================================================================ .SH DESCRIPTION Environmental or clinical molecular diversity studies generate large volumes of amplicons (e.g.; SSU-rRNA sequences) that need to be checked for chimeras, dereplicated, masked, sorted, searched, clustered or compared to reference sequences. The aim of \fBvsearch\fR is to offer a all-in-one open source tool to perform these tasks, using optimized algorithm implementations and harvesting the full potential of modern computers, thus providing fast and accurate data processing. .PP Comparing nucleotide sequences is at the core of \fBvsearch\fR. To speed up comparisons, \fBvsearch\fR implements an extremely fast Needleman-Wunsch algorithm, making use of the Streaming SIMD Extensions (SSE2) of post-2003 x86-64 CPUs. If SSE2 instructions are not available, \fBvsearch\fR exits with an error message. On Power8 CPUs it will use AltiVec/VSX/VMX instructions, and on ARMv8 CPUs it will use Neon instructions. On other systems it can use the SIMD Everywhere (simde) library, if available. Memory usage increases rapidly with sequence length: for example comparing two sequences of length 1 kb requires 8 MB of memory per thread, and comparing two 10 kb sequences requires 800 MB of memory per thread. For comparisons involving sequences with a length product greater than 25 million (for example two sequences of length 5 kb), \fBvsearch\fR uses a slower alignment method described by Hirschberg (1975) and Myers and Miller (1988), with much smaller memory requirements. .\" ---------------------------------------------------------------------------- .SS Input \fBvsearch\fR accept as input fasta or fastq files containing one or several nucleotidic entries. In fasta files, each entry is made of a header and a sequence. The header is defined as the string comprised between the initial '>' symbol and the first space, tab or the end of the line, unless the \-\-notrunclabels option is in effect, in which case the entire line is included. The header should contain printable ascii characters (33-126). The program will terminate with a fatal error if there are unprintable ascii characters. A warning will be issued if non-ascii characters (128-255) are encountered. .PP If the header matches the pattern '>[;]size=\fIinteger\fR;label', the pattern '>label;size=\fIinteger\fR;label', or the pattern '>label;size=\fIinteger\fR[;]', \fBvsearch\fR will interpret \fIinteger\fR as the number of occurrences (or abundance) of the sequence in the study. That abundance information is used or created during chimera detection, clustering, dereplication, sorting and searching. .PP The sequence is defined as a string of IUPAC symbols (ACGTURYSWKMDBHVN), starting after the end of the identifier line and ending before the next identifier line, or the file end. \fBvsearch\fR silently ignores ascii characters 9 to 13, and exits with an error message if ascii characters 0 to 8, 14 to 31, '.' or '-' are present. All other ascii or non-ascii characters are stripped and complained about in a warning message. .PP In fastq files, each entry is made of sequence header starting with a symbol '@', a nucleotidic sequence (same rules as for fasta sequences), a quality header starting with a symbol '+' and a string of ASCII characters (offset 33 or 64), each one encoding the quality value of the corresponding position in the nucleotidic sequence. .PP \fBvsearch\fR operations are case insensitive, except when soft masking is activated. DUST masking is automatically applied during chimera detection, clustering, masking, pairwise alignment and searching. Soft masking is specified with the options '\-\-dbmask soft' (for searching and chimera detection with a reference) or '\-\-qmask soft' (for searching, \fIde novo\fR chimera detection, clustering and masking). When using soft masking, lower case letters indicate masked symbols, while upper case letters indicate regular symbols. Masked symbols are never included in the unique index words used for sequence comparisons, otherwise they are treated as normal symbols. .PP When comparing sequences during chimera detection, dereplication, searching and clustering, T and U are considered identical, regardless of their case. When aligning sequences, identical symbols will receive a positive match score (default +2). If two symbols are not identical, their alignment result in a negative mismatch score (default -4). Aligning a pair of symbols where at least one of them is an ambiguous symbol (BDHKMNRSVWY) will always result in a score of zero. Alignment of two identical ambiguous symbols (for example, R vs R) also receives a score of zero. When computing the amount of similarity by counting matches and mismatches after alignment, ambiguous nucleotide symbols will count as matching to other symbols if they have at least one of the nucleotides (ACGTU) they may represent in common. For example: W will match A and T, but also any of MRVHDN. When showing alignments (for example with the \-\-alnout option) matches involving ambiguous symbols will be shown with a plus character (+) between them while exact matches between non-ambiguous symbols will be shown with a vertical bar character (|). .PP \fBvsearch\fR can read data from standard files and write to standard files, but it can also read from pipes and write to pipes! For example, multiple fasta files can be piped into \fBvsearch\fR for dereplication. To do so, file names can be replaced with: .RS .IP - 2 the symbol '-', representing '/dev/stdin' for input files or '/dev/stdout' for output files (with an exception for '\-\-db \-', see * below), .IP - a named pipe created with the command mkfifo, .IP - a process substitution '<(command)' as input or '>(command)' as output. .IP * \-\-db \- is not accepted, to prevent potential concurrent reads from stdin. A workaround for advanced users is to call '\-\-db /dev/stdin' directly. .RE .PP \fBvsearch\fR can automatically read compressed gzip or bzip2 files if the appropriate libraries are present during the compilation. \fBvsearch\fR can also read pipes streaming compressed gzip or bzip2 data if the options \-\-gzip_decompress or \-\-bzip2_decompress are selected. When reading from a pipe, the progress indicator is not updated. .\" ---------------------------------------------------------------------------- .SS Options \fBvsearch\fR recognizes a large number of command-line commands and options. For easier navigation, options are grouped below by theme (chimera detection, clustering, dereplication and rereplication, FASTA/FASTQ file processing, masking, pairwise alignment, searching, shuffling, sorting, and subsampling). We start with the general options that apply to all themes. Options start with a double dash (\-\-). A single dash (\-) may also be used, except on NetBSD systems. Option names may be shortened as long as they are not ambiguous (e.g. \-\-derep_f). .RE .PP .\" ---------------------------------------------------------------------------- .TAG help-and-version-commands Help and version commands: .PP .RS .TAG help .TAG h .TP 9 .B \-\-help \-h Display help text with brief information about all commands and options. .TAG version .TAG v .TP .B \-\-version \-v Output version information and a citation for the VSEARCH publication. Show the status of the support for gzip- and bzip2-compressed input files. .RE .PP .\" ---------------------------------------------------------------------------- .TAG general-options General options: .RS .TAG bzip2_decompress .TP 9 .B \-\-bzip2_decompress When reading from a pipe streaming bzip2-compressed data, decompress the data. This option is not needed when reading from a standard bzip2-compressed file. .TAG fasta_width .TP .BI \-\-fasta_width\~ "positive integer" Fasta files produced by \fBvsearch\fR are wrapped (sequences are written on lines of \fIinteger\fR nucleotides, 80 by default). Set the value to zero to eliminate the wrapping. .TAG gzip_decompress .TP .B \-\-gzip_decompress When reading from a pipe streaming gzip-compressed data, decompress the data. This option is not needed when reading from a standard gzip-compressed file. .TAG label_suffix .TP .BI \-\-label_suffix\~ string When writing FASTA or FASTQ files, add the suffix \fIstring\fR to sequence headers. .TAG log .TP .BI \-\-log \0filename Write messages to the specified log file. Information written includes program version, amount of memory available, number of cores and command line options, and if need be, informational messages, warnings and fatal errors. The start and finish times are also recorded as well as the elapsed time and the maximum amount of memory consumed. The different \fBvsearch\fR commands can also write additional information to the log file. .TAG maxseqlength .TP .BI \-\-maxseqlength\~ "positive integer" All \fBvsearch\fR operations discard sequences longer than \fIinteger\fR (50,000 nucleotides by default). .TAG minseqlength .TP .BI \-\-minseqlength\~ "positive integer" All \fBvsearch\fR operations discard sequences shorter than \fIinteger\fR: 1 nucleotide by default for sorting or shuffling, 32 nucleotides for clustering and dereplication as well as the commands \-\-makeudb_usearch, \-\-sintax, and \-\-usearch_global. .\" note: minseqlength can be set to zero (keep empty entries) .TAG no_progress .TP .B \-\-no_progress Do not show the gradually increasing progress indicator. .TAG notrunclabels .TP .B \-\-notrunclabels Do not truncate sequence labels at first space or tab, but use the full header in output files. Turned off by default for all commands except the sintax command. .TAG quiet .TP .B \-\-quiet Suppress all messages to stdout and stderr except for warnings and fatal error messages. .TAG sample .TP .BI \-\-sample\~ string When writing FASTA or FASTQ files, add the the given sample identifier \fIstring\fR to sequence headers. For instance, if the given string is ABC, the text ";sample=ABC" will be added to the header. Note that \fIstring\fR will be truncated at the first ';' or blank character. Other characters (alphabetical, numerical and punctuations) are accepted. .TAG threads .TP .BI \-\-threads\~ "positive integer" Number of computation threads to use (1 to 1024). The number of threads should be less than or equal to the number of available CPU cores. The default is to use all available resources and to launch one thread per core. The following commands are multi-threaded: allpairs_global, cluster_fast, cluster_size, cluster_smallmem, cluster_unoise, fastq_mergepairs, fastx_mask, maskfasta, search_exact, sintax, uchime_ref, and usearch_global. Only one thread is used for the other commands. .RE .PP .\" ---------------------------------------------------------------------------- .TAG chimera-detection-options Chimera detection options: .PP .RS Chimera detection is based on a scoring function controlled by five options (\-\-dn, \-\-mindiffs, \-\-mindiv, \-\-minh, \-\-xn). Sequences are first sorted by decreasing abundance, if available, and compared on their \fIplus\fR strand only (case insensitive). .PP Input sequences are masked as specified with the \-\-qmask and \-\-hardmask options. Masking of the database for reference based chimera detection is specified with the \-\-dbmask option. .PP In \fIde novo\fR mode, input fasta file must present abundance annotations (i.e. a pattern [;]size=\fIinteger\fR[;] in the fasta header). Input order matters for chimera detection, so we recommend to sort sequences by decreasing abundance (default of \-\-derep_fulllength command). If your sequence set needs to be sorted, please see the \-\-sortbysize command in the sorting section. .PP .TAG abskew .TP 9 .BI \-\-abskew \0real When using \-\-uchime_denovo, the abundance skew is used to distinguish in a three-way alignment which sequence is the chimera and which are the parents. The assumption is that chimeras appear later in the PCR amplification process and are therefore less abundant than their parents. For \-\-uchime3_denovo the default value is 16.0. For the other commands, the default value is 2.0, which means that the parents should be at least 2 times more abundant than their chimera. Any positive value equal or greater than 1.0 can be used. .TAG alignwidth .TP .BI \-\-alignwidth\~ "positive integer" When using \-\-uchimealns, set the width of the three-way alignments (80 nucleotides by default). Set to zero to eliminate wrapping. .TAG borderline .TP .BI \-\-borderline \0filename Output borderline chimeric sequences to \fIfilename\fR, in fasta format. Borderline chimeric sequences are sequences that have a high enough score but which are not sufficiently different from their closest parent. .TAG chimeras .TP .BI \-\-chimeras \0filename Output chimeric sequences to \fIfilename\fR, in fasta format. Output order may vary when using multiple threads. .TAG db .TP .BI \-\-db \0filename When using \-\-uchime_ref, detect chimeras using the reference sequences contained in \fIfilename\fR. Reference sequences are assumed to be chimera-free. Chimeras cannot be detected if their parents, or sufficiently close relatives, are not present in the database. The file name must refer to a FASTA file or to a UDB file. If a UDB file is used, it should be created using the \-\-makeudb_usearch command with the \-\-dbmask dust option. .TAG dn .TP .BI \-\-dn\~ "strictly positive real number" pseudo-count prior on the number of no votes, corresponding to the parameter \fIn\fR in the chimera scoring function (default value is 1.4). Increasing \-\-dn reduces the likelihood of tagging a sequence as a chimera (less false positives, but also more false negatives). .TAG fasta_score .TP .B \-\-fasta_score Add the chimera score to the headers in the fasta output files for chimeras, non-chimeras and borderline sequences, using the format ';uchime_denovo=\fIfloat\fR;'. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG mindiffs .TP .BI \-\-mindiffs\~ "positive integer" Minimum number of differences per segment (default value is 3). The parameter is ignored with \-\-uchime2_denovo and \-\-uchime3_denovo. .TAG mindiv .TP .BI \-\-mindiv \0real Minimum divergence from closest parent (default value is 0.8). The parameter is ignored with \-\-uchime2_denovo and \-\-uchime3_denovo. .TAG minh .TP .BI \-\-minh \0real Minimum score (\fIh\fR). Increasing this value tends to reduce the number of false positives and to decrease sensitivity. Default value is 0.28, and values ranging from 0.0 to 1.0 included are accepted. The parameter is ignored with \-\-uchime2_denovo and \-\-uchime3_denovo. .TAG nonchimeras .TP .BI \-\-nonchimeras \0filename Output non-chimeric sequences to \fIfilename\fR, in fasta format. Output order may vary when using multiple threads. .TAG relabel .TP .BI \-\-relabel \0string Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3, etc.) to construct the new headers. Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequences using the MD5 message digest algorithm applied to each sequence. Former sequence headers are discarded. The sequence is converted to upper case and each 'U' is replaced by a 'T' before computation of the digest. The MD5 digest is a cryptographic hash function designed to minimize the probability that two different inputs give the same output, even for very similar, but non-identical inputs. Still, there is a very small, but non-zero, probability that two different inputs give the same digest (i.e. a collision). MD5 generates a 128-bit (16-byte) digest that is represented by 16 hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the abundance annotations. .\" The probablity of collision for two sequences is 1/2^128 .TAG relabel_self .TP .B \-\-relabel_self Relabel sequences using each sequence itself as a label. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequences using the SHA1 message digest algorithm applied to each sequence. It is similar to the \-\-relabel_md5 option but uses the SHA1 algorithm instead of the MD5 algorithm. SHA1 generates a 160-bit (20-byte) digest that is represented by 20 hexadecimal numbers (40 symbols). The probability of a collision (two non-identical sequences resulting in the same digest) is smaller for the SHA1 algorithm than it is for the MD5 algorithm. .\" The probablity of collision for two sequences is 1/2^160 .TAG self .TP .B \-\-self When using \-\-uchime_ref, ignore a reference sequence when its label matches the label of the query sequence (useful to estimate false-positive rate in reference sequences). .\" I am not sure the statement above is true. .TAG selfid .TP .B \-\-selfid When using \-\-uchime_ref, ignore a reference sequence when its nucleotide sequence is strictly identical to the nucleotidic sequence of the query. .TP .B \-\-sizein In \fIde novo\fR mode, abundance annotations (pattern '[>;]size=\fIinteger\fR[;]') present in sequence headers are taken into account by default (\-\-sizein is always implied). This option is ignored by \-\-uchime_ref. .TP .TAG sizeout .B \-\-sizeout When relabelling, add abundance annotations to fasta headers (using the format ';size=\fIinteger\fR;'). .TAG uchime_denovo .TP .BI \-\-uchime_denovo \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR, without external references (i.e. \fIde novo\fR). Automatically sort the sequences in \fIfilename\fR by decreasing abundance beforehand (see the sorting section for details). Multithreading is not supported. .TAG uchime2_denovo .TP .BI \-\-uchime2_denovo \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR, using the UCHIME2 algorithm. This algorithm is designed for denoised amplicons (see \-\-cluster_unoise). Automatically sort the sequences in \fIfilename\fR by decreasing abundance beforehand (see the sorting section for details). Multithreading is not supported. .TAG uchime3_denovo .TP .BI \-\-uchime3_denovo \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR, using the UCHIME2 algorithm. The only difference from \-\-uchime2_denovo is that the default minimum abundance skew (\-\-abskew) is set to 16.0 rather than 2.0. .TAG uchime_ref .TP .BI \-\-uchime_ref \0filename Detect chimeras present in the fasta-formatted \fIfilename\fR by comparing them with reference sequences (option \-\-db). Multithreading is supported. .TAG uchimealns .TP .BI \-\-uchimealns \0filename Write the three-way global alignments (parentA, parentB, chimera) to \fIfilename\fR using a human-readable format. Use \-\-alignwidth to modify alignment length. Output order may vary when using multiple threads. All sequences are converted to upper case before alignment. Lower case letters indicate disagreement in the alignment. .TAG uchimeout .TP .BI \-\-uchimeout \0filename Write chimera detection results to \fIfilename\fR using a 18-field, tab\-separated uchime\-like format. Use \-\-uchimeout5 to use a format compatible with usearch v5 and earlier versions. Rows output order may vary when using multiple threads. .RS .RS .nr step 1 1 .IP \n[step]. 4 score: higher score means a more likely chimeric alignment. .IP \n+[step]. Q: query sequence label. .IP \n+[step]. A: parent A sequence label. .IP \n+[step]. B: parent B sequence label. .IP \n+[step]. T: top parent sequence label (i.e. parent most similar to the query). That field is removed when using \-\-uchimeout5. .IP \n+[step]. idQM: percentage of similarity of query (Q) and model (M) constructed as a part of parent A and a part of parent B. .IP \n+[step]. idQA: percentage of similarity of query (Q) and parent A. .IP \n+[step]. idQB: percentage of similarity of query (Q) and parent B. .IP \n+[step]. idAB: percentage of similarity of parent A and parent B. .IP \n+[step]. idQT: percentage of similarity of query (Q) and top parent (T). .IP \n+[step]. LY: yes votes in the left part of the model. .IP \n+[step]. LN: no votes in the left part of the model. .IP \n+[step]. LA: abstain votes in the left part of the model. .IP \n+[step]. RY: yes votes in the right part of the model. .IP \n+[step]. RN: no votes in the right part of the model. .IP \n+[step]. RA: abstain votes in the right part of the model. .IP \n+[step]. div: divergence, defined as (idQM - idQT). .IP \n+[step]. YN: query is chimeric (Y), or not (N), or is a borderline case (?). .RE .RE .TAG uchimeout5 .TP .B \-\-uchimeout5 When using \-\-uchimeout, write chimera detection results using a 17\-field, tab\-separated uchime\-like format (drop the 5th field of \-\-uchimeout), compatible with usearch version 5 and earlier versions. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xn .TP .BI \-\-xn\~ "strictly positive real number" weight of no votes, corresponding to the parameter \fIbeta\fR in the scoring function (default value is 8.0). Increasing \-\-xn reduces the likelihood of tagging a sequence as a chimera (less false positives, but also more false negatives). .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG clustering-options Clustering options: .RS .PP \fBvsearch\fR implements a single-pass, greedy centroid-based clustering algorithm, similar to the algorithms implemented in usearch, DNAclust and sumaclust for example. Important parameters are the global clustering threshold (\-\-id) and the pairwise identity definition (\-\-iddef). .PP Input sequences are masked as specified with the \-\-qmask and \-\-hardmask options. .TAG biomout .TP 9 .BI \-\-biomout \0filename Generate an OTU table in the biom version 1.0 JSON file format as specified at .URL https://biom-format.org/documentation/format_versions/biom-1.0.html "(link)" . The format describes how to store a sparse matrix containing the abundances of the OTUs in the different samples. This format is much more efficient than the classic and mothur OTU table formats available with the \-\-otutabout and \-\-mothur_shared_out options, respectively, and is recommended at least for large tables. The OTUs are represented by the cluster centroids. Taxonomy information will be included for the OTUs if available. Sample identifiers will be extracted from the headers of all sequences in the input file. If the header contains ';sample=abc123;' or ';barcodelabel=abc123;' or a similar string somewhere, then the given sample identifier (here 'abc123') will be used. The semicolon is not mandatory at the beginning or end of the header. The sample identifier may contain any printable character except semicolons. If no such sample label is found, the identifier in the initial part of the header will be used, but only letters, digits and underscores are allowed. OTU identifiers will be extracted from the headers of the cluster centroid sequences. If the header contains ';otu=def789;' or a similar string somewhere, then the given OTU identifier (here 'def789') will be used. The semicolon is not mandatory at the beginning or end of the header. The OTU identifier may contain any printable character except semicolons. If no such OTU label is found, the identifier in the initial part of the header will be used, and all characters except semicolons are allowed. Alternatively, OTU identifiers can be generated using the relabelling options (\-\-relabel, \-\-relabel_self, \-\-relabel_sha1, or \-\-relabel_md5). Taxonomy information, if present, will also be extracted from the headers of the centroid sequences. If the header contains ';tax=Homo_sapiens;' or a similar string somewhere, then the given taxonomy information (here 'Homo_sapiens') will be used. The semicolon is not mandatory at the beginning or end of the header. The taxonomy information may contain any printable character except semicolons. If an OTU table in the biom version 2.1 HDF5 file format is required, the biom utility may be used as described at .URL https://biom-format.org/documentation/biom_conversion.html "(link)" . .TAG centroids .TP .BI \-\-centroids \0filename Output cluster centroid sequences to \fIfilename\fR, in fasta format. The centroid is the sequence that seeded the cluster (i.e. the first sequence of the cluster). .TAG clusterout_id .TP .BI \-\-clusterout_id Add cluster identifier information to the output files when using the \-\-centroids, \-\-consout and \-\-profile options. .TAG clusterout_sort .TP .BI \-\-clusterout_sort Sort some output files by decreasing abundance instead of input order. It applies to the \-\-consout, \-\-msaout, \-\-profile, \-\-centroids, and \-\-uc options. For \-\-uc, the sorting applies only to the centroid information part (the C lines). .TAG cluster_fast .TP .BI \-\-cluster_fast \0filename Clusterize the fasta sequences in \fIfilename\fR, automatically sort by decreasing sequence length beforehand. .TAG cluster_size .TP .BI \-\-cluster_size \0filename Clusterize the fasta sequences in \fIfilename\fR, automatically sort by decreasing sequence abundance beforehand. .TAG cluster_smallmem .TP .BI \-\-cluster_smallmem \0filename Clusterize the fasta sequences in \fIfilename\fR without automatically modifying their order beforehand. Sequence are expected to be sorted by decreasing sequence length, unless \-\-usersort is used. .TAG cluster_unoise .TP .BI \-\-cluster_unoise \0filename Perform denoising of the fasta sequences in \fIfilename\fR according to the UNOISE version 3 algorithm by Robert Edgar, but without the \fIde novo\fR chimera removal step, which may be performed afterwards with \-\-uchime3_denovo. The options \-\-minsize (default 8) and \-\-unoise_alpha (default 2.0) may be specified. In the this algorithm, clustering of sequences depend on both the sequence distance and the abundance ratio. The abundance ratio (skew) is the abundance of a new sequence divided by the abundance of the centroid sequence. This skew must not be larger than beta if the sequences should be clustered together. Beta is calculated as 2 raised to the power of minus 1 minus alpha times the sequence distance. The sequence distance used is the number of mismatches in the alignment, ignoring gaps. This means that the abundance must be exponentially lower as the distance increases from the centroid for a new sequence to be included in the cluster. Nearer sequences with higher abundances will form their own new clusters. .TAG clusters .TP .BI \-\-clusters \0string Output each cluster to a separate fasta file using the prefix \fIstring\fR and a ticker (0, 1, 2, etc.) to construct the path and filenames. .TAG consout .TP .BI \-\-consout \0filename Output cluster consensus sequences to \fIfilename\fR. For each cluster, a center-star multiple sequence alignment is computed with the centroid as the center, using a fast algorithm (not accurate when using low pairwise identity thresholds). A consensus sequence is constructed by taking the majority symbol (nucleotide or gap) from each column of the alignment. Columns containing a majority of gaps are skipped, except for terminal gaps. If the \-\-sizein option is specified, sequence abundances will be taken into account. .TAG cons_truncate .TP .B \-\-cons_truncate This command is ignored. A warning is issued. .\" .TP .\" .B \-\-cons_truncate .\" when using the \-\-consout option to build consensus sequences, .\" do not ignore terminal gaps. That option skips terminal columns .\" if they contain a majority of gaps, yielding shorter consensus .\" sequences than when using \-\-consout alone. .TP .BI \-\-gapext \0string Set penalties for a gap extension. See \-\-gapopen for a complete description of the penalty declaration system. The default is to initialize the six gap extending penalties using a penalty of 2 for extending internal gaps and a penalty of 1 for extending terminal gaps, in both query and target sequences (i.e. 2I/1E). .TP .BI \-\-gapopen \0string Set penalties for a gap opening. A gap opening can occur in six different contexts: in the query (Q) or in the target (T) sequence, at the left (L) or right (R) extremity of the sequence, or inside the sequence (I). Sequence symbols (Q and T) can be combined with location symbols (L, I, and R), and numerical values to declare penalties for all possible contexts: aQL/bQI/cQR/dTL/eTI/fTR, where abcdef are zero or positive integers, and '/' is used as a separator. .br To simplify declarations, the location symbols (L, I, and R) can be combined, the symbol (E) can be used to treat both extremities (L and R) equally, and the symbols Q and T can be omitted to treat query and target sequences equally. For instance, the default is to declare a penalty of 20 for opening internal gaps and a penalty of 2 for opening terminal gaps (left or right), in both query and target sequences (i.e. 20I/2E). If only a numerical value is given, without any sequence or location symbol, then the penalty applies to all gap openings. To forbid gap-opening, an infinite penalty value can be declared with the symbol '*'. To use \fBvsearch\fR as a semi-global aligner, a null-penalty can be applied to the left (L) or right (R) gaps. .br \fBvsearch\fR always initializes the six gap opening penalties using the default parameters (20I/2E). The user is then free to declare only the values he/she wants to modify. The \fIstring\fR is scanned from left to right, accepted symbols are (0123456789/LIREQT*), and later values override previous values. .br Please note that \fBvsearch\fR, in contrast to usearch, only allows integer gap penalties. Because the lowest gap penalties are 0.5 by default in usearch, all default scores and gap penalties in \fBvsearch\fR have been doubled to maintain equivalent penalties and to produce identical alignments. .TAG id .TP .BI \-\-id \0real Do not add the target to the cluster if the pairwise identity with the centroid is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). The pairwise identity is defined as the number of (matching columns) / (alignment length - terminal gaps). That definition can be modified by \-\-iddef. .TAG iddef .TP .BI \-\-iddef\~ "0|1|2|3|4" Change the pairwise identity definition used in \-\-id. Values accepted are: .RS .RS .nr step 0 1 .IP \n[step]. 4 CD-HIT definition: (matching columns) / (shortest sequence length). .IP \n+[step]. edit distance: (matching columns) / (alignment length). .IP \n+[step]. edit distance excluding terminal gaps (same as \-\-id). .IP \n+[step]. Marine Biological Lab definition counting each gap opening (internal or terminal) as a single mismatch, whether or not the gap was extended: 1.0 - [(mismatches + gap openings)/(longest sequence length)] .IP \n+[step]. BLAST definition, equivalent to \-\-iddef 1 in a context of global pairwise alignment. .RE .RE .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TP .BI \-\-match\~ "integer" Score assigned to a match (i.e. identical nucleotides) in the pairwise alignment. The default value is 2. .TAG minsize .TP .BI \-\-minsize\~ "positive integer" Specify the minimum abundance of sequences for denoising using \-\-cluster_unoise. The default is 8. .TP .BI \-\-mismatch\~ "integer" Score assigned to a mismatch (i.e. different nucleotides) in the pairwise alignment. The default value is -4. .TAG msaout .TP .BI \-\-msaout \0filename Output a multiple sequence alignment and a consensus sequence for each cluster to \fIfilename\fR, in fasta format. Be warned that vsearch computes center star multiple sequence alignments using a fast method whose accuracy can decrease significantly when using low pairwise identity thresholds. The consensus sequence is constructed by taking the majority symbol (nucleotide or gap) from each column of the alignment. Columns containing a majority of gaps are skipped, except for terminal gaps. If the \-\-sizein option is specified, sequence abundances will be taken into account when computing the consensus. .TAG mothur_shared_out .TP .BI \-\-mothur_shared_out \0filename Output an OTU table in the mothur 'shared' tab-separated plain text format as described at .URL https://www.mothur.org/wiki/Shared_file (link) . The format describes how a matrix containing the abundances of the OTUs in the different samples is stored. The first line will start with the strings 'label', 'group' and 'numOtus' and is followed by a list of all OTU identifiers. The following lines, one for each sample, starts with the string 'vsearch' followed by the sample identifier, the total number of OTUs, and a list of abundances for each OTU in that sample, in the order given on the first line. The OTU and sample identifiers are extracted from the FASTA headers of the sequences. The OTUs are represented by the cluster centroids. See the \-\-biomout option for further details. .TAG otutabout .TP .BI \-\-otutabout \0filename Output an OTU table in the classic tab-separated plain text format as a matrix containing the abundances of the OTUs in the different samples. The first line will start with the string '#OTU ID' and is followed by a tab-separated list of all sample identifiers. The following lines, one for each OTU, starts with the OTU identifier and is followed by a tab-separated list of abundances for that OTU in each sample, in the order given on the first line. The OTU and sample identifiers are extracted from the FASTA headers of the sequences (see the \-\-sample option). The OTUs are represented by the cluster centroids. An extra column is added to the right of the table if taxonomy information is available for at least one of the OTUs. This column will be labelled 'taxonomy' and each row will then contain the taxonomy information extracted for that OTU. See the \-\-biomout option for further details. .TAG profile .TP .BI \-\-profile \0filename Output a sequence profile to a text file with the frequency of each nucleotide in each position in the multiple alignment for each cluster. There is a FASTA-like header line for each cluster, followed by the profile information in a tab-separated format. The eight columns are: position (0-based), consensus nucleotide, number of As, number of Cs, number of Gs, number of Ts or Us, number of gap symbols, and finally the total number of ambiguous nucleotide symbols (B, D, H, K, M, N, R, S, Y, V or W). All numbers are integers. If the \-\-sizein option is specified, sequence abundances will be taken into account. .TAG qmask .TP .BI \-\-qmask\~ "none|dust|soft" Mask regions in sequences using the \fIdust\fR or the \fIsoft\fR methods, or do not mask (\fInone\fR). Warning, when using \fIsoft\fR masking, clustering becomes case sensitive. The default is to mask using \fIdust\fR. .TAG qsegout .TP .BI \-\-qsegout \0filename Write the aligned part of each query sequence to \fIfilename\fR in FASTA format. .TAG relabel .TP .BI \-\-relabel \0string Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG relabel_self .TP .B \-\-relabel_self Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequence identifiers in the output files produced by \-\-consout, \-\-profile and \-\-centroids options. Please see the description of the same option under Chimera detection for details. .TAG sizein .TP .B \-\-sizein Take into account the abundance annotations present in the input fasta file (search for the pattern '[>;]size=\fIinteger\fR[;]' in sequence headers). .TAG sizeorder .TP .B \-\-sizeorder When an amplicon is close to 2 or more centroids, both within the distance specified with the \-\-id option, resolve the ambiguity by clustering it with the centroid having the highest abundance, not necessarily the closest one. The option only has effect when the value specified with \-\-maxaccepts is higher than one. The \-\-sizeorder option turns on what is sometimes referred to as abundance-based greedy clustering (AGC), in contrast to the default distance-based greedy clustering (DGC). .TAG sizeout .TP .B \-\-sizeout Add abundance annotations to the output fasta files (add the pattern ';size=\fIinteger\fR;' to sequence headers). If \-\-sizein is specified, abundance annotations are reported to output files, and each cluster centroid receives a new abundance value corresponding to the total abundance of the amplicons included in the cluster (\-\-centroids option). If \-\-sizein is not specified, input abundances are set to 1 for amplicons, and to the number of amplicons per cluster for centroids. .TAG strand .TP .BI \-\-strand\~ "plus|both" When comparing sequences with the cluster seed, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. .TAG tsegout .TP .BI \-\-tsegout \0filename Write the aligned part of each target sequence to \fIfilename\fR in FASTA format. .TAG uc .TP .BI \-\-uc \0filename Output clustering results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns and 3 different type of entries (S, H or C). Each fasta sequence in the input file can be either a cluster centroid (S) or a hit (H) assigned to a cluster. Cluster records (C) summarize information (size, centroid label) for each cluster. In the context of clustering, the option \-\-uc_allhits has no effect on the \-\-uc output. Column content varies with the type of entry (S, H or C): .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type: S, H, or C. .IP \n+[step]. Cluster number (zero-based). .IP \n+[step]. Centroid length (S), query length (H), or cluster size (C). .IP \n+[step]. Percentage of similarity with the centroid sequence (H), or set to '*' (S, C). .IP \n+[step]. Match orientation + or - (H), or set to '*' (S, C). .IP \n+[step]. Not used, always set to '*' (S, C) or to zero (H). .IP \n+[step]. Not used, always set to '*' (S, C) or to zero (H). .IP \n+[step]. set to '*' (S, C) or, for H, compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence (ignoring terminal gaps). .IP \n+[step]. Label of the query sequence (H), or of the centroid sequence (S, C). .IP \n+[step]. Label of the centroid sequence (H), or set to '*' (S, C). .RE .RE .TAG unoise_alpha .TP .BI \-\-unoise_alpha\~ real Specify the alpha parameter to the \-\-cluster_unoise command. The default is 2.0. .TAG usersort .TP .B \-\-usersort When using \-\-cluster_smallmem, allow any sequence input order, not just a decreasing length ordering. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .TP .B ... Most searching options as well as score filtering, gap penalties and masking also apply to clustering (see the Searching section for definitions): \-\-alnout, \-\-blast6out, \-\-fastapairs, \-\-matched, \-\-notmatched, \-\-maxaccepts, \-\-maxrejects, \-\-samout, \-\-userout, \-\-userfields .RE .PP .\" ---------------------------------------------------------------------------- .TAG dereplication-and-rereplication-options Dereplication and rereplication options: .PP .RS VSEARCH can dereplicate sequences with the commands \-\-derep_fulllength, \-\-derep_id, \-\-derep_smallmem, \-\-derep_prefix and \-\-fastx_uniques. The \-\-derep_fulllength command is depreciated and is replaced by the new \-\-fastx_uniques command that can also handle FASTQ files in addition to FASTA files. The \-\-derep_fulllength, \-\-derep_smallmem, and \-\-fastx_uniques commands requires strictly identical sequences of the same length, but ignores upper/lower case and treats T and U as identical symbols. The \-\-derep_id command requires both identical sequences and identical headers/labels. The \-\-derep_prefix command will group sequences with a common prefix and does not require them to be equally long. The \-\-derep_smallmem uses a much smaller amount of memory when dereplicating than the other files, and may be a bit slower and cannot read the input from a pipe. It takes both FASTA and FASTQ files as input but only writes FASTA output to the file specified with the \-\-fastaout option. The \-\-fastx_uniques command can write FASTQ output (specified with \-\-fastqout) or FASTA output (specified with \-\-fastaout) as well as a special tab-separated column text format (with \-\-tabbedout). The other commands can write FASTA output to the file specified with the \-\-output option. All dereplication commands, except \-\-derep_smallmem, can write output to a special UCLUST-like file specified with the \-\-uc option. The \-\-rereplicate command can duplicate sequences in the input file according to the abundance of each input sequence. Other valid options are \-\-fastq_ascii, \-\-fastq_asciiout, \-\-fastq_qmax, \-\-fastq_qmaxout, \-\-fastq_qmin, \-\-fastq_qminout, \-\-fastq_qout_max, \-\-lengthout, \-\-maxuniquesize, \-\-minuniquesize, \-\-relabel, \-\-relabel_keep, \-\-relabel_md5, \-\-relabel_self, \-\-relabel_sha1, \-\-sizein, \-\-sizeout, \-\-strand, \-\-topn, \-\-xlength, and \-\-xsize. .PP .TAG derep_fulllength .TP 9 .BI \-\-derep_fulllength \0filename Merge strictly identical sequences contained in \fIfilename\fR. Identical sequences are defined as having the same length and the same string of nucleotides (case insensitive, T and U are considered the same). See the options \-\-sizein and \-\-sizeout to take into account and compute abundance values. This command does not support multithreading. .TAG derep_id .TP .BI \-\-derep_id \0filename Merge strictly identical sequences contained in \fIfilename\fR, as with the \-\-derep_fulllength command, but the sequence labels (identifiers) on the header line need to be identical too. .TAG derep_smallmem .TP .BI \-\-derep_smallmem \0filename Merge strictly identical sequences contained in \fIfilename\fR, as with the \-\-derep_fulllength command, but using much less memory. The output is written to a FASTA file specified with the \-\-fastaout option. The output is written in the order that the sequences first appear in the input, and not in descending abundance order as with the other dereplication commands. It can read, but not write FASTQ files. This command cannot read from a pipe, it must be a proper file, as it is read twice. Dereplication is performed with a 128 bit hash function and it is not verified that grouped sequences are identical, however the probability that two different sequences are grouped in a dataset of one billion unique sequences is approximately 1e-21. Memory footprint is appr. 24 bytes times the number of unique sequence. Multithreading and the options \-\-topn, \-\-uc, or \-\-tabbedout are not supported. .TAG derep_prefix .TP .BI \-\-derep_prefix \0filename Merge sequences with identical prefixes contained in \fIfilename\fR. A short sequence identical to an initial segment (prefix) of another sequence is considered a replicate of the longer sequence. If a sequence is identical to the prefix of two or more longer sequences, it is clustered with the shortest of them. If they are equally long, it is clustered with the most abundant. Remaining ties are solved using sequence headers and sequence input order. Sequence comparisons are case insensitive, and T and U are considered identical. This command does not support multithreading. .TAG fastaout .TP .BI \-\-fastaout \0filename Write the dereplicated sequences to \fIfilename\fR, in fasta format and sorted by decreasing abundance. Identical sequences receive the header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fasta header using the pattern ';size=\fIinteger\fR;'. This option is only valid for \-\-fastx_uniques and \-\-derep_smallmem. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the dereplicated sequences to \fIfilename\fR, in fastq format and sorted by decreasing abundance. Identical sequences receive the header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fastq header using the pattern ';size=\fIinteger\fR;'. This option is only valid for \-\-fastx_uniques. .TAG fastq_ascii .TP .BI \-\-fastq_ascii\~ "positive integer" Define the ASCII character number used as the basis for the FASTQ quality score. The default is 33, which is used by the Sanger / Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 and 64 are valid arguments. .TAG fastq_asciiout .TP .BI \-\-fastq_asciiout\~ "positive integer" When using \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, define the ASCII character number used as the basis for the FASTQ quality score when writing FASTQ output files. The default is 33. Only 33 and 64 are valid arguments. .TAG fastq_qmax .TP .BI \-\-fastq_qmax\~ "positive integer" Specify the maximum quality score accepted when reading FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. .TAG fastq_qmaxout .TP .BI \-\-fastq_qmaxout\~ "positive integer" Specify the maximum quality score used when writing FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use a maximum quality score of 40. .TAG fastq_qmin .TP .BI \-\-fastq_qmin\~ "positive integer" Specify the minimum quality score accepted for FASTQ files. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. .TAG fastq_qminout .TP .BI \-\-fastq_qminout\~ "positive integer" Specify the minimum quality score used when writing FASTQ files. The default is 0, which is usual for Sanger/Illumina 1.8+ files. Older versions of the format may use scores between -5 and 2. .TAG fastq_qout_max .TP .BI \-\-fastq_qout_max For \-\-fastx_uniques, indicate that the new quality scores computed when dereplicating FASTQ files should be equal to the maximum (best) of the input quality scores for each position (corresponding to the lowest error probability). The default is to output a quality score corresponding to the average of the error probabilities for each position. .TAG fastx_uniques .TP .BI \-\-fastx_uniques \0filename Merge strictly identical sequences contained in FASTA or FASTQ file \fIfilename\fR. Identical sequences are defined as having the same length and the same string of nucleotides (case insensitive, T and U are considered the same). See the options \-\-sizein and \-\-sizeout to take into account and compute abundance values. This command does not support multithreading. By default, the quality scores in FASTQ output files will correspond to the average error probability of the nucleotides in the each position. If the \-\-fastq_qout_max option is given, the quality score will be the highest (best) quality score observed in each position. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA and FASTQ format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG maxuniquesize .TP .BI \-\-maxuniquesize\~ "positive integer" Discard sequences with a post-dereplication abundance value greater than \fIinteger\fR. .TAG minuniquesize .TP .BI \-\-minuniquesize\~ "positive integer" Discard sequences with a post-dereplication abundance value smaller than \fIinteger\fR. .TAG output .TP .BI \-\-output \0filename Write the dereplicated sequences to \fIfilename\fR, in fasta format and sorted by decreasing abundance. Identical sequences receive the header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fasta header using the pattern ';size=\fIinteger\fR;'. This option is not allowed for \-\-fastx_uniques or \-\-derep_smallmem. .TP .TAG relabel .BI \-\-relabel \0string Please see the description of the same option under Chimera detection for details. .TP .TAG relabel_keep .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TP .TAG relabel_md5 .B \-\-relabel_md5 Please see the description of the same option under Chimera detection for details. .TP .TAG relabel_self .B \-\-relabel_self Please see the description of the same option under Chimera detection for details. .TP .TAG relabel_sha1 .B \-\-relabel_sha1 Please see the description of the same option under Chimera detection for details. .TP .TAG rereplicate .BI \-\-rereplicate \0filename Duplicate each sequence the number of times indicated by the abundance of each sequence in the specified file (option \-\-sizein is always implied). The sequence labels are identical for the same sequence, unless \-\-relabel, \-\-relabel_self, \-\-relabel_sha1 or \-\-relabel_md5 is used to create unique labels. Output is written to the file specified with the \-\-output option, in FASTA format. The output file does not contain abundance information unless \-\-sizeout is specified, in which case an abundance of 1 is used. .TAG sizein .TP .B \-\-sizein Take into account the abundance annotations present in the input fasta file (search for the pattern '[>;]size=\fIinteger\fR[;]' in sequence headers). That option is active by default when rereplicating. .TAG sizeout .TP .B \-\-sizeout Add abundance annotations to the output fasta file (add the pattern ';size=\fIinteger\fR;' to sequence headers). If \-\-sizein is specified, each unique sequence receives a new abundance value corresponding to its total abundance (sum of the abundances of its occurrences). If \-\-sizein is not specified, input abundances are set to 1, and each unique sequence receives a new abundance value corresponding to its number of occurrences in the input file. .TAG strand .TP .BI \-\-strand\~ "plus|both" When searching for strictly identical sequences, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. .TAG tabbedout .TP .BI \-\-tabbedout \0filename Output clustering info to the specified tab-separated text file with 6 columns and a row for each input sequence. Column 1 contains the original label/header of the sequence. Column 2 contains the label of the output sequence which is equal to the label/header of the first sequence in each cluster, but potentially relabelled. Column 3 contains the cluster number, starting from 0. Column 4 contains the sequence number within each cluster, starting at 0. Column 5 contains the number of sequences in the cluster. Column 6 contains the original label/header of the first sequence in the cluster before any potential relabelling. This option is only valid for the \-\-fastx_uniques command. .TAG topn .TP .BI \-\-topn\~ "positive integer" Output only the top \fIinteger\fR sequences (i.e. the most abundant). .TAG uc .TP .BI \-\-uc \0filename Output full-length or prefix-dereplication results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns and 3 different type of entries (S, H or C). Each fasta sequence in the input file can be either a cluster centroid (S) or a hit (H) assigned to a cluster. Cluster records (C) summarize information (size, centroid label) for each cluster. In the context of dereplication, the option \-\-uc_allhits has no effect on the \-\-uc output. Column content varies with the type of entry (S, H or C): .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type: S, H, or C. .IP \n+[step]. Cluster number (zero-based). .IP \n+[step]. Sequence length (S, H), or cluster size (C). .IP \n+[step]. Percentage of similarity with the centroid sequence (H), or set to '*' (S, C). .IP \n+[step]. Match orientation + or - (H), or set to '*' (S, C). .IP \n+[step]. Not used, always set to '*' (S, C) or 0 (H). .IP \n+[step]. Not used, always set to '*' (S, C) or 0 (H). .IP \n+[step]. Not used, always set to '*'. .IP \n+[step]. Label of the query sequence (H), or of the centroid sequence (S, C). .IP \n+[step]. Label of the centroid sequence (H), or set to '*' (S, C). .RE .RE .RE .PP .RS .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG extraction-options Extraction options: .RS .PP Sequences with headers matching certain criteria can be extracted from FASTA and FASTQ files using the \-\-fastx_getseq, \-\-fastx_getseqs and \-\-fastx_getsubseq commands. .PP The \-\-fastx_getseq command requires the header to match a label specified with the \-\-label option. If the \-\-label_substr_match option is given, the label may be a substring located anywhere in the header, otherwise the entire header must match the label. These matches are not case-sensitive. The headers in the input file are truncated at the first space or tab character unless the \-\-notrunclabels option is given. The matching sequences will be written to the files specified with the \-\-fastaout and \-\-fastqout options, in FASTA and FASTQ format, respectively. Sequences that do not match are written to the files specified with the \-\-notmatched and \-\-notmatchedfq options, respectively. .PP The \-\-fastx_getsubseq command is similar to the \-\-fastx_getseq command, but will extract a subsequence of the matching sequences. The start position is specified with the \-\-subseq_start option and the end position is specified with the \-\-subseq_end option. The positions are 1-based, meaning that the first symbol of the sequence is at position 1. If the start or end position option is not specified, the default is to start at the first position and end at the last position in the sequence. .PP The \-\-fastx_getseqs command is similar to the \-\-fastx_getseq command but allows more flexibility in specifying the label(s) to be matched. A single label may be specified using the \-\-label option as described above. Alternatively, a file containing a list of labels to be matched may be specified with the \-\-labels option. The file must be a plain text file with one label on each line. The \-\-label_word and \-\-label_words options may be used to specify either a single word or a file containing a list of words, respectively, to be matched. Words are defined as character sequences delimited either by a character that is not alpha-numeric (A-Z, a-z, or 0-9) or by the beginning or end of the header. Word matching is case-sensitive. The \-\-label_field option will limit the matching of words to a certain field in the header. .PP .TAG fastaout .TP 9 .BI \-\-fastaout \0filename Write the extracted sequences in FASTA format to the file with the given name. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the extracted sequences in FASTQ format to the file with the given name. This option is illegal if the input is in FASTA format. .TAG fastx_getseq .TP .BI \-\-fastx_getseq \0filename Extract sequences from the given FASTA or FASTQ file. Specify a label to match using the \-\-label option. Output files are specified with the \-\-fastaout, \-\-fastqout, \-\-notmatched and \-\-notmatchedfq options. .TAG fastx_getseqs .TP .BI \-\-fastx_getseqs \0filename Extract sequences from the given FASTA or FASTQ file. Specify the label or labels to match using one of the following options: \-\-label, \-\-labels, \-\-label_word, or \-\-label_words. Output files are specified with the \-\-fastaout, \-\-fastqout, \-\-notmatched and \-\-notmatchedfq options. .TAG fastx_getsubseq .TP .BI \-\-fastx_getsubseq \0filename Extract a certain part of some of the sequences in the given FASTA or FASTQ file. Specify labels to match using the \-\-label option. Specify the subsequence range to be extracted with the \-\-subseq_start and \-\-subseq_end options. Output files are specified with the \-\-fastaout, \-\-fastqout, \-\-notmatched and \-\-notmatchedfq options. .TAG label .TP .BI \-\-label \0string Specify the label to match in the sequence header. Unless the \-\-label_substr_match option is given, the label must match the entire header. The comparison is not case-sensitive. .TAG label_field .TP .BI \-\-label_field \0string Specify a field name to be used when matching using the \-\-label_word or \-\-label_words option. The field name is a string like "abc" that must precede the word to be matched with an equals sign (=) in between. The field must be delimited by semicolons or the beginning or end of the header. The following header will match the label 123 in the field abc: "seq1;abc=123". .TAG label_substr_match .TP .BI \-\-label_substr_match The labels specified with the \-\-label or the \-\-labels option may match anywhere in the header if this option is given. Otherwise a label needs to match the entire header. .TAG label_word .TP .BI \-\-label_word \0string Specify a word to match in the sequence header. Words are defined as strings delimited by either the start or end of the header or by any symbol that is not a letter (A-Z, a-z) or digit (0-9). The comparison is case-sensitive. .TAG label_words .TP .BI \-\-label_words \0filename Specify a file containing words to be matched against the sequence headers. The plain text file must contain one word on each line. Words are defined as strings delimited by either the start or end of the header or by any symbol that is not a letter (A-Z, a-z) or digit (0-9). The comparison is case-sensitive. .TAG labels .TP .BI \-\-labels \0filename Specify a file containing labels to be matched against the sequence headers. The plain text file must contain one label on each line. Unless the \-\-label_substr_match option is given, a label must match the entire header. The comparison is not case-sensitive. .TAG notmatched .TP .BI \-\-notmatched \0filename Write the sequences that were not extracted to the file with the given name, in FASTA format. .TAG notmatchedfq .TP .BI \-\-notmatchedfq \0filename Write the sequences that were not extracted to the file with the given name, in FASTQ format. This option is illegal if the input is in FASTA format. .TAG subseq_end .TP .BI \-\-subseq_end\~ "positive integer" Specify the end position in the sequences when extracting subsequences using the \-\-fastx_getsubseq command. Positions are 1-based, so the sequences start at position 1. The default is to end at the end of the sequence if this option is not specified. .TAG subseq_start .TP .BI \-\-subseq_start\~ "positive integer" Specify the starting position in the sequences when extracting subsequences using the \-\-fastx_getsubseq command. Positions are 1-based, so the sequences start at position 1. The default is to start at the beginning of the sequence (position 1), if this option is not specified. .RE .PP .\" ---------------------------------------------------------------------------- .TAG fasta-fastq-file-processing-options FASTA/FASTQ/SFF file processing options: .RS .PP Analyse, trim, filter, convert, merge, join or reverse complement sequences in FASTA, FASTQ or SFF files. The \-\-fastq_chars command can be used to analyse FASTQ files to identify the quality encoding and the range of quality score values used. To convert between different FASTQ file variants, use the \-\-fastq_convert command. Statistical analysis of the quality and length of the sequences in a FASTQ file may be performed with the \-\-fastq_stats, \-\-fastq_eestats, and \-\-fastq_eestats2 commands. Sequences may be trimmed, filtered and converted by the \-\-fastq_filter or \-\-fastx_filter commands. The \-\-sff_convert command can be used to convert SFF files to FASTQ, while the \-\-fasta2fastq command will convert a FASTA file to a FASTQ file with fake quality scores. Paired-end reads can be merged using the \-\-fastq_mergepairs command or joined with the \-\-fastq_join command. The \-\-fastx_revcomp command will reverse-complements sequences. .PP .TAG eeout .TP 9 .B \-\-eeout When using \-\-fastq_filter, \-\-fastx_filter or \-\-fastq_mergepairs, include the number of expected errors (ee) in the sequence header of FASTQ and FASTA output files. This option is a synonym of the \-\-fastq_eeout option. Use the \-\-xee option to remove this information from headers. .TAG eetabbedout .TP .BI \-\-eetabbedout \0filename When specified with the \-\-fastq_mergepairs command, write statistics with expected errors of each merged read to the given file. The file is a tab separated file with four columns: The number of expected errors in the forward read, the number of expected errors in the reverse read, the number of observed errors in the forward read, and the number of observed errors in the reverse read. The observed number of errors are the number of differences in the overlap region of the merged sequence relative to each of the reads in the pair. .TAG fasta2fastq .TP .BI \-\-fasta2fastq \0filename Add a fake nucleotide quality score to the sequences in the given FASTA file and write them to the FASTQ file specified with the \-\-fastqout option. The quality score may be adjusted using the \-\-fastq_qmaxout option (default 41). The \-\-fastq_asciiout option may be used to adjust the FASTQ output quality ASCII base character (default 33). .TAG fastaout .TP .BI \-\-fastaout \0filename When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, write to the given FASTA-formatted file the sequences passing the filter, or the merged sequences. .TAG fastaout_rev .TP .BI \-\-fastaout_rev \0filename When using \-\-fastq_filter, or \-\-fastx_filter, write to the given FASTA-formatted file the reverse reads passing the filter. .TAG fastaout_notmerged_fwd .TP .BI \-\-fastaout_notmerged_fwd \0filename When using \-\-fastq_mergepairs, write forward reads not merged to the specified FASTA file. .TAG fastaout_notmerged_rev .TP .BI \-\-fastaout_notmerged_rev \0filename When using \-\-fastq_mergepairs, write reverse reads not merged to the specified FASTA file. .TAG fastaout_discarded .TP .BI \-\-fastaout_discarded \0filename Write sequences that do not pass the filter of the \-\-fastq_filter or \-\-fastx_filter command to the given FASTA-formatted file. .TAG fastaout_discarded_rev .TP .BI \-\-fastaout_discarded_rev \0filename Write reverse reads that do not pass the filter of the \-\-fastq_filter or \-\-fastx_filter command to the given FASTA-formatted file. .TAG fastq_allowmergestagger .TP .B \-\-fastq_allowmergestagger When using \-\-fastq_mergepairs, allow merging of staggered read pairs. Staggered pairs are pairs where the 3' end of the reverse read has an overhang to the left of the 5' end of the forward read. This situation can occur when a very short fragment is sequenced. The 3' overhang of the reverse read is not included in the merged sequence. The opposite option is the \-\-fastq_nostagger option. The default is to discard staggered pairs. .TAG fastq_ascii .TP .BI \-\-fastq_ascii\~ "positive integer" Define the ASCII character number used as the basis for the FASTQ quality score. The default is 33, which is used by the Sanger / Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 and 64 are valid arguments. .TAG fastq_asciiout .TP .BI \-\-fastq_asciiout\~ "positive integer" When using \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, define the ASCII character number used as the basis for the FASTQ quality score when writing FASTQ output files. The default is 33. Only 33 and 64 are valid arguments. .TAG fastq_chars .TP .BI \-\-fastq_chars \0filename Summarize the composition of sequence and quality strings contained in the input FASTQ file. For each sequence symbol, \-\-fastq_chars gives the number of occurrences of the symbol, its relative frequency and the length of the longest run of that symbol. For each character present in the quality strings, \-\-fastq_chars gives the ASCII value of the character, its relative frequency, and the number of times a \fIk\fR-mer of that character appears at the end of quality strings. The length of the \fIk\fR-mer can be set using \-\-fastq_tail (4 by default). The command \-\-fastq_chars tries to automatically detect the quality encoding (Solexa, Illumina 1.3+, Illumina 1.5+ or Illumina 1.8+/Sanger) by analyzing the range of observed quality score values. In case of success, \-\-fastq_chars suggests values for the \-\-fastq_ascii (33 or 64), \-\-fastq_qmin and \-\-fastq_qmax options to be used with the other commands that require a FASTQ input file. .TAG fastq_convert .TP .BI \-\-fastq_convert \0filename Convert between the different variants of the FASTQ file format. The quality encoding of the input file must be specified with the \-\-fastq_ascii option (either 33 or 64, the default is 33), and the output quality encoding must be specified with the \-\-fastq_asciiout option (default 33). The minimum and maximum output quality scores may be limited using the \-\-fastq_qminout and \-\-fastq_qmaxout options. The output file is specified with the \-\-fastqout option. .TAG fastq_eeout .TP .B \-\-fastq_eeout When using \-\-fastq_filter, \-\-fastx_filter or \-\-fastq_mergepairs, include the number of expected errors (ee) in the sequence header of FASTQ and FASTA files. This option is a synonym of the \-\-eeout option. Use the \-\-xee option to remove this information from headers. .TAG fastq_eestats .TP .BI \-\-fastq_eestats \0filename Analyze a FASTQ file and report statistics on the distributions of quality scores, error probabilities and expected accumulated errors. The report, a table of 21 tab-separated columns, is written to the file specified with the \-\-output option. The first column corresponds to the position in the reads (Pos). The second and third columns correspond to the number of reads (Reads) and percentage of reads (PctRecs) that include this position. The remaining columns include information about the distribution of quality scores in this position (Q), error probabilities in this position (Pe), and finally the expected number of accumulated errors from the beginning of the reads and until the current position (EE). For each of the Q, Pe and EE distributions, the following statistics are included: minimum value (Min), lower quartile (Low), median (Med), mean (Mean), upper quartile (Hi), and maximum value (Max). The quality encoding and the range of quality values may be specified with \-\-fastq_ascii \-\-fastq_qmin and \-\-fastq_qmax. .TAG fastq_eestats2 .TP .BI \-\-fastq_eestats2 \0filename Analyze the specified FASTQ file and report statistics on the number of sequences that would be retained at a combination of selected cutoffs for length truncation and maximum expected errors, that could potentially be used as arguments to the \-\-fastq_trunclen and \-\-fastq_maxee options to the \-\-fastq_filter command. The result, a table of two or more columns, is written to the file specified with the \-\-output option. There is a line for each length truncation cutoff. The first column on each line contains the selected truncation length, while the following columns contain the number of sequences and, in parenthesis, the percentage of sequences that would be retained at the selected EE levels. The truncation length cutoffs may be specified with the \-\-length_cutoffs option and requires a list of three comma-separated integers indicating the shortest cutoff, the longest cutoff, and the increment between cutoffs. The longest cutoff may be specified with a star (*) which indicates that the limit is equal to the longest sequence in the input file. The default setting is "50,*,50" meaning that truncation lengths of 50, 100, 150 and so on up to the longest sequence length should be used. The maximum expected error (EE) cutoffs may be specified with the \-\-ee_cutoffs option which requires a comma-separated list of floating point numbers as its argument. The default setting is "0.5,1.0,2.0" that indicates that expected error levels of 0.5, 1.0 and 2.0 should be used. .TAG fastq_filter .TP .BI \-\-fastq_filter \0filename Trim and/or filter sequences in the given FASTQ file. Similar to the \-\-fastx_filter command, but works only on FASTQ files. See \-\-fastx_filter for details. .TAG fastq_join .TP .BI \-\-fastq_join\0 filename Join paired-end sequence reads into one sequence and add a gap between them using a padding sequence. The sequences are not merged as with the fastq_mergepairs command, but simply joined with a gap. The forward reads are specified as the argument to this option and the reverse reads are specified with the \-\-reverse option. The resulting sequences consist of the forward read, the padding sequence and the reverse complement of the reverse read. The padding sequence is specified with the \-\-join_padgap option and the padding quality is specified with the \-\-join_padgapq option. The default padding sequence string is NNNNNNNN and the default padding quality string is IIIIIIII, corresponding to a base quality score of 40 (a very high quality score with error probability 0.0001). The joined sequences are output to the file(s) specified with the \-\-fastaout or \-\-fastqout options. .TAG fastq_maxdiffs .TP .BI \-\-fastq_maxdiffs\~ "positive integer" When using \-\-fastq_mergepairs, specify the maximum number of non-matching nucleotides allowed in the overlap region. That option has a strong influence on the merging success rate. The default value is 10. .TAG fastq_maxdiffpct .TP .BI \-\-fastq_maxdiffpct\~ real When using \-\-fastq_mergepairs, specify the maximum percentage of non-matching nucleotides allowed in the overlap region. The default value is 100.0%. There are other more sophisticated rules in the merging algorithm that will discard read pairs with a high fraction of mismatches. .TAG fastq_maxee .TP .BI \-\-fastq_maxee\~ real When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard sequences with an expected error greater than the specified number (value ranging from 0.0 to infinity). For a given sequence, the expected error is the sum of error probabilities for all the positions in the sequence. Since error probabilities can be small but not null, the expected error is always greater than zero, and at most equal to the length of the sequence when all positions in the sequence have an error probability of 1.0. Using the expected error as the \fIlambda\fR parameter in the Poisson distribution, it is possible to compute the probability of observing \fIk\fR errors. For instance, a read with an expected error of 1.0 has: .RS .IP - 2 36.8% chance of having zero error, .IP - 36.8% chance of having one error, .IP - 18.4% chance of having two errors, .IP - 6.1% chance of having three errors, .IP - 1.5% chance of having four errors, .IP - 0.3% chance of having five errors, .IP - etc. .RE .PP .TAG fastq_maxee_rate .TP .BI \-\-fastq_maxee_rate\~ real When using \-\-fastq_filter or \-\-fastx_filter, discard sequences with an average expected error greater than the specified number (value ranging from 0.0 to 1.0 included). For a given sequence, the average expected error is the sum of error probabilities for all the positions in the sequence, divided by the length of the sequence. .TAG fastq_maxlen .TP .BI \-\-fastq_maxlen\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard sequences with more than the specified number of bases. .TAG fastq_maxmergelen .TP .BI \-\-fastq_maxmergelen\~ "positive integer" When using \-\-fastq_mergepairs, specify the maximum length of the merged sequence (default is 1,000,000). .TAG fastq_maxns .TP .BI \-\-fastq_maxns\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard sequences with more than the specified number of N's. .TAG fastq_mergepairs .TP .BI \-\-fastq_mergepairs\0 filename Merge paired-end sequence reads into one sequence. The forward reads are specified as the argument to this option and the reverse reads are specified with the \-\-reverse option. Reads with the same index/position in the forward and reverse files are considered to form a pair, even if their labels are different. Thus, forward and reverse reads \fBmust\fR appear in the same order and total number in both files. A warning is emitted if the forward and reverse files contain different numbers of reads. The merged sequences are written to the file(s) specified with the \-\-fastaout or \-\-fastqout options. The non-merged reads can be output to the files specified with the \-\-fastaout_notmerged_fwd, \-\-fastaout_notmerged_rev, \-\-fastqout_notmerged_fwd and \-\-fastqout_notmerged_rev options. Statistics may be output to the file specified with the \-\-eetabbedout option. Sequences are truncated as specified with the \-\-fastq_truncqual option to remove low-quality bases in the 3' end. Sequences shorter than specified with \-\-fastq_minlen (after truncation) are discarded (1 by default). Sequences with too many ambiguous bases (N's), as specified with the \-\-fastq_maxns are also discarded (no limit by default). Staggered reads are not merged unless the \-\-fastq_allowmergestagger option is specified. The minimum length of the overlap region between the reads may be specified with the \-\-fastq_minovlen option (at least 5, default 10). The overlap region may not include more mismatches than specified with the \-\-fastq_maxdiffs option (10 by default) or a higher percentage of mismatches than specified with the \-\-fastq_maxdiffpct option (100.0% by default), otherwise the read pair is discarded. Additional rules will avoid merging of reads that cannot be aligned reliably and unambiguously. The minimum and maximum length of the merged sequence may be specified with the \-\-fastq_minmergelen and \-\-fastq_maxmergelen options, respectively. The quality value limits for output files may be specified with the \-\-fastq_qminout and \-\-fastq_qmaxout options, but they apply only to the merged region. Other relevant options are: \-\-fastq_ascii, \-\-fastq_maxee, \-\-fastq_nostagger, \-\-fastq_qmax, \-\-fastq_qmin, and \-\-label_suffix. .TAG fastq_minlen .TP .BI \-\-fastq_minlen\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, discard input sequences with less than the specified number of bases (default 1). .TAG fastq_minmergelen .TP .BI \-\-fastq_minmergelen\~ "positive integer" When using \-\-fastq_mergepairs, specify the minimum length of the merged sequence. The default is 1. .TAG fastq_minovlen .TP .BI \-\-fastq_minovlen\~ "positive integer" When using \-\-fastq_mergepairs, specify the minimum overlap between the merged reads. The default is 10. Must be at least 5. .TAG fastq_minqual .TP .BI \-\-fastq_minqual\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, discard reads having any base with a quality score below the given value. The default is 0, which discards none. .TAG fastq_nostagger .TP .B \-\-fastq_nostagger When using \-\-fastq_mergepairs, forbid the merging of staggered read pairs. This is the default behaviour of \-\-fastq_mergepairs. To change that behaviour, see the \-\-fastq_allowmergestagger option. .TAG fastq_qmax .TP .BI \-\-fastq_qmax\~ "positive integer" Specify the maximum quality score accepted when reading FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. .TAG fastq_qmaxout .TP .BI \-\-fastq_qmaxout\~ "positive integer" When using \-\-fastq_mergepairs, \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, specify the maximum quality score used when writing FASTQ files. For the \-\-fasta2fastq command, the value specified here is the fake quality score used for the FASTQ output file. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use a maximum quality score of 40. The limit only applies to the merged region when using \-\-fastq_mergepairs. .TAG fastq_qmin .TP .BI \-\-fastq_qmin\~ "positive integer" Specify the minimum quality score accepted for FASTQ files. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. .TAG fastq_qminout .TP .BI \-\-fastq_qminout\~ "positive integer" When using \-\-fastq_mergepairs, \-\-fastq_convert or \-\-sff_convert, specify the minimum quality score used when writing FASTQ files. The default is 0, which is usual for Sanger/Illumina 1.8+ files. Older versions of the format may use scores between -5 and 2. The limit applies only to the merged region when using \-\-fastq_mergepairs. .TAG fastq_stats .TP .BI \-\-fastq_stats \0filename Analyze a FASTQ file and report the number of reads it contains. The quality encoding and the range of quality values may be specified with \-\-fastq_ascii \-\-fastq_qmin and \-\-fastq_qmax. That command requires the \-\-log option and outputs the following detailed statistics on read length, quality score, length vs. quality distributions, and length / quality filtering: .RS .TP Read length distribution: .RS .nr step 1 1 .IP \n[step]. 4 L: read length. .IP \n+[step]. N: number of reads. .IP \n+[step]. Pct: fraction of reads with this length. .IP \n+[step]: AccPct: fraction of reads with this length or longer. .RE .TP Quality score distribution: .RS .nr step 1 1 .IP \n[step]. 4 ASCII: character encoding the quality score. .IP \n+[step]. Q: Phred quality score. .IP \n+[step]. Pe: probability of error associated with the quality score. .IP \n+[step]. N: number of bases with this quality score. .IP \n+[step]. Pct: fraction of bases with this quality score. .IP \n+[step]: AccPct: fraction of bases with this quality score or higher. .RE .TP Length vs. quality distribution: .RS .nr step 1 1 .IP \n[step]. 4 L: position in reads (starting from position 2). .IP \n+[step]. PctRecs: fraction of reads with at least this length. .IP \n+[step]. AvgQ: average quality score at this position. .IP \n+[step]. P(AvgQ): error probability corresponding to AvgQ. .IP \n+[step]. AvgP: average error probability at this position. .IP \n+[step]: AvgEE: average expected error over all reads up to this position. .IP \n+[step]: Rate: growth rate of AvgEE between this position and position - 1. .IP \n+[step]: RatePct: Rate (as explained above) expressed as a percentage. .RE .TP Effect of expected error and length filtering: .RS The first column indicates read lengths (\fIL\fR). The next four columns indicate the number of reads that would be retained by the \-\-fastq_filter command if the reads were truncated at length \fIL\fR (option \-\-fastq_trunclen \fIL\fR) and filtered to have a maximum expected error of 1.0, 0.5, 0.25 or 0.1 (with the option \-\-fastq_maxee \fIfloat\fR). The last four columns indicate the fraction of reads that would be retained by the \-\-fastq_filter command using the same length and maximum expected error parameters. .RE .TP Effect of minimum quality and length filtering: .RS The first column indicates read lengths (\fILen\fR). The next four columns indicate the fraction of reads that would be retained by the \-\-fastq_filter command if the reads were truncated at length \fILen\fR (option \-\-fastq_trunclen \fILen\fR) or at the first position with a quality \fIQ\fR equal to or lesser than 5, 10, 15 or 20 (option \-\-fastq_truncqual \fIQ\fR). .RE .RE .TAG fastq_stripleft .TP .BI \-\-fastq_stripleft\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, strip the specified number of bases from the left end of the reads. If the length of the resulting read is null, then the read is discarded. .TAG fastq_stripright .TP .BI \-\-fastq_stripright\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, strip the specified number of bases from the right end of the reads. If the length of the resulting read is null, then the read is discarded. .TAG fastq_tail .TP .BI \-\-fastq_tail\~ "positive integer" When using \-\-fastq_chars, count the number of times a series of characters of length \fIk\fR appears at the end of quality strings. By default, \fIk\fR = 4. .TAG fastq_truncee .TP .BI \-\-fastq_truncee\~ real When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so that their total expected error is not higher than the specified value. .TAG fastq_truncee_rate .TP .BI \-\-fastq_truncee_rate\~ real When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so that their average expected error per base is not higher than the specified value. The truncation will happen at the first occurrence. The average expected error per base is calculated as the total expected number of errors divided by the length of the sequence after truncation. .TAG fastq_trunclen .TP .BI \-\-fastq_trunclen\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences to the specified length. Shorter sequences are discarded. .TAG fastq_trunclen_keep .TP .BI \-\-fastq_trunclen_keep\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences to the specified length. Shorter sequences are not discarded. .TAG fastq_truncqual .TP .BI \-\-fastq_truncqual\~ "positive integer" When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, truncate sequences starting from the first base with the specified base quality score value or lower. .TAG fastqout .TP .BI \-\-fastqout \0filename When using \-\-fastq_filter, \-\-fastq_mergepairs, \-\-fastx_filter or \-\-fasta2fastq, write to the given FASTQ-formatted file the sequences passing the filter, or the merged or converted sequences. .TAG fastqout_rev .TP .BI \-\-fastqout_rev \0filename When using \-\-fastq_filter or \-\-fastx_filter, write to the given FASTQ-formatted file the reverse reads passing the filter. .TAG fastqout_discarded .TP .BI \-\-fastqout_discarded \0filename When using \-\-fastq_filter or \-\-fastx_filter, write sequences that do not pass the filter to the given FASTQ-formatted file. .TAG fastqout_discarded_rev .TP .BI \-\-fastqout_discarded_rev \0filename When using \-\-fastq_filter or \-\-fastx_filter, write reverse reads that do not pass the filter to the given FASTQ-formatted file. .TAG fastqout_notmerged_fwd .TP .BI \-\-fastqout_notmerged_fwd \0filename When using \-\-fastq_mergepairs, write forward reads not merged to the specified FASTQ file. .TAG fastqout_notmerged_rev .TP .BI \-\-fastqout_notmerged_rev \0filename When using \-\-fastq_mergepairs, write reverse reads not merged to the specified FASTQ file. .TAG fastx_filter .TP .BI \-\-fastx_filter \0filename Trim and/or filter the sequences in the given FASTA or FASTQ file and output the remaining sequences to the FASTQ file specified with the \-\-fastqout option and/or to the FASTA file specified with the \-\-fastaout option. Discarded sequences are written to the files specified with the \-\-fastaout_discarded and \-\-fastqout_discarded options. The input format (FASTA or FASTQ) is automatically detected. If the input consists of paired sequences, an input file with reverse reads may be specified with the \-\-reverse option, and corresponding output will be written to the files specified with the \-\-fastqout_rev, \-\-fastaout_rev, \-\-fastqout_discarded_rev, and \-\-fastaout_discarded_rev options. Output can not be written to FASTQ files if the input is in FASTA format. The sequences are first trimmed and then filtered based on the remaining bases. Sequences may be trimmed using the options \-\-fastq_stripleft, \-\-fastq_stripright, \-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_trunclen, \-\-fastq_trunclen_keep and \-\-fastq_truncqual. The sequences may be filtered using the options \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxlen, \-\-fastq_maxns, \-\-fastq_minlen (default 1), \-\-fastq_minqual, \-\-fastq_trunclen, \-\-maxsize, and \-\-minsize. Sequences not satisfying the requirements are discarded. For pairs of sequences, both sequences in a pair must satisfy the requirements, otherwise both are discarded. If no shortening or filtering options are given, all sequences are written to the output files, possibly after conversion from FASTQ to FASTA format. The \-\-relabel option may be used to relabel the output sequences. The \-\-eeout option may be used to output the expected number of errors in each sequence. After all sequences have been processed, the number of kept and discarded sequences will be shown, as well as how many of the kept sequences were trimmed. When the input is in FASTA format, the following options are not accepted because quality scores are not available: \-\-eeout, \-\-fastq_ascii, \-\-fastq_eeout, \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_minqual, \-\-fastq_out, \-\-fastq_qmax, \-\-fastq_qmin, \-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_truncqual, \-\-fastqout_discarded, \-\-fastqout_discarded_rev, \-\-fastqout_rev. .TAG fastx_revcomp .TP .BI \-\-fastx_revcomp \0filename Reverse-complement the sequences in the given FASTA or FASTQ file to a file specified with the \-\-fastaout and/or \-\-fastqout options. If the input file is in FASTA format, the output can not be written back to a FASTQ file due to missing base quality scores. .TAG join_padgap .TP .BI \-\-join_padgap\~ string When running \-\-fastq_join, use the \fIstring\fR as a sequence padding string. The default is NNNNNNNN (8 N's). Option accepts an empty string, or any other combination of ASCII characters. .TAG join_padgapq .TP .BI \-\-join_padgapq\~ string When running \-\-fastq_join, use the \fIstring\fR as a quality padding string. The default is a string of I's equal in length to the sequence padding string. The letter I corresponds to a base quality score of 40 indicating a very high quality base with error probability of 0.0001. Option accepts an empty string, or any other combination of ASCII characters. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA or FASTQ format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG maxsize .TP .BI \-\-maxsize\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, discard sequences with an abundance higher than the specified value. .TAG minsize .TP .BI \-\-minsize\~ "positive integer" When using \-\-fastq_filter or \-\-fastx_filter, discard sequences with an abundance lower than the specified value. .TAG output .TP .BI \-\-output \0filename When using \-\-fastq_eestats or \-\-fastq_eestats2, write tabulated results to \fIfilename\fR. See \-\-fastq_eestats's and \-\-fastq_eestats2's documentation for a complete description of the table. .TAG relabel_keep .TP .B \-\-relabel_keep When using \-\-relabel, keep the old identifier in the header after a space. .TAG relabel .TP .BI \-\-relabel \0string Please see the description of the same option under Chimera detection for details. .TAG relabel_md5 .TP .BI \-\-relabel_md5 Please see the description of the same option under Chimera detection for details. .TAG relabel_self .TP .BI \-\-relabel_self Please see the description of the same option under Chimera detection for details. .TAG relabel_sha1 .TP .BI \-\-relabel_sha1 Please see the description of the same option under Chimera detection for details. .TAG reverse .TP .BI \-\-reverse \0filename When using \-\-fastq_filter, \-\-fastx_filter, \-\-fastq_mergepairs or \-\-fastq_join, specify the FASTQ file containing containing the reverse reads. .TAG sff_convert .TP .BI \-\-sff_convert \0filename Convert the given SFF file to FASTQ. The FASTQ output file is specified with the \-\-fastqout option. The sequence may be clipped as specified in the SFF file if the option \-\-sff_clip is specified, otherwise no clipping occurs. Bases that would have been clipped are converted to lower case, while the rest is in upper case. The output quality encoding may be specified with the \-\-fastq_asciiout option (default 33). The minimum and maximum output quality scores may be limited using the \-\-fastq_qminout and \-\-fastq_qmaxout options. .TAG sff_clip .TP .BI \-\-sff_clip Specifies that the sequences converted by the \-\-sff_convert command should be clipped in both ends as indicated in the SFF file. By default no clipping is performed. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .TAG xee .TP .B \-\-xee Strip information about expected errors (ee) from the output file headers. This information is added by the \-\-fastq_eeout and \-\-eeout options. .RE .PP .\" ---------------------------------------------------------------------------- .TAG masking-options Masking options: .RS .PP An input sequence can be composed of lower- or uppercase letters. When soft masking is specified, lower case letters are treated as symbols that should be masked. Otherwise the case of the input sequences is ignored. .PP DUST masking is performed automatically by the commands for chimera detection (uchime_denovo, uchime_ref), clustering (cluster_fast, cluster_smallmem, cluster_size), masking (maskfasta, fastx_mask), pairwise alignment (allpairs_global) and searching (search_exact, usearch_global). .PP Masking is usually specified with the \-\-qmask option, while the \-\-dbmask option is used for the database sequences specified with the \-\-db option with the \-\-usearch_global, \-\-search_exact and \-\-uchime_ref commands. .PP The argument to the \-\-qmask and \-\-dbmask option may be none, soft or dust. If the argument is none, the no masking is performed. If the argument is soft the lower case symbols are masked. Finally, if the argument is dust, the sequence is masked using the DUST algorithm by Tatusov and Lipman to mask low-complexity regions. .PP If the \-\-hardmask option is specified, all masked regions are converted to N's, otherwise masked regions are indicated by lower case letters. .PP If any sequence is masked, the masked version of the sequence (with lower case letters or N's) is used in all output files. Otherwise the sequence is unmodified. The exception is the sequences in the output file specified with the \-\-uchimealns option, where the input sequences are converted to upper case first and lower case letters indicate disagreement between the aligned sequences. .PP The \-\-qmask option (or \-\-dbmask for database sequences) may be combined with the \-\-hardmask option. The results of using the none, dust or soft argument to \-\-qmask or \-\-dbmask are presented below, assuming each input sequence contains both lower and uppercase symbols. .PP Results if the \-\-hardmask option is off (default): .RS .TP 9 .B none: no masking, all symbols used, no change .TP .B dust: masked symbols lowercased, rest uppercased .TP .B soft: lowercase symbols masked, no case changes .RE .PP Results if the \-\-hardmask option is on: .RS .TP 9 .B none: no masking, all symbols used, no change .TP .B dust: masked symbols changed to Ns, rest unchanged .TP .B soft: lowercase symbols masked and changed to Ns .RE .PP When a sequence region is masked, words in the region are not included in the indices used in the heuristic search algorithm. In all other aspects, the region is treated as other regions. .PP Regions in sequences that are hardmasked (with N's) have a zero alignment score and do not contribute to an alignment. .RE .PP .RS .TAG fastaout .TP 9 .BI \-\-fastaout \0filename Write the masked sequences to \fIfilename\fR, in fasta format. Applies only to the \-\-fastx_mask command. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the masked sequences to \fIfilename\fR, in fastq format. Applies only to the \-\-fastx_mask command. .TAG fastx_mask .TP .BI \-\-fastx_mask \0filename Mask regions in sequences contained in the specified fasta or fastq file. The default is to mask using DUST (use \-\-qmask to modify that behaviour). The output files are specified with the \-\-fastaout and \-\-fastqout options. The minimum and maximum percentage of unmasked residues may be specified with the \-\-min_unmasked_pct and \-\-max_unmasked_pct options, respectively. .TAG hardmask .TP .B \-\-hardmask Symbols in masked regions are replaced by N's. The default is to replace the masked regions by lower case letters. .TAG maskfasta .TP .BI \-\-maskfasta \0filename Mask regions in sequences contained in the fasta file \fIfilename\fR. The default is to mask using \fIdust\fR (use \-\-qmask to modify that behaviour). The output file is specified with the \-\-output option. This command is depreciated, please use \-\-fastx_mask instead. .TAG max_unmasked_pct .TP .BI \-\-max_unmasked_pct \0real Discard sequences with more than the specified maximum percentage of unmasked residues. Works only with \-\-fastx_mask. .TAG min_unmasked_pct .TP .BI \-\-min_unmasked_pct \0real Discard sequences with less than the specified minimum percentage of unmasked residues. Works only with \-\-fastx_mask. .TAG output .TP .BI \-\-output \0filename Write the masked sequences to \fIfilename\fR, in fasta format. Applies only to the \-\-mask_fasta command. .TAG qmask .TP .BI \-\-qmask\~ "none|dust|soft" If the argument is dust, mask regions in sequences using the \fIDUST\fR algorithm that detects simple repeats and low-complexity regions. This is the default for chimera detection, clustering, masking, pairwise alignment, and searching. If the argument is soft, mask the lower case letters in the input sequence. If the argument is none, do not mask. .RE .PP .\" ---------------------------------------------------------------------------- .TAG orienting-options Orienting options: .RS .PP The \-\-orient command can be used to orient the sequences in a given file in either the forward or the reverse complementary direction based on a reference database specified with the \-\-db option. The two strands of each input sequence are compared to the reference database using nucleotide words. If one of the strands shares many more words with at least one sequence in the database than the other, that strand is chosen. The correctly oriented sequences may be written to a FASTA file specified with the \-\-fastaout, and to a FASTQ file specified with the \-\-fastqout option (as long as the input was also in FASTQ format). If the result is uncertain, because the number of matching words is too similar, the original sequence is written to the file specified with the \-\-notmatched option. The results may also be written to a tab-delimited text file specified with the \-\-tabbedout option. This file will contain the query label, the direction (+, - or ?), the number of matching words on the forward strand, and the number of matching words on the reverse complementary strand. By default, a word length of 12 is used for this command. The word length may be adjusted using the \-\-wordlength option. There has to be at least 4 times as many matches on one strand than the other for a strand to be selected. In addition to the common options, the following options may also be specified for this command: \-\-dbmask, \-\-qmask, \-\-relabel, \-\-relabel_keep, \-\-relabel_md5, \-\-relabel_self, \-\-relabel_sha1, \-\-sizein, and \-\-sizeout. .PP .TAG db .TP 9 .BI \-\-db \0filename Read the reference database from the given file. It may be in FASTA, FASTQ or UDB format. If an UDB file is used it should have been created with a wordlength of 12. .TAG fastaout .TP .BI \-\-fastaout \0filename Write the correctly oriented sequences to \fIfilename\fR, in fasta format. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the correctly oriented sequences to \fIfilename\fR, in fastq format. .TAG notmatched .TP .BI \-\-notmatched \0filename Write the sequences with undetermined direction to \fIfilename\fR, in the original format. .TAG orient .TP .BI \-\-orient \0filename Orient the sequences in the given file. .TAG tabbedout .TP .BI \-\-tabbedout \0filename Write the resuls to a tab-delimited text file with the specified \fIfilename\fR. This file will contain the query label, the direction (+, - or ?), the number of matching words on the forward strand, and the number of matching words on the reverse complementary strand. .RE .PP .\" ---------------------------------------------------------------------------- .TAG pairwise-alignment-options Pairwise alignment options: .RS .PP The results of the n * (n-1) / 2 pairwise alignments are written to the result files specified with \-\-alnout, \-\-blast6out, \-\-fastapairs \-\-matched, \-\-notmatched, \-\-qsegout, \-\-samout, \-\-tsegout, \-\-uc or \-\-userout (see Searching section below). Specify either the \-\-acceptall option to output all pairwise alignments, or specify an identity level with \-\-id to discard weak alignments. Most other accept/reject options (see Searching options below) may also be used. Sequences are aligned on their \fIplus\fR strand only. Masking is performed as usual and specified with \-\-qmask and \-\-hardmask. .TAG acceptall .TP 9 .B \-\-acceptall Write the results of all alignments to output files. This option overrides all other accept/reject options (including \-\-id). .TAG allpairs_global .TP .BI \-\-allpairs_global \0filename Perform optimal global pairwise alignments of the fasta sequences contained in \fIfilename\fR. Each sequence is compared to all sequencs that come after it in the file, resulting in a total of n * (n-1) / 2 pairwise alignments, where n is the total number of sequences. This command is multi-threaded. .TAG id .TP .BI \-\-id \0real Reject the sequence match if the pairwise identity is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). .TAG threads .TP .BI \-\-threads\~ "positive integer" Number of computation threads to use (1 to 1024). The number of threads should be lesser or equal to the number of available CPU cores. The default is to use all available resources and to launch one thread per logical core. .TAG uc .TP .BI \-\-uc \0filename Output pairwise alignment results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns. Each sequence is compared to all other sequences, and all hits (\-\-acceptall) or only some hits (\-\-id \fIfloat\fR) are reported, with one pairwise comparison per line: .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type, always set to 'H'. .IP \n+[step]. Ordinal number of the target sequence (based on input order, starting from zero). .IP \n+[step]. Sequence length. .IP \n+[step]. Percentage of similarity with the target sequence. .IP \n+[step]. Match orientation, always set to '+'. .IP \n+[step]. Not used, always set to zero. .IP \n+[step]. Not used, always set to zero. .IP \n+[step]. Compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence (ignoring terminal gaps). .IP \n+[step]. Label of the query sequence. .IP \n+[step]. Label of the target sequence. .RE .RE .RE .PP .\" ---------------------------------------------------------------------------- .TAG restriction-site-cutting-options Restriction site cutting options: .RS .PP The input sequences in the file specified with the \-\-cut command are cut into fragments at all restriction sites matching the pattern given with the \-\-cut_pattern option. The fragments on the forward strand are written to the file specified with the \-\-fastaout file and the fragments on the reverse strand are written to the file specified with the \-\-fastaout_rev option. Input sequences that do not match are written to the file specified with the option \-\-fastaout_discarded, and their reverse complement are also written to the file specified with the \-\-fastaout_discarded_rev option. The relabel options (\-\-relabel, \-\-relabel_self, \-\-relabel_keep, \-\-relabel_md5, and \-\-relabel_sha1) may be used to relabel the output sequences). .TAG cut .TP 9 .BI \-\-cut \0filename Specify the input file with sequences in FASTA format. .TAG cut_pattern .TP .BI \-\-cut_pattern \0string Specify the restriction site cutting pattern and positions. The pattern is a string of lower- or uppercase letters specifying the nucleotides that must match, and may include ambiguous nucleotide symbols. The special characters "^" (circumflex) and "_" (underscore) are used to indicate the cutting position on the forward and reverse strand, respectively. For example, the pattern "G^AATT_C" is the pattern for the EcoRI restriction site. For such palindromic patterns (identical to its reverse complement) the command will output all possible fragments on both strands. For non-palindromic sites, it may be necessary to run the command also on the reverse complemented input sequences. Exactly one cutting site on each strand must be indicated. .TAG fastaout .TP .BI \-\-fastaout \0filename Specify the output file for the resulting fragments on the forward strand. .TAG fastaout_rev .TP .BI \-\-fastaout_rev \0filename Specify the output file for the resulting fragments on the reverse strand. .TAG fastaout_discarded .TP .BI \-\-fastaout_discarded \0filename Specify the output file for the non-matching sequences. .TAG fastaout_discarded_rev .TP .BI \-\-fastaout_discarded_rev \0filename Specify the output file for the non-matching sequences, reverse complemented. .RE .PP .\" ---------------------------------------------------------------------------- .TAG searching-options Searching options: .RS .TAG alnout .TP 9 .BI \-\-alnout \0filename Write pairwise global alignments to \fIfilename\fR using a human-readable format. Use \-\-rowlen to modify alignment length. Output order may vary when using multiple threads. .TAG biomout .TP .BI \-\-biomout \0filename Write search results to an OTU table in the biom version 1.0 file format. The query file contains the samples, while the database file contains the OTUs. Sample and OTU identifiers are extracted from the header of these sequences. See the \-\-biomout option in the Clustering section for further details. .TAG blast6out .TP .BI \-\-blast6out \0filename Write search results to \fIfilename\fR using a blast-like tab-separated format of twelve fields (listed below), with one line per query-target matching (or lack of matching if \-\-output_no_hits is used). Warning, vsearch uses global pairwise alignments, not blast's seed-and-extend algorithm. Therefore, some common blast output values (alignment start and end, evalue, bit score) are reported differently. Output order may vary when using multiple threads. A similar output can be obtain with \-\-userout \fIfilename\fR and \-\-userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits. A complete list and description is available in the section 'Userfields' of this manual. .RS .RS .nr step 1 1 .IP \n[step]. 4 \fIquery\fR: query label. .IP \n+[step]. \fItarget\fR: target (database sequence) label. The field is set to '*' if there is no alignment. .IP \n+[step]. \fIid\fR: percentage of identity (real value ranging from 0.0 to 100.0). The percentage identity is defined as 100 * (matching columns) / (alignment length - terminal gaps). See fields id0 to id4 for other definitions. .IP \n+[step]. \fIalnlen\fR: length of the query-target alignment (number of columns). The field is set to 0 if there is no alignment. .IP \n+[step]. \fImism\fR: number of mismatches in the alignment (zero or positive integer value). .IP \n+[step]. \fIopens\fR: number of columns containing a gap opening (zero or positive integer value, excluding terminal gaps). .IP \n+[step]. \fIqlo\fR: first nucleotide of the query aligned with the target. Always equal to 1 if there is an alignment, 0 otherwise (see \fIqilo\fR to ignore initial gaps). .IP \n+[step]. \fIqhi\fR: last nucleotide of the query aligned with the target. Always equal to the length of the pairwise alignment, 0 otherwise (see \fIqihi\fR to ignore terminal gaps). .IP \n+[step]. \fItlo\fR: first nucleotide of the target aligned with the query. Always equal to 1 if there is an alignment, 0 otherwise (see \fItilo\fR to ignore initial gaps). .IP \n+[step]. \fIthi\fR: last nucleotide of the target aligned with the query. Always equal to the length of the pairwise alignment, 0 otherwise (see \fItihi\fR to ignore terminal gaps). .IP \n+[step]. \fIevalue\fR: expectancy-value (not computed for nucleotide alignments). Always set to -1. .IP \n+[step]. \fIbits\fR: bit score (not computed for nucleotide alignments). Always set to 0. .RE .RE .TAG db .TP .BI \-\-db \0filename Compare query sequences (specified with \-\-usearch_global) to the target sequences contained in \fIfilename\fR in FASTA or FASTQ format, using global pairwise alignment. Alternatively, the name of a preformatted UDB database created using the makeudb_usearch command (see below) may be specified. .TAG dbmask .TP .BI \-\-dbmask\~ "none|dust|soft" Mask regions in the target database sequences using the dust method or the soft method, or do not mask (none). Warning, when using soft masking search commands become case sensitive. The default is to mask using dust. .TAG dbmatched .TP .BI \-\-dbmatched \0filename Write database target sequences matching at least one query sequence to \fIfilename\fR, in fasta format. If the option \-\-sizeout is used, the number of queries that matched each target sequence is indicated using the pattern ";size=\fIinteger\fR;". .TAG dbnotmatched .TP .BI \-\-dbnotmatched \0filename Write database target sequences not matching query sequences to \fIfilename\fR, in fasta format. .TAG fastapairs .TP .BI \-\-fastapairs \0filename Write pairwise alignments of query and target sequences to \fIfilename\fR, in fasta format. .TAG fulldp .TP .B \-\-fulldp Dummy option for compatibility with usearch. To maximize search sensitivity, \fBvsearch\fR uses a 8-way 16-bit SIMD vectorized full dynamic programming algorithm (Needleman-Wunsch), whether or not \-\-fulldp is specified. .TAG gapext .TP .BI \-\-gapext \0string Set penalties for a gap extension. See \-\-gapopen for a complete description of the penalty declaration system. The default is to initialize the six gap extending penalties using a penalty of 2 for extending internal gaps and a penalty of 1 for extending terminal gaps, in both query and target sequences (i.e. 2I/1E). .TAG gapopen .TP .BI \-\-gapopen \0string Set penalties for a gap opening. A gap opening can occur in six different contexts: in the query (Q) or in the target (T) sequence, at the left (L) or right (R) extremity of the sequence, or inside the sequence (I). Sequence symbols (Q and T) can be combined with location symbols (L, I, and R), and numerical values to declare penalties for all possible contexts: aQL/bQI/cQR/dTL/eTI/fTR, where abcdef are zero or positive integers, and '/' is used as a separator. .br To simplify declarations, the location symbols (L, I, and R) can be combined, the symbol (E) can be used to treat both extremities (L and R) equally, and the symbols Q and T can be omitted to treat query and target sequences equally. For instance, the default is to declare a penalty of 20 for opening internal gaps and a penalty of 2 for opening terminal gaps (left or right), in both query and target sequences (i.e. 20I/2E). If only a numerical value is given, without any sequence or location symbol, then the penalty applies to all gap openings. To forbid gap-opening, an infinite penalty value can be declared with the symbol '*'. To use \fBvsearch\fR as a semi-global aligner, a null-penalty can be applied to the left (L) or right (R) gaps. .br \fBvsearch\fR always initializes the six gap opening penalties using the default parameters (20I/2E). The user is then free to declare only the values he/she wants to modify. The \fIstring\fR is scanned from left to right, accepted symbols are (0123456789/LIREQT*), and later values override previous values. .br Please note that \fBvsearch\fR, in contrast to usearch, only allows integer gap penalties. Because the lowest gap penalties are 0.5 by default in usearch, all default scores and gap penalties in \fBvsearch\fR have been doubled to maintain equivalent penalties and to produce identical alignments. .TAG hardmask .TP .B \-\-hardmask Mask sequence regions by replacing them with Ns instead of setting them to lower case as is the default. For more information, please see the Masking section. .TAG id .TP .BI \-\-id \0real Reject the sequence match if the pairwise identity is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). The search process sorts target sequences by decreasing number of \fIk\fR-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. That efficient pre-filtering also prevents pairwise alignments with very short, or with weakly matching targets, as there needs to be by default at least 12 shared \fIk\fR-mers to start the pairwise alignment, and at least one out of every 16 \fIk\fR-mers from the query needs to match the target (see options \-\-wordlength and \-\-minwordmatches to change that behaviour). Consequently, using values lower than \-\-id 0.5 is not likely to capture more weakly matching targets. The pairwise identity is by default defined as the number of (matching columns) / (alignment length - terminal gaps). That definition can be modified by \-\-iddef. .TAG iddef .TP .BI \-\-iddef\~ "0|1|2|3|4" Change the pairwise identity definition used in \-\-id. Values accepted are: .RS .RS .nr step 0 1 .IP \n[step]. 4 CD-HIT definition: (matching columns) / (shortest sequence length). .IP \n+[step]. edit distance: (matching columns) / (alignment length). .IP \n+[step]. edit distance excluding terminal gaps (default definition for \-\-id). .IP \n+[step]. Marine Biological Lab definition counting each gap opening (internal or terminal) as a single mismatch, whether or not the gap was extended: 1.0 - [(mismatches + gap openings)/(longest sequence length)] .IP \n+[step]. BLAST definition, equivalent to \-\-iddef 1 for global pairwise alignments. .RE .PP The option \-\-userfields accepts the fields id0 to id4, in addition to the field id, to report the pairwise identity values corresponding to the different definitions. .RE .TAG idprefix .TP .BI \-\-idprefix\~ "positive integer" Reject the sequence match if the first \fIinteger\fR nucleotides of the target do not match the query. .TAG idsuffix .TP .BI \-\-idsuffix\~ "positive integer" Reject the sequence match if the last \fIinteger\fR nucleotides of the target do not match the query. .TAG lca_cutoff .TP .BI \-\-lca_cutoff \0real Adjust the fraction of matching hits required for the last common ancestor (LCA) output with the \-\-lcaout option during searches. The default value is 1.0 which requires all hits to match at each taxonomic rank for that rank to be included. If a lower cutoff value is used, e.g. 0.95, a small fraction of non-matching hits are allowed while that rank will still be reported. The argument to this option must be larger than 0.5, but not larger than 1.0. .TAG lcaout .TP .BI \-\-lcaout \0filename Output last common ancestor (LCA) information about the hits of each query to a text file in a tab-separated format. The first column contains the query id, while the second column contains the taxonomic information. The headers of the sequences in the database must contain taxonomic information in the same format as used with the \-\-sintax command, e.g. "tax=k:Archaea,p:Euryarchaeota,c:Halobacteria". Only the initial parts of the taxonomy that are common to a large fraction of the hits of each query will be output. It is necessary to set the \-\-maxaccepts option to a value different from 1 for this information to be useful. The \-\-top_hits_only option may also be useful. The fraction of matching hits required may be adjusted by the \-\-lca_cutoff option (default 1.0). .TAG leftjust .TP .B \-\-leftjust Reject the sequence match if the pairwise alignment begins with gaps. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG match .TP .BI \-\-match\~ "integer" Score assigned to a match (i.e. identical nucleotides) in the pairwise alignment. The default value is 2. .TAG matched .TP .BI \-\-matched \0filename Write query sequences matching database target sequences to \fIfilename\fR, in fasta format. .TAG maxaccepts .TP .BI \-\-maxaccepts\~ "positive integer" Maximum number of matching target sequences to accept before stopping the search for a given query. The default value is 1. This option works in pair with \-\-maxrejects. The search process sorts target sequences by decreasing number of \fIk\fR-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. After pairwise alignments, if the first target sequence passes the acceptation criteria, it is accepted as best hit and the search process stops for that query. If \-\-maxaccepts is set to a higher value, more matching targets are accepted. If \-\-maxaccepts and \-\-maxrejects are both set to 0, the complete database is searched. See \-\-maxhits option for a control on the number of hits reported per query when search is done on both strands. .TAG maxdiffs .TP .BI \-\-maxdiffs\~ "positive integer" Reject the sequence match if the alignment contains more than \fIinteger\fR substitutions, insertions or deletions. .TAG maxgaps .TP .BI \-\-maxgaps\~ "positive integer" Reject the sequence match if the alignment contains more than \fIinteger\fR insertions or deletions. .TAG maxhits .TP .BI \-\-maxhits\~ "non-negative integer" Maximum number of hits to show once the search is terminated for a given query (hits are sorted by decreasing identity). When searching only on the plus strand (default situation, see \-\-strand), the number of matching targets (\-\-maxaccepts) and the number of hits (\-\-maxhits) are the same. However, when searching on both strands, there could be two hits per target (one per strand): \-\-maxhits then controls the overall number of reported hits per query. Unlimited by default or if the argument is zero. This option applies to \-\-alnout, \-\-blast6out, \-\-fastapairs, \-\-samout, \-\-uc, or \-\-userout output files. .TAG maxid .TP .BI \-\-maxid \0real Reject the sequence match if the percentage of identity between the two sequences is greater than \fIreal\fR. .TAG maxqsize .TP .BI \-\-maxqsize\~ "positive integer" Reject query sequences with an abundance greater than \fIinteger\fR. .TAG maxqt .TP .BI \-\-maxqt \0real Reject if the query/target sequence length ratio is greater than \fIreal\fR. .TAG maxrejects .TP .BI \-\-maxrejects\~ "positive integer" Maximum number of non-matching target sequences to consider before stopping the search for a given query. The default value is 32. This option works in pair with \-\-maxaccepts. The search process sorts target sequences by decreasing number of \fIk\fR-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. After pairwise alignments, if none of the first 32 examined target sequences pass the acceptation criteria, the search process stops for that query (no hit). If \-\-maxrejects is set to a higher value, more target sequences are considered. If \-\-maxaccepts and \-\-maxrejects are both set to 0, the complete database is searched. .TAG maxsizeratio .TP .BI \-\-maxsizeratio \0real Reject if the query/target abundance ratio is greater than \fIreal\fR. .TAG maxsl .TP .BI \-\-maxsl \0real Reject if the shorter/longer sequence length ratio is greater than \fIreal\fR. .TAG maxsubs .TP .BI \-\-maxsubs\~ "positive integer" Reject the sequence match if the pairwise alignment contains more than \fIinteger\fR substitutions. .TAG mid .TP .BI \-\-mid \0real Reject the sequence match if the percentage of identity is lower than \fIreal\fR (ignoring all gaps, internal and terminal). .TAG mincols .TP .BI \-\-mincols\~ "positive integer" Reject the sequence match if the alignment length is shorter than \fIinteger\fR. .TAG minqt .TP .BI \-\-minqt \0real Reject if the query/target sequence length ratio is lower than \fIreal\fR. .TAG minsizeratio .TP .BI \-\-minsizeratio \0real Reject if the query/target abundance ratio is lower than \fIreal\fR. .TAG minsl .TP .BI \-\-minsl \0real Reject if the shorter/longer sequence length ratio is lower than \fIreal\fR. .TAG mintsize .TP .BI \-\-mintsize\~ "positive integer" Reject target sequences with an abundance lower than \fIinteger\fR. .TAG minwordmatches .TP .BI \-\-minwordmatches\~ "non-negative integer" Minimum number of \fIk\fR-mers or word matches required for a sequence to be considered further. Default value is 12 for the default word length 8. For word lengths 3-15, the default minimum word matches are 18, 17, 16, 15, 14, 12, 11, 10, 9, 8, 7, 5 and 3, respectively. If the query sequence has fewer unique words than the number specified, all words in the query must match. If the argument is 0, no word matches are required. .TAG mismatch .TP .BI \-\-mismatch\~ "integer" Score assigned to a mismatch (i.e. different nucleotides) in the pairwise alignment. The default value is -4. .TAG mothur_shared_out .TP .BI \-\-mothur_shared_out \0filename Write search results to an OTU table in the mothur 'shared' tab-separated plain text file format. The query file contains the samples, while the database file contains the OTUs. Sample and OTU identifiers are extracted from the header of these sequences. See the \-\-otutabout option in the Clustering section for further details. .TAG notmatched .TP .BI \-\-notmatched \0filename Write query sequences not matching database target sequences to \fIfilename\fR, in fasta format. .TAG otutabout .TP .BI \-\-otutabout \0filename Write search results to an OTU table in the classic tab-separated plain text format. The query file contains the samples, while the database file contains the OTUs. Sample and OTU identifiers are extracted from the header of these sequences (\-\-sample option). See the \-\-mothur_shared_out option in the Clustering section for further details. .TAG output_no_hits .TP .B \-\-output_no_hits Write both matching and non-matching queries to \-\-alnout, \-\-blast6out, \-\-samout or \-\-userout output files. Non-matching queries are labelled 'No hits' in \-\-alnout files. .TAG pattern .TP .B \-\-pattern \fIstring\fR This option is ignored. It is provided for compatibility with usearch. .TAG qmask .TP .BI \-\-qmask\~ "none|dust|soft" Mask regions in the query sequences using the dust or the soft algorithms, or do not mask (none). Warning, when using soft masking search commands become case sensitive. The default is to mask using \fIdust\fR. .TAG qsegout .TP .BI \-\-qsegout \0filename Write the aligned part of each query sequence to \fIfilename\fR in FASTA format. .TAG query_cov .TP .BI \-\-query_cov \0real Reject if the fraction of the query aligned to the target sequence is lower than \fIreal\fR (value ranging from 0.0 to 1.0 included). The query coverage is computed as (matches + mismatches) / query sequence length. Internal or terminal gaps are not taken into account. .TAG rightjust .TP .B \-\-rightjust Reject the sequence match if the pairwise alignment ends with gaps. .TAG rowlen .TP .BI \-\-rowlen\~ "positive integer" Width of alignment lines in \-\-alnout output. The default value is 64. Set to 0 to eliminate wrapping. .TAG samheader .TP .B \-\-samheader Include header lines to the SAM file when \-\-samout is specified. The header includes lines starting with @HD, @SQ and @PG, but no @RG lines (see .URL https://github.com/samtools/hts-specs (link) ). By default no header line is written. .TAG samout .TP .BI \-\-samout \0filename Write alignment results to \fIfilename\fR using the SAM format (a tab-separated text file). When using the \-\-samheader option, the SAM file starts with header lines. Each non-header line is a SAM record, which represents either a query-target alignment or the absence of match for a query (output order may vary when using multiple threads). Each record contains 11 mandatory fields and optional fields (see .URL https://github.com/samtools/hts-specs (link) for a complete description of the format): .RS .RS .nr step 1 1 .IP \n[step]. 4 query sequence label. .IP \n+[step]. combination of bitwise flags. Possible values are: 0 (top hit), 4 (no hit), 16 (reverse-complemented hit), 256 (secondary hit, i.e. all hits except the top hit). .IP \n+[step]. target sequence label. .IP \n+[step]. first position of a target aligned with the query (always 1 for global pairwise alignments, 0 if there is no match). .IP \n+[step]. mapping quality (ignored, always set to '*'). .IP \n+[step]. CIGAR string (set to '*' if there is no match). .IP \n+[step]. name of the target sequence matching with the next read of the query (for mate reads only, ignored and always set to '*'). .IP \n+[step]. position of the primary alignment of the next read of the query (for mate reads only, ignored and always set to 0). .IP \n+[step]. target sequence length (for multi-segment targets, ignored and always set to 0). .IP \n+[step]. query sequence (complete, not only the segment aligned to the target as usearch does). .IP \n+[step]. quality string (ignored, always set to '*'). .RE .TP Optional fields for query-target matches (number and order of fields may vary): .RS .nr step 12 1 .IP \n[step]. 4 AS:i:? alignment score (i.e. percentage of identity). .IP \n+[step]. XN:i:? next best alignment score (always set to 0). .IP \n+[step]. XM:i:? number of mismatches. .IP \n+[step]. XO:i:? number of gap openings (excluding terminal gaps). .IP \n+[step]. XG:i:? number of gap extensions (excluding terminal gaps). .IP \n+[step]. NM:i:? edit distance to the target (sum of XM and XG). .IP \n+[step]. MD:Z:? string for mismatching positions. .IP \n+[step]. YT:Z:UU string representing the alignment type. .RE .RE .TAG search_exact .TP .BI \-\-search_exact \0filename Search for exact full-length matches to the query sequences contained in \fIfilename\fR in the fasta database of target sequences (\-\-db). Only 100% exact matches are reported and this command is much faster than \-\-usearch_global. The \-\-id, \-\-maxaccepts and \-\-maxrejects options are ignored, but the rest of the searching options may be specified. .TAG self .TP .B \-\-self Reject the sequence match if the query and target labels are identical. .TAG selfid .TP .B \-\-selfid Reject the sequence match if the query and target sequences are strictly identical. .TAG sizeout .TP .B \-\-sizeout Add abundance annotations to the output of the option \-\-dbmatched (using the pattern ';size=\fIinteger\fR;'), to report the number of queries that matched each target. .TAG strand .TP .BI \-\-strand\~ "plus|both" When searching for similar sequences, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. .TAG target_cov .TP .BI \-\-target_cov \0real Reject the sequence match if the fraction of the target sequence aligned to the query sequence is lower than \fIreal\fR. The target coverage is computed as (matches + mismatches) / target sequence length. Internal or terminal gaps are not taken into account. .TAG top_hits_only .TP .B \-\-top_hits_only Only the top hits with an equally high percentage of identity between the query and database sequence sets are written to the output specified with the options \-\-lcaout, \-\-alnout, \-\-samout, \-\-userout, \-\-blast6out, \-\-uc, \-\-fastapairs, \-\-matched or \-\-notmatched (but not \-\-dbmatched and \-\-dbnotmatched). For each query, the top hit is the one presenting the highest percentage of identity (see the \-\-iddef option to change the way identity is measured). For a given query, if several top hits present exactly the same percentage of identity, the number of matching targets reported is controlled by the \-\-maxaccepts value (1 by default), and the number of hits is controlled by the \-\-maxhits option. .TAG tsegout .TP .BI \-\-tsegout \0filename Write the aligned part of each target sequence to \fIfilename\fR in FASTA format. .TAG uc .TP .BI \-\-uc \0filename Output searching results in \fIfilename\fR using a tab-separated uclust-like format with 10 columns. When using the \-\-search_exact command, the table layout is the same than with the \-\-allpairs_global. When using the \-\-usearch_global command, the table present two different type of entries: hit (H) or no hit (N). Each query sequence is compared to all other sequences, and the best hit (\-\-maxaccepts 1) or several hits (\-\-maxaccepts > 1 and \-\-uc_allhits) are reported (H). Output order may vary when using multiple threads. Column content varies with the type of entry (H or N): .RS .RS .nr step 1 1 .IP \n[step]. 4 Record type: H, or N ('hit' or 'no hit'). .IP \n+[step]. Ordinal number of the target sequence (based on input order, starting from zero). Set to '*' for N. .IP \n+[step]. Sequence length. Set to '*' for N. .IP \n+[step]. Percentage of similarity with the target sequence. Set to '*' for N. .IP \n+[step]. Match orientation + or -. . Set to '.' for N. .IP \n+[step]. Not used, always set to zero for H, or '*' for N. .IP \n+[step]. Not used, always set to zero for H, or '*' for N. .IP \n+[step]. Compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence (ignoring terminal gaps). Set to '*' for N. .IP \n+[step]. Label of the query sequence. .IP \n+[step]. Label of the target centroid sequence. Set to '*' for N. .RE .RE .TAG uc_allhits .TP .B \-\-uc_allhits With the commands \-\-search_exact and \-\-usearch_global, when using the \-\-uc option, show all hits, not just the top hit for each query. .TAG usearch_global .TP .BI \-\-usearch_global \0filename Compare target sequences (\-\-db) to the query sequences contained in \fIfilename\fR in FASTA or FASTQ format, using global pairwise alignment. .TAG userfields .TP .BI \-\-userfields \0string When using \-\-userout, select and order the fields written to the output file. Fields are separated by '+' (e.g. query+target+id). See the 'Userfields' section for a complete list of fields. .TAG userout .TP .BI \-\-userout \0filename Write user-defined tab-separated output to \fIfilename\fR. Select the fields with the option \-\-userfields. Output order may vary when using multiple threads. If \-\-userfields is empty or not present, \fIfilename\fR is empty. .TAG weak_id .TP .BI \-\-weak_id \0real Show hits with percentage of identity of at least \fIreal\fR, without terminating the search. A normal search stops as soon as enough hits are found (as defined by \-\-maxaccepts, \-\-maxrejects, and \-\-id). As \-\-weak_id reports weak hits that are not deduced from \-\-maxaccepts (but count towards \-\-maxrejects), high \-\-id values can be used, hence preserving both speed and sensitivity. Logically, \fIreal\fR must be smaller than the value indicated by \-\-id. .TAG wordlength .TP .BI \-\-wordlength\~ "positive integer" Length of words (i.e. \fIk\fR-mers) for database indexing. The range of possible values goes from 3 to 15, but values near 8 or 9 are generally recommended. Longer words may reduce the sensitivity/recall for weak similarities, but can increase precision. On the other hand, shorter words may increase sensitivity or recall, but may reduce precision. Computation time generally increases with shorter words and decreases with longer words, but it increases again for very long words. Memory requirements for a part of the index increase with a factor of 4 each time word length increases by one nucleotide, and this generally becomes significant for long words (12 or more). The default value is 8. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .RE .PP .\" ---------------------------------------------------------------------------- .TAG shuffling-options Shuffling options: .RS Fasta entries in the input file are outputted in a pseudo-random order. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG output .TP 9 .BI \-\-output \0filename Write the shuffled sequences to \fIfilename\fR, in fasta format. .TAG randseed .TP .BI \-\-randseed\~ "integer" When shuffling sequence order, use \fIinteger\fR as seed. A given seed always produces the same output order (useful for replicability). Set to 0 to use a pseudo-random seed (default behaviour). .TAG relabel .TP .BI \-\-relabel \0string Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3, etc.) to construct the new headers. Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequences using the MD5 message digest algorithm applied to each sequence. Former sequence headers are discarded. The sequence is converted to upper case and U is replaced by T before the digest is computed. The MD5 digest is a cryptographic hash function designed to minimize the probability that two different inputs gives the same output, even for very similar, but non-identical inputs. Still, there is always a very small, but non-zero probability that two different inputs give the same result. The MD5 digest generates a 128-bit (16-byte) digest that is represented by 16 hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_self .TP .B \-\-relabel_self Relabel sequences using the sequence itself as the label. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequences using the SHA1 message digest algorithm applied to each sequence. It is similar to the \-\-relabel_md5 option but uses the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest generates a 160-bit (20-byte) result that is represented by 20 hexadecimal numbers (40 symbols). The probability of a collision (two non-identical sequences having the same digest) is smaller for the SHA1 algorithm than it is for the MD5 algorithm. Use \-\-sizeout to conserve the abundance annotations. .TAG sizeout .TP .B \-\-sizeout When using \-\-relabel, \-\-relabel_self, \-\-relabel_md5 or \-\-relabel_sha1, preserve and report abundance annotations to the output fasta file (using the pattern ';size=\fIinteger\fR;'). .TAG shuffle .TP .BI \-\-shuffle \0filename Pseudo-randomly shuffle the order of sequences contained in \fIfilename\fR. .TAG topn .TP .BI \-\-topn\~ "positive integer" Output only the first \fIinteger\fR sequences after pseudo-random reordering. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG sorting-options Sorting options: .RS Fasta entries are sorted by decreasing abundance (\-\-sortbysize) or sequence length (\-\-sortbylength). To obtain a stable sorting order, ties are sorted by decreasing abundance (if present) and label increasing alpha-numerical order (\-\-sortbylength), or just by label increasing alpha-numerical order (\-\-sortbysize). Label sorting assumes that all sequences have unique labels. The same applies to the automatic sorting performed during chimera checking (\-\-uchime_denovo), dereplication (\-\-derep_fulllength), and clustering (\-\-cluster_fast and \-\-cluster_size). .PP .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG maxsize .TP 9 .BI \-\-maxsize\~ "positive integer" When using \-\-sortbysize, discard sequences with an abundance value greater than \fIinteger\fR. .TAG minsize .TP .BI \-\-minsize\~ "positive integer" When using \-\-sortbysize, discard sequences with an abundance value smaller than \fIinteger\fR. .TAG output .TP .BI \-\-output \0filename Write the sorted sequences to \fIfilename\fR, in fasta format. .TAG relabel .TP .BI \-\-relabel \0string Please see the description of the same option under Chimera detection for details. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .BI \-\-relabel_md5 Please see the description of the same option under Chimera detection for details. .TAG relabel_self .TP .BI \-\-relabel_self Please see the description of the same option under Chimera detection for details. .TAG relabel_sha1 .TP .BI \-\-relabel_sha1 Please see the description of the same option under Chimera detection for details. .TAG sizeout .TP .B \-\-sizeout When using \-\-relabel, report abundance annotations to the output fasta file (using the pattern ';size=\fIinteger\fR;'). .TAG sortbylength .TP .BI \-\-sortbylength \0filename Sort by decreasing length the sequences contained in \fIfilename\fR. See the general options \-\-minseqlength and \-\-maxseqlength to eliminate short and long sequences. .TAG sortbysize .TP .BI \-\-sortbysize \0filename Sort by decreasing abundance the sequences contained in \fIfilename\fR (missing abundance values are assumed to be ';size=1'). See the options \-\-minsize and \-\-maxsize to eliminate rare and dominant sequences. .TAG topn .TP .BI \-\-topn\~ "positive integer" Output only the top \fIinteger\fR sequences (i.e. the longest or the most abundant). .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG subsampling-options Subsampling options: .RS Subsampling randomly extracts a certain number or a certain percentage of the sequences in the input file. If the \-\-sizein option is in effect, the abundances of the input sequences is taken into account and the sampling is performed as if the input sequences were rereplicated, subsampled and dereplicated before being written to the output file. The extraction is performed as a random sampling with a uniform distribution among the input sequences and is performed without replacement. The input file is specified with the \-\-fastx_subsample option, the output files are specified with the \-\-fastaout and \-\-fastqout options and the amount of sequences to be sampled is specified with the \-\-sample_pct or \-\-sample_size options. The sequences not sampled may be written to files specified with the options \-\-fasta_discarded and \-\-fastq_discarded. The \-\-fastq_ascii, \-\-fastq_qmin and \-\-fastq_qmax options are also available. .PP .TAG fastaout .TP 9 .BI \-\-fastaout \0filename Write the sampled sequences to \fIfilename\fR, in fasta format. .TAG fastaout_discarded .TP .BI \-\-fastaout_discarded \0filename Write the sequences not sampled to \fIfilename\fR, in fasta format. .TAG fastq_ascii .TP .BI \-\-fastq_ascii\~ "positive integer" Define the ASCII character number used as the basis for the FASTQ quality score. The default is 33, which is used by the Sanger / Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 and 64 are valid arguments. .TAG fastq_qmax .TP .BI \-\-fastq_qmax\~ "positive integer" Specify the maximum quality score accepted when reading FASTQ files. The default is 41, which is usual for recent Sanger/Illumina 1.8+ files. .TAG fastq_qmin .TP .BI \-\-fastq_qmin\~ "positive integer" Specify the minimum quality score accepted for FASTQ files. The default is 0, which is usual for recent Sanger/Illumina 1.8+ files. Older formats may use scores between -5 and 2. .TAG fastqout .TP .BI \-\-fastqout \0filename Write the sampled sequences to \fIfilename\fR, in fastq format. Requires input in fastq format. .TAG fastqout_discarded .TP .BI \-\-fastqout_discarded \0filename Write the sequences not sampled to \fIfilename\fR, in fastq format. Requires input in fastq format. .TAG fastx_subsample .TP .BI \-\-fastx_subsample \0filename Perform subsampling from the sequences in the specified input file that is in FASTA or FASTQ format. .TAG lengthout .TP .B \-\-lengthout Write sequence length information to the output files in FASTA format by adding a ";length=\fIinteger\fR" attribute in the header. .TAG randseed .TP .BI \-\-randseed\~ "positive integer" When subsampling, use \fIinteger\fR as a seed for the pseudo-random generator. A given seed always produces the same output, which is useful for replicability. Set to 0 to use a pseudo-random seed (default behaviour). .TAG relabel .TP .BI \-\-relabel \0string Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3, etc.) to construct the new headers. Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_keep .TP .B \-\-relabel_keep When relabelling, keep the old identifier in the header after a space. .TAG relabel_md5 .TP .B \-\-relabel_md5 Relabel sequences using the MD5 message digest algorithm applied to each sequence. Former sequence headers are discarded. The sequence is converted to upper case and U is replaced by T before the digest is computed. The MD5 digest is a cryptographic hash function designed to minimize the probability that two different inputs give the same output, even for very similar, but non-identical inputs. Still, there is always a very small, but non-zero probability that two different inputs give the same result. The MD5 digest generates a 128-bit (16-byte) digest that is represented by 16 hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the abundance annotations. .TAG relabel_self .TP .B \-\-relabel_self Relabel sequences using the sequence itself as the label. .TAG relabel_sha1 .TP .B \-\-relabel_sha1 Relabel sequences using the SHA1 message digest algorithm applied to each sequence. It is similar to the \-\-relabel_md5 option but uses the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest generates a 160-bit (20-byte) result that is represented by 20 hexadecimal numbers (40 symbols). The probability of a collision (two non-identical sequences having the same digest) is smaller for the SHA1 algorithm than it is for the MD5 algorithm. Use \-\-sizeout to conserve the abundance annotations. .TAG sample_pct .TP .BI \-\-sample_pct\~ "real" Subsample the given percentage of the input sequences. Accepted values range from 0.0 to 100.0. .TAG sample_size .TP .BI \-\-sample_size\~ "positive integer" Extract the given number of sequences. .TAG sizein .TP .B \-\-sizein Take the abundance information of the input file into account, otherwise the abundance of each sequence is considered to be 1. .TAG sizeout .TP .B \-\-sizeout Write abundance information to the output file. .TAG xlength .TP .B \-\-xlength Strip header attribute ";length=\fIinteger\fR" from input sequences. This attribute is added to output sequences by the \-\-lengthout option. .TAG xsize .TP .B \-\-xsize Strip abundance information from the headers when writing the output file. .RE .PP .\" ---------------------------------------------------------------------------- .TAG taxonomic-classification-options Taxonomic classification options: .RS The vsearch command \-\-sintax will classify the input sequences according to the Sintax algorithm as described by Robert Edgar (2016) in SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences, BioRxiv, 074161. Preprint. doi: 10.1101/074161 .URL https://doi.org/10.1101/074161 (link) .PP The name of the fasta file containing the input sequences to be classified is given as an argument to the \-\-sintax command. The reference sequence database is specified with the \-\-db option. The results are written in a tab delimited text file whose name is specified with the \-\-tabbedout option. The \-\-sintax_cutoff option may be used to set a minimum level of bootstrap support for the taxonomic ranks to be reported. The \-\-randseed option may be included to specify a seed for initialisation of the random number generator used by the algorithm. Please note that when using multiple threads, the \-\-randseed option may not work as intended, because sequences may be processed in a random order by different threads. To ensure the same results each time, use a single thread \-\-threads 1) in combination with a fixed random seed specified with \-\-randseed. .PP Multithreading is supported. Databases in UDB files are supported. The strand option may be specified. .PP The reference database must contain taxonomic information in the header of each sequence in the form of a string starting with ";tax=" and followed by a comma-separated list of up to nine taxonomic identifiers. Each taxonomic identifier must start with an indication of the rank by one of the letters d (for domain) k (kingdom), p (phylum), c (class), o (order), f (family), g (genus), s (species), or t (strain). The letter is followed by a colon (:) and the name of that rank. Commas and semicolons are not allowed in the name of the rank. Non-ascii characters should be avoided in the names. .PP Example: >X80725_S000004313;\:tax=d:Bacteria,\:p:Proteobacteria,\:c:Gammaproteobacteria,\:o:Enterobacteriales,\:f:Enterobacteriaceae,\:g:Escherichia/Shigella,\:s:Escherichia_coli,\:t:str._K-12_substr._MG1655 .PP The option \-\-notrunclabels is turned on by default for this command, allowing spaces in the taxonomic identifiers. .PP If two sequences in the reference database has equally many kmer matches with the query, the shortest sequence will be chosen by default. If they are equally long, the sequence appearing first in the database will be chosen. If the recommended option \-\-sintax_random is specified, sequences with an equal number of kmer matches will instead be chosen by a random draw. .PP .TAG db .TP 9 .BI \-\-db \0filename Read the reference sequences from \fIfilename\fR, in FASTA, FASTQ or UDB format. These sequences need to be annotated with taxonomy. .TAG randseed .TP .BI \-\-randseed\~ "positive integer" Use \fIinteger\fR as seed for the random number generator used in the Sintax algorithm. A given seed always produces the same output order (useful for replicability). Set to 0 to use a pseudo-random seed (default behaviour). Does not work correctly with multiple threads; please use \-\-threads 1 to ensure correct behaviour. .TAG sintax .TP .BI \-\-sintax \0filename Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format. .TAG sintax_cutoff .TP .BI \-\-sintax_cutoff\~ "real" Specify a minimum level of bootstrap support for the taxonomic ranks that will be included in column 4 of the output file. For instance 0.9, corresponding to 90%. .TAG sintax_random .TP .B \-\-sintax_random Break ties between sequences with equally many kmer matches by a random draw. This option is recommended and may be made the default in the future. .TAG tabbedout .TP .BI \-\-tabbedout \0filename Write the results to \fIfilename\fR, in a tab-separated text format. Column 1 contains the query label. Column 2 contains the predicted taxonomy in the same format as for the reference data, with bootstrap support indicated in parentheses after each rank. Column 3 contains the strand. If the \-\-sintax_cutoff option is used, the predicted taxonomy will be repeated in column 4 while omitting the bootstrap values and including only the ranks with support at or above the threshold. .RE .PP .\" ---------------------------------------------------------------------------- .TAG udb-options UDB options: .RS Databases to be used with the \-\-usearch_global or \-\-sintax commands may be prepared from FASTA files and stored to a binary UDB formatted file in order to speed up searching. This may be worthwhile when searching a large database repeatedly. The sequences are indexed and stored in a way that can be quickly loaded into memory. The commands and options below can be used to create and inspect UDB files. .PP .TAG dbmask .TP 9 .BI \-\-dbmask\~ "none|dust|soft" Specify the sequence masking method used with the \-\-makeudb_usearch command, either none, dust or soft. No masking is performed when none is specified. When dust is specified, the DUST algorithm will be used for masking low complexity regions (short repeats and skewed composition). Lower case letters in the input file will be masked when soft is specified (soft masking). .TAG hardmask .TP .B \-\-hardmask Mask sequences by replacing letters with N for the \-\-makeudb_usearch command. The default is to use lower case letters (soft masking). .TAG makeudb_usearch .TP .BI \-\-makeudb_usearch \0filename Create an UDB database file from the FASTA-formatted sequences in the file with the given \fIfilename\fR. The UDB database is written to the file specified with the \-\-output option. .TAG output .TP .BI \-\-output \0filename Specify the \fIfilename\fR of a FASTA or UDB output file for the \-\-makeudb_usearch or the \-\-udb2fasta command, respectively. .TAG udb2fasta .TP .BI \-\-udb2fasta \0filename Read the UDB database in the file with the given \fIfilename\fR and output the sequences in FASTA format in the file specified by the \-\-output option. .TAG udbinfo .TP .BI \-\-udbinfo \0filename Show information about the UDB database in the file with the given \fIfilename\fR. .TAG udbstats .TP .BI \-\-udbstats \0filename Report statistics about the indexed words in the UDB database in the file with the given \fIfilename\fR. .TAG wordlength .TP .BI \-\-wordlength\~ "positive integer" Specify the length of the words to be used when creating the UDB database index using the \-\-makeudb_usearch command. Valid numbers range from 3 to 15. The default is 8. .RE .PP .\" ---------------------------------------------------------------------------- .TAG userfields Userfields (fields accepted by the \-\-userfields option): .RS .TP 9 .B aln Print a string of M (match/mismatch, i.e. not a gap), D (delete, i.e. a gap in the query) and I (insert, i.e. a gap in the target) representing the pairwise alignment. Empty field if there is no alignment. .TP .B alnlen Print the length of the query-target alignment (number of columns). The field is set to 0 if there is no alignment. .TP .B bits Bit score (not computed for nucleotide alignments). Always set to 0. .TP .B caln Compact representation of the pairwise alignment using the CIGAR format (Compact Idiosyncratic Gapped Alignment Report): M (match/mismatch), D (deletion) and I (insertion). The equal sign '=' indicates that the query is identical to the centroid sequence (ignoring terminal gaps). Empty field if there is no alignment. .TP .B evalue E-value (not computed for nucleotide alignments). Always set to -1. .TP .B exts Number of columns containing a gap extension (zero or positive integer value). .TP .B gaps Number of columns containing a gap (zero or positive integer value, excluding terminal gaps). .TP .B id The percentage of identity, according to the identity definition specified by the \-\-iddef option. Equal to id0, id1, id2, id3 or id4 below. By default the same as id2. .TP .B id0 CD-HIT definition of the percentage of identity (real value ranging from 0.0 to 100.0) using the length of the shortest sequence in the pairwise alignment as denominator: 100 * (matching columns) / (shortest sequence length). .TP .B id1 The percentage of identity (real value ranging from 0.0 to 100.0) is defined as the edit distance: 100 * (matching columns) / (alignment length). .TP .B id2 The percentage of identity (real value ranging from 0.0 to 100.0) is defined as the edit distance, excluding terminal gaps. .TP .B id3 Marine Biological Lab definition of the percentage of identity (real value ranging from 0.0 to 100.0), counting each gap opening (internal or terminal) as a single mismatch, whether or not the gap was extended, and using the length of the longest sequence in the pairwise alignment as denominator: 100 * (1.0 - [(mismatches + gaps) / (longest sequence length)]). .TP .B id4 BLAST definition of the percentage of identity (real value ranging from 0.0 to 100.0), equivalent to \-\-iddef 1 in a context of global pairwise alignment. The field id4 is always equal to the field id1. .TP .B ids Number of matches in the alignment (zero or positive integer value). .TP .B mism Number of mismatches in the alignment (zero or positive integer value). .TP .B opens Number of columns containing a gap opening (zero or positive integer value, excluding terminal gaps). .TP .B pairs Number of columns containing only nucleotides. That value corresponds to the length of the alignment minus the gap-containing columns (zero or positive integer value). .TP .B pctgaps Number of columns containing gaps expressed as a percentage of the alignment length (real value ranging from 0.0 to 100.0). .TP .B pctpv Percentage of positive columns. When working with nucleotide sequences, this is equivalent to the percentage of matches (real value ranging from 0.0 to 100.0). .TP .B pv Number of positive columns. When working with nucleotide sequences, this is equivalent to the number of matches (zero or positive integer value). .TP .B qcov Fraction of the query sequence that is aligned with the target sequence (real value ranging from 0.0 to 100.0). The query coverage is computed as 100.0 * (matches + mismatches) / query sequence length. Internal or terminal gaps are not taken into account. The field is set to 0.0 if there is no alignment. .TP .B qframe Query frame (-3 to +3). That field only concerns coding sequences and is not computed by \fBvsearch\fR. Always set to +0. .TP .B qhi Last nucleotide of the query aligned with the target. Always equal to the length of the pairwise alignment, 0 otherwise (see \fIqihi\fR to ignore terminal gaps). .TP .B qihi Last nucleotide of the query aligned with the target (ignoring terminal gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B qilo First nucleotide of the query aligned with the target (ignoring initial gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B ql Query sequence length (positive integer value). The field is set to 0 if there is no alignment. .TP .B qlo First nucleotide of the query aligned with the target. Always equal to 1 if there is an alignment, 0 otherwise (see \fIqilo\fR to ignore initial gaps). .TP .B qrow Print the sequence of the query segment as seen in the pairwise alignment (i.e. with gap insertions if need be). Empty field if there is no alignment. .TP .B qs Query segment length. Always equal to query sequence length. .\" The meaning of that field is not clear to us. .TP .B qstrand Query strand orientation (+ or - for nucleotide sequences). Empty field if there is no alignment. .TP .B query Query label. .TP .B raw Raw alignment score (negative, null or positive integer value). The score is the sum of match rewards minus mismatch penalties, gap openings and gap extensions. The field is set to 0 if there is no alignment. .TP .B target Target label. The field is set to '*' if there is no alignment. .TP .B tcov Fraction of the target sequence that is aligned with the query sequence (real value ranging from 0.0 to 100.0). The target coverage is computed as 100.0 * (matches + mismatches) / target sequence length. Internal or terminal gaps are not taken into account. The field is set to 0.0 if there is no alignment. .TP .B tframe Target frame (-3 to +3). That field only concerns coding sequences and is not computed by \fBvsearch\fR. Always set to +0. .TP .B thi Last nucleotide of the target aligned with the query. Always equal to the length of the pairwise alignment, 0 otherwise (see \fItihi\fR to ignore terminal gaps). .TP .B tihi Last nucleotide of the target aligned with the query (ignoring terminal gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B tilo First nucleotide of the target aligned with the query (ignoring initial gaps). Nucleotide numbering starts from 1. The field is set to 0 if there is no alignment. .TP .B tl Target sequence length (positive integer value). The field is set to 0 if there is no alignment. .TP .B tlo First nucleotide of the target aligned with the query. Always equal to 1 if there is an alignment, 0 otherwise (see \fItilo\fR to ignore initial gaps). .TP .B trow Print the sequence of the target segment as seen in the pairwise alignment (i.e. with gap insertions if need be). Empty field if there is no alignment. .TP .B ts Target segment length. Always equal to target sequence length. The field is set to 0 if there is no alignment. .TP .B tstrand Target strand orientation (+ or - for nucleotide sequences). Always set to '+', so reverse strand matches have tstrand '+' and qstrand '\-'. Empty field if there is no alignment. .RE .PP .\" ============================================================================ .SH DELIBERATE CHANGES If you are a usearch user, our objective is to make you feel at home. That's why \fBvsearch\fR was designed to behave like usearch, to some extent. Like any complex software, usearch is not free from quirks and inconsistencies. We decided not to reproduce some of them, and for complete transparency, to document here the deliberate changes we made. .PP During a search with usearch, when using the options \-\-blast6out and \-\-output_no_hits, for queries with no match the number of fields reported is 13, where it should be 12. This is corrected in \fBvsearch\fR. .PP The field raw of the \-\-userfields option is not informative in usearch. This is corrected in \fBvsearch\fR. .PP The fields qlo, qhi, tlo, thi now have counterparts (qilo, qihi, tilo, tihi) reporting alignment coordinates ignoring terminal gaps. .PP In usearch, when using the option \-\-output_no_hits, queries that receive no match are reported in \-\-blast6out file, but not in the alignment output file. This is corrected in \fBvsearch\fR. .PP \fBvsearch\fR introduces a new \-\-cluster_size command that sorts sequences by decreasing abundance before clustering. .PP \fBvsearch\fR reintroduces \-\-iddef alternative pairwise identity definitions that were removed from usearch. .PP \fBvsearch\fR extends the \-\-topn option to sorting commands. .PP \fBvsearch\fR extends the \-\-sizein option to dereplication (\-\-derep_fulllength) and clustering (\-\-cluster_fast). .PP \fBvsearch\fR treats T and U as identical nucleotides during dereplication. .PP \fBvsearch\fR sorting is stabilized by using sequence abundances or sequences labels as secondary or tertiary keys. .PP \fBvsearch\fR by default uses the DUST algorithm for masking low-complexity regions. Masking behaviour is also slightly changed to be more consistent. .PP .\" ============================================================================ .SH NOVELTIES \fBvsearch\fR introduces new commands and new options not present in usearch 7. They are described in the 'Options' section of this manual. Here is a short list: .RS .IP - 2 uchime2_denovo, uchime3_denovo, alignwidth, borderline, fasta_score (chimera checking) .IP - cluster_size, cluster_unoise, clusterout_id, clusterout_sort, profile (clustering) .IP - fasta_width, gzip_decompress, bzip2_decompress (general option) .IP - iddef (clustering, pairwise alignment, searching) .IP - maxuniquesize (dereplication) .IP - relabel_md5, relabel_self and relabel_sha1 (chimera detection, dereplication, FASTQ processing, shuffling, sorting) .IP - shuffle (shuffling) .IP - fastq_eestats, fastq_eestats2, fastq_maxlen, fastq_truncee (FASTQ processing) .IP - fastaout_discarded, fastqout_discarded (subsampling) .IP - rereplicate (dereplication/rereplication) .RE .PP .\" ============================================================================ .SH EXAMPLES .PP Align all sequences in a database with each other and output all pairwise alignments: .PP .RS \fBvsearch\fR \-\-allpairs_global \fIdatabase.fas\fR \-\-alnout \fIresults.aln\fR \-\-acceptall .RE .PP Check for the presence of chimeras (\fIde novo\fR); parents should be at least 1.5 times more abundant than chimeras. Output non-chimeric sequences in fasta format (no wrapping): .PP .RS \fBvsearch\fR \-\-uchime_denovo \fIqueries.fas\fR \-\-abskew 1.5 \-\-nonchimeras \fIresults.fas\fR \-\-fasta_width 0 .RE .PP Cluster with a 97% similarity threshold, collect cluster centroids, and write cluster descriptions using a uclust-like format: .PP .RS \fBvsearch\fR \-\-cluster_fast \fIqueries.fas\fR \-\-id 0.97 \-\-centroids \fIcentroids.fas\fR \-\-uc \fIclusters.uc\fR .RE .PP Dereplicate the sequences contained in \fIqueries.fas\fR, take into account the abundance information already present, write unwrapped fasta sequences to \fIqueries_unique.fas\fR with the new abundance information, discard all sequences with an abundance of 1: .PP .RS \fBvsearch\fR \-\-derep_fulllength \fIqueries.fas\fR \-\-sizein \-\-fasta_width 0 \-\-sizeout \-\-output \fIqueries_unique.fas\fR \-\-minuniquesize 2 .RE .PP Mask simple repeats and low complexity regions in the input fasta file with the DUST algorithm (masked regions are lowercased), and write the results to the output file: .PP .RS \fBvsearch\fR \-\-maskfasta \fIqueries.fas\fR \-\-qmask dust \-\-output \fIqueries_masked.fas\fR .RE .PP Search queries in a reference database, with a 80%-similarity threshold, take terminal gaps into account when calculating pairwise similarities, output pairwise alignments: .PP .RS \fBvsearch\fR \-\-usearch_global \fIqueries.fas\fR \-\-db \fIreferences.fas\fR \-\-id 0.8 \-\-iddef 1 \-\-alnout \fIresults.aln\fR .RE .PP Search a sequence dataset against itself (ignore self hits), get all matches with at least 60% similarity, and collect results in a blast-like tab-separated format. Accept an unlimited number of hits (\-\-maxaccepts 0), and compare each query to all other sequences, including unlikely candidates (\-\-maxrejects 0): .PP .RS \fBvsearch\fR \-\-usearch_global \fIqueries.fas\fR \-\-db \fIqueries.fas\fR \-\-self \-\-id 0.6 \-\-blast6out \fIresults.blast6\fR \-\-maxaccepts 0 \-\-maxrejects 0 .RE .PP Shuffle the input fasta file (change the order of sequences) in a repeatable fashion (fixed seed), and write unwrapped fasta sequences to the output file: .PP .RS \fBvsearch\fR \-\-shuffle \fIqueries.fas\fR \-\-output \fIqueries_shuffled.fas\fR \-\-randseed 13 \-\-fasta_width 0 .RE .PP Sort by decreasing abundance the sequences contained in \fIqueries.fas\fR (using the 'size=\fIinteger\fR' information), relabel the sequences while preserving the abundance information (with \-\-sizeout), keep only sequences with an abundance equal to or greater than 2: .PP .RS \fBvsearch\fR \-\-sortbysize \fIqueries.fas\fR \-\-output \fIqueries_sorted.fas\fR \-\-relabel sampleA_ \-\-sizeout \-\-minsize 2 .RE .PP .\" .\" ============================================================================ .SH AUTHORS Implementation and documentation by Torbjørn Rognes, Frédéric Mahé and Tomás Flouri. .PP .\" ============================================================================ .SH CITATION .PP Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. \fIPeerJ\fR 4:e2584 doi: 10.7717/peerj.2584 .URL https://doi.org/10.7717/peerj.2584 (link) .PP .\" ============================================================================ .SH REPORTING BUGS Submit suggestions and bug-reports at .URL https://github.com/torognes/vsearch/issues (link) , send a pull request on .URL https://github.com/torognes/vsearch (link) , or compose a friendly or curmudgeont e-mail to Torbjørn Rognes .MTO torognes@ifi.uio.no (link) . .PP .\" ============================================================================ .SH AVAILABILITY Source code and binaries are available at . .PP .\" ============================================================================ .SH COPYRIGHT Copyright (C) 2014-2025, Torbjørn Rognes, Frédéric Mahé and Tomás Flouri .PP All rights reserved. .PP Contact: Torbjørn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway .PP This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. .PP \fBGNU General Public License version 3\fR .PP This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. .PP This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. .PP You should have received a copy of the GNU General Public License along with this program. If not, see .URL https://www.gnu.org/licenses/ (link) . .PP .PP \fBThe BSD 2-Clause License\fR .PP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: .PP 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. .PP 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. .PP THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .PP We would like to thank the authors of the following projects for making their source code available: .RS .IP - 2 \fBvsearch\fR includes code from Google's CityHash project by Geoff Pike and Jyrki Alakuijala, providing some excellent hash functions available under a MIT license. .IP - \fBvsearch\fR includes code derived from Tatusov and Lipman's DUST program that is in the public domain. .IP - \fBvsearch\fR includes public domain code written by Alexander Peslyak for the MD5 message digest algorithm. .IP - \fBvsearch\fR includes public domain code written by Steve Reid and others for the SHA1 message digest algorithm. .IP - \fBvsearch\fR binaries may include code from the zlib library, copyright Jean-Loup Gailly and Mark Adler. .IP - \fBvsearch\fR binaries may include code from the bzip2 library, copyright Julian R. Seward. .RE .PP .\" ============================================================================ .SH SEE ALSO \fBswipe\fR, an extremely fast pairwise local (Smith-Waterman) database search tool by Torbjørn Rognes, available at .URL https://github.com/torognes/swipe "(link)" . .PP \fBswarm\fR, a fast and accurate amplicon clustering method by Frédéric Mahé and Torbjørn Rognes, available at .URL https://github.com/torognes/swarm "(link)" . .PP .\" ============================================================================ .SH VERSION HISTORY New features and important modifications of \fBvsearch\fR (short lived or minor bug releases may not be mentioned): .TP .BR v1.0.0\~ "released November 28th, 2014" First public release. .TP .BR v1.0.1\~ "released December 1st, 2014" Bug fixes (sortbysize, semicolon after size annotation in headers) and minor changes (labels as secondary sort key for most sorts, treat T and U as identical for dereplication, only output size in \-\-dbmatched file if \-\-sizeout specified). .TP .BR v1.0.2\~ "released December 6th, 2014" Bug fixes (ssse3/sse4.1 requirement, memory leak). .TP .BR v1.0.3\~ "released December 6th, 2014" Bug fix (now writes help to stdout instead of stderr). .TP .BR v1.0.4\~ "released December 8th, 2014" Added \-\-allpairs_global option. Reduce memory requirements slightly and eliminate memory leaks. .TP .BR v1.0.5\~ "released December 9th, 2014" Fixes a minor bug with \-\-allpairs_global and \-\-acceptall options. .TP .BR v1.0.6\~ "released December 14th, 2014" Fixes a memory allocation bug in chimera detection (\-\-uchime_ref option). .TP .BR v1.0.7\~ "released December 19th, 2014" Fixes a bug in the output from chimera detection with the \-\-uchimeout option. .TP .BR v1.0.8\~ "released January 22nd, 2015" Introduces several changes and bug fixes: .RS .IP - 2 a new linear memory aligner for alignment of sequences longer than 5,000 nucleotides, .IP - a new \-\-cluster_size command that sorts sequences by decreasing abundance before clustering, .IP - meaning of userfields qlo, qhi, tlo, thi changed for compatibility with usearch, .IP - new userfields qilo, qihi, tilo, tihi give alignment coordinates ignoring terminal gaps, .IP - in \-\-uc output files, a perfect alignment is indicated with a '=' sign, .IP - the option \-\-cluster_fast now sorts sequences by decreasing length, then by decreasing abundance and finally by sequence identifier, .IP - default \-\-maxseqlength value set to 50,000 nucleotides, .IP - fix for bug in alignment in rare cases, .IP - fix for lack of detection of under- or overflow in SIMD aligner. .RE .TP .BR v1.0.9\~ "released January 22nd, 2015" Fixes a bug in the function sorting sequences by decreasing abundance (\-\-sortbysize). .TP .BR v1.0.10\~ "released January 23rd, 2015" Fixes a bug where the \-\-sizein option was ignored and always treated as on, affecting clustering and dereplication commands. .TP .BR v1.0.11\~ "released February 5th, 2015" Introduces the possibility to output results in SAM format (for clustering, pairwise alignment and searching). .TP .BR v1.0.12\~ "released February 6th, 2015" Temporarily fixes a problem with long headers in FASTA files. .TP .BR v1.0.13\~ "released February 17th, 2015" Fix a memory allocation problem when computing multiple sequence alignments with the \-\-msaout and \-\-consout options, as well as a memory leak. Also increased line buffer for reading FASTA files to 4MB. .TP .BR v1.0.14\~ "released February 17th, 2015" Fix a bug where the multiple alignment and consensus sequence computed after clustering ignored the strand of the sequences. Also decreased size of line buffer for reading FASTA files to 1MB again due to excessive stack memory usage. .TP .BR v1.0.15\~ "released February 18th, 2015" Fix bug in calculation of identity metric between sequences when using the MBL definition (\-\-iddef 3). .TP .BR v1.0.16\~ "released February 19th, 2015" Integrated patches from Debian for increased compatibility with various architectures. .TP .BR v1.1.0\~ "released February 20th, 2015" Added the \-\-quiet option to suppress all output to stdout and stderr except for warnings and fatal errors. Added the \-\-log option to write messages to a log file. .TP .BR v1.1.1\~ "released February 20th, 2015" Added info about \-\-log and \-\-quiet options to help text. .TP .BR v1.1.2\~ "released March 18th, 2015" Fix bug with large datasets. Fix format of help info. .TP .BR v1.1.3\~ "released March 18th, 2015" Fix more bugs with large datasets. .TP .BR v1.2.0-1.2.19\~ "released July 6th to September 8th, 2015" Several new commands and options added. Bugs fixed. Documentation updated. .TP .BR v1.3.0\~ "released September 9th, 2015" Changed to autotools build system. .TP .BR v1.3.1\~ "released September 14th, 2015" Several new commands and options. Bug fixes. .TP .BR v1.3.2\~ "released September 15th, 2015" Fixed memory leaks. Added '-h' shortcut for help. Removed extra 'v' in version number. .TP .BR v1.3.3\~ "released September 15th, 2015" Fixed bug in hexadecimal digits of MD5 and SHA1 digests. Added \-\-samheader option. .TP .BR v1.3.4\~ "released September 16th, 2015" Fixed compilation problems with zlib and bzip2lib. .TP .BR v1.3.5\~ "released September 17th, 2015" Minor configuration/makefile changes to compile to native CPU and simplify makefile. .TP .BR v1.4.0\~ "released September 25th, 2015" Added \-\-sizeorder option. .TP .BR v1.4.1\~ "released September 29th, 2015" Inserted public domain MD5 and SHA1 code to eliminate dependency on crypto and openssl libraries and their licensing issues. .TP .BR v1.4.2\~ "released October 2nd, 2015" Dynamic loading of libraries for reading gzip and bzip2 compressed files if available. Circumvention of missing gzoffset function in zlib 1.2.3 and earlier. .TP .BR v1.4.3\~ "released October 3rd, 2015" Fix a bug with determining amount of memory on some versions of Apple OS X. .TP .BR v1.4.4\~ "released October 3rd, 2015" Remove debug message. .TP .BR v1.4.5\~ "released October 6th, 2015" Fix memory allocation bug when reading long FASTA sequences. .TP .BR v1.4.6\~ "released October 6th, 2015" Fix subtle bug in SIMD alignment code that reduced accuracy. .TP .BR v1.4.7\~ "released October 7th, 2015" Fixes a problem with searching for or clustering sequences with repeats. In this new version, vsearch looks at all words occurring at least once in the sequences in the initial step. Previously only words occurring exactly once were considered. In addition, vsearch now requires at least 10 words to be shared by the sequences, previously only 6 were required. If the query contains less than 10 words, all words must be present for a match. This change seems to lead to slightly reduced recall, but somewhat increased precision, ending up with slightly improved overall accuracy. .TP .BR v1.5.0\~ "released October 7th, 2015" This version introduces the new option \-\-minwordmatches that allows the user to specify the minimum number of matching unique words before a sequence is considered further. New default values for different word lengths are also set. The minimum word length is increased to 7. .TP .BR v1.6.0\~ "released October 9th, 2015" This version adds the relabeling options (\-\-relabel, \-\-relabel_md5 and \-\-relabel_sha1) to the shuffle command. It also adds the \-\-xsize option to the clustering, dereplication, shuffling and sorting commands. .TP .BR v1.6.1\~ "released October 14th, 2015" Fix bugs and update manual and help text regarding relabelling. Add all relabelling options to the subsampling command. Add the \-\-xsize option to chimera detection, dereplication and fastq filtering commands. Refactoring of code. .TP .BR v1.7.0\~ "released October 14th, 2015" Add \-\-relabel_keep option. .TP .BR v1.8.0\~ "released October 19th, 2015" Added \-\-search_exact, \-\-fastx_mask and \-\-fastq_convert commands. Changed most commands to read FASTQ input files as well as FASTA files. Modified \-\-fastx_revcomp and \-\-fastx_subsample to write FASTQ files. .TP .BR v1.8.1\~ "released November 2nd, 2015" Fixes for compatibility with QIIME and older OS X versions. .TP .BR v1.9.0\~ "released November 12th, 2015" Added the \-\-fastq_mergepairs command and associated options. This command has not been tested well yet. Included additional files to avoid dependency of autoconf for compilation. Fixed an error where identifiers in fasta headers where not truncated at tabs, just spaces. Fixed a bug in detection of the file format (FASTA/FASTQ) of a gzip compressed input file. .TP .BR v1.9.1\~ "released November 13th, 2015" Fixed memory leak and a bug in score computation in \-\-fastq_mergepairs, and improved speed. .TP .BR v1.9.2\~ "released November 17th, 2015" Fixed a bug in the computation of some values with \-\-fastq_stats. .TP .BR v1.9.3\~ "released November 19th, 2015" Workaround for missing x86intrin.h with old compilers. .TP .BR v1.9.4\~ "released December 3rd, 2015" Fixed incrementation of counter when relabeling dereplicated sequences. .TP .BR v1.9.5\~ "released December 3rd, 2015" Fixed bug resulting in inferior chimera detection performance. .TP .BR v1.9.6\~ "released January 8th, 2016" Fixed bug in aligned sequences produced with \-\-fastapairs and \-\-userout (qrow, trow) options. .TP .BR v1.9.7\~ "released January 12th, 2016" Masking behaviour is changed somewhat to keep the letter case of the input sequences unchanged when no masking is performed. Masking is now performed also during chimera detection. Documentation updated. .TP .BR v1.9.8\~ "released January 22nd, 2016" Fixed bug causing segfault when chimera detection is performed on extremely short sequences. .TP .BR v1.9.9\~ "released January 22nd, 2016" Adjusted default minimum number of word matches during searches for improved performance. .TP .BR v1.9.10\~ "released January 25th, 2016" Fixed bug related to masking and lower case database sequences. .TP .BR v1.10.0\~ "released February 11th, 2016" Parallelized and improved merging of paired-end reads and adjusted some defaults. Removed progress indicator when stderr is not a terminal. Added \-\-fasta_score option to report chimera scores in FASTA files. Added \-\-rereplicate and \-\-fastq_eestats commands. Fixed typos. Added relabelling to files produced with \-\-consout and \-\-profile options. .TP .BR v1.10.1\~ "released February 23rd, 2016" Fixed a bug affecting the \-\-fastq_mergepairs command causing FASTQ headers to be truncated at first space (despite the bug fix release 1.9.0 of November 12th, 2015). Full headers are now included in the output (no matter if \-\-notrunclabels is in effect or not). .TP .BR v1.10.2\~ "released March 18th, 2016" Fixed a bug causing a segmentation fault when running \-\-usearch_global with an empty query sequence. Also fixed a bug causing imperfect alignments to be reported with an alignment string of '=' in uc output files. Fixed typos in man file. Fixed fasta/fastq processing code regarding presence or absence of compression library header files. .TP .BR v1.11.1\~ "released April 13th, 2016" Added strand information in UC file for \-\-derep_fulllength and \-\-derep_prefix. Added expected errors (ee) to header of FASTA files specified with \-\-fastaout and \-\-fastaout_discarded when \-\-eeout or \-\-fastq_eeout option is in effect for fastq_filter and fastq_mergepairs. The options \-\-eeout and \-\-fastq_eeout are now equivalent. .TP .BR v1.11.2\~ "released June 21st, 2016" Two bugs were fixed. The first issue was related to the \-\-query_cov option that used a different coverage definition than the qcov userfield. The coverage is now defined as the fraction of the whole query sequence length that is aligned with matching or mismatching residues in the target. All gaps are ignored. The other issue was related to the consensus sequences produced during clustering when only N's were present in some positions. Previously these would be converted to A's in the consensus. The behaviour is changed so that N's are produced in the consensus, and it should now be more compatible with usearch. .TP .BR v2.0.0\~ "released June 24th, 2016" This major new version supports reading from pipes. Two new options are added: \-\-gzip_decompress and \-\-bzip2_decompress. One of these options must be specified if reading compressed input from a pipe, but are not required when reading from ordinary files. The vsearch header that was previously written to stdout is now written to stderr. This enables piping of results for further processing. The file name '\-' now represent standard input (/dev/stdin) or standard output (/dev/stdout) when reading or writing files, respectively. Code for reading FASTA and FASTQ files has been refactored. .TP .BR v2.0.1\~ "released June 30th, 2016" Avoid segmentation fault when masking very long sequences. .TP .BR v2.0.2\~ "released July 5th, 2016" Avoid warnings when compiling with GCC 6. .TP .BR v2.0.3\~ "released August 2nd, 2016" Fixed bad compiler options resulting in Illegal instruction errors when running precompiled binaries. .TP .BR v2.0.4\~ "released September 1st, 2016" Improved error message for bad FASTQ quality values. Improved manual. .TP .BR v2.0.5\~ "released September 9th, 2016" Add options \-\-fastaout_discarded and \-\-fastqout_discarded to output discarded sequences from subsampling to separate files. Updated manual. .TP .BR v2.1.0\~ "released September 16th, 2016" New command: \-\-fastx_filter. New options: \-\-fastq_maxlen, \-\-fastq_truncee. Allow \-\-minwordmatches down to 3. .TP .BR v2.1.1\~ "released September 23rd, 2016" Fixed bugs in output to UC-files. Improved help text and manual. .TP .BR v2.1.2\~ "released September 28th, 2016" Fixed incorrect abundance output from fastx_filter and fastq_filter when relabelling. .TP .BR v2.2.0\~ "released October 7th, 2016" Added OTU table generation options \-\-biomout, \-\-mothur_shared_out and \-\-otutabout to the clustering and searching commands. .TP .BR v2.3.0\~ "released October 10th, 2016" Allowed zero-length sequences in FASTA and FASTQ files. Added \-\-fastq_trunclen_keep option. Fixed bug with output of OTU tables to pipes. .TP .BR v2.3.1\~ "released November 16th, 2016" Fixed bug where \-\-minwordmatches 0 was interpreted as the default minimum word matches for the given word length instead of zero. When used in combination with \-\-maxaccepts 0 and \-\-maxrejects 0 it will allow complete bypass of kmer-based heuristics. .TP .BR v2.3.2\~ "released November 18th, 2016" Fixed bug where vsearch reported the ordinal number of the target sequence instead of the cluster number in column 2 on H-lines in the uc output file after clustering. For search and alignment commands both usearch and vsearch reports the target sequence number here. .TP .BR v2.3.3\~ "released December 5th, 2016" A minor speed improvement. .TP .BR v2.3.4\~ "released December 9th, 2016" Fixed bug in output of sequence profiles and updated documentation. .TP .BR v2.4.0\~ "released February 8th, 2017" Added support for Linux on Power8 systems (ppc64le) and Windows on x86_64. Improved detection of pipes when reading FASTA and FASTQ files. Corrected option for specifying output from fastq_eestats command in help text. .TP .BR v2.4.1\~ "released March 1st, 2017" Fixed an overflow bug in fastq_stats and fastq_eestats affecting analysis of very large FASTQ files. Fixed maximum memory usage reporting on Windows. .TP .BR v2.4.2\~ "released March 10th, 2017" Default value for fastq_minovlen increased to 16 in accordance with help text and for compatibility with usearch. Minor changes for improved accuracy of paired-end read merging. .TP .BR v2.4.3\~ "released April 6th, 2017" Fixed bug with progress bar for shuffling. Fixed missing N-lines in UC files with usearch_global, search_exact and allpairs_global when the output_no_hits option was not specified. .TP .BR v2.4.4\~ "released August 28th, 2017" Fixed a few minor bugs, improved error messages and updated documentation. .TP .BR v2.5.0\~ "released October 5th, 2017" Support for UDB database files. New commands: fastq_stripright, fastq_eestats2, makeudb_usearch, udb2fasta, udbinfo, and udbstats. New general option: no_progress. New options minsize and maxsize to fastx_filter. Minor bug fixes, error message improvements and documentation updates. .TP .BR v2.5.1\~ "released October 25th, 2017" Fixed bug with bad default value of 1 instead of 32 for minseqlength when using the makeudb_usearch command. .TP .BR v2.5.2\~ "released October 30th, 2017" Fixed bug with where '-' as an argument to the fastq_eestats2 option was treated literally instead of equivalent to stdin. .TP .BR v2.6.0\~ "released November 10th, 2017" Rewritten paired-end reads merger with improved accuracy. Decreased default value for fastq_minovlen option from 16 to 10. The default value for the fastq_maxdiffs option is increased from 5 to 10. There are now other more important restrictions that will avoid merging reads that cannot be reliably aligned. .TP .BR v2.6.1\~ "released December 8th, 2017" Improved parallelisation of paired end reads merging. .TP .BR v2.6.2\~ "released December 18th, 2017" Fixed option xsize that was partially inactive for commands uchime_denovo, uchime_ref, and fastx_filter. .TP .BR v2.7.0\~ "released February 13th, 2018" Added commands cluster_unoise, uchime2_denovo and uchime3_denovo contributed by Davide Albanese based on Robert Edgar's papers. Refactored fasta and fastq print functions as well as code for extraction of abundance and other attributes from the headers. .TP .BR v2.7.1\~ "released February 16th, 2018" Fix several bugs on Windows related to large files, use of "-" as a file name to mean stdin or stdout, alignment errors, missed kmers and corrupted UDB files. Added documentation of UDB-related commands. .TP .BR v2.7.2\~ "released April 20th, 2018" Added the sintax command for taxonomic classification. Fixed a bug with incorrect FASTA headers of consensus sequences after clustering. .TP .BR v2.8.0\~ "released April 24th, 2018" Added the fastq_maxdiffpct option to the fastq_mergepairs command. .TP .BR v2.8.1\~ "released June 22nd, 2018" Fixes for compilation warnings with GCC 8. .TP .BR v2.8.2\~ "released August 21st, 2018" Fix for wrong placement of semicolons in header lines in some cases when using the sizeout or xsize options. Reduced memory requirements for full-length dereplication in cases with many duplicate sequences. Improved wording of fastq_mergepairs report. Updated manual regarding use of sizein and sizeout with dereplication. Changed a compiler option. .TP .BR v2.8.3\~ "released August 31st, 2018" Fix for segmentation fault for \-\-derep_fulllength with \-\-uc. .TP .BR v2.8.4\~ "released September 3rd, 2018" Further reduce memory requirements for dereplication when not using the uc option. Fix output during subsampling when quiet or log options are in effect. .TP .BR v2.8.5\~ "released September 26th, 2018" Fixed a bug in fastq_eestats2 that caused the values for large lengths to be much too high when the input sequences had varying lengths. .TP .BR v2.8.6\~ "released October 9th, 2018" Fixed a bug introduced in version 2.8.2 that caused derep_fulllength to include the full FASTA header in its output instead of stopping at the first space (unless the notrunclabels option is in effect). .TP .BR v2.9.0\~ "released October 10th, 2018" Added the fastq_join command. .TP .BR v2.9.1\~ "released October 29th, 2018" Changed compiler options that select the target cpu and tuning to allow the software to run on any 64-bit x86 system, while tuning for more modern variants. Avoid illegal instruction error on some architectures. Update documentation of rereplicate command. .TP .BR v2.10.0\~ "released December 6th, 2018" Added the sff_convert command to convert SFF files to FASTQ. Added some additional option argument checks. Fixed segmentation fault bug after some fatal errors when a log file was specified. .TP .BR v2.10.1\~ "released December 7th, 2018" Improved sff_convert command. It will now read several variants of the SFF format. It is also able to read from a pipe. Warnings are given if there are minor problems. Errors messages have been improved. Minor speed and memory usage improvements. .TP .BR v2.10.2\~ "released December 10th, 2018" Fixed bug in sintax with reversed order of domain and kingdom. .TP .BR v2.10.3\~ "released December 19th, 2018" Ported to Linux on ARMv8 (aarch64). Fixed compilation warning with gcc version 8.1.0 and 8.2.0. .TP .BR v2.10.4\~ "released January 4th, 2019" Fixed serious bug in x86_64 SIMD alignment code introduced in version 2.10.3. Added link to BioConda in README. Fixed bug in fastq_stats with sequence length 1. Fixed use of equals symbol in UC files for identical sequences with cluster_fast. .TP .BR v2.11.0\~ "released February 13th, 2019" Added ability to trim and filter paired-end reads using the reverse option with the fastx_filter and fastq_filter commands. Added \-\-xee option to remove ee attributes from FASTA headers. Minor invisible improvement to the progress indicator. .TP .BR v2.11.1\~ "released February 28th, 2019" Minor change to the handling of the weak_id and id options when using cluster_unoise. .TP .BR v2.12.0\~ "released March 19th, 2019" Take sequence abundance into account when computing consensus sequences or profiles after clustering. Warn when rereplicating sequences without abundance info. Guess offset 33 in more cases with fastq_chars. Stricter checking of option arguments and option combinations. .TP .BR v2.13.0\~ "released April 11th, 2019" Added the \-\-fastx_getseq, \-\-fastx_getseqs and \-\-fastx_getsubseq commands to extract sequences from a FASTA or FASTQ file based on their labels. Improved handling of ambiguous nucleotide symbols. Corrected behaviour of \-\-uchime_ref command with and options \-\-self and \-\-selfid. Strict detection of illegal options for each command. .TP .BR v2.13.1\~ "released April 26th, 2019" Minor changes to the allowed options for each command. All commands now allow the log, quiet and threads options. If more than 1 thread is specified for commands that are not multi-threaded, a warning will be issued. Minor changes to the manual. .TP .BR v2.13.2\~ "released April 30th, 2019" Fixed bug related to improper handling of newlines on Windows. Allowed option strand plus to uchime_ref for compatibility. .TP .BR v2.13.3\~ "released April 30th, 2019" Fixed bug in FASTQ parsing introduced in version 2.13.2. .TP .BR v2.13.4\~ "released May 10th, 2019" Added information about support for gzip- and bzip2-compressed input files to the output of the version command. Adapted source code for compilation on FreeBSD and NetBSD systems. .TP .BR v2.13.5\~ "released July 2nd, 2019" Added cut command to fragment sequences at restriction sites. Silenced output from the fastq_stats command if quiet option was given. Updated manual. .TP .BR v2.13.6\~ "released July 2nd, 2019" Added info about cut command to output of help command. .TP .BR v2.13.7\~ "released September 2nd, 2019" Fixed bug in consensus sequence introduced in version 2.13.0. .TP .BR v2.14.0\~ "released September 11th, 2019" Added relabel_self option. Made fasta_width, sizein, sizeout and relabelling options valid for certain commands. .TP .BR v2.14.1\~ "released September 18th, 2019" Fixed bug with sequences written to file specified with fastaout_rev for commands fastx_filter and fastq_filter. .TP .BR v2.14.2\~ "released January 28th, 2020" Fixed some issues with the cut, fastx_revcomp, fastq_convert, fastq_mergepairs, and makeudb_usearch commands. Updated manual. .TP .BR v2.15.0\~ "released June 19th, 2020" Update manual and documentation. Turn on notrunclabels option for sintax command by default. Change maxhits 0 to mean unlimited hits, like the default. Allow non-ascii characters in headers, with a warning. Sort centroids and uc too when clusterout_sort specified. Add cluster id to centroids output when clusterout_id specified. Improve error messages when parsing FASTQ files. Add missing fastq_qminout option and fix label_suffix option for fastq_mergepairs. Add derep_id command that dereplicates based on both label and sequence. Remove compilation warnings. .TP .BR v2.15.1\~ "released October 28th, 2020" Fix for dereplication when including reverse complement sequences and headers. Make some extra checks when loading compression libraries and add more diagnostic output about them to the output of the version command. Report an error when fastx_filter is used with FASTA input and options that require FASTQ input. Update manual. .TP .BR v2.15.2\~ "released January 26th, 2021" No real functional changes, but some code and compilation changes. Compiles successfully on macOS running on Apple Silicon (ARMv8). Binaries available. Code updated for C++11. Minor adaptations for Windows compatibility, including the use of the C++ standard library for regular expressions. Minor changes for compatibility with Power8. Switch to C++ header files. .TP .BR v2.16.0\~ "released March 22nd, 2021" This version adds the orient command. It also handles empty input files properly. Documentation has been updated. .TP .BR v2.17.0\~ "released March 29nd, 2021" The fastq_mergepairs command has been changed. It now allows merging of sequences with overlaps as short as 5 bp if the \-\-fastq_minovlen option has been adjusted down from the default 10. In addition, much fewer pairs of reads should now be rejected with the reason 'multiple potential alignments' as the algorithm for detecting those have been changed. .TP .BR v2.17.1\~ "released June 14th, 2021" Modernized code. Minor changes to help info. .TP .BR v2.18.0\~ "released August 27th, 2021" Added the fasta2fastq command. Fixed search bug on ppc64le. Fixed bug with removal of size and ee info in uc files. Fixed compilation errors in some cases. Made some general code improvements. Updated manual. .TP .BR v2.19.0\~ "released December 21st, 2021" Added the lcaout and lca_cutoff options to enable the output of last common ancestor (LCA) information about hits when searching. The randseed option was added as a valid option to the sintax command. Code improvements. .TP .BR v2.20.0\~ "released January 10th, 2022" Added the fastx_uniques command and the fastq_qout_max option for dereplication of FASTQ files. Some code cleaning. .TP .BR v2.20.1\~ "released January 11th, 2022" Fixes a bug in fastq_mergepair that caused an occational hang at the end when using multiple threads. .TP .BR v2.21.0\~ "released January 12th, 2022" This version adds the sample, qsegout and tsegout options. It enables the use of UDB databases with uchime_ref. .TP .BR v2.21.1\~ "released January 18th, 2022" Fix a problem with dereplication of empty input files. Update Altivec code on ppc64le for improved compiler compatibility (vector->__vector). .TP .BR v2.21.2\~ "released September 12th, 2022" Fix problems with the lcaout option when using maxaccepts above 1 and either lca_cutoff below 1 or with top_hits_only enabled. Update documentation. Update code to avoid compiler warnings. .TP .BR v2.22.0\~ "released September 19th, 2022" Add the derep_smallmem command for dereplication using little memory. .TP .BR v2.22.1\~ "released September 19th, 2022" Fix compiler warning. .TP .BR v2.23.0\~ "released July 7th, 2023" Update documentation. Add citation file. Modernize and improve code. Fix several minor bugs. Fix compilation with GCC 13. Print stats after fastq_mergepairs to log file instead of stderr. Handle sizein option correctly with dbmatched option for usearch_global. Allow maxseqlength option for makeudb_usearch. Fix memory allocation problem with chimera detection. Add lengthout and xlength options. Increase precision for eeout option. Add warning about sintax algorithm, random seed and multiple threads. Refactor chimera detection code. Add undocumented experimental long_chimeras_denovo command. Fix segfault with clustering. Add more references. .TP .BR v2.24.0\~ "released October 26th, 2023" Update documentation. Improve code. Allow up to 20 parents for the undocumented and experimental chimeras_denovo command. Fix compilation warnings for sha1.c. Compile for release (not debug) by default. .TP .BR v2.25.0\~ "released November 10th, 2023" Allow a given percentage of mismatches between chimeras and parents for the experimental chimeras_denovo command. .TP .BR v2.26.0\~ "released November 24th, 2023" Enable the maxseqlength and minseqlength options for the chimera detection commands. When the usearch_global or search_exact commands are used, OTU tables will include samples and OTUs with no matches. .TP .BR v2.26.1\~ "released November 25th, 2023" No real changes, but the previous version was released without proper updates to the source code. .TP .BR v2.27.0\~ "released January 19th, 2024" The usearch_global and search_exact commands now support FASTQ files as well as FASTA files as input. This version of vsearch includes clarifications and updates to the manual. Some code has been refactored. Generic Dockerfiles for major Linux distributions have been included. Some warnings from compilers and other tools have been eliminated. The release for Windows will also include DLL's for the two compression libraries. .TP .BR v2.27.1\~ "released April 6th, 2024" This version fixes the weak_id option and makes searches report weak hits in some cases. It also updates the names of the compression libraries to libz.so.1 and libbz2.so.1 on Linux to make them work on common Linux distributions without installing additional packages. README.md has been updated with information about compression libraries on Windows. .TP .BR v2.28.0\~ "released April 26th, 2024" The sintax command has been improved in several ways in this version of vsearch. Please note that several details of this algorithm is not clearly described in the preprint, and the implementation in vsearch differs from that in usearch. The former vsearch version did not always choose the most common taxonomic entity over the 100 bootstraps among the database sequences with the highest amount of word similarity to the query. Instead, if several sequences had an equal similarity with the query, the sequence encountered in the earliest bootstrap was chosen. The confidence level was calculated based on this sequence compared to the selected sequences from the other 99 bootstraps. This could lead to a suboptimal choice with a low confidence. In the new version, the most common of the sequences with the highest amount of word similarity across the 100 bootstraps will be selected, and ties will be broken randomly. Another problem with the old implementation was that if several sequences had the same amount of word similarity, the shortest one in the reference database would be chosen, and if they were equally long, the earliest in the database file would be chosen. A new option called sintax_random has now been introduced. This option will randomly select one of the sequences with the highest number of shared words with the query, without considering their length or position. This avoids a bias towards shorter reference sequences. This option is strongly recommended and will probably soon be the default. Furthermore, a ninth taxonomic rank, strain (letter t), is now recognized. The speed of the sintax command has also been significantly improved at least in some cases. Run vsearch with the randseed option and 1 thread to ensure reproducibility of the random choices in the algorithm. .TP .BR v2.28.1\~ "released April 26th, 2024" Fix a segmentation fault that could occur with the blast6out and output_no_hits options. .TP .BR v2.29.0\~ "released September 26th, 2024" This version fixes seven bugs (see changelog below), adds initial support for RISC-V architectures, and improves code quality and code testing (1,210 new tests): .RS .IP - 2 add: experimental support for RISCV64 and other 64-bit little-endian architectures, thanks to Michael R. Crusoe and his fellow Debian developers (issue #566), .IP - add: official support for clang-19 and gcc 14, .IP - add: beta support for clang-20, .IP - remove: unused \-\-output option for command \-\-fastq_stats (issue #572), .IP - fix: bug in \-\-sintax when selecting the best lineage (only low confidence values below 0.5 were affected) (issue #573), .IP - fix: out-of-bounds error in \-\-fastq_stats when processing empty reads (issue #571), .IP - fix: bug in \-\-cut, patterns with multiple cutting sites were not detected (commit 4c4f9fa70f14b28d50185dbf322cf5727087e86a), .IP - fix: memory error (segmentation fault) when using \-\-derep_id and \-\-strand (issue #565), .IP - fix: \-\-fastq_join now obeys to \-\-quiet and \-\-log options (commit 87f968b09f17c17ebf8db00aebe86e89b13a3948), .IP - fix: \-\-fastq_join quality padding is now also set to Q40 when quality offset is 64 (commit be0bf9b48d782286c4ce38f0bf1a4c82bd230250), .IP - fix: (partial) \-\-fastq_join's handling of abundance annotations (commit f2bbcb421dc2f4dfa6603b9f31ec3e4598c1b591), .IP - improve: additional safeguards to validate input values and to make sure that they are within acceptable limits. Changes concern options \-\-abskew (commit a530dd8990f8a05cb25fc0b6a5da5a14d28fbedd) and \-\-fastq_maxdiffs (commit 4b254db7f120bfd49e86185ef3cd9070c236f940), .IP - improve: code quality (1.3k+ commits, 6k+ clang-tidy warnings eliminated), .IP - improve: documentation and help messages (issue #568), .IP - improve: complete refactoring and modernization of a subset of commands (\-\-sortbylength, \-\-sortbysize, \-\-shuffle, \-\-rereplicate, \-\-cut, \-\-fastq_join, \-\-fasta2fastq, \-\-fastq_chars), .IP - improve: code-coverage of our test-suite for the above-mentioned commands (1,210 new tests, 4,753 in total) .RE .TP .BR v2.29.1\~ "released October 24th, 2024" Fix a segmentation fault that could occur during alignment in version 2.29.0, for example with \-\-uchime_ref. Some improvements to code and documentation. .TP .BR v2.29.2\~ "released December 20th, 2024" Fix a segmentation fault during clustering when the set of clusters is empty. Initial documentation in markdown format available on GitHub Pages. .TP .BR v2.29.3\~ "released February 3rd, 2025" This version is released in order to mitigate a bug that occurs when compiling the `align_simd.cc` file on x86_64 systems with the GNU C++ compiler version 9 or later with the `-O3` optimization option. It results in incorrect code that may cause bad alignments in some circumstances. We are investigating this issue further, but for now we recommend compiling with the `-O2` flag. The README.md file and the Dockerfiles have been updated to reflect this. The binaries released with this version will include this fix. .TP .BR v2.29.4\~ "released February 14th, 2025" Adjust the window size used for chimera detection down from 64 to 32. The window size was by accident increased from 32 to 64 in version 2.23.0, leading to somewhat fewer chimeras being predicted. In addition, a compiler pragma has been included in align_simd.cc to further protect the compiler from generating wrong code. .TP .BR v2.30.0\~ "released February 27th, 2025" Add options `\-\-n_mismatch`, `\-\-fastq_minqual`, and `\-\-fastq_truncee_rate`. The `\-\-n_mismatch` option will count N's as mismatches in alignments, which may be useful to get sensible alignments for sequences with lots of N's. By default N's are counted as matches. Both the scoring and the counting of matches are affected. The new `\-\-fastq_minqual` option for the `fastq_filter` and `fastx_filter` commands will discard sequences with any bases with a quality scores below the given value. The new `\-\-fastq_truncee_rate` option for the same commands will truncate sequences at the first position where the number of expected errors per base is above the given value. .TP .BR v2.30.1\~ "released October 3rd, 2025" This version incorporates many code improvements, more extensive testing, better documentation and some minor bug fixes. List of changes: .RS .IP - 2 fix: use-after-free introduced in commit de6c1d8 (Jun 13, 2024), .IP - fix: (harmless) out-of-bounds memory issue in \-\-derep_prefix (commit 8a0a508b), .IP - fix: (harmless) memory leak in \-\-fastx_getseqs \-\-label_field (commit a9c42713), .IP - fix: (harmless) memory leak when using option \-\-userfields (\-\-allpairs_global commit 03b95bcf; \-\-cluster_* commit 2fde5472; \-\-search_exact commit 45cd56d6; \-\-usearch_global commit d83bfee9), .IP - fix: (harmless) valgrind error, use of uninitialized values (commit 8bab2444), also eliminates a pesky compilation warning, .IP - change: passing a negative value to \-\-fastq_truncee_rate is now an error (commit a120f371), .IP - change: passing a negative value to \-\-fastq_minqual is now an error (commit ff5b0c99), .IP - change: passing a non-ASCII symbol to \-\-join_padgap or \-\-join_padgapq is now an error (commit a708f5b3), .IP - change: when using \-\-gapopen "*" to forbid gap opening, the penalty is now set to INT_MAX (rather than 1,000). This might change alignment results for users who relied on the old behavior (thanks to Denis Filloux, issue #602, commit 96e9cf9e), .IP - change: when using command \-\-chimeras_denovo, \-\-tabbedout or \-\-alnout can be the only output files specified (commit d51f0a456300a0ea69c035ba3de23c5ecf3da348) .IP - change: when using command \-\-chimeras_denovo, option \-\-lengthout is now accepted (commit 3f55cc6b) .IP - change: when using command \-\-chimeras_denovo, option \-\-xlength is now accepted (commit 0fd346cd) .IP - add: compilation option GLIBCXX_DEBUG when compiling for debugging (commit 5cf4a6c1, option activates more runtime checks), .IP - add: official support for clang 20, .IP - add: initial support for clang 21, initial support for GCC 15, .IP - add: experimental support for clang 22, .IP - improve: more accurate line number when reporting illegal characters in fastq headers (commit 539084e9), .IP - improve: more accurate line number when reporting non-ASCII characters in fastq headers (commit 98a851ed), .IP - improve: remove checks for unneeded libraries during compilation (commit 249bb5d5276b1be6c9add60a538a16e1b9d0ebfb), .IP - improve: code quality (8,536 clang-tidy warnings eliminated), .IP - improve: documentation and help messages (issues #604), .IP - improve: complete refactoring and modernization of the command \-\-fastq_stats, .IP - improve: command \-\-fastq_stats is now up to twice faster (tested on x86-64), .IP - improve: extensive test-suites for \-\-fastq_stats and \-\-sff_convert, .IP - improve: code coverage of our test-suite .RE .\" ============================================================================ .\" TODO: .\" .\" NOTES .\" visualize and output to pdf .\" man -l vsearch.1 .\" man -t ./doc/vsearch.1 | ps2pdf - > ./doc/vsearch_manual.pdf vsearch-2.30.1/src/000077500000000000000000000000001506776541700140415ustar00rootroot00000000000000vsearch-2.30.1/src/Makefile.am000066400000000000000000000104441506776541700161000ustar00rootroot00000000000000bin_PROGRAMS = $(top_builddir)/bin/vsearch AM_CFLAGS = -Wall -Wextra -Wpedantic -Wno-ignored-attributes # Conditionally set profiling based on ENABLE_PROFILING if ENABLE_PROFILING AM_CFLAGS += -pg -O1 endif if TARGET_PPC AM_CFLAGS += -mcpu=powerpc64le -maltivec else if TARGET_AARCH64 AM_CFLAGS += -march=armv8-a+simd -mtune=generic else if TARGET_X86_64 AM_CFLAGS += -march=x86-64 -mtune=generic endif endif endif # Conditionally set NDEBUG based on ENABLE_DEBUG if ENABLE_DEBUG # next warnings to activate: # -fsanitize=undefined,address (run-time warnings; at first, try one at a time) # -Wconversion # -Wold-style-cast # -Wshadow # -Wsign-conversion # -Wunused-macros # -Wuseless-cast # GCC 14: -Wnrvo (Named Return Value Optimization) AM_CFLAGS += -UNDEBUG -D_GLIBCXX_DEBUG -Wcast-align -Wdate-time \ -Wdouble-promotion -Wduplicated-branches -Wduplicated-cond -Wfloat-equal \ -Wformat=1 -Wformat-overflow -Wlogical-op -Wnon-virtual-dtor -Wnull-dereference \ -Woverloaded-virtual -Wuninitialized \ -Wunsafe-loop-optimizations -Wunused -Wvla \ -Wno-ignored-attributes # -Wno-ignored-attributes: ignore warnings due to # std::array<__m128i> (__m128i is declared with # alignment attributes which are ignored by the # std::array template else AM_CFLAGS += -DNDEBUG endif AM_CXXFLAGS = $(AM_CFLAGS) -std=c++11 export MACOSX_DEPLOYMENT_TARGET=10.9 VSEARCHHEADERS=\ align_simd.h \ allpairs.h \ arch.h \ attributes.h \ bitmap.h \ chimera.h \ city.h \ cluster.h \ cpu.h \ cut.h \ db.h \ dbhash.h \ dbindex.h \ derep.h \ derep_prefix.h \ derep_smallmem.h \ dynlibs.h \ eestats.h \ fasta2fastq.h \ fasta.h \ fastq.h \ fastq_chars.h \ fastq_join.h \ fastq_mergepairs.h \ fastq_stats.h \ fastqops.h \ fastx.h \ filter.h \ getseq.h \ kmerhash.h \ linmemalign.h \ mask.h \ md5.h \ minheap.h \ msa.h \ orient.h \ otutable.h \ rereplicate.h \ results.h \ search.h \ searchcore.h \ search_exact.h \ sff_convert.h \ showalign.h \ sha1.h \ shuffle.h \ sintax.h \ sortbylength.h \ sortbysize.h \ subsample.h \ tax.h \ udb.h \ unique.h \ userfields.h \ util.h \ utils/check_output_filehandle.hpp \ utils/cigar.hpp \ utils/cigar_operations.hpp \ utils/compare_strings_nocase.hpp \ utils/fatal.hpp \ utils/kmer_hash_struct.hpp \ utils/maps.hpp \ utils/open_file.hpp \ utils/os_byteswap.hpp \ utils/progress.hpp \ utils/seqcmp.hpp \ utils/span.hpp \ utils/taxonomic_fields.h \ utils/xpthread.hpp \ vsearch.h \ xstring.h if TARGET_X86_64 libcpu_sse2_a_SOURCES = cpu.cc $(VSEARCHHEADERS) libcpu_sse2_a_CXXFLAGS = $(AM_CXXFLAGS) -msse2 libcpu_ssse3_a_SOURCES = cpu.cc $(VSEARCHHEADERS) libcpu_ssse3_a_CXXFLAGS = $(AM_CXXFLAGS) -mssse3 -DSSSE3 noinst_LIBRARIES = libcpu_sse2.a libcpu_ssse3.a libcityhash.a else libcpu_a_SOURCES = cpu.cc $(VSEARCHHEADERS) noinst_LIBRARIES = libcpu.a libcityhash.a endif libcityhash_a_SOURCES = city.cc city.h if TARGET_WIN libcityhash_a_CXXFLAGS = $(AM_CXXFLAGS) -Wno-sign-compare -D_MSC_VER __top_builddir__bin_vsearch_LDFLAGS = -static __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu_ssse3.a libcpu_sse2.a else libcityhash_a_CXXFLAGS = $(AM_CXXFLAGS) -Wno-sign-compare if TARGET_X86_64 __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu_ssse3.a libcpu_sse2.a else __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu.a endif endif __top_builddir__bin_vsearch_SOURCES = $(VSEARCHHEADERS) \ align_simd.cc \ allpairs.cc \ arch.cc \ attributes.cc \ bitmap.cc \ chimera.cc \ cluster.cc \ cut.cc \ db.cc \ dbhash.cc \ dbindex.cc \ derep.cc \ derep_prefix.cc \ derep_smallmem.cc \ dynlibs.cc \ eestats.cc \ fasta2fastq.cc \ fasta.cc \ fastq.cc \ fastq_chars.cc \ fastq_join.cc \ fastq_mergepairs.cc \ fastq_stats.cc \ fastqops.cc \ fastx.cc \ filter.cc \ getseq.cc \ kmerhash.cc \ linmemalign.cc \ mask.cc \ md5.c \ minheap.cc \ msa.cc \ orient.cc \ otutable.cc \ rereplicate.cc \ results.cc \ search.cc \ searchcore.cc \ search_exact.cc \ sff_convert.cc \ sha1.c \ showalign.cc \ shuffle.cc \ sintax.cc \ sortbylength.cc \ sortbysize.cc \ subsample.cc \ tax.cc \ udb.cc \ unique.cc \ userfields.cc \ util.cc \ utils/fatal.cpp \ utils/maps.cpp \ utils/check_output_filehandle.cpp \ utils/cigar.cpp \ utils/compare_strings_nocase.cpp \ utils/open_file.cpp \ utils/os_byteswap.cpp \ utils/xpthread.cpp \ utils/seqcmp.cpp \ vsearch.cc vsearch-2.30.1/src/align_simd.cc000066400000000000000000001704621506776541700164700ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::min, std::max #include #include // int64_t, uint64_t #include // std::printf, std::snprintf #include // std::memcpy, std::memmove, std::memset, std::strcpy, std::strlen #include /* Using 16-bit signed values, from -32768 to +32767. match: positive mismatch: negative gap penalties: positive (open, extend, query/target, left/interior/right) optimal global alignment (NW) maximize score */ constexpr auto matrix_size = 16; constexpr auto CHANNELS = 8; constexpr auto CDEPTH = 4; constexpr auto maxseqlenproduct = 25000000LL; // anonymous namespace: limit visibility and usage to this translation unit namespace { constexpr std::array sym_nt_4bit = {{'-', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}}; } // end of anonymous namespace /* Due to memory usage, limit the product of the length of the sequences. If the product of the query length and any target sequence length is above the limit, the alignment will not be computed and a score of SHORT_MAX (+32,767) will be returned as the score. If an overflow occurs during alignment computation, a score of SHORT_MAX (+32,767) will also be returned. The limit is set to 5 000 * 5 000 = 25 000 000. This will allocate up to 200 MB per thread. It will align pairs of sequences less than 5000 nt long using the SIMD implementation, larger alignments will be performed with the linear memory aligner. */ static std::array, matrix_size> scorematrix {{}}; /* The macros below usually operate on 128-bit vectors of 8 signed short 16-bit integers. Additions and subtractions should be saturated. The shift operation should shift left by 2 bytes (one short int) and shift in zeros. The v_mask_gt operation should compare two vectors of signed shorts and return a 16-bit bitmask with pairs of 2 bits set for each element greater in the first than in the second argument. */ #ifdef __PPC__ using VECTOR_SHORT = __vector signed short; constexpr __vector unsigned char perm_merge_long_low = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17}; constexpr __vector unsigned char perm_merge_long_high = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h} #define v_load(a) vec_ld(0, (VECTOR_SHORT *)(a)) #define v_store(a, b) vec_st((__vector unsigned char)(b), 0, \ (__vector unsigned char *)(a)) #define v_add(a, b) vec_adds((a), (b)) #define v_sub(a, b) vec_subs((a), (b)) #define v_sub_unsigned(a, b) ((VECTOR_SHORT) \ vec_subs((__vector unsigned short) (a), \ (__vector unsigned short) (b))) #define v_max(a, b) vec_max((a), (b)) #define v_min(a, b) vec_min((a), (b)) #define v_dup(a) vec_splat((VECTOR_SHORT){(short)(a), 0, 0, 0, 0, 0, 0, 0}, 0); #define v_zero vec_splat_s16(0) #define v_and(a, b) vec_and((a), (b)) #define v_xor(a, b) vec_xor((a), (b)) #define v_shift_left(a) vec_sld((a), v_zero, 2) #elif defined __aarch64__ using VECTOR_SHORT = int16x8_t; constexpr uint16x8_t neon_mask = {0x0003, 0x000c, 0x0030, 0x00c0, 0x0300, 0x0c00, 0x3000, 0xc000}; // warning: ISO C++ forbids compound-literals [-Wpedantic] (line below) (clang specific?) #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h} #define v_load(a) vld1q_s16((const int16_t *)(a)) #define v_store(a, b) vst1q_s16((int16_t *)(a), (b)) #define v_merge_lo_16(a, b) vzip1q_s16((a),(b)) #define v_merge_hi_16(a, b) vzip2q_s16((a),(b)) #define v_merge_lo_32(a, b) vreinterpretq_s16_s32(vzip1q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b))) #define v_merge_hi_32(a, b) vreinterpretq_s16_s32(vzip2q_s32(vreinterpretq_s32_s16(a), vreinterpretq_s32_s16(b))) #define v_merge_lo_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(a)), vget_low_s64(vreinterpretq_s64_s16(b)))) #define v_merge_hi_64(a, b) vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s16(a)), vget_high_s64(vreinterpretq_s64_s16(b)))) #define v_add(a, b) vqaddq_s16((a), (b)) #define v_sub(a, b) vqsubq_s16((a), (b)) #define v_sub_unsigned(a, b) vreinterpretq_s16_u16(vqsubq_u16(vreinterpretq_u16_s16(a), vreinterpretq_u16_s16(b))) #define v_max(a, b) vmaxq_s16((a), (b)) #define v_min(a, b) vminq_s16((a), (b)) #define v_dup(a) vdupq_n_s16(a) #define v_zero v_dup(0) #define v_and(a, b) vandq_s16((a), (b)) #define v_xor(a, b) veorq_s16((a), (b)) #define v_shift_left(a) vextq_s16((v_zero), (a), 7) #define v_mask_gt(a, b) vaddvq_u16(vandq_u16((vcgtq_s16((a), (b))), neon_mask)) #elif defined(__x86_64__) || defined(SIMDE_VERSION) using VECTOR_SHORT = __m128i; #define v_init(a,b,c,d,e,f,g,h) _mm_set_epi16(h,g,f,e,d,c,b,a) #define v_load(a) _mm_load_si128((VECTOR_SHORT *)(a)) #define v_store(a, b) _mm_store_si128((VECTOR_SHORT *)(a), (b)) #define v_merge_lo_16(a, b) _mm_unpacklo_epi16((a),(b)) #define v_merge_hi_16(a, b) _mm_unpackhi_epi16((a),(b)) #define v_merge_lo_32(a, b) _mm_unpacklo_epi32((a),(b)) #define v_merge_hi_32(a, b) _mm_unpackhi_epi32((a),(b)) #define v_merge_lo_64(a, b) _mm_unpacklo_epi64((a),(b)) #define v_merge_hi_64(a, b) _mm_unpackhi_epi64((a),(b)) #define v_add(a, b) _mm_adds_epi16((a), (b)) #define v_sub(a, b) _mm_subs_epi16((a), (b)) #define v_sub_unsigned(a, b) _mm_subs_epu16((a), (b)) #define v_max(a, b) _mm_max_epi16((a), (b)) #define v_min(a, b) _mm_min_epi16((a), (b)) #define v_dup(a) _mm_set1_epi16(a) #define v_zero v_dup(0) #define v_and(a, b) _mm_and_si128((a), (b)) #define v_xor(a, b) _mm_xor_si128((a), (b)) #define v_shift_left(a) _mm_slli_si128((a), 2) #define v_mask_gt(a, b) _mm_movemask_epi8(_mm_cmpgt_epi16((a), (b))) #else #error Unknown Architecture #endif struct s16info_s { std::array matrix {{}}; VECTOR_SHORT * hearray = nullptr; VECTOR_SHORT * dprofile = nullptr; VECTOR_SHORT ** qtable = nullptr; unsigned short * dir = nullptr; char * qseq = nullptr; uint64_t diralloc = 0; char * cigar = nullptr; char * cigarend = nullptr; int64_t cigaralloc = 0; int opcount = 0; char op = '\0'; int qlen = 0; int maxdlen = 0; CELL penalty_gap_open_query_left = 0; CELL penalty_gap_open_target_left = 0; CELL penalty_gap_open_query_interior = 0; CELL penalty_gap_open_target_interior = 0; CELL penalty_gap_open_query_right = 0; CELL penalty_gap_open_target_right = 0; CELL penalty_gap_extension_query_left = 0; CELL penalty_gap_extension_target_left = 0; CELL penalty_gap_extension_query_interior = 0; CELL penalty_gap_extension_target_interior = 0; CELL penalty_gap_extension_query_right = 0; CELL penalty_gap_extension_target_right = 0; }; auto _mm_print(VECTOR_SHORT const x) -> void { auto * y = (unsigned short *) &x; for (int i = 0; i < 8; i++) { printf("%s%6d", (i > 0 ? " " : ""), y[7 - i]); } } auto _mm_print2(VECTOR_SHORT const x) -> void { auto * y = (signed short *) &x; for (int i = 0; i < 8; i++) { printf("%s%2d", (i > 0 ? " " : ""), y[7 - i]); } } auto dprofile_dump16(CELL const * dprofile) -> void { printf("\ndprofile:\n"); for (int i = 0; i < matrix_size; i++) { printf("%c: ", sym_nt_4bit[i]); for (int k = 0; k < CDEPTH; k++) { printf("["); for (int j = 0; j < CHANNELS; j++) { printf(" %3d", dprofile[(CHANNELS * CDEPTH * i) + (CHANNELS * k) + j]); } printf("]"); } printf("\n"); } } auto dumpscorematrix(CELL const * score_matrix) -> void { for (auto i = 0; i < matrix_size; ++i) { std::printf("%2d %c", i, sym_nt_4bit[i]); for (auto j = 0; j < matrix_size; ++j) { std::printf(" %2d", score_matrix[(matrix_size * i) + j]); } std::printf("\n"); } } auto dprofile_fill16(CELL * dprofile_word, CELL * score_matrix_word, BYTE const * dseq) -> void { #if 0 dumpscorematrix(score_matrix_word); for (int j = 0; j < CDEPTH; j++) { for (int z = 0; z < CHANNELS; z++) std::fprintf(stderr, " [%c]", sym_nt_4bit[dseq[j * CHANNELS + z]]); std::fprintf(stderr, "\n"); } #endif for (int j = 0; j < CDEPTH; j++) { std::array d {{}}; for (int z = 0; z < CHANNELS; z++) { d[z] = dseq[(j * CHANNELS) + z] << 4U; } for (int i = 0; i < matrix_size; i += 8) { #ifdef __PPC__ __vector signed short reg0; __vector signed short reg1; __vector signed short reg2; __vector signed short reg3; __vector signed short reg4; __vector signed short reg5; __vector signed short reg6; __vector signed short reg7; __vector signed int reg8; __vector signed int reg9; __vector signed int reg10; __vector signed int reg11; __vector signed int reg12; __vector signed int reg13; __vector signed int reg14; __vector signed int reg15; __vector signed long long reg16; __vector signed long long reg17; __vector signed long long reg18; __vector signed long long reg19; __vector signed long long reg20; __vector signed long long reg21; __vector signed long long reg22; __vector signed long long reg23; __vector signed long long reg24; __vector signed long long reg25; __vector signed long long reg26; __vector signed long long reg27; __vector signed long long reg28; __vector signed long long reg29; __vector signed long long reg30; __vector signed long long reg31; #else VECTOR_SHORT reg0; VECTOR_SHORT reg1; VECTOR_SHORT reg2; VECTOR_SHORT reg3; VECTOR_SHORT reg4; VECTOR_SHORT reg5; VECTOR_SHORT reg6; VECTOR_SHORT reg7; VECTOR_SHORT reg8; VECTOR_SHORT reg9; VECTOR_SHORT reg10; VECTOR_SHORT reg11; VECTOR_SHORT reg12; VECTOR_SHORT reg13; VECTOR_SHORT reg14; VECTOR_SHORT reg15; VECTOR_SHORT reg16; VECTOR_SHORT reg17; VECTOR_SHORT reg18; VECTOR_SHORT reg19; VECTOR_SHORT reg20; VECTOR_SHORT reg21; VECTOR_SHORT reg22; VECTOR_SHORT reg23; VECTOR_SHORT reg24; VECTOR_SHORT reg25; VECTOR_SHORT reg26; VECTOR_SHORT reg27; VECTOR_SHORT reg28; VECTOR_SHORT reg29; VECTOR_SHORT reg30; VECTOR_SHORT reg31; #endif reg0 = v_load(score_matrix_word + d[0] + i); reg1 = v_load(score_matrix_word + d[1] + i); reg2 = v_load(score_matrix_word + d[2] + i); reg3 = v_load(score_matrix_word + d[3] + i); reg4 = v_load(score_matrix_word + d[4] + i); reg5 = v_load(score_matrix_word + d[5] + i); reg6 = v_load(score_matrix_word + d[6] + i); reg7 = v_load(score_matrix_word + d[7] + i); #ifdef __PPC__ reg8 = (__vector signed int) vec_mergeh(reg0, reg1); reg9 = (__vector signed int) vec_mergel(reg0, reg1); reg10 = (__vector signed int) vec_mergeh(reg2, reg3); reg11 = (__vector signed int) vec_mergel(reg2, reg3); reg12 = (__vector signed int) vec_mergeh(reg4, reg5); reg13 = (__vector signed int) vec_mergel(reg4, reg5); reg14 = (__vector signed int) vec_mergeh(reg6, reg7); reg15 = (__vector signed int) vec_mergel(reg6, reg7); reg16 = (__vector signed long long) vec_mergeh(reg8, reg10); reg17 = (__vector signed long long) vec_mergel(reg8, reg10); reg18 = (__vector signed long long) vec_mergeh(reg12, reg14); reg19 = (__vector signed long long) vec_mergel(reg12, reg14); reg20 = (__vector signed long long) vec_mergeh(reg9, reg11); reg21 = (__vector signed long long) vec_mergel(reg9, reg11); reg22 = (__vector signed long long) vec_mergeh(reg13, reg15); reg23 = (__vector signed long long) vec_mergel(reg13, reg15); reg24 = (__vector signed long long) vec_perm (reg16, reg18, perm_merge_long_low); reg25 = (__vector signed long long) vec_perm (reg16, reg18, perm_merge_long_high); reg26 = (__vector signed long long) vec_perm (reg17, reg19, perm_merge_long_low); reg27 = (__vector signed long long) vec_perm (reg17, reg19, perm_merge_long_high); reg28 = (__vector signed long long) vec_perm (reg20, reg22, perm_merge_long_low); reg29 = (__vector signed long long) vec_perm (reg20, reg22, perm_merge_long_high); reg30 = (__vector signed long long) vec_perm (reg21, reg23, perm_merge_long_low); reg31 = (__vector signed long long) vec_perm (reg21, reg23, perm_merge_long_high); #else reg8 = v_merge_lo_16(reg0, reg1); reg9 = v_merge_hi_16(reg0, reg1); reg10 = v_merge_lo_16(reg2, reg3); reg11 = v_merge_hi_16(reg2, reg3); reg12 = v_merge_lo_16(reg4, reg5); reg13 = v_merge_hi_16(reg4, reg5); reg14 = v_merge_lo_16(reg6, reg7); reg15 = v_merge_hi_16(reg6, reg7); reg16 = v_merge_lo_32(reg8, reg10); reg17 = v_merge_hi_32(reg8, reg10); reg18 = v_merge_lo_32(reg12, reg14); reg19 = v_merge_hi_32(reg12, reg14); reg20 = v_merge_lo_32(reg9, reg11); reg21 = v_merge_hi_32(reg9, reg11); reg22 = v_merge_lo_32(reg13, reg15); reg23 = v_merge_hi_32(reg13, reg15); reg24 = v_merge_lo_64(reg16, reg18); reg25 = v_merge_hi_64(reg16, reg18); reg26 = v_merge_lo_64(reg17, reg19); reg27 = v_merge_hi_64(reg17, reg19); reg28 = v_merge_lo_64(reg20, reg22); reg29 = v_merge_hi_64(reg20, reg22); reg30 = v_merge_lo_64(reg21, reg23); reg31 = v_merge_hi_64(reg21, reg23); #endif v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 0)) + (CHANNELS * j), reg24); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 1)) + (CHANNELS * j), reg25); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 2)) + (CHANNELS * j), reg26); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 3)) + (CHANNELS * j), reg27); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 4)) + (CHANNELS * j), reg28); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 5)) + (CHANNELS * j), reg29); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 6)) + (CHANNELS * j), reg30); v_store(dprofile_word + (CDEPTH * CHANNELS * (i + 7)) + (CHANNELS * j), reg31); } } #if 0 dprofile_dump16(dprofile_word); #endif } /* The direction bits are set as follows: in DIR[0..1] if F>H initially (must go up) (4th pri) in DIR[2..3] if E>max(H,F) (must go left) (3rd pri) in DIR[4..5] if new F>H (must extend up) (2nd pri) in DIR[6..7] if new E>H (must extend left) (1st pri) no bits set: go diagonally */ /* On PPC the fifth parameter is a vector for the result in the lower 64 bits. On x86_64 the fifth parameter is the address to write the result to. */ #ifdef __PPC__ /* Handle differences between GNU and IBM compilers */ #ifdef __IBMCPP__ #define VECTORBYTEPERMUTE vec_bperm #else #define VECTORBYTEPERMUTE vec_vbpermq #endif /* The VSX vec_bperm instruction puts the 16 selected bits of the first source into bits 48-63 of the destination. */ constexpr __vector unsigned char perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; #define ALIGNCORE(H, N, F, V, RES, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \ { \ __vector unsigned short W, X, Y, Z; \ __vector unsigned int WX, YZ; \ __vector short VV; \ VV = v_load(&V); \ H = v_add(H, VV); \ W = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(F, H), perm); \ H = v_max(H, F); \ X = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(E, H), perm); \ H = v_max(H, E); \ H_MIN = v_min(H_MIN, H); \ H_MAX = v_max(H_MAX, H); \ N = H; \ HF = v_sub(H, QR_t); \ F = v_sub(F, R_t); \ Y = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(F, HF), perm); \ F = v_max(F, HF); \ HE = v_sub(H, QR_q); \ E = v_sub(E, R_q); \ Z = (__vector unsigned short) VECTORBYTEPERMUTE \ ((__vector unsigned char) vec_cmpgt(E, HE), perm); \ E = v_max(E, HE); \ WX = (__vector unsigned int) vec_mergel(W, X); \ YZ = (__vector unsigned int) vec_mergel(Y, Z); \ RES = (__vector unsigned long long) vec_mergeh(WX, YZ); \ } #else /* x86_64 & aarch64 */ #define ALIGNCORE(H, N, F, V, PATH, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \ H = v_add(H, V); \ *((PATH)+0) = v_mask_gt(F, H); \ (H) = v_max(H, F); \ *((PATH)+1) = v_mask_gt(E, H); \ (H) = v_max(H, E); \ (H_MIN) = v_min(H_MIN, H); \ (H_MAX) = v_max(H_MAX, H); \ (N) = H; \ HF = v_sub(H, QR_t); \ (F) = v_sub(F, R_t); \ *((PATH)+2) = v_mask_gt(F, HF); \ (F) = v_max(F, HF); \ HE = v_sub(H, QR_q); \ E = v_sub(E, R_q); \ *((PATH)+3) = v_mask_gt(E, HE); \ E = v_max(E, HE); #endif auto aligncolumns_first(VECTOR_SHORT * Sm, VECTOR_SHORT * hep, VECTOR_SHORT ** qp, VECTOR_SHORT QR_q_i, VECTOR_SHORT R_q_i, VECTOR_SHORT QR_q_r, VECTOR_SHORT R_q_r, VECTOR_SHORT QR_t_0, VECTOR_SHORT R_t_0, VECTOR_SHORT QR_t_1, VECTOR_SHORT R_t_1, VECTOR_SHORT QR_t_2, VECTOR_SHORT R_t_2, VECTOR_SHORT QR_t_3, VECTOR_SHORT R_t_3, VECTOR_SHORT h0, VECTOR_SHORT h1, VECTOR_SHORT h2, VECTOR_SHORT h3, VECTOR_SHORT f0, VECTOR_SHORT f1, VECTOR_SHORT f2, VECTOR_SHORT f3, VECTOR_SHORT * _h_min, VECTOR_SHORT * _h_max, VECTOR_SHORT Mm, VECTOR_SHORT M_QR_t_left, VECTOR_SHORT M_R_t_left, VECTOR_SHORT M_QR_q_interior, VECTOR_SHORT M_QR_q_right, int64_t ql, unsigned short * dir) -> void { VECTOR_SHORT h4; VECTOR_SHORT h5; VECTOR_SHORT h6; VECTOR_SHORT h7; VECTOR_SHORT h8; VECTOR_SHORT E; VECTOR_SHORT HE; VECTOR_SHORT HF; VECTOR_SHORT const * vp = nullptr; VECTOR_SHORT h_min = v_zero; VECTOR_SHORT h_max = v_zero; #ifdef __PPC__ __vector unsigned long long RES1; __vector unsigned long long RES2; __vector unsigned long long RES; #endif int64_t i = 0; f0 = v_sub(f0, QR_t_0); f1 = v_sub(f1, QR_t_1); f2 = v_sub(f2, QR_t_2); f3 = v_sub(f3, QR_t_3); for (i = 0; i < ql - 1; i++) { vp = qp[i + 0]; h4 = hep[(2 * i) + 0]; E = hep[(2 * i) + 1]; /* Initialize selected h and e values for next/this round. First zero those cells where a new sequence starts by using an unsigned saturated subtraction of a huge value to set it to zero. Then use signed subtraction to obtain the correct value. */ h4 = v_sub_unsigned(h4, Mm); h4 = v_sub(h4, M_QR_t_left); E = v_sub_unsigned(E, Mm); E = v_sub(E, M_QR_t_left); E = v_sub(E, M_QR_q_interior); M_QR_t_left = v_add(M_QR_t_left, M_R_t_left); #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+0, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+4, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+8, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; h0 = h4; h1 = h5; h2 = h6; h3 = h7; } /* the final round - using query gap penalties for right end */ vp = qp[i + 0]; E = hep[(2 * i) + 1]; E = v_sub_unsigned(E, Mm); E = v_sub(E, M_QR_t_left); E = v_sub(E, M_QR_q_right); #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16*i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir+16*i+ 4, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir+16*i+ 8, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir+16*i+12, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; Sm[0] = h5; Sm[1] = h6; Sm[2] = h7; Sm[3] = h8; *_h_min = h_min; *_h_max = h_max; } auto aligncolumns_rest(VECTOR_SHORT * Sm, VECTOR_SHORT * hep, VECTOR_SHORT ** qp, VECTOR_SHORT QR_q_i, VECTOR_SHORT R_q_i, VECTOR_SHORT QR_q_r, VECTOR_SHORT R_q_r, VECTOR_SHORT QR_t_0, VECTOR_SHORT R_t_0, VECTOR_SHORT QR_t_1, VECTOR_SHORT R_t_1, VECTOR_SHORT QR_t_2, VECTOR_SHORT R_t_2, VECTOR_SHORT QR_t_3, VECTOR_SHORT R_t_3, VECTOR_SHORT h0, VECTOR_SHORT h1, VECTOR_SHORT h2, VECTOR_SHORT h3, VECTOR_SHORT f0, VECTOR_SHORT f1, VECTOR_SHORT f2, VECTOR_SHORT f3, VECTOR_SHORT * _h_min, VECTOR_SHORT * _h_max, int64_t ql, unsigned short * dir) -> void { VECTOR_SHORT h4; VECTOR_SHORT h5; VECTOR_SHORT h6; VECTOR_SHORT h7; VECTOR_SHORT h8; VECTOR_SHORT E; VECTOR_SHORT HE; VECTOR_SHORT HF; VECTOR_SHORT const * vp = nullptr; VECTOR_SHORT h_min = v_zero; VECTOR_SHORT h_max = v_zero; #ifdef __PPC__ __vector unsigned long long RES1; __vector unsigned long long RES2; __vector unsigned long long RES; #endif int64_t i = 0; f0 = v_sub(f0, QR_t_0); f1 = v_sub(f1, QR_t_1); f2 = v_sub(f2, QR_t_2); f3 = v_sub(f3, QR_t_3); for (i = 0; i < ql - 1; i++) { vp = qp[i + 0]; h4 = hep[(2 * i) + 0]; E = hep[(2 * i) + 1]; #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir+16*i+ 0, QR_q_i, R_q_i, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir + 16 * i + 4, QR_q_i, R_q_i, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir + 16 * i + 8, QR_q_i, R_q_i, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir + 16 * i + 12, QR_q_i, R_q_i, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; h0 = h4; h1 = h5; h2 = h6; h3 = h7; } /* the final round - using query gap penalties for right end */ vp = qp[i + 0]; E = hep[(2 * i) + 1]; #ifdef __PPC__ ALIGNCORE(h0, h5, f0, vp[0], RES1, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], RES2, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 0), RES); ALIGNCORE(h2, h7, f2, vp[2], RES1, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], RES2, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); RES = vec_perm(RES1, RES2, perm_merge_long_low); v_store((dir + 16 * i + 8), RES); #else ALIGNCORE(h0, h5, f0, vp[0], dir + 16 * i + 0, QR_q_r, R_q_r, QR_t_0, R_t_0, h_min, h_max); ALIGNCORE(h1, h6, f1, vp[1], dir + 16 * i + 4, QR_q_r, R_q_r, QR_t_1, R_t_1, h_min, h_max); ALIGNCORE(h2, h7, f2, vp[2], dir + 16 * i + 8, QR_q_r, R_q_r, QR_t_2, R_t_2, h_min, h_max); ALIGNCORE(h3, h8, f3, vp[3], dir + 16 * i + 12, QR_q_r, R_q_r, QR_t_3, R_t_3, h_min, h_max); #endif hep[(2 * i) + 0] = h8; hep[(2 * i) + 1] = E; Sm[0] = h5; Sm[1] = h6; Sm[2] = h7; Sm[3] = h8; *_h_min = h_min; *_h_max = h_max; } inline auto pushop(s16info_s * s, char const newop) -> void { if (newop == s->op) { ++s->opcount; return; } *--s->cigarend = s->op; if (s->opcount > 1) { static constexpr auto size = 11; std::array buffer {{}}; const auto length = std::snprintf(buffer.data(), size, "%d", s->opcount); s->cigarend -= length; std::memcpy(s->cigarend, buffer.data(), length); } s->op = newop; s->opcount = 1; } inline auto finishop(s16info_s * s) -> void { if ((s->op != 0) and (s->opcount != 0)) { *--s->cigarend = s->op; if (s->opcount > 1) { static constexpr auto size = 11; std::array buffer {{}}; const auto length = std::snprintf(buffer.data(), size, "%d", s->opcount); s->cigarend -= length; std::memcpy(s->cigarend, buffer.data(), length); } s->op = 0; s->opcount = 0; } } auto backtrack16(s16info_s * s, char * dseq, uint64_t dlen, uint64_t offset, uint64_t channel, unsigned short * paligned, unsigned short * pmatches, unsigned short * pmismatches, unsigned short * pgaps) -> void { unsigned short * dirbuffer = s->dir; uint64_t const dirbuffersize = s->qlen * s->maxdlen * 4; uint64_t const qlen = s->qlen; char const * qseq = s->qseq; uint64_t const maskup = 3ULL << (2 * channel + 0); uint64_t const maskleft = 3ULL << (2 * channel + 16); uint64_t const maskextup = 3ULL << (2 * channel + 32); uint64_t const maskextleft = 3ULL << (2 * channel + 48); #if 0 printf("Dumping backtracking array\n"); for (uint64_t i = 0; i < qlen; i++) { for (uint64_t j = 0; j < dlen; j++) { uint64_t d = *((uint64_t *) (dirbuffer + (offset + matrix_size * s->qlen * (j / 4) + matrix_size * i + 4 * (j & 3)) % dirbuffersize)); if (d & maskup) { if (d & maskleft) printf("+"); else printf("^"); } else if (d & maskleft) { printf("<"); } else { printf("\\"); } } printf("\n"); } printf("Dumping gap extension array\n"); for (uint64_t i = 0; i < qlen; i++) { for (uint64_t j = 0; j < dlen; j++) { uint64_t d = *((uint64_t *) (dirbuffer + (offset + matrix_size * s->qlen * (j / 4) + matrix_size * i + 4 * (j & 3)) % dirbuffersize)); if (d & maskextup) { if (d & maskextleft) printf("+"); else printf("^"); } else if (d & maskextleft) { printf("<"); } else { printf("\\"); } } printf("\n"); } #endif unsigned short aligned = 0; unsigned short matches = 0; unsigned short mismatches = 0; unsigned short gaps = 0; int64_t i = qlen - 1; int64_t j = dlen - 1; s->cigarend = s->cigar + s->qlen + s->maxdlen + 1; s->op = 0; s->opcount = 1; while ((i >= 0) and (j >= 0)) { ++aligned; // future refactoring: // d = *(block1) // block1 = dirbuffer + (block2 % dirbuffersize); // block2 = (offset + block3 + block4 + block5); // block3 = 16 * s->qlen * (j / 4); // block4 = 16 * i; // block5 = 4 * (j & 3); uint64_t const d = *((uint64_t *) (dirbuffer + ((offset + (matrix_size * s->qlen * (j / 4)) + (matrix_size * i) + (4 * (j & 3))) % dirbuffersize))); if ((s->op == 'I') and ((d & maskextleft) != 0U)) { --j; pushop(s, 'I'); } else if ((s->op == 'D') and ((d & maskextup) != 0U)) { --i; pushop(s, 'D'); } else if ((d & maskleft) != 0U) { if (s->op != 'I') { ++gaps; } --j; pushop(s, 'I'); } else if ((d & maskup) != 0U) { if (s->op != 'D') { ++gaps; } --i; pushop(s, 'D'); } else { if (is_equivalent_4bit(qseq[i], dseq[j])) { if (opt_n_mismatch and ((map_4bit(qseq[i]) == 15) or (map_4bit(dseq[j]) == 15))) { ++mismatches; } else { ++matches; } } else { ++mismatches; } --i; --j; pushop(s, 'M'); } } while (i >= 0) { ++aligned; if (s->op != 'D') { ++gaps; } --i; pushop(s, 'D'); } while (j >= 0) { ++aligned; if (s->op != 'I') { ++gaps; } --j; pushop(s, 'I'); } finishop(s); /* move cigar to beginning of allocated memory area */ int const cigarlen = s->cigar + s->qlen + s->maxdlen - s->cigarend; std::memmove(s->cigar, s->cigarend, cigarlen + 1); * paligned = aligned; * pmatches = matches; * pmismatches = mismatches; * pgaps = gaps; } auto search16_init(CELL score_match, CELL score_mismatch, CELL penalty_gap_open_query_left, CELL penalty_gap_open_target_left, CELL penalty_gap_open_query_interior, CELL penalty_gap_open_target_interior, CELL penalty_gap_open_query_right, CELL penalty_gap_open_target_right, CELL penalty_gap_extension_query_left, CELL penalty_gap_extension_target_left, CELL penalty_gap_extension_query_interior, CELL penalty_gap_extension_target_interior, CELL penalty_gap_extension_query_right, CELL penalty_gap_extension_target_right) -> struct s16info_s * { (void) score_match; (void) score_mismatch; /* prepare alloc of qtable, dprofile, hearray, dir */ auto * s = (struct s16info_s *) xmalloc(sizeof(struct s16info_s)); s->dprofile = (VECTOR_SHORT *) xmalloc(2 * 4 * 8 * 16); s->qlen = 0; s->qseq = nullptr; s->maxdlen = 0; s->dir = nullptr; s->diralloc = 0; s->hearray = nullptr; s->qtable = nullptr; s->cigar = nullptr; s->cigarend = nullptr; s->cigaralloc = 0; for (auto i = 0U; i < matrix_size; ++i) { for (auto j = 0U; j < matrix_size; ++j) { CELL value = 0; if (opt_n_mismatch and ((i == 15U) or (j == 15U))) { value = opt_mismatch; } else if (is_ambiguous_4bit(i) or is_ambiguous_4bit(j)) { value = 0; } else if (i == j) { value = opt_match; } else { value = opt_mismatch; } ((CELL *) (s->matrix.data()))[(matrix_size * i) + j] = value; scorematrix[i][j] = value; } } s->penalty_gap_open_query_left = penalty_gap_open_query_left; s->penalty_gap_open_query_interior = penalty_gap_open_query_interior; s->penalty_gap_open_query_right = penalty_gap_open_query_right; s->penalty_gap_open_target_left = penalty_gap_open_target_left; s->penalty_gap_open_target_interior = penalty_gap_open_target_interior; s->penalty_gap_open_target_right = penalty_gap_open_target_right; s->penalty_gap_extension_query_left = penalty_gap_extension_query_left; s->penalty_gap_extension_query_interior = penalty_gap_extension_query_interior; s->penalty_gap_extension_query_right = penalty_gap_extension_query_right; s->penalty_gap_extension_target_left = penalty_gap_extension_target_left; s->penalty_gap_extension_target_interior = penalty_gap_extension_target_interior; s->penalty_gap_extension_target_right = penalty_gap_extension_target_right; return s; } auto search16_exit(s16info_s * s) -> void { /* free mem for dprofile, hearray, dir, qtable */ if (s->dir != nullptr) { xfree(s->dir); } if (s->hearray != nullptr) { xfree(s->hearray); } if (s->dprofile != nullptr) { xfree(s->dprofile); } if (s->qtable != nullptr) { xfree(s->qtable); } if (s->cigar != nullptr) { xfree(s->cigar); } xfree(s); } auto search16_qprep(s16info_s * s, char * qseq, int qlen) -> void { s->qlen = qlen; s->qseq = qseq; if (s->hearray != nullptr) { xfree(s->hearray); } s->hearray = (VECTOR_SHORT *) xmalloc(2 * s->qlen * sizeof(VECTOR_SHORT)); std::memset(s->hearray, 0, 2 * s->qlen * sizeof(VECTOR_SHORT)); if (s->qtable != nullptr) { xfree(s->qtable); } s->qtable = (VECTOR_SHORT **) xmalloc(s->qlen * sizeof(VECTOR_SHORT*)); for (int i = 0; i < qlen; i++) { s->qtable[i] = s->dprofile + (4 * map_4bit(qseq[i])); } } auto compute_score_min(struct s16info_s const & alignment) -> short { auto const gap_penalty_max = std::max({ 0, alignment.penalty_gap_open_query_left + alignment.penalty_gap_extension_query_left, alignment.penalty_gap_open_query_interior + alignment.penalty_gap_extension_query_interior, alignment.penalty_gap_open_query_right + alignment.penalty_gap_extension_query_right, alignment.penalty_gap_open_target_left + alignment.penalty_gap_extension_target_left, alignment.penalty_gap_open_target_interior + alignment.penalty_gap_extension_target_interior, alignment.penalty_gap_open_target_right + alignment.penalty_gap_extension_target_right }); return static_cast(std::numeric_limits::min() + gap_penalty_max); } /* Turn off tree-partial-pre optimizations for the rest of the file. GNU C++ 9 or later generates incorrect code on x86_64 if turned on. */ #ifdef __GNUC__ #ifndef __clang__ #pragma GCC optimize ("-fno-tree-partial-pre") #endif #endif auto search16(s16info_s * s, unsigned int sequences, unsigned int * seqnos, CELL * pscores, unsigned short * paligned, unsigned short * pmatches, unsigned short * pmismatches, unsigned short * pgaps, char ** pcigar) -> void { CELL ** q_start = (CELL **) s->qtable; CELL * dprofile = (CELL *) s->dprofile; CELL * hearray = (CELL *) s->hearray; uint64_t const qlen = s->qlen; if (qlen == 0) { for (auto cand_id = 0U; cand_id < sequences; cand_id++) { auto const seqno = seqnos[cand_id]; int64_t const length = db_getsequencelen(seqno); paligned[cand_id] = length; pmatches[cand_id] = 0; pmismatches[cand_id] = 0; pgaps[cand_id] = length; if (length == 0) { pscores[cand_id] = 0; } else { pscores[cand_id] = std::max(- s->penalty_gap_open_target_left - (length * s->penalty_gap_extension_target_left), - s->penalty_gap_open_target_right - (length * s->penalty_gap_extension_target_right)); } char * cigar = nullptr; if (length > 0) { auto const ret = xsprintf(&cigar, "%ldI", length); if ((ret < 2) or (cigar == nullptr)) { fatal("Unable to allocate enough memory."); } } else { cigar = (char *) xmalloc(1); cigar[0] = 0; } pcigar[cand_id] = cigar; } return; } /* find longest target sequence and reallocate direction buffer */ uint64_t maxdlen = 0; for (int64_t i = 0; i < sequences; i++) { uint64_t const dlen = db_getsequencelen(seqnos[i]); /* skip the very long sequences */ if ((int64_t) (s->qlen) * dlen <= maxseqlenproduct) { maxdlen = std::max(dlen, maxdlen); } } maxdlen = 4 * ((maxdlen + 3) / 4); s->maxdlen = maxdlen; uint64_t const dirbuffersize = s->qlen * s->maxdlen * 4; if (dirbuffersize > s->diralloc) { s->diralloc = dirbuffersize; if (s->dir != nullptr) { xfree(s->dir); } s->dir = (unsigned short*) xmalloc(dirbuffersize * sizeof(unsigned short)); } unsigned short * dirbuffer = s->dir; if (s->qlen + s->maxdlen + 1 > s->cigaralloc) { s->cigaralloc = s->qlen + s->maxdlen + 1; if (s->cigar != nullptr) { xfree(s->cigar); } s->cigar = (char *) xmalloc(s->cigaralloc); } VECTOR_SHORT M; VECTOR_SHORT T0; VECTOR_SHORT M_QR_target_left; VECTOR_SHORT M_R_target_left; VECTOR_SHORT M_QR_query_interior; VECTOR_SHORT M_QR_query_right; VECTOR_SHORT R_query_left; VECTOR_SHORT QR_query_interior; VECTOR_SHORT R_query_interior; VECTOR_SHORT QR_query_right; VECTOR_SHORT R_query_right; VECTOR_SHORT QR_target_left; VECTOR_SHORT R_target_left; VECTOR_SHORT QR_target_interior; VECTOR_SHORT R_target_interior; VECTOR_SHORT QR_target_right; VECTOR_SHORT R_target_right; std::array QR_target {{}}; std::array R_target {{}}; VECTOR_SHORT * hep = nullptr; VECTOR_SHORT ** qp = nullptr; std::array d_begin {{}}; std::array d_end {{}}; std::array d_offset {{}}; std::array d_address {{}}; std::array d_length {{}}; std::array seq_id {{}}; std::array overflow {{}}; std::array dseqalloc {{}}; std::array S {{}}; BYTE * dseq = (BYTE *) dseqalloc.data(); BYTE zero = 0; uint64_t next_id = 0; uint64_t done = 0; T0 = v_init(-1, 0, 0, 0, 0, 0, 0, 0); R_query_left = v_dup(s->penalty_gap_extension_query_left); QR_query_interior = v_dup((s->penalty_gap_open_query_interior + s->penalty_gap_extension_query_interior)); R_query_interior = v_dup(s->penalty_gap_extension_query_interior); QR_query_right = v_dup((s->penalty_gap_open_query_right + s->penalty_gap_extension_query_right)); R_query_right = v_dup(s->penalty_gap_extension_query_right); QR_target_left = v_dup((s->penalty_gap_open_target_left + s->penalty_gap_extension_target_left)); R_target_left = v_dup(s->penalty_gap_extension_target_left); QR_target_interior = v_dup((s->penalty_gap_open_target_interior + s->penalty_gap_extension_target_interior)); R_target_interior = v_dup(s->penalty_gap_extension_target_interior); QR_target_right = v_dup((s->penalty_gap_open_target_right + s->penalty_gap_extension_target_right)); R_target_right = v_dup(s->penalty_gap_extension_target_right); hep = (VECTOR_SHORT *) hearray; qp = (VECTOR_SHORT **) q_start; for (int c = 0; c < CHANNELS; c++) { d_begin[c] = &zero; d_end[c] = d_begin[c]; d_address[c] = nullptr; d_offset[c] = 0; d_length[c] = 0; seq_id[c] = -1; overflow[c] = false; } auto const score_min = compute_score_min(*s); auto const score_max = std::numeric_limits::max(); for (int i = 0; i < 4; i++) { S[i] = v_zero; dseqalloc[i] = v_zero; } VECTOR_SHORT H0 = v_zero; VECTOR_SHORT H1 = v_zero; VECTOR_SHORT H2 = v_zero; VECTOR_SHORT H3 = v_zero; VECTOR_SHORT F0 = v_zero; VECTOR_SHORT F1 = v_zero; VECTOR_SHORT F2 = v_zero; VECTOR_SHORT F3 = v_zero; bool easy = false; unsigned short * dir = dirbuffer; while (true) { if (easy) { /* fill all channels with symbols from the database sequences */ for (int c = 0; c < CHANNELS; c++) { for (int j = 0; j < CDEPTH; j++) { if (d_begin[c] < d_end[c]) { dseq[(CHANNELS * j) + c] = map_4bit(*(d_begin[c]++)); } else { dseq[(CHANNELS * j) + c] = 0; } } if (d_begin[c] == d_end[c]) { easy = false; } } dprofile_fill16(dprofile, (CELL*) s->matrix.data(), dseq); /* create vectors of gap penalties for target depending on whether any of the database sequences ended in these four columns */ if (easy) { for (unsigned int j = 0; j < CDEPTH; j++) { QR_target[j] = QR_target_interior; R_target[j] = R_target_interior; } } else { /* one or more sequences ended */ VECTOR_SHORT const QR_diff = v_sub(QR_target_right, QR_target_interior); VECTOR_SHORT const R_diff = v_sub(R_target_right, R_target_interior); for (unsigned int j = 0; j < CDEPTH; j++) { VECTOR_SHORT MM = v_zero; VECTOR_SHORT TT = T0; for (int c = 0; c < CHANNELS; c++) { if ((d_begin[c] == d_end[c]) and (j >= ((d_length[c] + 3) % 4))) { MM = v_xor(MM, TT); } TT = v_shift_left(TT); } QR_target[j] = v_add(QR_target_interior, v_and(QR_diff, MM)); R_target[j] = v_add(R_target_interior, v_and(R_diff, MM)); } } VECTOR_SHORT h_min; VECTOR_SHORT h_max; aligncolumns_rest(S.data(), hep, qp, QR_query_interior, R_query_interior, QR_query_right, R_query_right, QR_target[0], R_target[0], QR_target[1], R_target[1], QR_target[2], R_target[2], QR_target[3], R_target[3], H0, H1, H2, H3, F0, F1, F2, F3, & h_min, & h_max, qlen, dir); VECTOR_SHORT h_min_vector; VECTOR_SHORT h_max_vector; v_store(& h_min_vector, h_min); v_store(& h_max_vector, h_max); for (int c = 0; c < CHANNELS; c++) { if (not overflow[c]) { signed short const h_min_c = ((signed short *) (& h_min_vector))[c]; signed short const h_max_c = ((signed short *) (& h_max_vector))[c]; if ((h_min_c <= score_min) or (h_max_c >= score_max)) { overflow[c] = true; } } } } else { /* One or more sequences ended in the previous block. We have to switch over to a new sequence */ easy = true; M = v_zero; VECTOR_SHORT T = T0; for (int c = 0; c < CHANNELS; c++) { if (d_begin[c] < d_end[c]) { /* this channel has more sequence */ for (int j = 0; j < CDEPTH; j++) { if (d_begin[c] < d_end[c]) { dseq[(CHANNELS * j) + c] = map_4bit(*(d_begin[c]++)); } else { dseq[(CHANNELS * j) + c] = 0; } } if (d_begin[c] == d_end[c]) { easy = false; } } else { /* sequence in channel c ended. change of sequence */ M = v_xor(M, T); int64_t cand_id = seq_id[c]; if (cand_id >= 0) { /* save score */ char * dbseq = (char *) d_address[c]; int64_t const dbseqlen = d_length[c]; int64_t const z = (dbseqlen + 3) % 4; int64_t const score = ((CELL *) S.data())[(z * CHANNELS) + c]; if (overflow[c]) { pscores[cand_id] = std::numeric_limits::max(); paligned[cand_id] = 0; pmatches[cand_id] = 0; pmismatches[cand_id] = 0; pgaps[cand_id] = 0; pcigar[cand_id] = xstrdup(""); } else { pscores[cand_id] = score; backtrack16(s, dbseq, dbseqlen, d_offset[c], c, paligned + cand_id, pmatches + cand_id, pmismatches + cand_id, pgaps + cand_id); pcigar[cand_id] = (char *) xmalloc(std::strlen(s->cigar)+1); strcpy(pcigar[cand_id], s->cigar); } done++; } /* get next sequence of reasonable length */ int64_t length = 0; while ((length == 0) and (next_id < sequences)) { cand_id = next_id++; length = db_getsequencelen(seqnos[cand_id]); if ((length == 0) or (s->qlen * length > maxseqlenproduct)) { pscores[cand_id] = std::numeric_limits::max(); paligned[cand_id] = 0; pmatches[cand_id] = 0; pmismatches[cand_id] = 0; pgaps[cand_id] = 0; pcigar[cand_id] = xstrdup(""); length = 0; done++; } } if (length > 0) { seq_id[c] = cand_id; char * address = db_getsequence(seqnos[cand_id]); d_address[c] = (BYTE *) address; d_length[c] = length; d_begin[c] = (unsigned char *) address; d_end[c] = (unsigned char *) address + length; d_offset[c] = dir - dirbuffer; overflow[c] = false; ((CELL *) &H0)[c] = 0; ((CELL *) &H1)[c] = - s->penalty_gap_open_query_left - (1 * s->penalty_gap_extension_query_left); ((CELL *) &H2)[c] = - s->penalty_gap_open_query_left - (2 * s->penalty_gap_extension_query_left); ((CELL *) &H3)[c] = - s->penalty_gap_open_query_left - (3 * s->penalty_gap_extension_query_left); ((CELL *) &F0)[c] = - s->penalty_gap_open_query_left - (1 * s->penalty_gap_extension_query_left); ((CELL *) &F1)[c] = - s->penalty_gap_open_query_left - (2 * s->penalty_gap_extension_query_left); ((CELL *) &F2)[c] = - s->penalty_gap_open_query_left - (3 * s->penalty_gap_extension_query_left); ((CELL *) &F3)[c] = - s->penalty_gap_open_query_left - (4 * s->penalty_gap_extension_query_left); /* fill channel */ for (int j = 0; j < CDEPTH; j++) { if (d_begin[c] < d_end[c]) { dseq[(CHANNELS * j) + c] = map_4bit(*(d_begin[c]++)); } else { dseq[(CHANNELS * j) + c] = 0; } } if (d_begin[c] == d_end[c]) { easy = false; } } else { /* no more sequences, empty channel */ seq_id[c] = -1; d_address[c] = nullptr; d_begin[c] = &zero; d_end[c] = d_begin[c]; d_length[c] = 0; d_offset[c] = 0; for (int j = 0; j < CDEPTH; j++) { dseq[(CHANNELS * j) + c] = 0; } } } T = v_shift_left(T); } if (done == sequences) { break; } /* make masked versions of QR and R for gaps in target */ M_QR_target_left = v_and(M, QR_target_left); M_R_target_left = v_and(M, R_target_left); /* make masked versions of QR for gaps in query at target left end */ M_QR_query_interior = v_and(M, QR_query_interior); M_QR_query_right = v_and(M, QR_query_right); dprofile_fill16(dprofile, (CELL *) s->matrix.data(), dseq); /* create vectors of gap penalties for target depending on whether any of the database sequences ended in these four columns */ if (easy) { for (unsigned int j = 0; j < CDEPTH; j++) { QR_target[j] = QR_target_interior; R_target[j] = R_target_interior; } } else { /* one or more sequences ended */ VECTOR_SHORT const QR_diff = v_sub(QR_target_right, QR_target_interior); VECTOR_SHORT const R_diff = v_sub(R_target_right, R_target_interior); for (unsigned int j = 0; j < CDEPTH; j++) { VECTOR_SHORT MM = v_zero; VECTOR_SHORT TT = T0; for (int c = 0; c < CHANNELS; c++) { if ((d_begin[c] == d_end[c]) and (j >= ((d_length[c] + 3) % 4))) { MM = v_xor(MM, TT); } TT = v_shift_left(TT); } QR_target[j] = v_add(QR_target_interior, v_and(QR_diff, MM)); R_target[j] = v_add(R_target_interior, v_and(R_diff, MM)); } } VECTOR_SHORT h_min; VECTOR_SHORT h_max; aligncolumns_first(S.data(), hep, qp, QR_query_interior, R_query_interior, QR_query_right, R_query_right, QR_target[0], R_target[0], QR_target[1], R_target[1], QR_target[2], R_target[2], QR_target[3], R_target[3], H0, H1, H2, H3, F0, F1, F2, F3, & h_min, & h_max, M, M_QR_target_left, M_R_target_left, M_QR_query_interior, M_QR_query_right, qlen, dir); VECTOR_SHORT h_min_vector; VECTOR_SHORT h_max_vector; v_store(& h_min_vector, h_min); v_store(& h_max_vector, h_max); for (int c = 0; c < CHANNELS; c++) { if (not overflow[c]) { signed short const h_min_c = ((signed short *) (& h_min_vector))[c]; signed short const h_max_c = ((signed short *) (& h_max_vector))[c]; if ((h_min_c <= score_min) or (h_max_c >= score_max)) { overflow[c] = true; } } } } H0 = v_sub(H3, R_query_left); H1 = v_sub(H0, R_query_left); H2 = v_sub(H1, R_query_left); H3 = v_sub(H2, R_query_left); F0 = v_sub(F3, R_query_left); F1 = v_sub(F0, R_query_left); F2 = v_sub(F1, R_query_left); F3 = v_sub(F2, R_query_left); dir += 4 * 4 * s->qlen; if (dir >= dirbuffer + dirbuffersize) { dir -= dirbuffersize; } } } vsearch-2.30.1/src/align_simd.h000066400000000000000000000074161506776541700163300ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using CELL = signed short; using WORD = unsigned short; using BYTE = unsigned char; struct s16info_s; auto search16_init(CELL score_match, CELL score_mismatch, CELL penalty_gap_open_query_left, CELL penalty_gap_open_target_left, CELL penalty_gap_open_query_interior, CELL penalty_gap_open_target_interior, CELL penalty_gap_open_query_right, CELL penalty_gap_open_target_right, CELL penalty_gap_extension_query_left, CELL penalty_gap_extension_target_left, CELL penalty_gap_extension_query_interior, CELL penalty_gap_extension_target_interior, CELL penalty_gap_extension_query_right, CELL penalty_gap_extension_target_right) -> struct s16info_s *; auto search16_exit(s16info_s * searchinfo) -> void; auto search16_qprep(s16info_s * searchinfo, char * qseq, int qlen) -> void; auto search16(s16info_s * searchinfo, unsigned int sequences, unsigned int * seqnos, CELL * pscores, unsigned short * paligned, unsigned short * pmatches, unsigned short * pmismatches, unsigned short * pgaps, char * * pcigar) -> void; vsearch-2.30.1/src/allpairs.cc000066400000000000000000000565061506776541700161730ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "linmemalign.h" #include "mask.h" #include "utils/fatal.hpp" #include "utils/xpthread.hpp" #include // std::min, std::max #include // int64_t #include // std::fprintf, std::FILE, std:fclose, std::size_t #include // std::qsort #include // std::strlen #include #include #include static pthread_t * pthread; /* global constants/data, no need for synchronization */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static int qmatches; static int queries; static int64_t progress = 0; static FILE * fp_alnout = nullptr; static FILE * fp_samout = nullptr; static FILE * fp_userout = nullptr; static FILE * fp_blast6out = nullptr; static FILE * fp_uc = nullptr; static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; static FILE * fp_qsegout = nullptr; static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; inline auto allpairs_hit_compare_typed(struct hit * lhs, struct hit * rhs) -> int { // high id, then low id // early target, then late target if (lhs->id > rhs->id) { return -1; } if (lhs->id < rhs->id) { return +1; } if (lhs->target < rhs->target) { return -1; } if (lhs->target > rhs->target) { return +1; } return 0; } auto allpairs_hit_compare(const void * lhs, const void * rhs) -> int { return allpairs_hit_compare_typed((struct hit *) lhs, (struct hit *) rhs); } auto allpairs_output_results(int hit_count, struct hit * hits, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc) -> void { /* show results */ auto const toreport = std::min(opt_maxhits, static_cast(hit_count)); if (fp_alnout != nullptr) { results_show_alnout(fp_alnout, hits, toreport, query_head, qsequence, qseqlen); } if (fp_samout != nullptr) { results_show_samout(fp_samout, hits, toreport, query_head, qsequence, qsequence_rc); } if (toreport != 0) { double const top_hit_id = hits[0].id; for (int t = 0; t < toreport; t++) { struct hit const * hp = hits + t; if ((opt_top_hits_only != 0) and (hp->id < top_hit_id)) { break; } if (fp_fastapairs != nullptr) { results_show_fastapairs_one(fp_fastapairs, hp, query_head, qsequence, qsequence_rc); } if (fp_qsegout != nullptr) { results_show_qsegout_one(fp_qsegout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout != nullptr) { results_show_tsegout_one(fp_tsegout, hp); } if (fp_uc != nullptr) { if ((t == 0) or (opt_uc_allhits != 0)) { results_show_uc_one(fp_uc, hp, query_head, qseqlen, hp->target); } } if (fp_userout != nullptr) { results_show_userout_one(fp_userout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, hp, query_head, qseqlen); } } } else { if (fp_uc != nullptr) { results_show_uc_one(fp_uc, nullptr, query_head, qseqlen, 0); } if (opt_output_no_hits != 0) { if (fp_userout != nullptr) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } } if (hit_count != 0) { ++count_matched; if (opt_matched != nullptr) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, std::strlen(query_head), 0, count_matched, -1.0, -1, -1, nullptr, 0.0); } } else { ++count_notmatched; if (opt_notmatched != nullptr) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, std::strlen(query_head), 0, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } } auto allpairs_thread_run(int64_t t) -> void { (void) t; struct searchinfo_s searchinfo; searchinfo.hits_v.resize(seqcount); searchinfo.hits = searchinfo.hits_v.data(); searchinfo.s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); struct Scoring scoring; scoring.match = opt_match; scoring.mismatch = opt_mismatch; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_open_query_left = opt_gap_open_query_left; scoring.gap_open_target_left = opt_gap_open_target_left; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_open_target_interior = opt_gap_open_target_interior; scoring.gap_open_query_right = opt_gap_open_query_right; scoring.gap_open_target_right = opt_gap_open_target_right; scoring.gap_extension_query_left = opt_gap_extension_query_left; scoring.gap_extension_target_left = opt_gap_extension_target_left; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_extension_target_interior = opt_gap_extension_target_interior; scoring.gap_extension_query_right = opt_gap_extension_query_right; scoring.gap_extension_target_right = opt_gap_extension_target_right; LinearMemoryAligner lma(scoring); /* allocate memory for alignment results */ auto const maxhits = static_cast(seqcount); std::vector pseqnos(maxhits); std::vector pscores(maxhits); std::vector paligned(maxhits); std::vector pmatches(maxhits); std::vector pmismatches(maxhits); std::vector pgaps(maxhits); std::vector pcigar(maxhits); std::vector finalhits(maxhits); auto cont = true; while (cont) { xpthread_mutex_lock(&mutex_input); int const query_no = queries; if (query_no < seqcount) { ++queries; /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* init search info */ searchinfo.query_no = query_no; searchinfo.qsize = db_getabundance(query_no); searchinfo.query_head_len = db_getheaderlen(query_no); searchinfo.query_head = db_getheader(query_no); searchinfo.qseqlen = db_getsequencelen(query_no); searchinfo.qsequence = db_getsequence(query_no); searchinfo.rejects = 0; searchinfo.accepts = 0; searchinfo.hit_count = 0; for (int target = searchinfo.query_no + 1; target < seqcount; target++) { if ((opt_acceptall != 0) or search_acceptable_unaligned(searchinfo, target)) { pseqnos[searchinfo.hit_count] = target; ++searchinfo.hit_count; } } if (searchinfo.hit_count != 0) { /* perform alignments */ search16_qprep(searchinfo.s, searchinfo.qsequence, searchinfo.qseqlen); search16(searchinfo.s, searchinfo.hit_count, pseqnos.data(), pscores.data(), paligned.data(), pmatches.data(), pmismatches.data(), pgaps.data(), pcigar.data()); /* convert to hit structure */ for (int h = 0; h < searchinfo.hit_count; h++) { struct hit * hit = &searchinfo.hits_v[h]; unsigned int const target = pseqnos[h]; int64_t nwscore = pscores[h]; char * nwcigar {nullptr}; int64_t nwalignmentlength {0}; int64_t nwmatches {0}; int64_t nwmismatches {0}; int64_t nwgaps {0}; if (nwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * tseq = db_getsequence(target); int64_t const tseqlen = db_getsequencelen(target); if (pcigar[h] != nullptr) { xfree(pcigar[h]); } nwcigar = xstrdup(lma.align(searchinfo.qsequence, tseq, searchinfo.qseqlen, tseqlen)); lma.alignstats(nwcigar, searchinfo.qsequence, tseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); } else { nwcigar = pcigar[h]; nwalignmentlength = paligned[h]; nwmatches = pmatches[h]; nwmismatches = pmismatches[h]; nwgaps = pgaps[h]; } hit->target = target; hit->strand = 0; hit->count = 0; hit->accepted = false; hit->rejected = false; hit->aligned = true; hit->weak = false; hit->nwscore = nwscore; hit->nwdiff = nwalignmentlength - nwmatches; hit->nwgaps = nwgaps; hit->nwindels = nwalignmentlength - nwmatches - nwmismatches; hit->nwalignmentlength = nwalignmentlength; hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) / nwalignmentlength; hit->nwalignment = nwcigar; hit->matches = nwalignmentlength - hit->nwdiff; hit->mismatches = hit->nwdiff - hit->nwindels; auto const dseqlen = static_cast(db_getsequencelen(target)); hit->shortest = std::min(searchinfo.qseqlen, dseqlen); hit->longest = std::max(searchinfo.qseqlen, dseqlen); /* trim alignment, compute numbers excluding terminal gaps */ align_trim(hit); /* test accept/reject criteria after alignment */ if ((opt_acceptall != 0) or search_acceptable_aligned(searchinfo, hit)) { finalhits[searchinfo.accepts] = *hit; ++searchinfo.accepts; } } /* sort hits */ qsort(finalhits.data(), searchinfo.accepts, sizeof(struct hit), allpairs_hit_compare); } /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* output results */ allpairs_output_results(searchinfo.accepts, finalhits.data(), searchinfo.query_head, searchinfo.qseqlen, searchinfo.qsequence, nullptr); /* update stats */ if (searchinfo.accepts != 0) { ++qmatches; } /* show progress */ progress += seqcount - query_no - 1; progress_update(progress); xpthread_mutex_unlock(&mutex_output); /* free memory for alignment strings */ for (int i = 0; i < searchinfo.hit_count; i++) { if (searchinfo.hits_v[i].aligned) { xfree(searchinfo.hits_v[i].nwalignment); } } } else { /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); cont = false; } } search16_exit(searchinfo.s); } auto allpairs_thread_worker(void * void_ptr) -> void * { auto const nth_thread = reinterpret_cast(void_ptr); allpairs_thread_run(nth_thread); return nullptr; } auto allpairs_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { xpthread_create(pthread + t, &attr, allpairs_thread_worker, (void *) (int64_t) t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); } xpthread_attr_destroy(&attr); } auto allpairs_global(struct Parameters const & parameters, char * cmdline, char * progheader) -> void { /* open output files */ if (opt_alnout != nullptr) { fp_alnout = fopen_output(opt_alnout); if (fp_alnout == nullptr) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", parameters.command_line.c_str()); fprintf(fp_alnout, "%s\n", progheader); } if (opt_samout != nullptr) { fp_samout = fopen_output(opt_samout); if (fp_samout == nullptr) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout != nullptr) { fp_userout = fopen_output(opt_userout); if (fp_userout == nullptr) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out != nullptr) { fp_blast6out = fopen_output(opt_blast6out); if (fp_blast6out == nullptr) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_uc != nullptr) { fp_uc = fopen_output(opt_uc); if (fp_uc == nullptr) { fatal("Unable to open uc output file for writing"); } } if (opt_fastapairs != nullptr) { fp_fastapairs = fopen_output(opt_fastapairs); if (fp_fastapairs == nullptr) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout != nullptr) { fp_qsegout = fopen_output(opt_qsegout); if (fp_qsegout == nullptr) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout != nullptr) { fp_tsegout = fopen_output(opt_tsegout); if (fp_tsegout == nullptr) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched != nullptr) { fp_matched = fopen_output(opt_matched); if (fp_matched == nullptr) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched != nullptr) { fp_notmatched = fopen_output(opt_notmatched); if (fp_notmatched == nullptr) { fatal("Unable to open notmatched output file for writing"); } } db_read(parameters.opt_allpairs_global, 0); results_show_samheader(fp_samout, cmdline, parameters.opt_allpairs_global); if (parameters.opt_qmask == MASK_DUST) { dust_all(); } else if ((parameters.opt_qmask == MASK_SOFT) and parameters.opt_hardmask) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); /* prepare reading of queries */ qmatches = 0; queries = 0; std::vector pthread_v(parameters.opt_threads); pthread = pthread_v.data(); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); progress = 0; progress_init("Aligning", std::max(int64_t{0}, ((int64_t) seqcount) * ((int64_t) seqcount - 1)) / 2); // refactoring: issue with parenthesis? allpairs_thread_worker_run(); progress_done(); if (not parameters.opt_quiet) { fprintf(stderr, "Matching query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "Matching query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n\n"); } xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); // pthread_v not used after this point /* clean up, global */ db_free(); if (opt_matched != nullptr) { fclose(fp_matched); } if (opt_notmatched != nullptr) { fclose(fp_notmatched); } if (opt_fastapairs != nullptr) { fclose(fp_fastapairs); } if (opt_qsegout != nullptr) { fclose(fp_qsegout); } if (opt_tsegout != nullptr) { fclose(fp_tsegout); } if (fp_uc != nullptr) { fclose(fp_uc); } if (fp_blast6out != nullptr) { fclose(fp_blast6out); } if (fp_userout != nullptr) { fclose(fp_userout); } if (fp_alnout != nullptr) { fclose(fp_alnout); } if (fp_samout != nullptr) { fclose(fp_samout); } show_rusage(); if (fp_userout != nullptr) { clean_up(); // free userfields allocation } } vsearch-2.30.1/src/allpairs.h000066400000000000000000000050141506776541700160210ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void allpairs_global(struct Parameters const & parameters, char * cmdline, char * progheader); vsearch-2.30.1/src/arch.cc000066400000000000000000000203561506776541700152730ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dynlibs.h" #include "utils/fatal.hpp" #include // std::max #include // std::FILE, std::size_t #include // uint64_t #include // std::realloc, std::free constexpr auto memalignment = 16; auto arch_get_memused() -> uint64_t { #ifdef _WIN32 PROCESS_MEMORY_COUNTERS pmc; GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(PROCESS_MEMORY_COUNTERS)); return pmc.PeakWorkingSetSize; #else struct rusage r_usage; getrusage(RUSAGE_SELF, & r_usage); # ifdef __APPLE__ /* Mac: ru_maxrss gives the size in bytes */ return r_usage.ru_maxrss; # else /* Linux: ru_maxrss gives the size in kilobytes */ return r_usage.ru_maxrss * 1024; # endif #endif } auto arch_get_memtotal() -> uint64_t { #ifdef _WIN32 MEMORYSTATUSEX ms; ms.dwLength = sizeof(MEMORYSTATUSEX); GlobalMemoryStatusEx(&ms); return ms.ullTotalPhys; #elif defined(__APPLE__) int mib [] = { CTL_HW, HW_MEMSIZE }; int64_t ram = 0; std::size_t length = sizeof(ram); if(sysctl(mib, 2, &ram, &length, NULL, 0) == -1) fatal("Cannot determine amount of RAM"); return ram; #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) int64_t const phys_pages = sysconf(_SC_PHYS_PAGES); int64_t const pagesize = sysconf(_SC_PAGESIZE); if ((phys_pages == -1) or (pagesize == -1)) { fatal("Cannot determine amount of RAM"); } return pagesize * phys_pages; #else struct sysinfo si; if (sysinfo(&si)) fatal("Cannot determine amount of RAM"); return si.totalram * si.mem_unit; #endif } auto arch_get_cores() -> long { #ifdef _WIN32 SYSTEM_INFO si; GetSystemInfo(&si); return si.dwNumberOfProcessors; #else return sysconf(_SC_NPROCESSORS_ONLN); #endif } auto arch_get_user_system_time(double * user_time, double * system_time) -> void { *user_time = 0; *system_time = 0; #ifdef _WIN32 HANDLE hProcess = GetCurrentProcess(); FILETIME ftCreation, ftExit, ftKernel, ftUser; ULARGE_INTEGER ul; GetProcessTimes(hProcess, &ftCreation, &ftExit, &ftKernel, &ftUser); ul.u.HighPart = ftUser.dwHighDateTime; ul.u.LowPart = ftUser.dwLowDateTime; *user_time = ul.QuadPart * 100.0e-9; ul.u.HighPart = ftKernel.dwHighDateTime; ul.u.LowPart = ftKernel.dwLowDateTime; *system_time = ul.QuadPart * 100.0e-9; #else struct rusage r_usage; getrusage(RUSAGE_SELF, & r_usage); * user_time = (r_usage.ru_utime.tv_sec * 1.0) + (r_usage.ru_utime.tv_usec * 1.0e-6); * system_time = (r_usage.ru_stime.tv_sec * 1.0) + (r_usage.ru_stime.tv_usec * 1.0e-6); #endif } auto arch_srandom() -> void { /* initialize pseudo-random number generator */ unsigned int seed = opt_randseed; if (seed == 0) { #ifdef _WIN32 srand(GetTickCount()); #else auto const fd = open("/dev/urandom", O_RDONLY); if (fd < 0) { fatal("Unable to open /dev/urandom"); } if (read(fd, & seed, sizeof(seed)) < 0) { fatal("Unable to read from /dev/urandom"); } close(fd); srandom(seed); #endif } else { #ifdef _WIN32 srand(seed); #else srandom(seed); #endif } } auto arch_random() -> uint64_t { #ifdef _WIN32 return rand(); #else return random(); #endif } auto xmalloc(std::size_t size) -> void * { static constexpr auto minimal_allocation = std::size_t{1}; size = std::max(size, minimal_allocation); void * ptr = nullptr; #ifdef _WIN32 ptr = _aligned_malloc(size, memalignment); #else if (posix_memalign(&ptr, memalignment, size) != 0) { ptr = nullptr; } #endif if (ptr == nullptr) { fatal("Unable to allocate enough memory."); } return ptr; } auto xrealloc(void * ptr, std::size_t size) -> void * { static constexpr auto minimal_allocation = std::size_t{1}; size = std::max(size, minimal_allocation); #ifdef _WIN32 void * new_ptr = _aligned_realloc(ptr, size, memalignment); #else void * new_ptr = realloc(ptr, size); #endif if (new_ptr == nullptr) { fatal("Unable to reallocate enough memory."); } return new_ptr; } auto xfree(void * ptr) -> void { if (ptr != nullptr) { #ifdef _WIN32 _aligned_free(ptr); #else free(ptr); #endif } else { fatal("Trying to free a null pointer"); } } auto xfstat(int file_descriptor, xstat_t * buf) -> int { #ifdef _WIN32 return _fstat64(file_descriptor, buf); #else return fstat(file_descriptor, buf); // return zero if success #endif } auto xstat(const char * path, xstat_t * buf) -> int { #ifdef _WIN32 return _stat64(path, buf); #else return stat(path, buf); #endif } auto xlseek(int file_descriptor, uint64_t offset, int whence) -> uint64_t { #ifdef _WIN32 return _lseeki64(file_descriptor, offset, whence); #else return lseek(file_descriptor, offset, whence); // libC or linuxism: replace with std::fseek()? #endif } // refactoring: only used in fastx.cc auto xftello(std::FILE * stream) -> uint64_t { #ifdef _WIN32 return _ftelli64(stream); #else return ftello(stream); #endif } // refactoring: only used in udb.cc auto xopen_read(const char * path) -> int { #ifdef _WIN32 return _open(path, _O_RDONLY | _O_BINARY); #else return open(path, O_RDONLY); #endif } // refactoring: only used in udb.cc auto xopen_write(const char * path) -> int { #ifdef _WIN32 return _open(path, _O_WRONLY | _O_CREAT | _O_TRUNC | _O_BINARY, _S_IREAD | _S_IWRITE); #else return open(path, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); #endif } #ifdef _WIN32 auto arch_dlsym(HMODULE handle, const char * symbol) -> void_func_ptr { return reinterpret_cast(GetProcAddress(handle, symbol)); } #else auto arch_dlsym(void * handle, const char * symbol) -> void_func_ptr { return reinterpret_cast(dlsym(handle, symbol)); } #endif vsearch-2.30.1/src/arch.h000066400000000000000000000070531506776541700151340ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t #include // std::FILE, std::size_t #ifdef _WIN32 using xstat_t = struct __stat64; #else using xstat_t = struct stat; #endif auto arch_get_memused() -> uint64_t; auto arch_get_memtotal() -> uint64_t; auto arch_get_cores() -> long; auto arch_get_user_system_time(double * user_time, double * system_time) -> void; auto arch_srandom() -> void; auto arch_random() -> uint64_t; auto xmalloc(std::size_t size) -> void *; auto xrealloc(void * ptr, std::size_t size) -> void *; auto xfree(void * ptr) -> void; auto xfstat(int file_descriptor, xstat_t * buf) -> int; auto xstat(const char * path, xstat_t * buf) -> int; auto xlseek(int file_descriptor, uint64_t offset, int whence) -> uint64_t; auto xftello(std::FILE * stream) -> uint64_t; auto xopen_read(const char * path) -> int; auto xopen_write(const char * path) -> int; auto xstrcasestr(const char * haystack, const char * needle) -> const char *; typedef void (* void_func_ptr)(); #ifdef _WIN32 auto arch_dlsym(HMODULE handle, const char * symbol) -> void_func_ptr; #else auto arch_dlsym(void * handle, const char * symbol) -> void_func_ptr; #endif vsearch-2.30.1/src/attributes.cc000066400000000000000000000221421506776541700165370ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include // std::swap #include #include // errno #include // int64_t #include // std::FILE, std::fprintf #include // std::strtoll #include // std::strlen, std::strstr, std::strspn constexpr auto n_expected_attributes = std::size_t{3}; // 3 attributes: size, ee, length auto header_find_attribute(char const * header, int const header_length, char const * attribute, int * start, int * end, bool const allow_decimal) -> bool { /* Identify the first occurence of the pattern (^|;)size=([0-9]+)(;|$) in the header string, where "size=" is the specified attribute. If allow_decimal is true, a dot (.) is allowed within the digits. */ const char * digit_chars = "0123456789"; const char * digit_chars_decimal = "0123456789."; if ((header == nullptr) or (attribute == nullptr)) { return false; } auto const attribute_length = static_cast(std::strlen(attribute)); auto offset = 0; while (offset < header_length - attribute_length) { auto const * first_occurence = std::strstr(header + offset, attribute); /* no match */ if (first_occurence == nullptr) { break; } offset = first_occurence - header; /* check for ';' in front */ if ((offset > 0) and (header[offset - 1] != ';')) { offset += attribute_length + 1; continue; } auto const digits = static_cast(std::strspn(header + offset + attribute_length, (allow_decimal ? digit_chars_decimal : digit_chars))); /* check for at least one digit */ if (digits == 0) { offset += attribute_length + 1; continue; } /* check for ';' after */ if ((offset + attribute_length + digits < header_length) and (header[offset + attribute_length + digits] != ';')) { offset += attribute_length + digits + 2; continue; } /* ok */ *start = offset; *end = offset + attribute_length + digits; return true; } return false; } auto header_get_size(char const * header, int const header_length) -> int64_t { /* read size/abundance annotation */ static constexpr auto length_of_attribute_name = 5; // "size=" -> 5 letters static constexpr auto decimal_base = 10; auto start = 0; auto end = 0; auto const attribute_is_present = header_find_attribute(header, header_length, "size=", &start, &end, false); if (not attribute_is_present) { return 0; // refactoring: return 1 by default? } char * next_character = nullptr; auto const abundance = std::strtoll(header + start + length_of_attribute_name, &next_character, decimal_base); auto const range_error = (errno == ERANGE); if (range_error) { fatal("Invalid (range error) abundance annotation in FASTA file header"); } if (abundance == 0) { fatal("Invalid (zero) abundance annotation in FASTA file header"); } return abundance; } auto look_for_attribute(char const * header, int const header_length, int & nth_attribute, std::array &attribute_start, std::array &attribute_end, char const * attribute_text, bool const strip_attribute) -> void { auto start = 0; auto end = 0; if (not strip_attribute) { return; } auto const attribute_is_present = header_find_attribute(header, header_length, attribute_text, & start, & end, false); if (not attribute_is_present) { return; } attribute_start[nth_attribute] = start; attribute_end[nth_attribute] = end; ++nth_attribute; } auto header_fprint_strip(FILE * output_handle, char const * header, int const header_length, bool const strip_size, bool const strip_ee, bool const strip_length) -> void { auto nth_attribute = 0; std::array attribute_start {{}}; std::array attribute_end {{}}; /* look for size attribute */ look_for_attribute(header, header_length, nth_attribute, attribute_start, attribute_end, "size=", strip_size); /* look for ee attribute */ look_for_attribute(header, header_length, nth_attribute, attribute_start, attribute_end, "ee=", strip_ee); /* look for length attribute */ look_for_attribute(header, header_length, nth_attribute, attribute_start, attribute_end, "length=", strip_length); /* sort */ auto last_swap = 0; auto limit = nth_attribute - 1; while (limit > 0) { for (auto i = 0; i < limit; ++i) { if (attribute_start[i] > attribute_start[i + 1]) { std::swap(attribute_start[i], attribute_start[i + 1]); std::swap(attribute_end[i], attribute_end[i + 1]); last_swap = i; } } limit = last_swap; } /* print */ if (nth_attribute == 0) { std::fprintf(output_handle, "%.*s", header_length, header); } else { auto prev_end = 0; for (auto i = 0; i < nth_attribute; ++i) { /* print part of header in front of this attribute */ if (attribute_start[i] > prev_end + 1) { std::fprintf(output_handle, "%.*s", attribute_start[i] - prev_end - 1, header + prev_end); } prev_end = attribute_end[i]; } /* print the rest, if any */ if (header_length > prev_end + 1) { std::fprintf(output_handle, "%.*s", header_length - prev_end, header + prev_end); } } } vsearch-2.30.1/src/attributes.h000066400000000000000000000055151506776541700164060ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // int64_t #include // std::FILE auto header_get_size(char const * header, int header_length) -> int64_t; auto header_fprint_strip(std::FILE * output_handle, char const * header, int header_length, bool strip_size, bool strip_ee, bool strip_length) -> void; vsearch-2.30.1/src/bitmap.cc000066400000000000000000000074221506776541700156310ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "bitmap.h" #include // std::memset auto bitmap_init(unsigned int const size) -> struct bitmap_s * { constexpr auto divider = 8U; constexpr auto padding = divider - 1U; const auto minimal_size = (size + padding) / divider; auto * a_bitmap = static_cast(xmalloc(sizeof(struct bitmap_s))); a_bitmap->size = size; a_bitmap->bitmap = static_cast(xmalloc(minimal_size)); return a_bitmap; } auto bitmap_get(struct bitmap_s * a_bitmap, unsigned int const seed_value) -> unsigned char { constexpr auto mask_111 = 7U; constexpr auto divider = 3U; // divide by 8 return (a_bitmap->bitmap[seed_value >> divider] >> (seed_value & mask_111)) & 1U; } auto bitmap_reset_all(struct bitmap_s * a_bitmap) -> void { constexpr auto n_bits_in_a_byte = 8U; const auto size_in_bytes = (a_bitmap->size + n_bits_in_a_byte - 1) / n_bits_in_a_byte; std::memset(a_bitmap->bitmap, 0, size_in_bytes); } auto bitmap_set(struct bitmap_s * a_bitmap, unsigned int const seed_value) -> void { constexpr auto mask_111 = 7U; constexpr auto divider = 3U; // divide by 8 a_bitmap->bitmap[seed_value >> divider] |= 1U << (seed_value & mask_111); } auto bitmap_free(struct bitmap_s * a_bitmap) -> void { if (a_bitmap->bitmap != nullptr) { xfree(a_bitmap->bitmap); } xfree(a_bitmap); } vsearch-2.30.1/src/bitmap.h000066400000000000000000000056561506776541700155020ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BITMAP_H #define BITMAP_H struct bitmap_s { unsigned char * bitmap; /* the actual bitmap */ unsigned int size; /* size in bits */ }; auto bitmap_init(unsigned int size) -> struct bitmap_s *; auto bitmap_get(struct bitmap_s * a_bitmap, unsigned int seed_value) -> unsigned char; auto bitmap_reset_all(struct bitmap_s * a_bitmap) -> void; auto bitmap_set(struct bitmap_s * a_bitmap, unsigned int seed_value) -> void; auto bitmap_free(struct bitmap_s * a_bitmap) -> void; #endif // BITMAP_H vsearch-2.30.1/src/chimera.cc000066400000000000000000002300401506776541700157570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "attributes.h" #include "chimera.h" #include "dbindex.h" #include "linmemalign.h" #include "mask.h" #include "minheap.h" #include "udb.h" #include "unique.h" #include "utils/cigar.hpp" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/span.hpp" #include "utils/xpthread.hpp" #include // std::copy, std::fill, std::fill_n, std::max, std::max_element, std::min, std::transform #include #include #include // std::tolower #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::qsort #include // std::FILE, std::fprintf, std::sscanf #include // std::strlen, std::strcpy #include // std::next #include #include // std::accumulate #include #include /* This code implements the method described in this paper: Robert C. Edgar, Brian J. Haas, Jose C. Clemente, Christopher Quince and Rob Knight (2011) UCHIME improves sensitivity and speed of chimera detection Bioinformatics, 27, 16, 2194-2200 https://doi.org/10.1093/bioinformatics/btr381 */ /* global constants/data, no need for synchronization */ static auto parts = 0; constexpr auto maxparts = 100; constexpr auto window = 32; constexpr auto few = 4; constexpr auto maxcandidates = few * maxparts; constexpr auto rejects = 16; constexpr auto chimera_id = 0.55; static int tophits; // all these static variables are on both sides of a pthread wall static pthread_attr_t attr; static pthread_t * pthread; static fastx_handle query_fasta_h; /* mutexes and global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static unsigned int seqno = 0; static uint64_t progress = 0; static int chimera_count = 0; static int nonchimera_count = 0; static int borderline_count = 0; static int total_count = 0; static int64_t chimera_abundance = 0; static int64_t nonchimera_abundance = 0; static int64_t borderline_abundance = 0; static int64_t total_abundance = 0; static std::FILE * fp_chimeras = nullptr; static std::FILE * fp_nonchimeras = nullptr; static std::FILE * fp_uchimealns = nullptr; static std::FILE * fp_uchimeout = nullptr; static std::FILE * fp_borderline = nullptr; /* information for each query sequence to be checked */ struct chimera_info_s { int query_alloc = 0; /* the longest query sequence allocated memory for */ int head_alloc = 0; /* the longest header allocated memory for */ int query_no = 0; std::vector query_head; int query_head_len = 0; int query_size = 0; std::vector query_seq; int query_len = 0; std::array si {{}}; std::array cand_list {{}}; int cand_count = 0; struct s16info_s * s = nullptr; std::array snwscore {{}}; std::array snwalignmentlength {{}}; std::array snwmatches {{}}; std::array snwmismatches {{}}; std::array snwgaps {{}}; std::array nwscore {{}}; std::array nwalignmentlength {{}}; std::array nwmatches {{}}; std::array nwmismatches {{}}; std::array nwgaps {{}}; std::vector nwcigar = std::vector(maxcandidates); // this is a test int match_size = 0; std::vector match; std::vector insert; std::vector smooth; std::vector maxsmooth; std::vector scan_p; std::vector scan_q; int parents_found = 0; std::array best_parents {{}}; std::array best_start {{}}; std::array best_len {{}}; int best_target = 0; char * best_cigar = nullptr; std::vector maxi; // longest insertion per position std::vector> paln; std::vector qaln; std::vector diffs; std::vector votes; std::vector model; std::vector ignore; double best_h = 0; }; static struct chimera_info_s * cia; enum struct Status : unsigned char { no_parents, // (0) non-chimeric no_alignment, // (1) score < 0, non-chimeric low_score, // (2) score < minh, non-chimeric suspicious, // (3) score >= minh, not available with uchime2_denovo and uchime3_denovo chimeric // (4) score >= minh && divdiff >= opt_mindiv && ... }; // anonymous namespace: limit visibility and usage to this translation unit namespace { } // end of anonymous namespace auto realloc_arrays(struct chimera_info_s * chimera_info) -> void { if (opt_chimeras_denovo != nullptr) { if (opt_chimeras_parts == 0) { parts = (chimera_info->query_len + maxparts - 1) / maxparts; // bug fix: std::max(expr, parts);? } else { parts = opt_chimeras_parts; } if (parts < 2) { parts = 2; } else if (parts > maxparts) { parts = maxparts; } } else { /* default for uchime, uchime2, and uchime3 */ parts = 4; } const int maxhlen = std::max(chimera_info->query_head_len, 1); if (maxhlen > chimera_info->head_alloc) { chimera_info->query_head.resize(maxhlen + 1); } chimera_info->head_alloc = std::max(chimera_info->head_alloc, maxhlen); /* realloc arrays based on query length */ const int maxqlen = std::max(chimera_info->query_len, 1); const int maxpartlen = (maxqlen + parts - 1) / parts; auto const max_2x2_size = static_cast(maxcandidates * maxqlen); if (maxqlen > chimera_info->query_alloc) { chimera_info->query_alloc = maxqlen; chimera_info->query_seq.resize(maxqlen + 1); for (auto & query_info: chimera_info->si) { query_info.qsequence_v.resize(maxpartlen + 1); query_info.qsequence = query_info.qsequence_v.data(); } chimera_info->maxi.resize(maxqlen + 1); chimera_info->maxsmooth.resize(maxqlen); chimera_info->match.resize(max_2x2_size); chimera_info->insert.resize(max_2x2_size); chimera_info->smooth.resize(max_2x2_size); chimera_info->scan_p.resize(maxqlen + 1); chimera_info->scan_q.resize(maxqlen + 1); const int maxalnlen = maxqlen + (2 * db_getlongestsequence()); chimera_info->paln.resize(maxparents); for (auto & a_parent_alignment : chimera_info->paln) { a_parent_alignment.resize(maxalnlen + 1); } chimera_info->qaln.resize(maxalnlen + 1); chimera_info->diffs.resize(maxalnlen + 1); chimera_info->votes.resize(maxalnlen + 1); chimera_info->model.resize(maxalnlen + 1); chimera_info->ignore.resize(maxalnlen + 1); } } auto reset_matches(struct chimera_info_s * a_chimera_info) -> void { // refactoring: initialization to zero? (useless), or reset to zero?? std::fill(a_chimera_info->match.begin(), a_chimera_info->match.end(), 0); std::fill(a_chimera_info->insert.begin(), a_chimera_info->insert.end(), 0); } auto find_matches(struct chimera_info_s * chimera_info) -> void { /* find the positions with matches for each potential parent */ /* also note the positions with inserts in front */ auto & qseq = chimera_info->query_seq; for (auto i = 0; i < chimera_info->cand_count; ++i) { auto const * tseq = db_getsequence(chimera_info->cand_list[i]); auto qpos = 0; auto tpos = 0; auto * cigar_start = chimera_info->nwcigar[i]; auto const cigar_length = std::strlen(cigar_start); auto const cigar_pairs = parse_cigar_string(Span{cigar_start, cigar_length}); for (auto const & a_pair: cigar_pairs) { auto const operation = a_pair.first; auto const runlength = a_pair.second; switch (operation) { case Operation::match: for (auto j = 0; j < runlength; ++j) { if ((map_4bit(qseq[qpos]) & map_4bit(tseq[tpos])) != 0U) { chimera_info->match[(i * chimera_info->query_len) + qpos] = 1; } ++qpos; ++tpos; } break; case Operation::insertion: chimera_info->insert[(i * chimera_info->query_len) + qpos] = runlength; tpos += runlength; break; case Operation::deletion: qpos += runlength; break; } } } } struct parents_info_s { int cand = -1; int start = -1; int len = 0; }; auto compare_positions(const void * a, const void * b) -> int { const int lhs = ((const parents_info_s *) a)->start; const int rhs = ((const parents_info_s *) b)->start; if (lhs < rhs) { return -1; } if (lhs > rhs) { return +1; } return 0; } auto scan_matches(struct chimera_info_s * ci, int const * matches, int const len, double const percentage, int * best_start, int * best_len) -> bool { /* Scan matches array of zeros and ones, and find the longest subsequence having a match fraction above or equal to the given percentage (e.g. 2%). Based on an idea of finding the longest positive sum substring: https://stackoverflow.com/questions/28356453/longest-positive-sum-substring If the percentage is 2%, matches are given a score of 2 and mismatches -98. */ auto const score_match = percentage; auto const score_mismatch = percentage - 100.0; auto & p = ci->scan_p; auto & q = ci->scan_q; p[0] = 0.0; for (auto i = 0; i < len; ++i) { p[i + 1] = p[i] + ((matches[i] != 0) ? score_match : score_mismatch); } q[len] = p[len]; for (auto i = len - 1; i >= 0; --i) { q[i] = std::max(q[i + 1], p[i]); } auto best_i = 0; auto best_d = -1; auto best_c = -1.0; auto i = 1; auto j = 1; while (j <= len) { auto const c = q[j] - p[i - 1]; if (c >= 0.0) { auto const d = j - i + 1; if (d > best_d) { best_i = i; best_d = d; best_c = c; } j += 1; } else { i += 1; } } if (best_c >= 0.0) { *best_start = best_i - 1; *best_len = best_d; return true; } return false; } auto find_best_parents_long(struct chimera_info_s * ci) -> int { /* Find parents with longest matching regions, without indels, allowing a given percentage of mismatches (specified with --chimeras_diff_pct), and excluding regions matched by previously identified parents. */ reset_matches(ci); find_matches(ci); std::vector best_parents(maxparents); std::vector position_used(ci->query_len, false); int pos_remaining = ci->query_len; int parents_found = 0; for (int f = 0; f < opt_chimeras_parents_max; ++f) { /* scan each candidate and find longest matching region */ int best_start = 0; int best_len = 0; int best_cand = -1; for (int i = 0; i < ci->cand_count; ++i) { int start = 0; int len = 0; int j = 0; while (j < ci->query_len) { start = j; len = 0; while ((j < ci->query_len) && (not position_used[j]) && ((len == 0) or (ci->insert[(i * ci->query_len) + j] == 0))) { ++len; ++j; } if (len > best_len) { int scan_best_start = 0; int scan_best_len = 0; if (scan_matches(ci, &ci->match[(i * ci->query_len) + start], len, opt_chimeras_diff_pct, & scan_best_start, & scan_best_len)) { if (scan_best_len > best_len) { best_cand = i; best_start = start + scan_best_start; best_len = scan_best_len; } } } ++j; } } if (best_len >= opt_chimeras_length_min) { best_parents[f].cand = best_cand; best_parents[f].start = best_start; best_parents[f].len = best_len; ++parents_found; #if 0 if (f == 0) printf("\n"); printf("Best parents long: %d %d %d %d %s %s\n", f, best_cand, best_start, best_len, ci->query_head.data(), db_getheader(ci->cand_list[best_cand])); #endif /* mark positions used */ for (int j = best_start; j < best_start + best_len; ++j) { position_used[j] = true; } pos_remaining -= best_len; } else { break; } } /* sort parents by position */ std::qsort(best_parents.data(), parents_found, sizeof(struct parents_info_s), compare_positions); ci->parents_found = parents_found; for (int f = 0; f < parents_found; ++f) { ci->best_parents[f] = best_parents[f].cand; ci->best_start[f] = best_parents[f].start; ci->best_len[f] = best_parents[f].len; } #if 0 if (pos_remaining == 0) printf("Fully covered!\n"); else printf("Not covered completely (%d).\n", pos_remaining); #endif return static_cast((parents_found > 1) and (pos_remaining == 0)); } auto find_best_parents(struct chimera_info_s * ci) -> int { reset_matches(ci); find_matches(ci); std::array best_parent_cand {{}}; for (int f = 0; f < 2; ++f) { best_parent_cand[f] = -1; ci->best_parents[f] = -1; } std::vector cand_selected(ci->cand_count, false); for (int f = 0; f < 2; ++f) { if (f > 0) { /* for all parents except the first */ /* wipe out matches for all candidates in positions covered by the previous parent */ for (int qpos = window - 1; qpos < ci->query_len; ++qpos) { int const z = (best_parent_cand[f - 1] * ci->query_len) + qpos; if (ci->smooth[z] == ci->maxsmooth[qpos]) { for (int i = qpos + 1 - window; i <= qpos; ++i) { for (int j = 0; j < ci->cand_count; ++j) { ci->match[(j * ci->query_len) + i] = 0; } } } } } /* Compute smoothed score in a 32bp window for each candidate. */ /* Record max smoothed score for each position among candidates left. */ // refactoring: reset or initialize? std::fill(ci->maxsmooth.begin(), ci->maxsmooth.end(), 0); for (int i = 0; i < ci->cand_count; ++i) { if (not cand_selected[i]) { int sum = 0; for (int qpos = 0; qpos < ci->query_len; ++qpos) { int const z = (i * ci->query_len) + qpos; sum += ci->match[z]; if (qpos >= window) { sum -= ci->match[z - window]; } if (qpos >= window - 1) { ci->smooth[z] = sum; ci->maxsmooth[qpos] = std::max(ci->smooth[z], ci->maxsmooth[qpos]); } } } } /* find parent with the most wins */ std::vector wins(ci->cand_count, 0); for (int qpos = window - 1; qpos < ci->query_len; ++qpos) { if (ci->maxsmooth[qpos] != 0) { for (int i = 0; i < ci->cand_count; ++i) { if (not cand_selected[i]) { int const z = (i * ci->query_len) + qpos; if (ci->smooth[z] == ci->maxsmooth[qpos]) { ++wins[i]; } } } } } /* select best parent based on most wins */ int maxwins = 0; for (int i = 0; i < ci->cand_count; ++i) { int const w = wins[i]; if (w > maxwins) { maxwins = w; best_parent_cand[f] = i; } } /* terminate loop if no parent found */ if (best_parent_cand[f] < 0) { break; } #if 0 printf("Query %d: Best parent (%d) candidate: %d. Wins: %d\n", ci->query_no, f, best_parent_cand[f], maxwins); #endif ci->best_parents[f] = best_parent_cand[f]; cand_selected[best_parent_cand[f]] = true; } /* Check if at least 2 candidates selected */ return static_cast((best_parent_cand[0] >= 0) and (best_parent_cand[1] >= 0)); } auto find_total_alignment_length(struct chimera_info_s const * chimera_info) -> int { // query_len, plus the sum of the longest insertion runs (I) for each position return std::accumulate(chimera_info->maxi.begin(), chimera_info->maxi.end(), chimera_info->query_len); } auto fill_max_alignment_length(struct chimera_info_s * chimera_info) -> void { /* find max insertions in front of each position in the query sequence */ std::fill(chimera_info->maxi.begin(), chimera_info->maxi.end(), 0); auto const count = static_cast(chimera_info->parents_found); assert(count <= chimera_info->best_parents.size()); auto const best_parents_view = Span{chimera_info->best_parents.data(), count}; for (auto const best_parent : best_parents_view) { auto pos = 0LL; auto * cigar_start = chimera_info->nwcigar[best_parent]; auto const cigar_length = std::strlen(cigar_start); auto const cigar_pairs = parse_cigar_string(Span{cigar_start, cigar_length}); for (auto const & a_pair: cigar_pairs) { auto const operation = a_pair.first; auto const runlength = a_pair.second; switch (operation) { case Operation::match: case Operation::deletion: pos += runlength; break; case Operation::insertion: assert(runlength <= std::numeric_limits::max()); chimera_info->maxi[pos] = std::max(static_cast(runlength), chimera_info->maxi[pos]); break; } } } } auto fill_alignment_parents(struct chimera_info_s * ci) -> void { /* fill in alignment strings for the parents */ for (int i = 0; i < ci->parents_found; ++i) { auto & alignment = ci->paln[i]; int const cand = ci->best_parents[i]; int const target_seqno = ci->cand_list[cand]; char const * target_seq = db_getsequence(target_seqno); auto is_inserted = false; int qpos = 0; int tpos = 0; int alnpos = 0; auto * cigar_start = ci->nwcigar[cand]; auto const cigar_length = std::strlen(cigar_start); auto const cigar_pairs = parse_cigar_string(Span{cigar_start, cigar_length}); for (auto const & a_pair: cigar_pairs) { auto const operation = a_pair.first; auto const runlength = a_pair.second; switch (operation) { case Operation::insertion: for (int j = 0; j < ci->maxi[qpos]; ++j) { if (j < runlength) { alignment[alnpos] = map_uppercase(target_seq[tpos]); ++tpos; ++alnpos; } else { alignment[alnpos] = '-'; ++alnpos; } } is_inserted = true; break; case Operation::match: case Operation::deletion: for (int j = 0; j < runlength; ++j) { if (not is_inserted) { std::fill_n(&alignment[alnpos], ci->maxi[qpos], '-'); alnpos += ci->maxi[qpos]; } if (operation == Operation::match) { alignment[alnpos] = map_uppercase(target_seq[tpos]); ++tpos; ++alnpos; } else { alignment[alnpos] = '-'; ++alnpos; } ++qpos; is_inserted = false; } } } /* add any gaps at the end */ if (not is_inserted) { std::fill_n(&alignment[alnpos], ci->maxi[qpos], '-'); alnpos += ci->maxi[qpos]; } /* end of sequence string */ alignment[alnpos] = '\0'; } } auto fill_in_alignment_string_for_query(struct chimera_info_s * chimera_info) -> void { auto alnpos = 0; auto qpos = 0; for (auto const nucleotide: chimera_info->query_seq) { // add insertion (if any): auto const insert_length = chimera_info->maxi[qpos]; std::fill_n(&chimera_info->qaln[alnpos], insert_length, '-'); alnpos += insert_length; // add (mis-)matching position: chimera_info->qaln[alnpos] = map_uppercase(nucleotide); ++alnpos; ++qpos; } // add terminal gap (if any): auto const insert_length = chimera_info->maxi[chimera_info->query_len]; std::fill_n(&chimera_info->qaln[alnpos], insert_length, '-'); alnpos += insert_length; chimera_info->qaln[alnpos] = '\0'; } auto fill_in_model_string_for_query(struct chimera_info_s * chimera_info) -> void { int nth_parent = 0; auto alnpos = 0; for (int qpos = 0; qpos < chimera_info->query_len; ++qpos) { if (qpos >= (chimera_info->best_start[nth_parent] + chimera_info->best_len[nth_parent])) { ++nth_parent; } // add insertion (if any): auto const insert_length = chimera_info->maxi[qpos]; std::fill_n(&chimera_info->model[alnpos], insert_length, 'A' + nth_parent); alnpos += insert_length; // add (mis-)matching position: chimera_info->model[alnpos] = 'A' + nth_parent; ++alnpos; } // add terminal gap (if any): auto const insert_length = chimera_info->maxi[chimera_info->query_len]; std::fill_n(&chimera_info->model[alnpos], insert_length, 'A' + nth_parent); alnpos += insert_length; chimera_info->model[alnpos] = '\0'; } auto count_matches_with_parents(struct chimera_info_s const * chimera_info, int const alignment_length) -> std::array { std::array matches {{}}; for (auto i = 0; i < alignment_length; ++i) { auto const qsym = map_4bit(chimera_info->qaln[i]); for (auto f = 0; f < chimera_info->parents_found; ++f) { auto const psym = map_4bit(chimera_info->paln[f][i]); if (qsym == psym) { ++matches[f]; } } } return matches; } auto compute_global_similarities_with_parents( std::array const & match_counts, int const alignment_length) -> std::array { std::array similarities {{}}; auto compute_percentage = [alignment_length](int const match_count) -> double { return 100.0 * match_count / alignment_length; }; std::transform(match_counts.begin(), match_counts.end(), similarities.begin(), compute_percentage); return similarities; } auto compute_diffs(struct chimera_info_s const * ci, std::vector const & psym, unsigned char const qsym) -> char { auto const all_defined = (qsym != 0U) and std::all_of(psym.begin(), psym.end(), [](unsigned char const symbol) -> bool{ return symbol != 0U; }); char diff = ' '; if (not all_defined) { return diff; } auto z = 0; for (auto f = 0; f < ci->parents_found; ++f) { if (psym[f] == qsym) { diff = 'A' + f; ++z; } } if (z > 1) { diff = ' '; } return diff; } auto eval_parents_long(struct chimera_info_s * ci) -> Status { /* always chimeric if called */ auto const status = Status::chimeric; fill_max_alignment_length(ci); auto const alnlen = find_total_alignment_length(ci); fill_alignment_parents(ci); fill_in_alignment_string_for_query(ci); fill_in_model_string_for_query(ci); std::vector psym; psym.reserve(maxparents); for (int i = 0; i < alnlen; ++i) { auto const qsym = map_4bit(ci->qaln[i]); for (int f = 0; f < ci->parents_found; ++f) { psym.emplace_back(map_4bit(ci->paln[f][i])); } /* lower case parent symbols that differ from query */ for (int f = 0; f < ci->parents_found; ++f) { if ((psym[f] != 0U) and (psym[f] != qsym)) { ci->paln[f][i] = std::tolower(ci->paln[f][i]); } } /* compute diffs */ ci->diffs[i] = compute_diffs(ci, psym, qsym); psym.clear(); } ci->diffs[alnlen] = '\0'; auto const match_QP = count_matches_with_parents(ci, alnlen); int const seqno_a = ci->cand_list[ci->best_parents[0]]; int const seqno_b = ci->cand_list[ci->best_parents[1]]; int const seqno_c = ci->parents_found > 2 ? ci->cand_list[ci->best_parents[2]] : -1; auto const QP = compute_global_similarities_with_parents(match_QP, alnlen); auto const QT = *std::max_element(QP.begin(), QP.end()); double const QA = QP[0]; double const QB = QP[1]; double const QC = ci->parents_found > 2 ? QP[2] : 0.00; double const QM = 100.00; double const divfrac = 100.00 * (QM - QT) / QT; // divergence of the model with the closest parent xpthread_mutex_lock(&mutex_output); if ((opt_alnout != nullptr) and (status == Status::chimeric)) { std::fprintf(fp_uchimealns, "\n"); std::fprintf(fp_uchimealns, "----------------------------------------" "--------------------------------\n"); std::fprintf(fp_uchimealns, "Query (%5d nt) ", ci->query_len); header_fprint_strip(fp_uchimealns, ci->query_head.data(), ci->query_head_len, opt_xsize, opt_xee, opt_xlength); assert(ci->parents_found <= 20); // 20 parents max ('A' to 'U') for (int f = 0; f < ci->parents_found; ++f) { int const seqno = ci->cand_list[ci->best_parents[f]]; std::fprintf(fp_uchimealns, "\nParent%c (%5" PRIu64 " nt) ", 'A' + f, db_getsequencelen(seqno)); header_fprint_strip(fp_uchimealns, db_getheader(seqno), db_getheaderlen(seqno), opt_xsize, opt_xee, opt_xlength); } std::fprintf(fp_uchimealns, "\n\n"); int const width = opt_alignwidth > 0 ? opt_alignwidth : alnlen; int qpos = 0; std::array ppos {{}}; int rest = alnlen; for (int i = 0; i < alnlen; i += width) { /* count non-gap symbols on current line */ int qnt = 0; std::array pnt {{}}; int const w = std::min(rest, width); for (int j = 0; j < w; ++j) { if (ci->qaln[i + j] != '-') { ++qnt; } for (int f = 0; f < ci->parents_found; ++f) { if (ci->paln[f][i + j] != '-') { ++pnt[f]; } } } fprintf(fp_uchimealns, "Q %5d %.*s %d\n", qpos + 1, w, &ci->qaln[i], qpos + qnt); for (int f = 0; f < ci->parents_found; ++f) { fprintf(fp_uchimealns, "%c %5d %.*s %d\n", 'A' + f, ppos[f] + 1, w, &ci->paln[f][i], ppos[f] + pnt[f]); } fprintf(fp_uchimealns, "Diffs %.*s\n", w, &ci->diffs[i]); fprintf(fp_uchimealns, "Model %.*s\n", w, &ci->model[i]); fprintf(fp_uchimealns, "\n"); rest -= width; qpos += qnt; for (int f = 0; f < ci->parents_found; ++f) { ppos[f] += pnt[f]; } } fprintf(fp_uchimealns, "Ids. QA %.2f%%, QB %.2f%%, QC %.2f%%, " "QT %.2f%%, QModel %.2f%%, Div. %+.2f%%\n", QA, QB, QC, QT, QM, divfrac); } if (opt_tabbedout != nullptr) { fprintf(fp_uchimeout, "%.4f\t", 99.9999); header_fprint_strip(fp_uchimeout, ci->query_head.data(), ci->query_head_len, opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); if (seqno_c >= 0) { header_fprint_strip(fp_uchimeout, db_getheader(seqno_c), db_getheaderlen(seqno_c), opt_xsize, opt_xee, opt_xlength); } else { fprintf(fp_uchimeout, "*"); } fprintf(fp_uchimeout, "\t"); fprintf(fp_uchimeout, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t" "%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%c\n", QM, QA, QB, QC, QT, 0, /* ignore, left yes */ 0, /* ignore, left no */ 0, /* ignore, left abstain */ 0, /* ignore, right yes */ 0, /* ignore, right no */ 0, /* ignore, right abstain */ 0.00, status == Status::chimeric ? 'Y' : (status == Status::low_score ? 'N' : '?')); } xpthread_mutex_unlock(&mutex_output); return status; } auto eval_parents(struct chimera_info_s * ci) -> Status { auto status = Status::no_alignment; ci->parents_found = 2; fill_max_alignment_length(ci); auto const alnlen = find_total_alignment_length(ci); fill_alignment_parents(ci); /* fill in alignment string for query */ char * q = ci->qaln.data(); int qpos = 0; for (int i = 0; i < ci->query_len; ++i) { for (int j = 0; j < ci->maxi[i]; ++j) { *q = '-'; ++q; } *q = map_uppercase(ci->query_seq[qpos]); ++qpos; ++q; } for (int j = 0; j < ci->maxi[ci->query_len]; ++j) { *q = '-'; ++q; } *q = 0; /* mark positions to ignore in voting */ std::fill(ci->ignore.begin(), ci->ignore.end(), false); for (int i = 0; i < alnlen; ++i) { auto const qsym = map_4bit(ci->qaln[i]); auto const p1sym = map_4bit(ci->paln[0][i]); auto const p2sym = map_4bit(ci->paln[1][i]); /* ignore gap positions and those next to the gap */ if ((qsym == 0U) or (p1sym == 0U) or (p2sym == 0U)) { ci->ignore[i] = true; if (i > 0) { ci->ignore[i - 1] = true; } if (i < alnlen - 1) { ci->ignore[i + 1] = true; } } /* ignore ambiguous symbols */ if (is_ambiguous_4bit(qsym) or is_ambiguous_4bit(p1sym) or is_ambiguous_4bit(p2sym)) { ci->ignore[i] = true; } /* lower case parent symbols that differ from query */ if ((p1sym != 0U) and (p1sym != qsym)) { ci->paln[0][i] = std::tolower(ci->paln[0][i]); } if ((p2sym != 0U) and (p2sym != qsym)) { ci->paln[1][i] = std::tolower(ci->paln[1][i]); } /* compute diffs */ char diff = '\0'; if ((qsym != 0U) and (p1sym != 0U) and (p2sym != 0U)) { if (p1sym == p2sym) { if (qsym == p1sym) { diff = ' '; } else { diff = 'N'; } } else { if (qsym == p1sym) { diff = 'A'; } else if (qsym == p2sym) { diff = 'B'; } else { diff = '?'; } } } else { diff = ' '; } ci->diffs[i] = diff; } ci->diffs[alnlen] = '\0'; /* compute score */ int sumA = 0; int sumB = 0; int sumN = 0; // refactoring: extract to function, use a struct to pass results // std::transform(ci->diffs.begin(), // std::next(ci->diffs.begin(), alnlen), // ci->ignore.begin(), // [&sumA, &sumB, &sumN](char const diff, bool const is_ignored) -> void { // if (is_ignored) { return; } // if (diff == 'A') { // ++sumA; // } // else if (diff == 'B') { // ++sumB; // } // else if (diff != ' ') { // ++sumN; // } // return; // } // ); for (auto i = 0; i < alnlen; ++i) { if (ci->ignore[i]) { continue; } auto const diff = ci->diffs[i]; if (diff == 'A') { ++sumA; } else if (diff == 'B') { ++sumB; } else if (diff != ' ') { ++sumN; } } int left_n = 0; int left_a = 0; int left_y = 0; int right_n = sumA; int right_a = sumN; int right_y = sumB; double best_h = -1; int best_i = -1; auto best_is_reverse = false; int best_left_y = 0; int best_right_y = 0; int best_left_n = 0; int best_right_n = 0; int best_left_a = 0; int best_right_a = 0; for (int i = 0; i < alnlen; ++i) { if (not ci->ignore[i]) { char const diff = ci->diffs[i]; if (diff != ' ') { if (diff == 'A') { ++left_y; --right_n; } else if (diff == 'B') { ++left_n; --right_y; } else { ++left_a; --right_a; } double left_h = 0; double right_h = 0; double h = 0; if ((left_y > left_n) and (right_y > right_n)) { left_h = left_y / (opt_xn * (left_n + opt_dn) + left_a); right_h = right_y / (opt_xn * (right_n + opt_dn) + right_a); h = left_h * right_h; if (h > best_h) { best_is_reverse = false; best_h = h; best_i = i; best_left_n = left_n; best_left_y = left_y; best_left_a = left_a; best_right_n = right_n; best_right_y = right_y; best_right_a = right_a; } } else if ((left_n > left_y) and (right_n > right_y)) { /* swap left/right and yes/no */ left_h = left_n / (opt_xn * (left_y + opt_dn) + left_a); right_h = right_n / (opt_xn * (right_y + opt_dn) + right_a); h = left_h * right_h; if (h > best_h) { best_is_reverse = true; best_h = h; best_i = i; best_left_n = left_y; best_left_y = left_n; best_left_a = left_a; best_right_n = right_y; best_right_y = right_n; best_right_a = right_a; } } } } } ci->best_h = best_h > 0 ? best_h : 0.0; if (best_h >= 0.0) { status = Status::low_score; /* flip A and B if necessary */ if (best_is_reverse) { for (int i = 0; i < alnlen; ++i) { char const diff = ci->diffs[i]; if (diff == 'A') { ci->diffs[i] = 'B'; } else if (diff == 'B') { ci->diffs[i] = 'A'; } } } /* fill in votes and model */ for (int i = 0; i < alnlen; ++i) { char const m = i <= best_i ? 'A' : 'B'; ci->model[i] = m; char v = ' '; if (not ci->ignore[i]) { char const d = ci->diffs[i]; if ((d == 'A') or (d == 'B')) { if (d == m) { v = '+'; } else { v = '!'; } } else if ((d == 'N') or (d == '?')) { v = '0'; } } ci->votes[i] = v; /* lower case diffs for no votes */ if (v == '!') { ci->diffs[i] = std::tolower(ci->diffs[i]); } } /* fill in crossover region */ for (int i = best_i + 1; i < alnlen; ++i) { if ((ci->diffs[i] == ' ') or (ci->diffs[i] == 'A')) { ci->model[i] = 'x'; } else { break; } } ci->votes[alnlen] = 0; ci->model[alnlen] = 0; /* count matches */ auto const index_a = best_is_reverse ? 1U : 0U; auto const index_b = best_is_reverse ? 0U : 1U; int match_QA = 0; int match_QB = 0; int match_AB = 0; int match_QM = 0; int cols = 0; for (auto i = 0; i < alnlen; i++) { if (not ci->ignore[i]) { ++cols; auto const qsym = map_4bit(ci->qaln[i]); auto const asym = map_4bit(ci->paln[index_a][i]); auto const bsym = map_4bit(ci->paln[index_b][i]); auto const msym = (i <= best_i) ? asym : bsym; if (qsym == asym) { ++match_QA; } if (qsym == bsym) { ++match_QB; } if (asym == bsym) { ++match_AB; } if (qsym == msym) { ++match_QM; } } } int const seqno_a = ci->cand_list[ci->best_parents[index_a]]; int const seqno_b = ci->cand_list[ci->best_parents[index_b]]; double const QA = 100.0 * match_QA / cols; double const QB = 100.0 * match_QB / cols; double const AB = 100.0 * match_AB / cols; double const QT = std::max(QA, QB); double const QM = 100.0 * match_QM / cols; double const divdiff = QM - QT; double const divfrac = 100.0 * divdiff / QT; int const sumL = best_left_n + best_left_a + best_left_y; int const sumR = best_right_n + best_right_a + best_right_y; if ((opt_uchime2_denovo != nullptr) or (opt_uchime3_denovo != nullptr)) { // fix -Wfloat-equal: if match_QM == cols, then QM == 100.0 if ((match_QM == cols) and (QT < 100.0)) { status = Status::chimeric; } } else if (best_h >= opt_minh) { status = Status::suspicious; if ((divdiff >= opt_mindiv) and (sumL >= opt_mindiffs) and (sumR >= opt_mindiffs)) { status = Status::chimeric; } } /* print alignment */ xpthread_mutex_lock(&mutex_output); if ((opt_uchimealns != nullptr) and (status == Status::chimeric)) { fprintf(fp_uchimealns, "\n"); fprintf(fp_uchimealns, "----------------------------------------" "--------------------------------\n"); fprintf(fp_uchimealns, "Query (%5d nt) ", ci->query_len); header_fprint_strip(fp_uchimealns, ci->query_head.data(), ci->query_head_len, opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimealns, "\nParentA (%5" PRIu64 " nt) ", db_getsequencelen(seqno_a)); header_fprint_strip(fp_uchimealns, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimealns, "\nParentB (%5" PRIu64 " nt) ", db_getsequencelen(seqno_b)); header_fprint_strip(fp_uchimealns, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimealns, "\n\n"); auto const width = opt_alignwidth > 0 ? opt_alignwidth : alnlen; qpos = 0; auto p1pos = 0; auto p2pos = 0; auto rest = alnlen; for (auto i = 0; i < alnlen; i += width) { /* count non-gap symbols on current line */ auto qnt = 0; auto p1nt = 0; auto p2nt = 0; auto const w = std::min(rest, width); for (auto j = 0; j < w; ++j) { if (ci->qaln[i + j] != '-') { ++qnt; } if (ci->paln[0][i + j] != '-') { ++p1nt; } if (ci->paln[1][i + j] != '-') { ++p2nt; } } if (not best_is_reverse) { fprintf(fp_uchimealns, "A %5d %.*s %d\n", p1pos + 1, w, &ci->paln[0][i], p1pos + p1nt); fprintf(fp_uchimealns, "Q %5d %.*s %d\n", qpos + 1, w, &ci->qaln[i], qpos + qnt); fprintf(fp_uchimealns, "B %5d %.*s %d\n", p2pos + 1, w, &ci->paln[1][i], p2pos + p2nt); } else { fprintf(fp_uchimealns, "A %5d %.*s %d\n", p2pos + 1, w, &ci->paln[1][i], p2pos + p2nt); fprintf(fp_uchimealns, "Q %5d %.*s %d\n", qpos + 1, w, &ci->qaln[i], qpos + qnt); fprintf(fp_uchimealns, "B %5d %.*s %d\n", p1pos + 1, w, &ci->paln[0][i], p1pos + p1nt); } fprintf(fp_uchimealns, "Diffs %.*s\n", w, &ci->diffs[i]); fprintf(fp_uchimealns, "Votes %.*s\n", w, &ci->votes[i]); fprintf(fp_uchimealns, "Model %.*s\n", w, &ci->model[i]); fprintf(fp_uchimealns, "\n"); qpos += qnt; p1pos += p1nt; p2pos += p2nt; rest -= width; } fprintf(fp_uchimealns, "Ids. QA %.1f%%, QB %.1f%%, AB %.1f%%, " "QModel %.1f%%, Div. %+.1f%%\n", QA, QB, AB, QM, divfrac); fprintf(fp_uchimealns, "Diffs Left %d: N %d, A %d, Y %d (%.1f%%); " "Right %d: N %d, A %d, Y %d (%.1f%%), Score %.4f\n", sumL, best_left_n, best_left_a, best_left_y, 100.0 * best_left_y / sumL, sumR, best_right_n, best_right_a, best_right_y, 100.0 * best_right_y / sumR, best_h); } if (opt_uchimeout != nullptr) { fprintf(fp_uchimeout, "%.4f\t", best_h); header_fprint_strip(fp_uchimeout, ci->query_head.data(), ci->query_head_len, opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); header_fprint_strip(fp_uchimeout, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uchimeout, "\t"); if (opt_uchimeout5 == 0) { if (QA >= QB) { header_fprint_strip(fp_uchimeout, db_getheader(seqno_a), db_getheaderlen(seqno_a), opt_xsize, opt_xee, opt_xlength); } else { header_fprint_strip(fp_uchimeout, db_getheader(seqno_b), db_getheaderlen(seqno_b), opt_xsize, opt_xee, opt_xlength); } fprintf(fp_uchimeout, "\t"); } fprintf(fp_uchimeout, "%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t" "%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%c\n", QM, QA, QB, AB, QT, best_left_y, best_left_n, best_left_a, best_right_y, best_right_n, best_right_a, divdiff, status == Status::chimeric ? 'Y' : (status == Status::low_score ? 'N' : '?')); } xpthread_mutex_unlock(&mutex_output); } return status; } auto query_init(struct searchinfo_s * search_info) -> void { static constexpr auto overflow_padding = 16U; // 16 * sizeof(short) = 32 bytes search_info->hits_v.resize(tophits); search_info->hits = search_info->hits_v.data(); search_info->kmers_v.reserve(db_getsequencecount() + overflow_padding); search_info->kmers_v.resize(db_getsequencecount()); search_info->kmers = search_info->kmers_v.data(); search_info->hit_count = 0; search_info->uh = unique_init(); search_info->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); search_info->m = minheap_init(tophits); } auto query_exit(struct searchinfo_s * search_info) -> void { search16_exit(search_info->s); unique_exit(search_info->uh); minheap_exit(search_info->m); search_info->qsequence = nullptr; search_info->hits = nullptr; search_info->kmers = nullptr; } auto partition_query(struct chimera_info_s * chimera_info) -> void { auto rest = chimera_info->query_len; auto * cursor = chimera_info->query_seq.data(); for (auto i = 0; i < parts; ++i) { auto const length = (rest + (parts - i - 1)) / (parts - i); auto & search_info = chimera_info->si[i]; search_info.query_no = chimera_info->query_no; search_info.strand = 0; search_info.qsize = chimera_info->query_size; search_info.query_head_len = chimera_info->query_head_len; search_info.query_head = chimera_info->query_head.data(); search_info.qseqlen = length; assert(static_cast(length) <= search_info.qsequence_v.size()); std::copy(cursor, std::next(cursor, length), search_info.qsequence_v.begin()); search_info.qsequence_v[length] = '\0'; rest -= length; cursor = std::next(cursor, length); } } auto chimera_thread_init(struct chimera_info_s * ci) -> void { for (int i = 0; i < maxparts; ++i) { query_init(&ci->si[i]); } ci->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); } auto chimera_thread_exit(struct chimera_info_s * ci) -> void { search16_exit(ci->s); for (auto & a_search_info : ci->si) { query_exit(&a_search_info); } } auto chimera_thread_core(struct chimera_info_s * ci) -> uint64_t { chimera_thread_init(ci); std::vector allhits_list(maxcandidates); struct Scoring scoring; scoring.match = opt_match; scoring.mismatch = opt_mismatch; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_open_query_left = opt_gap_open_query_left; scoring.gap_open_target_left = opt_gap_open_target_left; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_open_target_interior = opt_gap_open_target_interior; scoring.gap_open_query_right = opt_gap_open_query_right; scoring.gap_open_target_right = opt_gap_open_target_right; scoring.gap_extension_query_left = opt_gap_extension_query_left; scoring.gap_extension_target_left = opt_gap_extension_target_left; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_extension_target_interior = opt_gap_extension_target_interior; scoring.gap_extension_query_right = opt_gap_extension_query_right; scoring.gap_extension_target_right = opt_gap_extension_target_right; LinearMemoryAligner lma(scoring); while (true) { /* get next sequence */ xpthread_mutex_lock(&mutex_input); if (opt_uchime_ref != nullptr) { if (fasta_next(query_fasta_h, (opt_notrunclabels == 0), chrmap_no_change_vector.data())) { ci->query_head_len = fasta_get_header_length(query_fasta_h); ci->query_len = fasta_get_sequence_length(query_fasta_h); ci->query_no = fasta_get_seqno(query_fasta_h); ci->query_size = fasta_get_abundance(query_fasta_h); /* if necessary expand memory for arrays based on query length */ realloc_arrays(ci); /* copy the data locally (query seq, head) */ std::strcpy(ci->query_head.data(), fasta_get_header(query_fasta_h)); std::strcpy(ci->query_seq.data(), fasta_get_sequence(query_fasta_h)); } else { xpthread_mutex_unlock(&mutex_input); break; /* end while loop */ } } else { if (seqno < db_getsequencecount()) { ci->query_no = seqno; ci->query_head_len = db_getheaderlen(seqno); ci->query_len = db_getsequencelen(seqno); ci->query_size = db_getabundance(seqno); /* if necessary expand memory for arrays based on query length */ realloc_arrays(ci); std::strcpy(ci->query_head.data(), db_getheader(seqno)); std::strcpy(ci->query_seq.data(), db_getsequence(seqno)); } else { xpthread_mutex_unlock(&mutex_input); break; /* end while loop */ } } xpthread_mutex_unlock(&mutex_input); auto status = Status::no_parents; /* partition query */ partition_query(ci); /* perform searches and collect candidate parents */ ci->cand_count = 0; auto allhits_count = 0; if (ci->query_len >= parts) { std::vector hits; for (auto i = 0; i < parts; ++i) { search_onequery(&ci->si[i], opt_qmask); search_joinhits(&ci->si[i], nullptr, hits); for (auto const & hit : hits) { if (hit.accepted) { allhits_list[allhits_count] = hit; ++allhits_count; } } hits.clear(); } } for (auto i = 0; i < allhits_count; ++i) { unsigned int const target = allhits_list[i].target; /* skip duplicates */ auto k = 0; for (k = 0; k < ci->cand_count; ++k) { if (ci->cand_list[k] == target) { break; } } if (k == ci->cand_count) { ci->cand_list[ci->cand_count] = target; ++ci->cand_count; } /* deallocate cigar */ if (allhits_list[i].nwalignment != nullptr) { xfree(allhits_list[i].nwalignment); allhits_list[i].nwalignment = nullptr; } } /* align full query to each candidate */ search16_qprep(ci->s, ci->query_seq.data(), ci->query_len); search16(ci->s, ci->cand_count, ci->cand_list.data(), ci->snwscore.data(), ci->snwalignmentlength.data(), ci->snwmatches.data(), ci->snwmismatches.data(), ci->snwgaps.data(), ci->nwcigar.data()); for (auto i = 0; i < ci->cand_count; ++i) { int64_t const target = ci->cand_list[i]; int64_t nwscore = ci->snwscore[i]; char * nwcigar = nullptr; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; if (nwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ auto * tseq = db_getsequence(target); int64_t const tseqlen = db_getsequencelen(target); if (ci->nwcigar[i] != nullptr) { xfree(ci->nwcigar[i]); } nwcigar = xstrdup(lma.align(ci->query_seq.data(), tseq, ci->query_len, tseqlen)); lma.alignstats(nwcigar, ci->query_seq.data(), tseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); ci->nwcigar[i] = nwcigar; ci->nwscore[i] = nwscore; ci->nwalignmentlength[i] = nwalignmentlength; ci->nwmatches[i] = nwmatches; ci->nwmismatches[i] = nwmismatches; ci->nwgaps[i] = nwgaps; } else { ci->nwscore[i] = ci->snwscore[i]; ci->nwalignmentlength[i] = ci->snwalignmentlength[i]; ci->nwmatches[i] = ci->snwmatches[i]; ci->nwmismatches[i] = ci->snwmismatches[i]; ci->nwgaps[i] = ci->snwgaps[i]; } } /* find the best pair of parents, then compute score for them */ if (opt_chimeras_denovo != nullptr) { /* long high-quality reads */ if (find_best_parents_long(ci) != 0) { status = eval_parents_long(ci); } else { status = Status::no_parents; } } else { if (find_best_parents(ci) != 0) { status = eval_parents(ci); } else { status = Status::no_parents; } } /* output results */ xpthread_mutex_lock(&mutex_output); ++total_count; total_abundance += ci->query_size; if (status == Status::chimeric) { ++chimera_count; chimera_abundance += ci->query_size; if (opt_chimeras != nullptr) { fasta_print_general(fp_chimeras, nullptr, ci->query_seq.data(), ci->query_len, ci->query_head.data(), ci->query_head_len, ci->query_size, chimera_count, -1.0, -1, -1, opt_fasta_score ? ( (opt_uchime_ref != nullptr) ? "uchime_ref" : "uchime_denovo" ) : nullptr, ci->best_h); } } if (status == Status::suspicious) { ++borderline_count; borderline_abundance += ci->query_size; if (opt_borderline != nullptr) { fasta_print_general(fp_borderline, nullptr, ci->query_seq.data(), ci->query_len, ci->query_head.data(), ci->query_head_len, ci->query_size, borderline_count, -1.0, -1, -1, opt_fasta_score ? ( (opt_uchime_ref != nullptr) ? "uchime_ref" : "uchime_denovo" ) : nullptr, ci->best_h); } } if (status < Status::suspicious) { ++nonchimera_count; nonchimera_abundance += ci->query_size; /* output no parents, no chimeras */ if ((status < Status::low_score) and (opt_uchimeout != nullptr)) { fprintf(fp_uchimeout, "0.0000\t"); header_fprint_strip(fp_uchimeout, ci->query_head.data(), ci->query_head_len, opt_xsize, opt_xee, opt_xlength); if (opt_uchimeout5 != 0) { fprintf(fp_uchimeout, "\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN\n"); } else { fprintf(fp_uchimeout, "\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN\n"); } } if (opt_nonchimeras != nullptr) { fasta_print_general(fp_nonchimeras, nullptr, ci->query_seq.data(), ci->query_len, ci->query_head.data(), ci->query_head_len, ci->query_size, nonchimera_count, -1.0, -1, -1, opt_fasta_score ? ( (opt_uchime_ref != nullptr) ? "uchime_ref" : "uchime_denovo" ) : nullptr, ci->best_h); } } if (status < Status::suspicious) { /* uchime_denovo: add non-chimeras to db */ if ((opt_uchime_denovo != nullptr) or (opt_uchime2_denovo != nullptr) or (opt_uchime3_denovo != nullptr) or (opt_chimeras_denovo != nullptr)) { dbindex_addsequence(seqno, opt_qmask); } } for (auto i = 0; i < ci->cand_count; ++i) { if (ci->nwcigar[i] != nullptr) { xfree(ci->nwcigar[i]); } } if (opt_uchime_ref != nullptr) { progress = fasta_get_position(query_fasta_h); } else { progress += db_getsequencelen(seqno); } progress_update(progress); ++seqno; xpthread_mutex_unlock(&mutex_output); } chimera_thread_exit(ci); return 0; } auto chimera_thread_worker(void * vp) -> void * { return (void *) chimera_thread_core(cia + (int64_t) vp); } auto chimera_threads_run() -> void { xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* create worker threads */ for (int64_t t = 0; t < opt_threads; ++t) { xpthread_create(pthread + t, & attr, chimera_thread_worker, (void *) t); } /* finish worker threads */ for (int64_t t = 0; t < opt_threads; ++t) { xpthread_join(pthread[t], nullptr); } xpthread_attr_destroy(&attr); } auto open_chimera_file(std::FILE ** output_stream, char const * name) -> void { if (name != nullptr) { *output_stream = fopen_output(name); if (*output_stream == nullptr) { fatal("Unable to open file %s for writing", name); } } else { *output_stream = nullptr; } } auto close_chimera_file(std::FILE * output_stream) -> void { if (output_stream != nullptr) { std::fclose(output_stream); } } auto chimera(struct Parameters const & parameters) -> void { open_chimera_file(&fp_chimeras, opt_chimeras); open_chimera_file(&fp_nonchimeras, opt_nonchimeras); open_chimera_file(&fp_borderline, opt_borderline); if (parameters.opt_chimeras_denovo != nullptr) { open_chimera_file(&fp_uchimealns, opt_alnout); open_chimera_file(&fp_uchimeout, opt_tabbedout); } else { open_chimera_file(&fp_uchimealns, opt_uchimealns); open_chimera_file(&fp_uchimeout, opt_uchimeout); } /* override any options the user might have set */ opt_maxaccepts = few; opt_maxrejects = rejects; opt_id = chimera_id; if (parameters.opt_strand) { fatal("Only --strand plus is allowed with uchime_ref."); } if (parameters.opt_uchime_ref == nullptr) { opt_self = 1; opt_selfid = 1; opt_threads = 1; opt_maxsizeratio = 1.0 / opt_abskew; } tophits = opt_maxaccepts + opt_maxrejects; uint64_t progress_total = 0; chimera_count = 0; nonchimera_count = 0; progress = 0; seqno = 0; /* prepare threads */ std::vector pthread_v(opt_threads); pthread = pthread_v.data(); std::vector cia_v(opt_threads); cia = cia_v.data(); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); char const * denovo_dbname = nullptr; /* prepare queries / database */ if (parameters.opt_uchime_ref != nullptr) { /* check if the reference database may be an UDB file */ auto const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); } else { db_read(opt_db, 0); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) and (opt_hardmask != 0)) { hardmask_all(); } dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } query_fasta_h = fasta_open(parameters.opt_uchime_ref); progress_total = fasta_get_size(query_fasta_h); } else { if (parameters.opt_uchime_denovo != nullptr) { denovo_dbname = parameters.opt_uchime_denovo; } else if (parameters.opt_uchime2_denovo != nullptr) { denovo_dbname = parameters.opt_uchime2_denovo; } else if (parameters.opt_uchime3_denovo != nullptr) { denovo_dbname = parameters.opt_uchime3_denovo; } else if (parameters.opt_chimeras_denovo != nullptr) { denovo_dbname = parameters.opt_chimeras_denovo; } else { fatal("Internal error"); } db_read(denovo_dbname, 0); if (parameters.opt_qmask == MASK_DUST) { dust_all(); } else if ((parameters.opt_qmask == MASK_SOFT) and (opt_hardmask != 0)) { hardmask_all(); } db_sortbyabundance(); dbindex_prepare(1, parameters.opt_qmask); progress_total = db_getnucleotidecount(); } if (parameters.opt_log != nullptr) { if ((parameters.opt_uchime_ref != nullptr) or (parameters.opt_uchime_denovo != nullptr)) { fprintf(fp_log, "%8.2f minh\n", opt_minh); } auto const is_a_uchime_command = (parameters.opt_uchime_ref != nullptr) or (parameters.opt_uchime_denovo != nullptr) or (parameters.opt_uchime2_denovo != nullptr) or (parameters.opt_uchime3_denovo != nullptr); if (is_a_uchime_command) { fprintf(fp_log, "%8.2f xn\n", opt_xn); fprintf(fp_log, "%8.2f dn\n", opt_dn); fprintf(fp_log, "%8.2f xa\n", 1.0); } if ((parameters.opt_uchime_ref != nullptr) or (parameters.opt_uchime_denovo != nullptr)) { fprintf(fp_log, "%8.2f mindiv\n", opt_mindiv); } fprintf(fp_log, "%8.2f id\n", opt_id); if (is_a_uchime_command) { fprintf(fp_log, "%8d maxp\n", 2); } fprintf(fp_log, "\n"); } progress_init("Detecting chimeras", progress_total); chimera_threads_run(); progress_done(); if (not parameters.opt_quiet) { if (total_count > 0) { if (parameters.opt_chimeras_denovo != nullptr) { fprintf(stderr, "Found %d (%.1f%%) chimeras and " "%d (%.1f%%) non-chimeras " "in %u unique sequences.\n", chimera_count, 100.0 * chimera_count / total_count, nonchimera_count, 100.0 * nonchimera_count / total_count, total_count); } else { fprintf(stderr, "Found %d (%.1f%%) chimeras, " "%d (%.1f%%) non-chimeras,\n" "and %d (%.1f%%) borderline sequences " "in %u unique sequences.\n", chimera_count, 100.0 * chimera_count / total_count, nonchimera_count, 100.0 * nonchimera_count / total_count, borderline_count, 100.0 * borderline_count / total_count, total_count); } } else { if (parameters.opt_chimeras_denovo != nullptr) { fprintf(stderr, "Found %d chimeras and " "%d non-chimeras " "in %u unique sequences.\n", chimera_count, nonchimera_count, total_count); } else { fprintf(stderr, "Found %d chimeras, " "%d non-chimeras,\n" "and %d borderline sequences " "in %u unique sequences.\n", chimera_count, nonchimera_count, borderline_count, total_count); } } if (total_abundance > 0) { if (parameters.opt_chimeras_denovo != nullptr) { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " (%.1f%%) chimeras and " "%" PRId64 " (%.1f%%) non-chimeras " "in %" PRId64 " total sequences.\n", chimera_abundance, 100.0 * chimera_abundance / total_abundance, nonchimera_abundance, 100.0 * nonchimera_abundance / total_abundance, total_abundance); } else { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " (%.1f%%) chimeras, " "%" PRId64 " (%.1f%%) non-chimeras,\n" "and %" PRId64 " (%.1f%%) borderline sequences " "in %" PRId64 " total sequences.\n", chimera_abundance, 100.0 * chimera_abundance / total_abundance, nonchimera_abundance, 100.0 * nonchimera_abundance / total_abundance, borderline_abundance, 100.0 * borderline_abundance / total_abundance, total_abundance); } } else { if (parameters.opt_chimeras_denovo != nullptr) { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " chimeras, " "%" PRId64 " non-chimeras " "in %" PRId64 " total sequences.\n", chimera_abundance, nonchimera_abundance, total_abundance); } else { fprintf(stderr, "Taking abundance information into account, " "this corresponds to\n" "%" PRId64 " chimeras, " "%" PRId64 " non-chimeras,\n" "and %" PRId64 " borderline sequences " "in %" PRId64 " total sequences.\n", chimera_abundance, nonchimera_abundance, borderline_abundance, total_abundance); } } } if (parameters.opt_log != nullptr) { if (parameters.opt_uchime_ref != nullptr) { fprintf(fp_log, "%s", parameters.opt_uchime_ref); } else { fprintf(fp_log, "%s", denovo_dbname); } if (seqno > 0) { fprintf(fp_log, ": %d/%u chimeras (%.1f%%)\n", chimera_count, seqno, 100.0 * chimera_count / seqno); } else { fprintf(fp_log, ": %d/%u chimeras\n", chimera_count, seqno); } } if (parameters.opt_uchime_ref != nullptr) { fasta_close(query_fasta_h); } dbindex_free(); db_free(); xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); close_chimera_file(fp_borderline); close_chimera_file(fp_uchimeout); close_chimera_file(fp_uchimealns); close_chimera_file(fp_nonchimeras); close_chimera_file(fp_chimeras); show_rusage(); } vsearch-2.30.1/src/chimera.h000066400000000000000000000050441506776541700156250ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ constexpr auto maxparents = 20; /* max, could be fewer */ auto chimera(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/city.cc000066400000000000000000000353651506776541700153340ustar00rootroot00000000000000// Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // This file provides CityHash64() and related functions. // // It's probably possible to create even faster hash functions by // writing a program that systematically explores some of the space of // possible hash functions, by using SIMD instructions, or by // compromising on hash quality. #include "config.h" #include #include "utils/os_byteswap.hpp" #include // std::swap #include #include // int32_t, uint8_t, uint32_t, uint64_t #include // std::size_t #include // std::memcpy, std::memset #include // std::pair, std::make_pair // anonymous namespace: limit visibility and usage to this translation unit namespace { // Some primes between 2^63 and 2^64 for various uses. constexpr uint64_t k0 = 0xc3a5c85c97cb3127ULL; // uint128 seeds? constexpr uint64_t k1 = 0xb492b66fbe98f273ULL; // uint128 seeds? constexpr uint64_t k2 = 0x9ae16a3b2f90404fULL; // uint64 initialization value? // shift values constexpr auto eighteen = 18U; constexpr auto twentyone = 21U; constexpr auto twentyfive = 25U; constexpr auto twentyseven = 27U; constexpr auto thirty = 30U; constexpr auto thirtythree = 33U; constexpr auto thirtyfive = 35U; constexpr auto thirtyseven = 37U; constexpr auto fortytwo = 42U; constexpr auto fortythree = 43U; constexpr auto fortyfour = 44U; constexpr auto fortyseven = 47U; constexpr auto fortynine = 49U; constexpr auto fiftythree = 53U; constexpr auto sixtyfour = 64U; // refactoring: offset values (in bytes) // constexpr auto offset_8 = 8U; // constexpr auto offset_16 = 16U; // constexpr auto offset_24 = 24U; // constexpr auto offset_32 = 32U; // constexpr auto offset_40 = 40U; // constexpr auto offset_48 = 48U; // constexpr auto offset_56 = 56U; inline auto Uint128Low64(const uint128 & a_pair) -> uint64_t { return a_pair.first; } inline auto Uint128High64(const uint128 & a_pair) -> uint64_t { return a_pair.second; } auto unaligned_load64(const char * src) -> uint64_t { uint64_t result = 0; std::memcpy(&result, src, sizeof(result)); return result; } auto unaligned_load32(const char * src) -> uint32_t { uint32_t result = 0; std::memcpy(&result, src, sizeof(result)); return result; } // C++23 refactoring: // #include // constexpr auto uint32_in_expected_order(uint32_t x) noexcept -> uint32_t { // if constexpr (std::endian::native == std::endian::big) { // return std::byteswap(x); } // else { // return x; } // } constexpr auto uint32_in_expected_order(uint32_t src) noexcept -> uint32_t { #ifdef WORDS_BIGENDIAN return bswap_32(src); #else return src; #endif } constexpr auto uint64_in_expected_order(uint64_t src) noexcept -> uint64_t { #ifdef WORDS_BIGENDIAN return bswap_64(src); #else return src; #endif } auto Fetch64(const char * src) -> uint64_t { return uint64_in_expected_order(unaligned_load64(src)); } auto Fetch32(const char * src) -> uint32_t { return uint32_in_expected_order(unaligned_load32(src)); } // Bitwise right rotate. Normally this will compile to a single // instruction, especially if the shift is a manifest constant. // C++20 refactoring: std::rotr() auto Rotate(uint64_t const val, unsigned int const shift) -> uint64_t { assert(shift != sixtyfour); // shifting by 64 yields undefined results return shift == 0 ? val : ((val >> shift) | (val << (sixtyfour - shift))); } auto ShiftMix(uint64_t const val) -> uint64_t { return val ^ (val >> fortyseven); } // Hash 128 input bits down to 64 bits of output. // This is intended to be a reasonably good hash function. inline auto Hash128to64(const uint128 & a_pair) -> uint64_t { // Murmur-inspired hashing. static constexpr auto divider = 47U; static constexpr uint64_t kMul = 0x9ddfea08eb382d69ULL; auto reg_a = (Uint128Low64(a_pair) ^ Uint128High64(a_pair)) * kMul; reg_a ^= (reg_a >> divider); auto reg_b = (Uint128High64(a_pair) ^ reg_a) * kMul; reg_b ^= (reg_b >> divider); reg_b *= kMul; return reg_b; } auto HashLen16(uint64_t const first, uint64_t const second) -> uint64_t { return Hash128to64(uint128(first, second)); } auto HashLen16(uint64_t const first, uint64_t const second, uint64_t const mul) -> uint64_t { // Murmur-inspired hashing. auto reg_a = (first ^ second) * mul; reg_a ^= (reg_a >> fortyseven); auto reg_b = (second ^ reg_a) * mul; reg_b ^= (reg_b >> fortyseven); reg_b *= mul; return reg_b; } auto HashLen0to16(const char * seq, std::size_t const len) -> uint64_t { if (len >= 8) { const uint64_t mul = k2 + (len * 2); const uint64_t a = Fetch64(seq) + k2; const uint64_t b = Fetch64(seq + len - 8); const uint64_t c = (Rotate(b, thirtyseven) * mul) + a; const uint64_t d = (Rotate(a, twentyfive) + b) * mul; return HashLen16(c, d, mul); } if (len >= 4) { const uint64_t mul = k2 + (len * 2); const uint64_t a = Fetch32(seq); return HashLen16(len + (a << 3U), Fetch32(seq + len - 4), mul); } if (len > 0) { uint8_t const a = seq[0]; uint8_t const b = seq[len >> 1U]; uint8_t const c = seq[len - 1]; const uint32_t y = static_cast(a) + (static_cast(b) << 8U); const uint32_t z = len + (static_cast(c) << 2U); return ShiftMix((y * k2) ^ (z * k0)) * k2; } return k2; // initialization value for empty sequences } // This probably works well for 16-byte strings as well, but it may be overkill // in that case. auto HashLen17to32(const char * seq, std::size_t const len) -> uint64_t { const uint64_t mul = k2 + (len * 2); const uint64_t a = Fetch64(seq) * k1; const uint64_t b = Fetch64(seq + 8); const uint64_t c = Fetch64(seq + len - 8) * mul; const uint64_t d = Fetch64(seq + len - 16) * k2; return HashLen16(Rotate(a + b, fortythree) + Rotate(c, thirty) + d, a + Rotate(b + k2, eighteen) + c, mul); } // Return a 16-byte hash for 48 bytes. Quick and dirty. // Callers do best to use "random-looking" values for a and b. auto WeakHashLen32WithSeeds(uint64_t const w, uint64_t const x, uint64_t const y, uint64_t const z, uint64_t a, uint64_t b) -> std::pair { a += w; b = Rotate(b + a + z, twentyone); const uint64_t c = a; a += x; a += y; b += Rotate(a, fortyfour); return std::make_pair(a + z, b + c); } // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. auto WeakHashLen32WithSeeds(const char * seq, uint64_t a, uint64_t b) -> std::pair { return WeakHashLen32WithSeeds(Fetch64(seq), Fetch64(seq + 8), Fetch64(seq + 16), Fetch64(seq + 24), a, b); } // Return an 8-byte hash for 33 to 64 bytes. auto HashLen33to64(const char * seq, std::size_t len) -> uint64_t { const uint64_t mul = k2 + (len * 2); uint64_t a = Fetch64(seq) * k2; uint64_t b = Fetch64(seq + 8); const uint64_t c = Fetch64(seq + len - 24); const uint64_t d = Fetch64(seq + len - 32); const uint64_t e = Fetch64(seq + 16) * k2; const uint64_t f = Fetch64(seq + 24) * 9; const uint64_t g = Fetch64(seq + len - 8); const uint64_t h = Fetch64(seq + len - 16) * mul; const uint64_t u = Rotate(a + g, fortythree) + ((Rotate(b, thirty) + c) * 9); const uint64_t v = ((a + g) ^ d) + f + 1; const uint64_t w = bswap_64((u + v) * mul) + h; const uint64_t x = Rotate(e + f, fortytwo) + c; const uint64_t y = (bswap_64((v + w) * mul) + g) * mul; const uint64_t z = e + f + c; a = bswap_64(((x + z) * mul) + y) + b; b = ShiftMix(((z + a) * mul) + d + h) * mul; return b + x; } // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings // of any length representable in signed long. Based on City and Murmur. auto CityMurmur(const char * seq, std::size_t len, uint128 seed) -> uint128 { uint64_t a = Uint128Low64(seed); uint64_t b = Uint128High64(seed); uint64_t c = 0; uint64_t d = 0; signed long l = len - 16; if (l <= 0) { // len <= 16 a = ShiftMix(a * k1) * k1; c = (b * k1) + HashLen0to16(seq, len); d = ShiftMix(a + (len >= 8 ? Fetch64(seq) : c)); } else { // len > 16 c = HashLen16(Fetch64(seq + len - 8) + k1, a); d = HashLen16(b + len, c + Fetch64(seq + len - 16)); a += d; do { a ^= ShiftMix(Fetch64(seq) * k1) * k1; a *= k1; b ^= a; c ^= ShiftMix(Fetch64(seq + 8) * k1) * k1; c *= k1; d ^= c; seq += 16; l -= 16; } while (l > 0); } a = HashLen16(a, c); b = HashLen16(d, b); return uint128(a ^ b, HashLen16(b, a)); } constexpr auto likely(bool condition) noexcept -> bool { #if HAVE_BUILTIN_EXPECT static constexpr auto is_expected_to_be_true = 1L; // !!condition converts to a strict boolean value return __builtin_expect(!!condition, is_expected_to_be_true); #else return condition; #endif } auto CityHash128WithSeed(const char * seq, std::size_t len, uint128 seed) -> uint128 { if (len < 128) { return CityMurmur(seq, len, seed); } // We expect len >= 128 to be the common case. Keep 56 bytes of state: // v, w, x, y, and z. std::pair v; std::pair w; uint64_t x = Uint128Low64(seed); uint64_t y = Uint128High64(seed); uint64_t z = len * k1; v.first = (Rotate(y ^ k1, fortynine) * k1) + Fetch64(seq); v.second = (Rotate(v.first, fortytwo) * k1) + Fetch64(seq + 8); w.first = (Rotate(y + z, thirtyfive) * k1) + x; w.second = Rotate(x + Fetch64(seq + 88), fiftythree) * k1; // This is the same inner loop as CityHash64(), manually unrolled. do { x = Rotate(x + y + v.first + Fetch64(seq + 8), thirtyseven) * k1; y = Rotate(y + v.second + Fetch64(seq + 48), fortytwo) * k1; x ^= w.second; y += v.first + Fetch64(seq + 40); z = Rotate(z + w.first, thirtythree) * k1; v = WeakHashLen32WithSeeds(seq, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(seq + 32, z + w.second, y + Fetch64(seq + 16)); std::swap(z, x); seq += 64; x = Rotate(x + y + v.first + Fetch64(seq + 8), thirtyseven) * k1; y = Rotate(y + v.second + Fetch64(seq + 48), fortytwo) * k1; x ^= w.second; y += v.first + Fetch64(seq + 40); z = Rotate(z + w.first, thirtythree) * k1; v = WeakHashLen32WithSeeds(seq, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(seq + 32, z + w.second, y + Fetch64(seq + 16)); std::swap(z, x); seq += 64; len -= 128; } while (likely(len >= 128)); // hot path x += Rotate(v.first + z, fortynine) * k0; y = (y * k0) + Rotate(w.second, thirtyseven); z = (z * k0) + Rotate(w.first, twentyseven); w.first *= 9; v.first *= k0; // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. for (std::size_t tail_done = 0; tail_done < len; ) { tail_done += 32; y = (Rotate(x + y, fortytwo) * k0) + v.second; w.first += Fetch64(seq + len - tail_done + 16); x = (x * k0) + w.first; z += w.second + Fetch64(seq + len - tail_done); w.second += v.first; v = WeakHashLen32WithSeeds(seq + len - tail_done, v.first + z, v.second); v.first *= k0; } // At this point our 56 bytes of state should contain more than // enough information for a strong 128-bit hash. We use two // different 56-byte-to-8-byte hashes to get a 16-byte final result. x = HashLen16(x, v.first); y = HashLen16(y + z, w.first); return uint128(HashLen16(x + v.second, w.second) + y, HashLen16(x + w.second, y + v.second)); } } // end of anonymous namespace auto CityHash64(const char * seq, std::size_t len) -> uint64_t { if (len <= 16) { return HashLen0to16(seq, len); } if (len <= 32) { return HashLen17to32(seq, len); } if (len <= 64) { return HashLen33to64(seq, len); } // For strings over 64 bytes we hash the end first, and then as we // loop we keep 56 bytes of state: v, w, x, y, and z. uint64_t x = Fetch64(seq + len - 40); uint64_t y = Fetch64(seq + len - 16) + Fetch64(seq + len - 56); uint64_t z = HashLen16(Fetch64(seq + len - 48) + len, Fetch64(seq + len - 24)); auto v = WeakHashLen32WithSeeds(seq + len - 64, len, z); auto w = WeakHashLen32WithSeeds(seq + len - 32, y + k1, x); x = (x * k1) + Fetch64(seq); // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. len = (len - 1) & ~static_cast(63); do { x = Rotate(x + y + v.first + Fetch64(seq + 8), thirtyseven) * k1; y = Rotate(y + v.second + Fetch64(seq + 48), fortytwo) * k1; x ^= w.second; y += v.first + Fetch64(seq + 40); z = Rotate(z + w.first, thirtythree) * k1; v = WeakHashLen32WithSeeds(seq, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(seq + 32, z + w.second, y + Fetch64(seq + 16)); std::swap(z, x); seq += 64; len -= 64; } while (len != 0); return HashLen16(HashLen16(v.first, w.first) + (ShiftMix(y) * k1) + z, HashLen16(v.second, w.second) + x); } auto CityHash128(const char * seq, std::size_t len) -> uint128 { return len >= 16 ? CityHash128WithSeed(seq + 16, len - 16, uint128(Fetch64(seq), Fetch64(seq + 8) + k0)) : CityHash128WithSeed(seq, len, uint128(k0, k1)); } vsearch-2.30.1/src/city.h000066400000000000000000000071071506776541700151670ustar00rootroot00000000000000// Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // http://code.google.com/p/cityhash/ // // This file provides a few functions for hashing strings. All of them are // high-quality functions in the sense that they pass standard tests such // as Austin Appleby's SMHasher. They are also fast. // // For 64-bit x86 code, on short strings, we don't know of anything faster than // CityHash64 that is of comparable quality. We believe our nearest competitor // is Murmur3. For 64-bit x86 code, CityHash64 is an excellent choice for hash // tables and most other hashing (excluding cryptography). // // For 64-bit x86 code, on long strings (> 900), the picture is more complicated. // On many recent Intel CPUs, such as Nehalem, Westmere, Sandy Bridge, etc., // CityHashCrc128 appears to be faster than all competitors of comparable // quality. CityHash128 is also good but not quite as fast. We believe our // nearest competitor is Bob Jenkins' Spooky. We don't have great data for // other 64-bit CPUs, but for long strings we know that Spooky is slightly // faster than CityHash on some relatively recent AMD x86-64 CPUs, for example. // Note that CityHashCrc128 is declared in citycrc.h. // // For 32-bit x86 code, we don't know of anything faster than CityHash32 that // is of comparable quality. We believe our nearest competitor is Murmur3A. // (On 64-bit CPUs, it is typically faster to use the other CityHash variants.) // // Functions in the CityHash family are not suitable for cryptography. // // Please see CityHash's README file for more details on our performance // measurements and so on. // // WARNING: This code has been only lightly tested on big-endian platforms! // It is known to work well on little-endian platforms that have a small penalty // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. // It should work on all 32-bit and 64-bit platforms that allow unaligned reads; // bug reports are welcome. // // By the way, for some hash functions, given strings a and b, the hash // of a+b is easily derived from the hashes of a and b. This property // doesn't hold for any hash functions in this file. #ifndef CITY_HASH_H_ #define CITY_HASH_H_ #include // uint64_t #include // std::size_t #include // std::pair using uint128 = std::pair; // Hash functions for byte arrays auto CityHash64(const char * seq, std::size_t len) -> uint64_t; auto CityHash128(const char * seq, std::size_t len) -> uint128; #endif // CITY_HASH_H_ vsearch-2.30.1/src/cluster.cc000066400000000000000000001507001506776541700160340ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "attributes.h" #include "dbindex.h" #include "linmemalign.h" #include "mask.h" #include "minheap.h" #include "msa.h" #include "otutable.h" #include "unique.h" #include "utils/fatal.hpp" #include "utils/xpthread.hpp" #include // std::count, std::minmax_element, std::max_element, std::min #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::qsort #include // std::strcpy, std::strlen #include #include #include // std::get #include static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ struct clusterinfo_s { int seqno; int clusterno; char * cigar; int strand; }; using clusterinfo_t = struct clusterinfo_s; static clusterinfo_t * clusterinfo = nullptr; static int clusters = 0; static int count_matched = 0; static int count_notmatched = 0; static int64_t * cluster_abundance; static std::FILE * fp_centroids = nullptr; static std::FILE * fp_uc = nullptr; static std::FILE * fp_alnout = nullptr; static std::FILE * fp_samout = nullptr; static std::FILE * fp_userout = nullptr; static std::FILE * fp_blast6out = nullptr; static std::FILE * fp_fastapairs = nullptr; static std::FILE * fp_matched = nullptr; static std::FILE * fp_notmatched = nullptr; static std::FILE * fp_otutabout = nullptr; static std::FILE * fp_mothur_shared_out = nullptr; static std::FILE * fp_biomout = nullptr; static std::FILE * fp_qsegout = nullptr; static std::FILE * fp_tsegout = nullptr; static pthread_attr_t attr; static struct searchinfo_s * si_plus; static struct searchinfo_s * si_minus; struct thread_info_s { pthread_t thread; pthread_mutex_t mutex; pthread_cond_t cond; int work; int query_first; int query_count; }; using thread_info_t = struct thread_info_s; static thread_info_t * ti; inline auto compare_byclusterno(const void * a, const void * b) -> int { auto * lhs = (clusterinfo_t *) a; auto * rhs = (clusterinfo_t *) b; if (lhs->clusterno < rhs->clusterno) { return -1; } if (lhs->clusterno > rhs->clusterno) { return +1; } if (lhs->seqno < rhs->seqno) { return -1; } if (lhs->seqno > rhs->seqno) { return +1; } return 0; } inline auto compare_byclusterabundance(const void * a, const void * b) -> int { auto * lhs = (clusterinfo_t *) a; auto * rhs = (clusterinfo_t *) b; if (cluster_abundance[lhs->clusterno] > cluster_abundance[rhs->clusterno]) { return -1; } if (cluster_abundance[lhs->clusterno] < cluster_abundance[rhs->clusterno]) { return +1; } if (lhs->clusterno < rhs->clusterno) { return -1; } if (lhs->clusterno > rhs->clusterno) { return +1; } if (lhs->seqno < rhs->seqno) { return -1; } if (lhs->seqno > rhs->seqno) { return +1; } return 0; } inline auto cluster_query_core(struct searchinfo_s * si) -> void { /* the main core function for clustering */ /* get sequence etc */ const int seqno = si->query_no; si->query_head_len = db_getheaderlen(seqno); si->query_head = db_getheader(seqno); si->qsize = db_getabundance(seqno); si->qseqlen = db_getsequencelen(seqno); if (si->strand != 0) { reverse_complement(si->qsequence, db_getsequence(seqno), si->qseqlen); } else { std::strcpy(si->qsequence, db_getsequence(seqno)); } /* perform search */ search_onequery(si, opt_qmask); } inline auto cluster_worker(int64_t t) -> void { /* wrapper for the main threaded core function for clustering */ for (int q = 0; q < ti[t].query_count; q++) { cluster_query_core(si_plus + ti[t].query_first + q); if (opt_strand > 1) { cluster_query_core(si_minus + ti[t].query_first + q); } } } auto threads_worker(void * vp) -> void * { auto t = (int64_t) vp; thread_info_s * tip = ti + t; xpthread_mutex_lock(&tip->mutex); /* loop until signalled to quit */ while (tip->work >= 0) { /* wait for work available */ if (tip->work == 0) { xpthread_cond_wait(&tip->cond, &tip->mutex); } if (tip->work > 0) { cluster_worker(t); tip->work = 0; xpthread_cond_signal(&tip->cond); } } xpthread_mutex_unlock(&tip->mutex); return nullptr; } auto threads_wakeup(int queries) -> void { int const threads = queries > opt_threads ? opt_threads : queries; int queries_rest = queries; int threads_rest = threads; int query_next = 0; /* tell the threads that there is work to do */ for (int t = 0; t < threads; t++) { thread_info_t * tip = ti + t; tip->query_first = query_next; tip->query_count = (queries_rest + threads_rest - 1) / threads_rest; queries_rest -= tip->query_count; query_next += tip->query_count; --threads_rest; xpthread_mutex_lock(&tip->mutex); tip->work = 1; xpthread_cond_signal(&tip->cond); xpthread_mutex_unlock(&tip->mutex); } /* wait for theads to finish their work */ for (int t = 0; t < threads; t++) { thread_info_t * tip = ti + t; xpthread_mutex_lock(&tip->mutex); while (tip->work > 0) { xpthread_cond_wait(&tip->cond, &tip->mutex); } xpthread_mutex_unlock(&tip->mutex); } } auto threads_init() -> void { xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* allocate memory for thread info */ ti = (thread_info_t *) xmalloc(opt_threads * sizeof(thread_info_t)); /* init and create worker threads */ for (int t = 0; t < opt_threads; t++) { thread_info_t * tip = ti + t; tip->work = 0; xpthread_mutex_init(&tip->mutex, nullptr); xpthread_cond_init(&tip->cond, nullptr); xpthread_create(&tip->thread, &attr, threads_worker, (void *) (int64_t) t); } } auto threads_exit() -> void { /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { struct thread_info_s * tip = ti + t; /* tell worker to quit */ xpthread_mutex_lock(&tip->mutex); tip->work = -1; xpthread_cond_signal(&tip->cond); xpthread_mutex_unlock(&tip->mutex); /* wait for worker to quit */ xpthread_join(tip->thread, nullptr); xpthread_cond_destroy(&tip->cond); xpthread_mutex_destroy(&tip->mutex); } xfree(ti); xpthread_attr_destroy(&attr); } auto cluster_query_init(struct searchinfo_s * si) -> void { /* initialisation of data for one thread; run once for each thread */ /* thread specific initialiation */ si->qsize = 1; si->nw = nullptr; si->hit_count = 0; /* allocate memory for sequence */ si->seq_alloc = db_getlongestsequence() + 1; si->qsequence = (char *) xmalloc(si->seq_alloc); si->kmers = (count_t *) xmalloc((seqcount * sizeof(count_t)) + 32); si->hits = (struct hit *) xmalloc(sizeof(struct hit) * tophits); si->uh = unique_init(); si->m = minheap_init(tophits); si->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); } auto cluster_query_exit(struct searchinfo_s * si) -> void { /* clean up after thread execution; called once per thread */ search16_exit(si->s); unique_exit(si->uh); minheap_exit(si->m); if (si->qsequence != nullptr) { xfree(si->qsequence); } if (si->hits != nullptr) { xfree(si->hits); } if (si->kmers != nullptr) { xfree(si->kmers); } } auto relabel_otu(int clusterno, char * sequence, int seqlen) -> char * { char * label = nullptr; if (opt_relabel != nullptr) { int const size = std::strlen(opt_relabel) + 21; label = (char *) xmalloc(size); snprintf(label, size, "%s%d", opt_relabel, clusterno + 1); } else if (opt_relabel_self) { int const size = seqlen + 1; label = (char *) xmalloc(size); snprintf(label, size, "%.*s", seqlen, sequence); } else if (opt_relabel_sha1) { label = (char *) xmalloc(len_hex_dig_sha1); get_hex_seq_digest_sha1(label, sequence, seqlen); } else if (opt_relabel_md5) { label = (char *) xmalloc(len_hex_dig_md5); get_hex_seq_digest_md5(label, sequence, seqlen); } return label; } auto cluster_core_results_hit(struct hit * best, int clusterno, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { ++count_matched; if ((opt_otutabout != nullptr) or (opt_mothur_shared_out != nullptr) or (opt_biomout != nullptr)) { if ((opt_relabel != nullptr) or opt_relabel_self or opt_relabel_sha1 or opt_relabel_md5) { char * label = relabel_otu(clusterno, db_getsequence(best->target), db_getsequencelen(best->target)); otutable_add(query_head, label, qsize); xfree(label); } else { otutable_add(query_head, db_getheader(best->target), qsize); } } if (fp_uc != nullptr) { results_show_uc_one(fp_uc, best, query_head, qseqlen, clusterno); } if (fp_alnout != nullptr) { results_show_alnout(fp_alnout, best, 1, query_head, qsequence, qseqlen); } if (fp_samout != nullptr) { results_show_samout(fp_samout, best, 1, query_head, qsequence, qsequence_rc); } if (fp_fastapairs != nullptr) { results_show_fastapairs_one(fp_fastapairs, best, query_head, qsequence, qsequence_rc); } if (fp_qsegout != nullptr) { results_show_qsegout_one(fp_qsegout, best, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout != nullptr) { results_show_tsegout_one(fp_tsegout, best); } if (fp_userout != nullptr) { results_show_userout_one(fp_userout, best, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, best, query_head, qseqlen); } if (opt_matched != nullptr) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, std::strlen(query_head), qsize, count_matched, -1.0, -1, -1, nullptr, 0.0); } } auto cluster_core_results_nohit(int clusterno, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { ++count_notmatched; if ((opt_otutabout != nullptr) or (opt_mothur_shared_out != nullptr) or (opt_biomout != nullptr)) { if ((opt_relabel != nullptr) or opt_relabel_self or opt_relabel_sha1 or opt_relabel_md5) { char * label = relabel_otu(clusterno, qsequence, qseqlen); otutable_add(query_head, label, qsize); xfree(label); } else { otutable_add(query_head, query_head, qsize); } } if (opt_uc != nullptr) { fprintf(fp_uc, "S\t%d\t%d\t*\t*\t*\t*\t*\t", clusters, qseqlen); header_fprint_strip(fp_uc, query_head, std::strlen(query_head), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uc, "\t*\n"); } if (opt_output_no_hits != 0) { if (fp_userout != nullptr) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } if (opt_notmatched != nullptr) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, std::strlen(query_head), qsize, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } auto compare_kmersample(const void * a, const void * b) -> int { unsigned int const lhs = * (unsigned int *) a; unsigned int const rhs = * (unsigned int *) b; if (lhs < rhs) { return -1; } if (lhs > rhs) { return +1; } return 0; } auto cluster_core_parallel() -> void { /* create threads and set them in stand-by mode */ threads_init(); constexpr static int queries_per_thread = 1; const int max_queries = queries_per_thread * opt_threads; /* allocate memory for the search information for each query; and initialize it */ si_plus = (struct searchinfo_s *) xmalloc(max_queries * sizeof(struct searchinfo_s)); if (opt_strand > 1) { si_minus = (struct searchinfo_s *) xmalloc(max_queries * sizeof(struct searchinfo_s)); } for (int i = 0; i < max_queries; i++) { cluster_query_init(si_plus + i); si_plus[i].strand = 0; if (opt_strand > 1) { cluster_query_init(si_minus + i); si_minus[i].strand = 1; } } std::vector extra_list(max_queries); struct Scoring scoring; scoring.match = opt_match; scoring.mismatch = opt_mismatch; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_open_query_left = opt_gap_open_query_left; scoring.gap_open_target_left = opt_gap_open_target_left; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_open_target_interior = opt_gap_open_target_interior; scoring.gap_open_query_right = opt_gap_open_query_right; scoring.gap_open_target_right = opt_gap_open_target_right; scoring.gap_extension_query_left = opt_gap_extension_query_left; scoring.gap_extension_target_left = opt_gap_extension_target_left; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_extension_target_interior = opt_gap_extension_target_interior; scoring.gap_extension_query_right = opt_gap_extension_query_right; scoring.gap_extension_target_right = opt_gap_extension_target_right; LinearMemoryAligner lma(scoring); auto lastlength = std::numeric_limits::max(); int seqno = 0; int64_t sum_nucleotides = 0; progress_init("Clustering", db_getnucleotidecount()); while (seqno < seqcount) { /* prepare work for the threads in sia[i] */ /* read query sequences into the search info (si) for each thread */ int queries = 0; for (int i = 0; i < max_queries; i++) { if (seqno < seqcount) { int const length = db_getsequencelen(seqno); #if 1 if ((opt_cluster_smallmem != nullptr) and (opt_usersort == 0) and (length > lastlength)) { fatal("Sequences not sorted by length and --usersort not specified."); } #endif lastlength = length; si_plus[i].query_no = seqno; si_plus[i].strand = 0; if (opt_strand > 1) { si_minus[i].query_no = seqno; si_minus[i].strand = 1; } ++queries; ++seqno; } } /* perform work in threads */ threads_wakeup(queries); /* analyse results */ int extra_count = 0; for (int i = 0; i < queries; i++) { struct searchinfo_s * si_p = si_plus + i; struct searchinfo_s * si_m = opt_strand > 1 ? si_minus + i : nullptr; for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_m : si_p; int added = 0; if (extra_count != 0) { /* Check if there is a hit with one of the non-matching extra sequences just analysed in this round */ for (int j = 0; j < extra_count; j++) { struct searchinfo_s const * sic = si_plus + extra_list[j]; /* find the number of shared unique kmers */ auto const shared = unique_count_shared(*si->uh, opt_wordlength, sic->kmersamplecount, sic->kmersample); /* check if min number of shared kmers is satisfied */ if (search_enough_kmers(*si, shared)) { unsigned int const length = sic->qseqlen; /* Go through the list of hits and see if the current match is better than any on the list in terms of more shared kmers (or shorter length if equal no of kmers). Determine insertion point (x). */ int x = si->hit_count; while ((x > 0) and ((si->hits[x - 1].count < shared) or ((si->hits[x - 1].count == shared) and (db_getsequencelen(si->hits[x - 1].target) > length)))) { --x; } if (x < opt_maxaccepts + opt_maxrejects - 1) { /* insert into list at position x */ /* trash bottom element if no more space */ if (si->hit_count >= opt_maxaccepts + opt_maxrejects - 1) { if (si->hits[si->hit_count-1].aligned) { xfree(si->hits[si->hit_count - 1].nwalignment); } --si->hit_count; } /* move the rest down */ for (int z = si->hit_count; z > x; z--) { si->hits[z] = si->hits[z - 1]; } /* init new hit */ struct hit * hit = si->hits + x; ++si->hit_count; hit->target = sic->query_no; hit->strand = si->strand; hit->count = shared; hit->accepted = false; hit->rejected = false; hit->aligned = false; hit->weak = false; hit->nwalignment = nullptr; ++added; } } } } /* now go through the hits and determine final status of each */ if (added != 0) { si->rejects = 0; si->accepts = 0; /* set all statuses to undetermined */ for (int t = 0; t < si->hit_count; t++) { si->hits[t].accepted = false; si->hits[t].rejected = false; } for (int t = 0; (si->accepts < opt_maxaccepts) and (si->rejects < opt_maxrejects) and (t < si->hit_count); ++t) { struct hit * hit = si->hits + t; if (not hit->aligned) { /* Test accept/reject criteria before alignment */ unsigned int const target = hit->target; if (search_acceptable_unaligned(*si, target)) { /* perform vectorized alignment */ /* but only using 1 sequence ! */ unsigned int nwtarget = target; int64_t nwscore = 0; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; char * nwcigar = nullptr; /* short variants for simd aligner */ CELL snwscore = 0; unsigned short snwalignmentlength = 0; unsigned short snwmatches = 0; unsigned short snwmismatches = 0; unsigned short snwgaps = 0; search16(si->s, 1, & nwtarget, & snwscore, & snwalignmentlength, & snwmatches, & snwmismatches, & snwgaps, & nwcigar); int64_t const tseqlen = db_getsequencelen(target); if (snwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * tseq = db_getsequence(target); if (nwcigar != nullptr) { xfree(nwcigar); } nwcigar = xstrdup(lma.align(si->qsequence, tseq, si->qseqlen, tseqlen)); lma.alignstats(nwcigar, si->qsequence, tseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); } else { nwscore = snwscore; nwalignmentlength = snwalignmentlength; nwmatches = snwmatches; nwmismatches = snwmismatches; nwgaps = snwgaps; } int64_t const nwdiff = nwalignmentlength - nwmatches; int64_t const nwindels = nwdiff - nwmismatches; hit->aligned = true; hit->nwalignment = nwcigar; hit->nwscore = nwscore; hit->nwdiff = nwdiff; hit->nwgaps = nwgaps; hit->nwindels = nwindels; hit->nwalignmentlength = nwalignmentlength; hit->matches = nwmatches; hit->mismatches = nwmismatches; hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) / nwalignmentlength; hit->shortest = std::min(si->qseqlen, static_cast(tseqlen)); hit->longest = std::max(si->qseqlen, static_cast(tseqlen)); /* trim alignment and compute numbers excluding terminal gaps */ align_trim(hit); } else { /* rejection without alignment */ hit->rejected = true; ++si->rejects; } } if (not hit->rejected) { /* test accept/reject criteria after alignment */ if (search_acceptable_aligned(*si, hit)) { ++si->accepts; } else { ++si->rejects; } } } /* delete all undetermined hits */ int new_hit_count = si->hit_count; for (int t = si->hit_count - 1; t >= 0; t--) { struct hit const * hit = si->hits + t; if (not hit->accepted and not hit->rejected) { new_hit_count = t; if (hit->aligned) { xfree(hit->nwalignment); } } } si->hit_count = new_hit_count; } } /* find best hit */ struct hit * best = nullptr; if (opt_sizeorder) { best = search_findbest2_bysize(si_p, si_m); } else { best = search_findbest2_byid(si_p, si_m); } int const myseqno = si_p->query_no; if (best != nullptr) { /* a hit was found, cluster current sequence with hit */ int const target = best->target; /* output intermediate results to uc etc */ cluster_core_results_hit(best, clusterinfo[target].clusterno, si_p->query_head, si_p->qseqlen, si_p->qsequence, (best->strand != 0) ? si_m->qsequence : nullptr, si_p->qsize); /* update cluster info about this sequence */ clusterinfo[myseqno].seqno = myseqno; clusterinfo[myseqno].clusterno = clusterinfo[target].clusterno; clusterinfo[myseqno].cigar = best->nwalignment; clusterinfo[myseqno].strand = best->strand; best->nwalignment = nullptr; } else { /* no hit found; add it to the list of extra sequences that must be considered by the coming queries in this round */ extra_list[extra_count] = i; ++extra_count; /* update cluster info about this sequence */ clusterinfo[myseqno].seqno = myseqno; clusterinfo[myseqno].clusterno = clusters; clusterinfo[myseqno].cigar = nullptr; clusterinfo[myseqno].strand = 0; /* add current sequence to database */ dbindex_addsequence(myseqno, opt_qmask); /* output intermediate results to uc etc */ cluster_core_results_nohit(clusters, si_p->query_head, si_p->qseqlen, si_p->qsequence, nullptr, si_p->qsize); ++clusters; } /* free alignments */ for (int s = 0; s < opt_strand; s++) { struct searchinfo_s const * si = (s != 0) ? si_m : si_p; for (int j = 0; j < si->hit_count; j++) { if (si->hits[j].aligned) { if (si->hits[j].nwalignment != nullptr) { xfree(si->hits[j].nwalignment); } } } } sum_nucleotides += si_p->qseqlen; } progress_update(sum_nucleotides); } progress_done(); /* clean up search info */ for (int i = 0; i < max_queries; i++) { cluster_query_exit(si_plus + i); if (opt_strand > 1) { cluster_query_exit(si_minus + i); } } // extra_list no used after that point xfree(si_plus); if (opt_strand > 1) { xfree(si_minus); } /* terminate threads and clean up */ threads_exit(); } auto cluster_core_serial() -> void { std::array si_p {{}}; // refactoring: direct initialization? std::array si_m {{}}; cluster_query_init(si_p.data()); if (opt_strand > 1) { cluster_query_init(si_m.data()); } auto lastlength = std::numeric_limits::max(); progress_init("Clustering", seqcount); for (int seqno = 0; seqno < seqcount; seqno++) { int const length = db_getsequencelen(seqno); #if 1 if ((opt_cluster_smallmem != nullptr) and (opt_usersort == 0) and (length > lastlength)) { fatal("Sequences not sorted by length and --usersort not specified."); } #endif lastlength = length; si_p[0].query_no = seqno; si_p[0].strand = 0; cluster_query_core(si_p.data()); if (opt_strand > 1) { si_m[0].query_no = seqno; si_m[0].strand = 1; cluster_query_core(si_m.data()); } struct hit * best = nullptr; if (opt_sizeorder) { best = search_findbest2_bysize(si_p.data(), si_m.data()); } else { best = search_findbest2_byid(si_p.data(), si_m.data()); } if (best != nullptr) { int const target = best->target; cluster_core_results_hit(best, clusterinfo[target].clusterno, si_p[0].query_head, si_p[0].qseqlen, si_p[0].qsequence, (best->strand != 0) ? si_m[0].qsequence : nullptr, si_p[0].qsize); clusterinfo[seqno].seqno = seqno; clusterinfo[seqno].clusterno = clusterinfo[target].clusterno; clusterinfo[seqno].cigar = best->nwalignment; clusterinfo[seqno].strand = best->strand; best->nwalignment = nullptr; } else { clusterinfo[seqno].seqno = seqno; clusterinfo[seqno].clusterno = clusters; clusterinfo[seqno].cigar = nullptr; clusterinfo[seqno].strand = 0; dbindex_addsequence(seqno, opt_qmask); cluster_core_results_nohit(clusters, si_p[0].query_head, si_p[0].qseqlen, si_p[0].qsequence, nullptr, si_p[0].qsize); ++clusters; } /* free alignments */ for (int s = 0; s < opt_strand; s++) { struct searchinfo_s const * si = (s != 0) ? si_m.data() : si_p.data(); for (int i = 0; i < si->hit_count; i++) { if (si->hits[i].aligned) { if (si->hits[i].nwalignment != nullptr) { xfree(si->hits[i].nwalignment); } } } } progress_update(seqno); } progress_done(); cluster_query_exit(si_p.data()); if (opt_strand > 1) { cluster_query_exit(si_m.data()); } } auto cluster(char * dbname, char * cmdline, char * progheader) -> void { if (opt_centroids != nullptr) { fp_centroids = fopen_output(opt_centroids); if (fp_centroids == nullptr) { fatal("Unable to open centroids file for writing"); } } if (opt_uc != nullptr) { fp_uc = fopen_output(opt_uc); if (fp_uc == nullptr) { fatal("Unable to open uc file for writing"); } } if (opt_alnout != nullptr) { fp_alnout = fopen_output(opt_alnout); if (fp_alnout == nullptr) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_samout != nullptr) { fp_samout = fopen_output(opt_samout); if (fp_samout == nullptr) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout != nullptr) { fp_userout = fopen_output(opt_userout); if (fp_userout == nullptr) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out != nullptr) { fp_blast6out = fopen_output(opt_blast6out); if (fp_blast6out == nullptr) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_fastapairs != nullptr) { fp_fastapairs = fopen_output(opt_fastapairs); if (fp_fastapairs == nullptr) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout != nullptr) { fp_qsegout = fopen_output(opt_qsegout); if (fp_qsegout == nullptr) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout != nullptr) { fp_tsegout = fopen_output(opt_tsegout); if (fp_tsegout == nullptr) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched != nullptr) { fp_matched = fopen_output(opt_matched); if (fp_matched == nullptr) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched != nullptr) { fp_notmatched = fopen_output(opt_notmatched); if (fp_notmatched == nullptr) { fatal("Unable to open notmatched output file for writing"); } } if (opt_otutabout != nullptr) { fp_otutabout = fopen_output(opt_otutabout); if (fp_otutabout == nullptr) { fatal("Unable to open OTU table (text format) output file for writing"); } } if (opt_mothur_shared_out != nullptr) { fp_mothur_shared_out = fopen_output(opt_mothur_shared_out); if (fp_mothur_shared_out == nullptr) { fatal("Unable to open OTU table (mothur format) output file for writing"); } } if (opt_biomout != nullptr) { fp_biomout = fopen_output(opt_biomout); if (fp_biomout == nullptr) { fatal("Unable to open OTU table (biom 1.0 format) output file for writing"); } } db_read(dbname, 0); otutable_init(); results_show_samheader(fp_samout, cmdline, dbname); if (opt_qmask == MASK_DUST) { dust_all(); } else if ((opt_qmask == MASK_SOFT) and (opt_hardmask != 0)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); if (opt_cluster_fast != nullptr) { db_sortbylength(); } else if ((opt_cluster_size != nullptr) or (opt_cluster_unoise != nullptr)) { db_sortbyabundance(); } dbindex_prepare(1, opt_qmask); /* tophits = the maximum number of hits we need to store */ if ((opt_maxrejects == 0) or (opt_maxrejects > seqcount)) { opt_maxrejects = seqcount; } if ((opt_maxaccepts == 0) or (opt_maxaccepts > seqcount)) { opt_maxaccepts = seqcount; } tophits = opt_maxrejects + opt_maxaccepts + MAXDELAYED; tophits = std::min(tophits, seqcount); std::vector clusterinfo_v(seqcount); clusterinfo = clusterinfo_v.data(); if (opt_log != nullptr) { uint64_t const slots = 1ULL << (static_cast(opt_wordlength) << 1ULL); fprintf(fp_log, "\n"); fprintf(fp_log, " Alphabet nt\n"); fprintf(fp_log, " Word width %" PRId64 "\n", opt_wordlength); fprintf(fp_log, " Word ones %" PRId64 "\n", opt_wordlength); fprintf(fp_log, " Spaced No\n"); fprintf(fp_log, " Hashed No\n"); fprintf(fp_log, " Coded No\n"); fprintf(fp_log, " Stepped No\n"); fprintf(fp_log, " Slots %" PRIu64 " (%.1fk)\n", slots, slots/1000.0); fprintf(fp_log, " DBAccel 100%%\n"); fprintf(fp_log, "\n"); } if (opt_threads == 1) { cluster_core_serial(); } else { cluster_core_parallel(); } /* find size and abundance of each cluster and save stats */ std::vector cluster_abundance_v(clusters); cluster_abundance = cluster_abundance_v.data(); std::vector cluster_size(clusters); for (int i = 0; i < seqcount; i++) { int const seqno = clusterinfo_v[i].seqno; int const clusterno = clusterinfo_v[i].clusterno; cluster_abundance_v[clusterno] += opt_sizein ? db_getabundance(seqno) : 1; ++cluster_size[clusterno]; } // refactoring: isolate in a function (returns struct abundance_stats) auto const minmax_elements = std::minmax_element(cluster_abundance_v.cbegin(), cluster_abundance_v.cend()); auto const abundance_min = cluster_abundance_v.empty() ? 0 : *std::get<0>(minmax_elements); auto const abundance_max = cluster_abundance_v.empty() ? 0 : *std::get<1>(minmax_elements); int const singletons = std::count(cluster_abundance_v.cbegin(), cluster_abundance_v.cend(), int64_t{1}); auto const max_element = std::max_element(cluster_size.cbegin(), cluster_size.cend()); auto const size_max = cluster_size.empty() ? 0 : *max_element; /* Sort sequences in clusters by their abundance or ordinal number */ /* Sequences in same cluster must always come right after each other. */ /* The centroid sequence must be the first in each cluster. */ progress_init("Sorting clusters", clusters); if (opt_clusterout_sort) { qsort(clusterinfo_v.data(), seqcount, sizeof(clusterinfo_t), compare_byclusterabundance); } else { qsort(clusterinfo_v.data(), seqcount, sizeof(clusterinfo_t), compare_byclusterno); } progress_done(); progress_init("Writing clusters", seqcount); /* allocate memory for full file name of the clusters files */ std::FILE * fp_clusters = nullptr; static constexpr auto space_for_cluster_id = 25; // up to 25 digits std::vector fn_clusters; if (opt_clusters != nullptr) { fn_clusters.reserve(std::strlen(opt_clusters) + space_for_cluster_id); } int lastcluster = -1; int ordinal = 0; for (int i = 0; i < seqcount; i++) { int const seqno = clusterinfo_v[i].seqno; int const clusterno = clusterinfo_v[i].clusterno; if (clusterno != lastcluster) { /* prepare for new cluster */ /* performed with first sequence only in each cluster */ /* the first sequence is always the centroid */ if (opt_centroids != nullptr) { fasta_print_general(fp_centroids, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), cluster_abundance_v[clusterno], clusterno + 1, -1.0, -1, opt_clusterout_id ? clusterno : -1, nullptr, 0.0); } if (opt_uc != nullptr) { fprintf(fp_uc, "C\t%d\t%" PRId64 "\t*\t*\t*\t*\t*\t", clusterno, cluster_abundance_v[clusterno]); header_fprint_strip(fp_uc, db_getheader(seqno), db_getheaderlen(seqno), opt_xsize, opt_xee, opt_xlength); fprintf(fp_uc, "\t*\n"); } if (opt_clusters != nullptr) { /* close previous (except for first time) and open new file */ if (lastcluster != -1) { fclose(fp_clusters); } ordinal = 0; snprintf(fn_clusters.data(), fn_clusters.capacity(), "%s%d", opt_clusters, clusterno); fp_clusters = fopen_output(fn_clusters.data()); if (fp_clusters == nullptr) { fatal("Unable to open clusters file for writing"); } } lastcluster = clusterno; } /* performed for all sequences */ if (opt_clusters != nullptr) { ++ordinal; fasta_print_db_relabel(fp_clusters, seqno, ordinal); } progress_update(i); } if (lastcluster != -1) { /* performed with the last sequence */ if (opt_clusters != nullptr) { fclose(fp_clusters); } } progress_done(); if (clusters < 1) { if (not opt_quiet) { fprintf(stderr, "Clusters: 0\n"); fprintf(stderr, "Singletons: 0\n"); } if (opt_log != nullptr) { fprintf(fp_log, "Clusters: 0\n"); fprintf(fp_log, "Singletons: 0\n"); } } else { if (not opt_quiet) { fprintf(stderr, "Clusters: %d Size min %" PRId64 ", max %" PRId64 ", avg %.1f\n", clusters, abundance_min, abundance_max, 1.0 * seqcount / clusters); fprintf(stderr, "Singletons: %d, %.1f%% of seqs, %.1f%% of clusters\n", singletons, 100.0 * singletons / seqcount, 100.0 * singletons / clusters); } if (opt_log != nullptr) { fprintf(fp_log, "Clusters: %d Size min %" PRId64 ", max %" PRId64 ", avg %.1f\n", clusters, abundance_min, abundance_max, 1.0 * seqcount / clusters); fprintf(fp_log, "Singletons: %d, %.1f%% of seqs, %.1f%% of clusters\n", singletons, 100.0 * singletons / seqcount, 100.0 * singletons / clusters); fprintf(fp_log, "\n"); } } if ((opt_msaout != nullptr) or (opt_consout != nullptr) or (opt_profile != nullptr)) { int msa_target_count = 0; std::vector msa_target_list_v(size_max); progress_init("Multiple alignments", seqcount); std::FILE * fp_msaout = nullptr; std::FILE * fp_consout = nullptr; std::FILE * fp_profile = nullptr; if (opt_msaout != nullptr) { fp_msaout = fopen_output(opt_msaout); if (fp_msaout == nullptr) { fatal("Unable to open msaout file"); } } if (opt_consout != nullptr) { fp_consout = fopen_output(opt_consout); if (fp_consout == nullptr) { fatal("Unable to open consout file"); } } if (opt_profile != nullptr) { fp_profile = fopen_output(opt_profile); if (fp_profile == nullptr) { fatal("Unable to open profile file"); } } lastcluster = -1; for (int i = 0; i < seqcount; i++) { int const clusterno = clusterinfo_v[i].clusterno; int const seqno = clusterinfo_v[i].seqno; char * cigar = clusterinfo_v[i].cigar; int const strand = clusterinfo_v[i].strand; if (clusterno != lastcluster) { if (lastcluster != -1) { /* compute msa & consensus */ msa(fp_msaout, fp_consout, fp_profile, lastcluster, msa_target_count, msa_target_list_v, cluster_abundance_v[lastcluster]); } /* start new cluster */ msa_target_count = 0; lastcluster = clusterno; } /* add current sequence to the cluster */ msa_target_list_v[msa_target_count].seqno = seqno; msa_target_list_v[msa_target_count].cigar = cigar; msa_target_list_v[msa_target_count].strand = strand; ++msa_target_count; progress_update(i); } if (lastcluster != -1) { /* compute msa & consensus */ msa(fp_msaout, fp_consout, fp_profile, lastcluster, msa_target_count, msa_target_list_v, cluster_abundance_v[lastcluster]); } progress_done(); if (fp_profile != nullptr) { fclose(fp_profile); } if (fp_msaout != nullptr) { fclose(fp_msaout); } if (fp_consout != nullptr) { fclose(fp_consout); } } // cluster_abundance not used below that point // cluster_size not used below that point /* free cigar strings for all aligned sequences */ for (auto & clusterinfo : clusterinfo_v) { if (clusterinfo.cigar != nullptr) { xfree(clusterinfo.cigar); } } // clusterinfo not used after this point if (fp_biomout != nullptr) { otutable_print_biomout(fp_biomout); fclose(fp_biomout); } if (fp_otutabout != nullptr) { otutable_print_otutabout(fp_otutabout); fclose(fp_otutabout); } if (fp_mothur_shared_out != nullptr) { otutable_print_mothur_shared_out(fp_mothur_shared_out); fclose(fp_mothur_shared_out); } otutable_done(); if (opt_matched != nullptr) { fclose(fp_matched); } if (opt_notmatched != nullptr) { fclose(fp_notmatched); } if (opt_fastapairs != nullptr) { fclose(fp_fastapairs); } if (opt_qsegout != nullptr) { fclose(fp_qsegout); } if (opt_tsegout != nullptr) { fclose(fp_tsegout); } if (fp_blast6out != nullptr) { fclose(fp_blast6out); } if (fp_userout != nullptr) { fclose(fp_userout); clean_up(); // free userfields allocation } if (fp_alnout != nullptr) { fclose(fp_alnout); } if (fp_samout != nullptr) { fclose(fp_samout); } if (fp_uc != nullptr) { fclose(fp_uc); } if (fp_centroids != nullptr) { fclose(fp_centroids); } dbindex_free(); db_free(); show_rusage(); } auto cluster_fast(char * cmdline, char * progheader) -> void { cluster(opt_cluster_fast, cmdline, progheader); } auto cluster_smallmem(char * cmdline, char * progheader) -> void { cluster(opt_cluster_smallmem, cmdline, progheader); } auto cluster_size(char * cmdline, char * progheader) -> void { cluster(opt_cluster_size, cmdline, progheader); } auto cluster_unoise(char * cmdline, char * progheader) -> void { cluster(opt_cluster_unoise, cmdline, progheader); } vsearch-2.30.1/src/cluster.h000066400000000000000000000052531506776541700157000ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto cluster_smallmem(char * cmdline, char * progheader) -> void; auto cluster_fast(char * cmdline, char * progheader) -> void; auto cluster_size(char * cmdline, char * progheader) -> void; auto cluster_unoise(char * cmdline, char * progheader) -> void; vsearch-2.30.1/src/cpu.cc000066400000000000000000000202251506776541700151400ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // int32_t #include // std::memcpy /* This file contains code dependent on special cpu features. */ /* The file may be compiled several times with different cpu options. */ #ifdef __aarch64__ void increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) { const uint8x16_t c1 = { 0x01, 0x01, 0x02, 0x02, 0x04, 0x04, 0x08, 0x08, 0x10, 0x10, 0x20, 0x20, 0x40, 0x40, 0x80, 0x80 }; unsigned short * p = (unsigned short *) (bitmap); int16x8_t * q = (int16x8_t *) (counters); const auto r = (totalbits + 15) / 16; for (auto j = 0U; j < r; j++) { // load and duplicate short uint16x8_t r0 = vdupq_n_u16(*p); ++p; // cast to bytes uint8x16_t r1 = vreinterpretq_u8_u16(r0); // bit test with mask giving 0x00 or 0xff uint8x16_t r2 = vtstq_u8(r1, c1); // transpose to duplicate even bytes uint8x16_t r3 = vtrn1q_u8(r2, r2); // transpose to duplicate odd bytes uint8x16_t r4 = vtrn2q_u8(r2, r2); // cast to signed 0x0000 or 0xffff int16x8_t r5 = vreinterpretq_s16_u8(r3); // cast to signed 0x0000 or 0xffff int16x8_t r6 = vreinterpretq_s16_u8(r4); // subtract signed 0 or -1 (i.e add 0 or 1) with saturation to counter *q = vqsubq_s16(*q, r5); ++q; // subtract signed 0 or 1 (i.e. add 0 or 1) with saturation to counter *q = vqsubq_s16(*q, r6); ++q; } } #elif defined __PPC__ void increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) { const __vector unsigned char c1 = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; const __vector unsigned char c2 = { 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f, 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f }; const __vector unsigned char c3 = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; unsigned short * p = (unsigned short *) (bitmap); __vector signed short * q = (__vector signed short *) (counters); const auto r = (totalbits + 15) / 16; for (auto j = 0U; j < r; j++) { __vector unsigned char r0; std::memcpy(&r0, p, 2); ++p; __vector unsigned char r1 = vec_perm(r0, r0, c1); __vector unsigned char r2 = vec_or(r1, c2); __vector __bool char r3 = vec_cmpeq(r2, c3); __vector signed short r4 = (__vector signed short) vec_unpackl(r3); __vector signed short r5 = (__vector signed short) vec_unpackh(r3); *q = vec_subs(*q, r4); ++q; *q = vec_subs(*q, r5); ++q; } } #elif __x86_64__ || defined(SIMDE_VERSION) #ifdef __x86_64__ #include #else #define SIMDE_ENABLE_NATIVE_ALIASES #include #endif #ifdef SIMDE_VERSION void increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) #elif defined(SSSE3) void increment_counters_from_bitmap_ssse3(count_t * counters, unsigned char * bitmap, unsigned int totalbits) #else void increment_counters_from_bitmap_sse2(count_t * counters, unsigned char * bitmap, unsigned int totalbits) #endif { /* Increment selected elements in an array of 16 bit counters. The counters to increment are indicated by 1's in the bitmap. We read 16 bytes from the bitmap, but use only two bytes (16 bits). Convert these 16 bits into 16 bytes with either 0x00 or 0xFF. Extend these to 16 words (32 bytes) with either 0x0000 or 0xFFFF. Use these values to increment 16 words in an array by subtraction. See article below for some hints: http://stackoverflow.com/questions/21622212/ how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb Because the efficient PSHUFB instruction is a SSSE3 instruction lacking in many AMD cpus, we provide slightly slower alternative SSE2 code. */ // 0xffffffff -> 1111'1111'1111'1111'1111'1111'1111'1111 (32 bits) static constexpr auto all_ones = static_cast(0xffffffff); // 0x7fbfdfef -> 0111'1111'1011'1111'1101'1111'1110'1111 (32 bits) static constexpr auto mask1 = static_cast(0x7fbfdfef); // 0xf7fbfdfe -> 1111'0111'1111'1011'1111'1101'1111'1110 (32 bits) static constexpr auto mask2 = static_cast(0xf7fbfdfe); #if defined(SSSE3) || defined(SIMDE_VERSION) const auto c1 = _mm_set_epi32(0x01010101, 0x01010101, 0x00000000, 0x00000000); #endif const auto c2 = _mm_set_epi32(mask1, mask2, mask1, mask2); const auto c3 = _mm_set_epi32(all_ones, all_ones, all_ones, all_ones); auto * p = (unsigned short *) (bitmap); auto * q = (__m128i *) (counters); const auto r = (totalbits + 15) / 16; for (auto j = 0U; j < r; j++) { const auto xmm0 = _mm_loadu_si128((__m128i *) p++); #if defined(SSSE3) || defined(SIMDE_VERSION) const auto xmm1 = _mm_shuffle_epi8(xmm0, c1); #else const auto xmm6 = _mm_unpacklo_epi8(xmm0, xmm0); const auto xmm7 = _mm_unpacklo_epi16(xmm6, xmm6); const auto xmm1 = _mm_unpacklo_epi32(xmm7, xmm7); #endif const auto xmm2 = _mm_or_si128(xmm1, c2); const auto xmm3 = _mm_cmpeq_epi8(xmm2, c3); const auto xmm4 = _mm_unpacklo_epi8(xmm3, xmm3); const auto xmm5 = _mm_unpackhi_epi8(xmm3, xmm3); *q = _mm_subs_epi16(*q, xmm4); ++q; *q = _mm_subs_epi16(*q, xmm5); ++q; } } #else #error Unknown architecture #endif vsearch-2.30.1/src/cpu.h000066400000000000000000000060721506776541700150060ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using count_t = unsigned short; #ifdef __x86_64__ auto increment_counters_from_bitmap_sse2(count_t * counters, unsigned char * bitmap, unsigned int totalbits) -> void; auto increment_counters_from_bitmap_ssse3(count_t * counters, unsigned char * bitmap, unsigned int totalbits) -> void; #else auto increment_counters_from_bitmap(count_t * counters, unsigned char * bitmap, unsigned int totalbits) -> void; #endif vsearch-2.30.1/src/cut.cc000066400000000000000000000415461506776541700151550ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/progress.hpp" #include // std::count, std::for_each, std::equal #include #include // macros PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf #include // std::next #include #include // std::move #include // anonymous namespace: limit visibility and usage to this translation unit namespace { struct statistics { int fragment_no = 0; int fragment_rev_no = 0; int fragment_discarded_no = 0; int fragment_discarded_rev_no = 0; int64_t cut = 0; int64_t uncut = 0; int64_t matches = 0; }; struct a_file { char * name = nullptr; std::FILE * handle = nullptr; }; struct a_strand { a_file forward; a_file reverse; }; struct file_purpose { a_strand cut; a_strand discarded; }; struct restriction_pattern { std::string pattern; std::string coded_pattern; int cut_fwd; int cut_rev; }; auto cut_a_sequence(fastx_handle input_handle, struct restriction_pattern const & restriction, struct file_purpose const & fastaout, struct statistics & counters, std::vector & rc_buffer) -> void { auto const pattern_length = static_cast(restriction.pattern.size()); char const * seq = fasta_get_sequence(input_handle); auto const seq_length = static_cast(fasta_get_sequence_length(input_handle)); // failed refactoring: use transform to create a coded std::string // and find() to search for pattern occurrences, IUPAC chars make it // harder to compare sequences /* get reverse complement */ rc_buffer.clear(); rc_buffer.resize(seq_length + 1); reverse_complement(rc_buffer.data(), seq, seq_length); int64_t local_matches = 0; int frag_start = 0; int frag_length = seq_length; int rc_start = seq_length; int rc_length = 0; for (int i = 0; i < seq_length - pattern_length + 1; ++i) { auto const match = std::equal(restriction.coded_pattern.cbegin(), restriction.coded_pattern.cend(), std::next(seq, i), [](char const & lhs, char const & rhs) -> bool { return is_equivalent_4bit_rhs(lhs, rhs); }); if (not match) { continue; } ++local_matches; frag_length = i + restriction.cut_fwd - frag_start; rc_length = rc_start - (seq_length - (i + restriction.cut_rev)); rc_start -= rc_length; if ((frag_length > 0) and (fastaout.cut.forward.name != nullptr)) { fasta_print_general(fastaout.cut.forward.handle, nullptr, std::next(seq, frag_start), frag_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_no, -1.0, -1, -1, nullptr, 0.0); } if ((rc_length > 0) and (fastaout.cut.reverse.name != nullptr)) { fasta_print_general(fastaout.cut.reverse.handle, nullptr, &rc_buffer[rc_start], rc_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_rev_no, -1.0, -1, -1, nullptr, 0.0); } frag_start += frag_length; } if (local_matches > 0) { ++counters.cut; frag_length = seq_length - frag_start; rc_length = rc_start; rc_start = 0; } if ((local_matches > 0) and (frag_length > 0) and (fastaout.cut.forward.name != nullptr)) { fasta_print_general(fastaout.cut.forward.handle, nullptr, std::next(seq, frag_start), frag_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_no, -1.0, -1, -1, nullptr, 0.0); } if ((local_matches > 0) and (rc_length > 0) and (fastaout.cut.reverse.name != nullptr)) { fasta_print_general(fastaout.cut.reverse.handle, nullptr, &rc_buffer[rc_start], rc_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_rev_no, -1.0, -1, -1, nullptr, 0.0); } if (local_matches == 0) { ++counters.uncut; } if ((local_matches == 0) and (fastaout.discarded.forward.name != nullptr)) { fasta_print_general(fastaout.discarded.forward.handle, nullptr, seq, seq_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_discarded_no, -1.0, -1, -1, nullptr, 0.0); } if ((local_matches == 0) and (fastaout.discarded.reverse.name != nullptr)) { fasta_print_general(fastaout.discarded.reverse.handle, nullptr, rc_buffer.data(), seq_length, fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), fasta_get_abundance(input_handle), ++counters.fragment_discarded_rev_no, -1.0, -1, -1, nullptr, 0.0); } counters.matches += local_matches; } auto ckeck_if_output_is_set(struct Parameters const & parameters) -> void { if ((parameters.opt_fastaout == nullptr) and (parameters.opt_fastaout_discarded == nullptr) and (parameters.opt_fastaout_rev == nullptr) and (parameters.opt_fastaout_discarded_rev == nullptr)) { fatal("No output files specified"); } } auto open_output_files(struct Parameters const & parameters) -> struct file_purpose { struct file_purpose fastaout; fastaout.cut.forward.name = parameters.opt_fastaout; fastaout.discarded.forward.name = parameters.opt_fastaout_discarded; fastaout.cut.reverse.name = parameters.opt_fastaout_rev; fastaout.discarded.reverse.name = parameters.opt_fastaout_discarded_rev; if (fastaout.cut.forward.name != nullptr) { fastaout.cut.forward.handle = fopen_output(fastaout.cut.forward.name); } if (fastaout.discarded.forward.name != nullptr) { fastaout.discarded.forward.handle = fopen_output(fastaout.discarded.forward.name); } if (fastaout.cut.reverse.name != nullptr) { fastaout.cut.reverse.handle = fopen_output(fastaout.cut.reverse.name); } if (fastaout.discarded.reverse.name != nullptr) { fastaout.discarded.reverse.handle = fopen_output(fastaout.discarded.reverse.name); } return fastaout; } auto check_output_files(struct file_purpose const & fastaout) -> void { if (fastaout.cut.forward.name != nullptr) { if (fastaout.cut.forward.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (fastaout.discarded.forward.name != nullptr) { if (fastaout.discarded.forward.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (fastaout.cut.reverse.name != nullptr) { if (fastaout.cut.reverse.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (fastaout.discarded.reverse.name != nullptr) { if (fastaout.discarded.reverse.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } } auto check_if_contains_circumflex(std::string const & pattern) -> void { auto const occurrences = std::count(pattern.cbegin(), pattern.cend(), '^'); if (occurrences == 0) { fatal("No forward sequence cut site (^) found in pattern"); } if (occurrences > 1) { fatal("Multiple cut sites not supported"); } } auto check_if_contains_underscore(std::string const & pattern) -> void { auto const occurrences = std::count(pattern.cbegin(), pattern.cend(), '_'); if (occurrences == 0) { fatal("No reverse sequence cut site (_) found in pattern"); } if (occurrences > 1) { fatal("Multiple cut sites not supported"); } } auto locate_forward_restriction_site(std::string pattern) -> int { auto const underscore_position = pattern.find('_'); pattern.erase(underscore_position, 1); return static_cast(pattern.find('^')); } auto locate_reverse_restriction_site(std::string pattern) -> int { auto const circumflex_position = pattern.find('^'); pattern.erase(circumflex_position, 1); return static_cast(pattern.find('_')); } auto remove_restriction_sites(std::string pattern) -> std::string { auto const circumflex_position = pattern.find('^'); pattern.erase(circumflex_position, 1); auto const underscore_position = pattern.find('_'); return pattern.erase(underscore_position, 1); } auto reencode_restriction_pattern(std::string raw_pattern) -> std::string { auto pattern = remove_restriction_sites(std::move(raw_pattern)); auto encode_characters = [](char const & character) -> char { return map_4bit(character); }; std::transform(pattern.cbegin(), pattern.cend(), pattern.begin(), encode_characters); return pattern; } auto check_if_pattern_is_empty(std::string const & pattern) -> void { if (pattern.empty()) { fatal("Empty cut pattern string"); } } auto search_illegal_characters(std::string const & pattern) -> void { auto character_is_illegal = [](char const & character) -> void { if (map_4bit(character) == '\0') { fatal("Illegal character in cut pattern"); } }; std::for_each(pattern.cbegin(), pattern.cend(), character_is_illegal); } auto stats_message(std::FILE * output_stream, struct statistics const & counters) -> void { static_cast(std::fprintf(output_stream, "%" PRId64 " sequence(s) cut %" PRId64 " times, %" PRId64 " sequence(s) never cut.\n", counters.cut, counters.matches, counters.uncut)); } auto output_stats_message(struct Parameters const & parameters, struct statistics const & counters, char const * filename) -> void { if (filename == nullptr) { return; } stats_message(parameters.fp_log, counters); } auto output_stats_message(struct Parameters const & parameters, struct statistics const & counters) -> void { if (parameters.opt_quiet) { return; } stats_message(stderr, counters); } auto close_output_files(struct file_purpose const & fastaout) -> void { for (auto * fp_outputfile : { fastaout.cut.forward.handle, fastaout.discarded.forward.handle, fastaout.cut.reverse.handle, fastaout.discarded.reverse.handle}) { if (fp_outputfile != nullptr) { static_cast(std::fclose(fp_outputfile)); } } } } // end of anonymous namespace auto cut(struct Parameters const & parameters) -> void { ckeck_if_output_is_set(parameters); auto * input_handle = fasta_open(parameters.opt_cut); assert(input_handle != nullptr); // verified by fasta_open() auto const fastaout = open_output_files(parameters); check_output_files(fastaout); auto const raw_pattern = parameters.opt_cut_pattern; // check for the expected number of restriction sites check_if_contains_circumflex(raw_pattern); check_if_contains_underscore(raw_pattern); // locate restriction sites and trim pattern struct restriction_pattern const restriction = { remove_restriction_sites(raw_pattern), reencode_restriction_pattern(raw_pattern), locate_forward_restriction_site(raw_pattern), locate_reverse_restriction_site(raw_pattern) }; check_if_pattern_is_empty(restriction.pattern); search_illegal_characters(restriction.pattern); auto const filesize = fasta_get_size(input_handle); Progress progress("Cutting sequences", filesize, parameters); struct statistics counters; std::vector rc_buffer; while (fasta_next(input_handle, false, chrmap_no_change_vector.data())) { cut_a_sequence(input_handle, restriction, fastaout, counters, rc_buffer); progress.update(fasta_get_position(input_handle)); } output_stats_message(parameters, counters); output_stats_message(parameters, counters, parameters.opt_log); close_output_files(fastaout); fasta_close(input_handle); } vsearch-2.30.1/src/cut.h000066400000000000000000000047451506776541700150170ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto cut(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/db.cc000066400000000000000000000334371506776541700147470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::min, std::max #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::fprintf, std::size_t #include // std::qsort #include // std::memcpy, std::strcmp #include constexpr uint64_t memchunk = 16777216; // 2^24 static fastx_handle h = nullptr; static bool is_fastq = false; static uint64_t sequences = 0; static uint64_t nucleotides = 0; static uint64_t longest = 0; static uint64_t shortest = 0; static uint64_t longestheader = 0; static uint64_t dataalloc = 0; static uint64_t datalen = 0; static size_t seqindex_alloc = 0; seqinfo_t * seqindex = nullptr; char * datap = nullptr; auto db_setinfo(bool new_is_fastq, uint64_t new_sequences, uint64_t new_nucleotides, uint64_t new_longest, uint64_t new_shortest, uint64_t new_longestheader) -> void { is_fastq = new_is_fastq; sequences = new_sequences; nucleotides = new_nucleotides; longest = new_longest; shortest = new_shortest; longestheader = new_longestheader; } auto db_is_fastq() -> bool { return is_fastq; } auto db_getquality(uint64_t seqno) -> char * { if (is_fastq) { return datap + seqindex[seqno].qual_p; } return nullptr; } auto db_add(bool const is_fastq, char const * header, char const * sequence, char const * quality, size_t const headerlength, size_t const sequencelength, int64_t const abundance) -> void { /* Add a sequence to the database. Assumes that the database has been initialized. */ /* grow space for data, if necessary */ size_t const dataalloc_old = dataalloc; size_t needed = datalen + headerlength + 1 + sequencelength + 1; if (is_fastq) { needed += sequencelength + 1; } while (dataalloc < needed) { dataalloc += memchunk; } if (dataalloc > dataalloc_old) { datap = (char *) xrealloc(datap, dataalloc); } /* store the header */ size_t const header_p = datalen; std::memcpy(datap + header_p, header, headerlength + 1); datalen += headerlength + 1; /* store sequence */ size_t const sequence_p = datalen; std::memcpy(datap + sequence_p, sequence, sequencelength + 1); datalen += sequencelength + 1; size_t const quality_p = datalen; if (is_fastq) { /* store quality */ std::memcpy(datap + quality_p, quality, sequencelength + 1); datalen += sequencelength + 1; } /* grow space for index, if necessary */ size_t const seqindex_alloc_old = seqindex_alloc; while ((sequences + 1) * sizeof(seqinfo_t) > seqindex_alloc) { seqindex_alloc += memchunk; } if (seqindex_alloc > seqindex_alloc_old) { seqindex = (seqinfo_t *) xrealloc(seqindex, seqindex_alloc); } /* update index */ seqinfo_t * seqindex_p = seqindex + sequences; seqindex_p->headerlen = headerlength; seqindex_p->seqlen = sequencelength; seqindex_p->header_p = header_p; seqindex_p->seq_p = sequence_p; seqindex_p->qual_p = quality_p; seqindex_p->size = abundance; /* update statistics */ ++sequences; nucleotides += sequencelength; longest = std::max((uint64_t)sequencelength, longest); shortest = std::min((uint64_t)sequencelength, shortest); longestheader = std::max((uint64_t)headerlength, longestheader); } auto db_read(const char * filename, int upcase) -> void { h = fastx_open(filename); if (h == nullptr) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } is_fastq = fastx_is_fastq(h); int64_t const filesize = fastx_get_size(h); char * prompt = nullptr; if (xsprintf(&prompt, "Reading file %s", filename) == -1) { fatal("Out of memory"); } progress_init(prompt, filesize); longest = 0; shortest = std::numeric_limits::max(); // refactoring: direct initialization longestheader = 0; sequences = 0; nucleotides = 0; int64_t discarded_short = 0; int64_t discarded_long = 0; int64_t discarded_unoise = 0; /* allocate space for data */ dataalloc = 0; datap = nullptr; datalen = 0; /* allocate space for index */ seqindex_alloc = 0; seqindex = nullptr; while (fastx_next(h, opt_notrunclabels == 0, (upcase != 0) ? chrmap_upcase_vector.data() : chrmap_no_change_vector.data())) { size_t const sequencelength = fastx_get_sequence_length(h); int64_t const abundance = fastx_get_abundance(h); if (sequencelength < (size_t) opt_minseqlength) { ++discarded_short; } else if (sequencelength > (size_t) opt_maxseqlength) { ++discarded_long; } else if ((opt_cluster_unoise != nullptr) && (abundance < opt_minsize)) { ++discarded_unoise; } else { db_add(is_fastq, fastx_get_header(h), fastx_get_sequence(h), is_fastq ? fastx_get_quality(h) : nullptr, fastx_get_header_length(h), sequencelength, abundance); } progress_update(fastx_get_position(h)); } progress_done(); xfree(prompt); fastx_close(h); if (not opt_quiet) { if (sequences > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, " "min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", db_getnucleotidecount(), db_getsequencecount()); } } if (opt_log != nullptr) { if (sequences > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, " "min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n\n", db_getnucleotidecount(), db_getsequencecount()); } } /* Warn about discarded sequences */ if (discarded_short != 0) { fprintf(stderr, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); if (opt_log != nullptr) { fprintf(fp_log, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); } } if (discarded_long != 0) { fprintf(stderr, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); if (opt_log != nullptr) { fprintf(fp_log, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); } } if (discarded_unoise != 0) { fprintf(stderr, "minsize %" PRId64 ": %" PRId64 " %s discarded.\n", opt_minsize, discarded_unoise, (discarded_unoise == 1 ? "sequence" : "sequences")); if (opt_log != nullptr) { fprintf(fp_log, "minsize %" PRId64 ": %" PRId64 " %s discarded.\n", opt_minsize, discarded_unoise, (discarded_unoise == 1 ? "sequence" : "sequences")); } } show_rusage(); } auto db_getsequencecount() -> uint64_t { return sequences; } auto db_getnucleotidecount() -> uint64_t { return nucleotides; } auto db_getlongestheader() -> uint64_t { return longestheader; } auto db_getlongestsequence() -> uint64_t { return longest; } auto db_getshortestsequence() -> uint64_t { return shortest; } auto db_free() -> void { if (datap != nullptr) { xfree(datap); } if (seqindex != nullptr) { xfree(seqindex); } } auto compare_bylength(const void * a, const void * b) -> int { auto * lhs = (seqinfo_t *) a; auto * rhs = (seqinfo_t *) b; /* longest first, then by abundance, then by label, otherwise keep order */ if (lhs->seqlen < rhs->seqlen) { return +1; } if (lhs->seqlen > rhs->seqlen) { return -1; } if (lhs->size < rhs->size) { return +1; } if (lhs->size > rhs->size) { return -1; } auto const result = std::strcmp(datap + lhs->header_p, datap + rhs->header_p); if (result != 0) { return result; } if (lhs < rhs) { return -1; } if (lhs > rhs) { return +1; } return 0; } auto compare_bylength_shortest_first(const void * a, const void * b) -> int { auto * lhs = (seqinfo_t *) a; auto * rhs = (seqinfo_t *) b; /* shortest first, then by abundance, then by label, otherwise keep order */ if (lhs->seqlen < rhs->seqlen) { return -1; } if (lhs->seqlen > rhs->seqlen) { return +1; } if (lhs->size < rhs->size) { return +1; } if (lhs->size > rhs->size) { return -1; } auto const result = std::strcmp(datap + lhs->header_p, datap + rhs->header_p); if (result != 0) { return result; } if (lhs < rhs) { return -1; } if (lhs > rhs) { return +1; } return 0; } inline auto compare_byabundance(const void * a, const void * b) -> int { auto * lhs = (seqinfo_t *) a; auto * rhs = (seqinfo_t *) b; /* most abundant first, then by label, otherwise keep order */ if (lhs->size > rhs->size) { return -1; } if (lhs->size < rhs->size) { return +1; } auto const result = std::strcmp(datap + lhs->header_p, datap + rhs->header_p); if (result != 0) { return result; } if (lhs < rhs) { return -1; } if (lhs > rhs) { return +1; } return 0; } auto db_sortbylength() -> void { progress_init("Sorting by length", 100); qsort(seqindex, sequences, sizeof(seqinfo_t), compare_bylength); progress_done(); } auto db_sortbylength_shortest_first() -> void { progress_init("Sorting by length", 100); qsort(seqindex, sequences, sizeof(seqinfo_t), compare_bylength_shortest_first); progress_done(); } auto db_sortbyabundance() -> void { progress_init("Sorting by abundance", 100); qsort(seqindex, sequences, sizeof(seqinfo_t), compare_byabundance); progress_done(); } vsearch-2.30.1/src/db.h000066400000000000000000000100101506776541700145670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t #include // std::size_t struct seqinfo_s { std::size_t header_p; std::size_t seq_p; std::size_t qual_p; unsigned int headerlen; unsigned int seqlen; unsigned int size; }; using seqinfo_t = struct seqinfo_s; extern char * datap; extern seqinfo_t * seqindex; inline auto db_getheader(uint64_t seqno) -> char * { return datap + seqindex[seqno].header_p; } inline auto db_getsequence(uint64_t seqno) -> char * { return datap + seqindex[seqno].seq_p; } inline auto db_getabundance(uint64_t seqno) -> uint64_t { return seqindex[seqno].size; } inline auto db_getsequencelen(uint64_t seqno) -> uint64_t { return seqindex[seqno].seqlen; } inline auto db_getheaderlen(uint64_t seqno) -> uint64_t { return seqindex[seqno].headerlen; } auto db_read(const char * filename, int upcase) -> void; auto db_free() -> void; auto db_getsequencecount() -> uint64_t; auto db_getnucleotidecount() -> uint64_t; auto db_getlongestheader() -> uint64_t; auto db_getlongestsequence() -> uint64_t; auto db_getshortestsequence() -> uint64_t; /* Note: the sorting functions below must be called after db_read, but before dbindex_prepare */ auto db_sortbylength() -> void; auto db_sortbylength_shortest_first() -> void; auto db_sortbyabundance() -> void; auto db_is_fastq() -> bool; auto db_getquality(uint64_t seqno) -> char *; auto db_setinfo(bool new_is_fastq, uint64_t new_sequences, uint64_t new_nucleotides, uint64_t new_longest, uint64_t new_shortest, uint64_t new_longestheader) -> void; vsearch-2.30.1/src/dbhash.cc000066400000000000000000000130601506776541700156010ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "bitmap.h" #include "utils/seqcmp.hpp" #include // int64_t, uint64_t #include // std::memset #include static struct bitmap_s * dbhash_bitmap; static uint64_t dbhash_size; static unsigned int dbhash_shift; static uint64_t dbhash_mask; std::vector dbhash_table; auto dbhash_open(uint64_t const maxelements) -> void { /* adjust size of hash table for 2/3 fill rate */ /* and use a multiple of 2 */ dbhash_size = 1; dbhash_shift = 0; while (3 * maxelements > 2 * dbhash_size) { dbhash_size <<= 1U; ++dbhash_shift; } dbhash_mask = dbhash_size - 1; dbhash_table.resize(dbhash_size); dbhash_bitmap = bitmap_init(dbhash_size); bitmap_reset_all(dbhash_bitmap); } auto dbhash_close() -> void { bitmap_free(dbhash_bitmap); dbhash_bitmap = nullptr; } auto dbhash_search_first(char * seq, uint64_t const seqlen, struct dbhash_search_info_s * info) -> int64_t { auto const hash = hash_cityhash64(seq, seqlen); info->hash = hash; info->seq = seq; info->seqlen = seqlen; auto index = hash & dbhash_mask; auto * bp = &dbhash_table[index]; while ((bitmap_get(dbhash_bitmap, index) != 0U) and ((bp->hash != hash) or (seqlen != db_getsequencelen(bp->seqno)) or (seqcmp(seq, db_getsequence(bp->seqno), seqlen) != 0))) { index = (index + 1) & dbhash_mask; bp = &dbhash_table[index]; } info->index = index; if (bitmap_get(dbhash_bitmap, index) != 0U) { return bp->seqno; } return -1; } auto dbhash_search_next(struct dbhash_search_info_s * info) -> int64_t { auto const hash = info->hash; auto * seq = info->seq; auto const seqlen = info->seqlen; auto index = (info->index + 1) & dbhash_mask; auto * bp = &dbhash_table[index]; while ((bitmap_get(dbhash_bitmap, index) != 0U) and ((bp->hash != hash) or (seqlen != db_getsequencelen(bp->seqno)) or (seqcmp(seq, db_getsequence(bp->seqno), seqlen) != 0))) { index = (index + 1) & dbhash_mask; bp = &dbhash_table[index]; } info->index = index; if (bitmap_get(dbhash_bitmap, index) != 0U) { return bp->seqno; } return -1; } auto dbhash_add(char * seq, uint64_t seqlen, uint64_t seqno) -> void { struct dbhash_search_info_s info; auto ret = dbhash_search_first(seq, seqlen, &info); while (ret >= 0) { ret = dbhash_search_next(&info); } bitmap_set(dbhash_bitmap, info.index); auto & bucket = dbhash_table[info.index]; bucket.hash = info.hash; bucket.seqno = seqno; } auto dbhash_add_all() -> void { progress_init("Hashing database sequences", db_getsequencecount()); std::vector normalized(db_getlongestsequence() + 1); for (uint64_t seqno = 0; seqno < db_getsequencecount(); ++seqno) { auto const * seq = db_getsequence(seqno); auto const seqlen = db_getsequencelen(seqno); string_normalize(normalized.data(), seq, seqlen); dbhash_add(normalized.data(), seqlen, seqno); progress_update(seqno + 1); } progress_done(); } vsearch-2.30.1/src/dbhash.h000066400000000000000000000061561506776541700154530ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t struct dbhash_bucket_s { uint64_t hash = 0; uint64_t seqno = 0; }; struct dbhash_search_info_s { char * seq = nullptr; uint64_t seqlen = 0; uint64_t hash = 0; uint64_t index = 0; }; auto dbhash_open(uint64_t maxelements) -> void; auto dbhash_close() -> void; auto dbhash_add(char * seq, uint64_t seqlen, uint64_t seqno) -> void; auto dbhash_add_all() -> void; auto dbhash_search_first(char * seq, uint64_t seqlen, struct dbhash_search_info_s * info) -> int64_t; auto dbhash_search_next(struct dbhash_search_info_s * info) -> int64_t; auto dbhash_search_finish(struct dbhash_search_info_s * info) -> void; vsearch-2.30.1/src/dbindex.cc000066400000000000000000000177241506776541700160000ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "bitmap.h" #include "dbindex.h" #include "unique.h" #include #include // uint64_t #include // std::FILE, std::fprintf #include // std::memset #include // std::next unsigned int * kmercount; uint64_t * kmerhash; unsigned int * kmerindex; struct bitmap_s * * kmerbitmap; unsigned int * dbindex_map; unsigned int kmerhashsize; uint64_t kmerindexsize; unsigned int dbindex_count; uhandle_s * dbindex_uh; constexpr unsigned int bitmap_threshold = 8; static unsigned int bitmap_mincount; auto dbindex_getbitmap(unsigned int const kmer) -> unsigned char * { auto * a_bitmap_s = *std::next(kmerbitmap, kmer); if (a_bitmap_s != nullptr) { return a_bitmap_s->bitmap; } return nullptr; } auto dbindex_getmatchcount(unsigned int const kmer) -> unsigned int { return *std::next(kmercount, kmer); } auto dbindex_getmatchlist(unsigned int const kmer) -> unsigned int * { return std::next(kmerindex, *std::next(kmerhash, kmer)); } auto dbindex_getmapping(unsigned int const index) -> unsigned int { return *std::next(dbindex_map, index); } auto dbindex_getcount() -> unsigned int { return dbindex_count; } auto fprint_kmer(std::FILE * output_handle, unsigned int const kmer_length, uint64_t const kmer) -> void { static constexpr std::array sym_nt_2bit = {{'A', 'C', 'G', 'T'}}; for (auto i = 0U; i < kmer_length; ++i) { std::fprintf(output_handle, "%c", sym_nt_2bit[(kmer >> (2 * (kmer_length - i - 1))) & 3]); } } auto dbindex_addsequence(unsigned int seqno, int seqmask) -> void { #if 0 std::printf("Adding seqno %d as index element no %d\n", seqno, dbindex_count); #endif unsigned int uniquecount = 0; unsigned int const * uniquelist = nullptr; unique_count(dbindex_uh, opt_wordlength, db_getsequencelen(seqno), db_getsequence(seqno), &uniquecount, &uniquelist, seqmask); dbindex_map[dbindex_count] = seqno; for (auto i = 0U; i < uniquecount; i++) { auto const kmer = uniquelist[i]; if (kmerbitmap[kmer] != nullptr) { ++kmercount[kmer]; bitmap_set(kmerbitmap[kmer], dbindex_count); } else { kmerindex[kmerhash[kmer] + kmercount[kmer]] = dbindex_count; ++kmercount[kmer]; } } ++dbindex_count; } auto dbindex_addallsequences(int seqmask) -> void { unsigned int const seqcount = db_getsequencecount(); progress_init("Creating k-mer index", seqcount); for (auto seqno = 0U; seqno < seqcount ; seqno++) { dbindex_addsequence(seqno, seqmask); progress_update(seqno); } progress_done(); } auto dbindex_prepare(int use_bitmap, int seqmask) -> void { dbindex_uh = unique_init(); unsigned int const seqcount = db_getsequencecount(); kmerhashsize = 1U << (2 * opt_wordlength); /* allocate memory for kmer count array */ kmercount = (unsigned int *) xmalloc(kmerhashsize * sizeof(unsigned int)); std::memset(kmercount, 0, kmerhashsize * sizeof(unsigned int)); /* first scan, just count occurences */ progress_init("Counting k-mers", seqcount); for (auto seqno = 0U; seqno < seqcount ; seqno++) { unsigned int uniquecount = 0; unsigned int const * uniquelist = nullptr; unique_count(dbindex_uh, opt_wordlength, db_getsequencelen(seqno), db_getsequence(seqno), &uniquecount, &uniquelist, seqmask); for (auto i = 0U; i < uniquecount; i++) { ++kmercount[uniquelist[i]]; } progress_update(seqno); } progress_done(); #if 0 /* dump kmer counts */ auto * f = fopen_output("kmercounts.txt"); for (auto kmer = 0U; kmer < kmerhashsize; kmer++) { fprint_kmer(f, 8, kmer); std::fprintf(f, "\t%d\t%d\n", kmer, kmercount[kmer]); } std::fclose(f); #endif /* determine minimum kmer count for bitmap usage */ if (use_bitmap != 0) { bitmap_mincount = seqcount / bitmap_threshold; } else { bitmap_mincount = seqcount + 1; } /* allocate and zero bitmap pointers */ kmerbitmap = (struct bitmap_s **) xmalloc(kmerhashsize * sizeof(struct bitmap_s *)); std::memset(kmerbitmap, 0, kmerhashsize * sizeof(struct bitmap_s *)); /* hash / bitmap setup */ /* convert hash counts to position in index */ kmerhash = (uint64_t *) xmalloc((kmerhashsize + 1) * sizeof(uint64_t)); uint64_t sum = 0; for (auto i = 0U; i < kmerhashsize; i++) { kmerhash[i] = sum; if (kmercount[i] >= bitmap_mincount) { kmerbitmap[i] = bitmap_init(seqcount + 127); // pad for xmm bitmap_reset_all(kmerbitmap[i]); } else { sum += kmercount[i]; } } kmerindexsize = sum; kmerhash[kmerhashsize] = sum; #if 0 if (not opt_quiet) std::fprintf(stderr, "Unique %ld-mers: %u\n", opt_wordlength, kmerindexsize); #endif /* reset counts */ std::memset(kmercount, 0, kmerhashsize * sizeof(unsigned int)); /* allocate space for actual data */ kmerindex = (unsigned int *) xmalloc(kmerindexsize * sizeof(unsigned int)); /* allocate space for mapping from indexno to seqno */ dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int)); dbindex_count = 0; show_rusage(); } auto dbindex_free() -> void { xfree(kmerhash); xfree(kmerindex); xfree(kmercount); xfree(dbindex_map); for (auto kmer = 0U; kmer < kmerhashsize; kmer++) { if (kmerbitmap[kmer] != nullptr) { bitmap_free(kmerbitmap[kmer]); } } xfree(kmerbitmap); unique_exit(dbindex_uh); } vsearch-2.30.1/src/dbindex.h000066400000000000000000000070441506776541700156340ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bitmap.h" #include // std::FILE #include // uint64_t struct uhandle_s; extern unsigned int * kmercount; /* number of matching seqnos for each kmer */ extern uint64_t * kmerhash; /* index into the list below for each kmer */ extern unsigned int * kmerindex; /* the list of matching seqnos for kmers */ extern struct bitmap_s * * kmerbitmap; extern unsigned int * dbindex_map; extern unsigned int dbindex_count; extern unsigned int kmerhashsize; extern uint64_t kmerindexsize; extern uhandle_s * dbindex_uh; auto fprint_kmer(std::FILE * output_handle, unsigned int kmer_length, uint64_t kmer) -> void; auto dbindex_prepare(int use_bitmap, int seqmask) -> void; auto dbindex_addallsequences(int seqmask) -> void; auto dbindex_addsequence(unsigned int seqno, int seqmask) -> void; auto dbindex_free() -> void; auto dbindex_getbitmap(unsigned int kmer) -> unsigned char *; auto dbindex_getmatchcount(unsigned int kmer) -> unsigned int; auto dbindex_getmatchlist(unsigned int kmer) -> unsigned int *; auto dbindex_getmapping(unsigned int index) -> unsigned int; auto dbindex_getcount() -> unsigned int; vsearch-2.30.1/src/derep.cc000066400000000000000000000725731506776541700154650ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/seqcmp.hpp" #include // std::count_if, std::min #include // macros PRIu64 and PRId64 #include // std::log10, std::pow #include // int64_t, uint64_t #include // std::qsort #include // std::FILE, std::fprintf, std::fclose #include // std::strlen, std::strcmp #include #include #include // anonymous namespace: limit visibility and usage to this translation unit namespace { // refactoring: // replace with std::unordered_map (default hashing) // if performance are bad, see Victor_Ciura's Cpp Talk "So You Think You Can Hash" // then make a CityHash hasher object and use it with std::unordered_map using Hash = decltype(&hash_cityhash64); Hash hash_function = hash_cityhash64; struct bucket { uint64_t hash = 0; unsigned int seqno_first = 0; unsigned int seqno_last = 0; unsigned int size = 0; unsigned int count = 0; bool deleted = false; char * header = nullptr; char * seq = nullptr; char * qual = nullptr; }; auto count_selected(std::vector const & hashtable, struct Parameters const & parameters) -> uint64_t { auto size_in_range = [&](struct bucket const & bucket) -> bool { auto const size = bucket.size; return ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)); }; auto const selected = std::count_if(hashtable.begin(), hashtable.end(), size_in_range); return std::min(static_cast(selected), static_cast(parameters.opt_topn)); } // refactoring: same as find_median_abundance() auto find_median_size(std::vector const & hashtable) -> double { static constexpr auto half = 0.5; if (hashtable.empty()) { return 0.0; } auto const table_size = hashtable.size(); auto const midpoint = std::ldiv(static_cast(table_size), 2L); auto const is_odd = ((table_size % 2) != 0U); if (is_odd) { // index is zero-based, so if size == 3, midpoint == 1 auto const index = midpoint.quot; return hashtable[index].size; } // pair number of elements: // index is zero-based, so if size == 4, midpoint == 2, lhs index == 1 auto const lhs_index = midpoint.quot - 1; auto const rhs_index = midpoint.quot; auto const lhs_size = hashtable[lhs_index].size; auto const rhs_size = hashtable[rhs_index].size; // sorted by decreasing abundance: lhs size > rhs size // limit risk of integer additon overflow: // a >= b ; (a + b) / 2 == b + (a - b) / 2 return rhs_size + ((lhs_size - rhs_size) * half); } auto rehash(std::vector & hashtable) -> void { // new double-size hash table uint64_t const new_hashtable_size = 2 * hashtable.size(); uint64_t const new_hash_mask = new_hashtable_size - 1; std::vector new_hashtable(new_hashtable_size); // rehash all entries from the old to the new table for (auto const & old_bucket : hashtable) { if (old_bucket.size != 0U) { auto new_index = old_bucket.hash & new_hash_mask; while (new_hashtable[new_index].size != 0U) { new_index = (new_index + 1) & new_hash_mask; } new_hashtable[new_index] = old_bucket; } } hashtable.swap(new_hashtable); } // refactorig: duplicate of q2p()? inline auto convert_quality_symbol_to_probability(int const quality_symbol, struct Parameters const & parameters) -> double { static constexpr auto minimal_quality_value = 2; static constexpr auto maximal_probability = 0.75; auto const quality_value = quality_symbol - static_cast(parameters.opt_fastq_ascii); if (quality_value < minimal_quality_value) { return maximal_probability; } static constexpr auto base = 10.0; return std::pow(base, -quality_value / base); } inline auto convert_probability_to_quality_symbol(double const probability, struct Parameters const & parameters) -> int { static constexpr auto base = 10.0; auto quality_value = static_cast(std::trunc(-base * std::log10(probability))); quality_value = std::min(quality_value, parameters.opt_fastq_qmaxout); quality_value = std::max(quality_value, parameters.opt_fastq_qminout); return static_cast(quality_value + parameters.opt_fastq_asciiout); } } // end of anonymous namespace auto derep_compare_full(void const * void_lhs, void const * void_rhs) -> int { auto * lhs = (struct bucket *) void_lhs; auto * rhs = (struct bucket *) void_rhs; /* highest abundance first, then by label, otherwise keep order */ if (lhs->deleted and not rhs->deleted) // refactoring: deleted is always set to false for derep_fulllength { return +1; } if (not lhs->deleted and rhs->deleted) // refactoring: deleted is always set to false for derep_fulllength { return -1; } // same status if (lhs->size < rhs->size) { return +1; } if (lhs->size > rhs->size) { return -1; } // same abundance if (lhs->size == 0) { return 0; } auto const result = std::strcmp(lhs->header, rhs->header); if (result != 0) { return result; } // same header (label) if (lhs->seqno_first < rhs->seqno_first) { return -1; } if (lhs->seqno_first > rhs->seqno_first) { return +1; } // same ordinal value (impossible) return 0; // unreachable } // used by --derep_fulllength, --derep_id, and --fastx_uniques auto derep(struct Parameters const & parameters, char * input_filename, bool const use_header) -> void { /* dereplicate full length sequences, optionally require identical headers */ /* derep_fulllength output options: --output, --uc (only FASTA, depreciated) fastx_uniques output options: --fastaout, --fastqout, --uc, --tabbedout */ show_rusage(); auto * input_handle = fastx_open(input_filename); if (input_handle == nullptr) { fatal("Unrecognized input file type (not proper FASTA or FASTQ format)"); // unreachable? case already handled in fastx_open(), assert(h != nullptr) should always be true } if (not fastx_is_empty(input_handle)) { if (fastx_is_fastq(input_handle)) { if (parameters.opt_fastx_uniques == nullptr) { fatal("FASTQ input is only allowed with the fastx_uniques command"); } } else { if (parameters.opt_fastqout != nullptr) { fatal("Cannot write FASTQ output when input file is not in FASTQ " "format"); } if (parameters.opt_tabbedout != nullptr) { fatal("Cannot write tab separated output file when input file is " "not in FASTQ format"); } } } std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; std::FILE * fp_uc = nullptr; std::FILE * fp_tabbedout = nullptr; if (parameters.opt_fastx_uniques != nullptr) { if ((parameters.opt_uc == nullptr) and (parameters.opt_fastaout == nullptr) and (parameters.opt_fastqout == nullptr) and (parameters.opt_tabbedout == nullptr)) { fatal("Output file for dereplication with fastx_uniques must be " "specified with --fastaout, --fastqout, --tabbedout, or --uc"); } } else { if ((parameters.opt_output == nullptr) and (parameters.opt_uc == nullptr)) { fatal("Output file for dereplication must be specified with --output " "or --uc"); } } if (parameters.opt_fastx_uniques != nullptr) { if (parameters.opt_fastaout != nullptr) { fp_fastaout = fopen_output(parameters.opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (parameters.opt_fastqout != nullptr) { fp_fastqout = fopen_output(parameters.opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (parameters.opt_tabbedout != nullptr) { fp_tabbedout = fopen_output(parameters.opt_tabbedout); if (fp_tabbedout == nullptr) { fatal("Unable to open tab delimited output file for writing"); } } } else { if (parameters.opt_output != nullptr) { fp_fastaout = fopen_output(parameters.opt_output); if (fp_fastaout == nullptr) { fatal("Unable to open FASTA output file for writing"); } } } if (parameters.opt_uc != nullptr) { fp_uc = fopen_output(parameters.opt_uc); if (fp_uc == nullptr) { fatal("Unable to open output (uc) file for writing"); } } auto const filesize = fastx_get_size(input_handle); /* allocate initial memory for 1024 clusters with sequences of length 1023 */ uint64_t alloc_clusters = 1024; uint64_t alloc_seqs = 1024; int64_t alloc_seqlen = 1023; uint64_t hashtablesize = 2 * alloc_clusters; uint64_t hash_mask = hashtablesize - 1; std::vector hashtable(hashtablesize); show_rusage(); constexpr auto terminal = std::numeric_limits::max(); std::vector nextseqtab; std::vector headertab; std::vector match_strand; auto const extra_info = (parameters.opt_uc != nullptr) or (parameters.opt_tabbedout != nullptr); if (extra_info) { /* If the uc or tabbedout option is in effect, we need to keep some extra info. Allocate and init memory for this. */ /* Links to other sequences in cluster */ nextseqtab.resize(alloc_seqs, terminal); /* Pointers to the header strings */ headertab.resize(alloc_seqs); /* Matching strand */ match_strand.resize(alloc_seqs); } show_rusage(); std::vector seq_up(alloc_seqlen + 1); std::vector rc_seq_up(alloc_seqlen + 1); std::string const prompt = std::string("Dereplicating file ") + input_filename; progress_init(prompt.c_str(), filesize); uint64_t sequencecount = 0; uint64_t nucleotidecount = 0; auto shortest = std::numeric_limits::max(); int64_t longest = 0; uint64_t discarded_short = 0; uint64_t discarded_long = 0; uint64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; auto average = 0.0; while (fastx_next(input_handle, not parameters.opt_notrunclabels, chrmap_no_change_vector.data())) { int64_t const seqlen = fastx_get_sequence_length(input_handle); if (seqlen < parameters.opt_minseqlength) { ++discarded_short; continue; } if (seqlen > parameters.opt_maxseqlength) { ++discarded_long; continue; } nucleotidecount += seqlen; longest = std::max(seqlen, longest); shortest = std::min(seqlen, shortest); /* check allocations */ if (seqlen > alloc_seqlen) { alloc_seqlen = seqlen; seq_up.resize(alloc_seqlen + 1); rc_seq_up.resize(alloc_seqlen + 1); show_rusage(); } if (extra_info and (sequencecount + 1 > alloc_seqs)) { uint64_t const new_alloc_seqs = 2 * alloc_seqs; nextseqtab.resize(new_alloc_seqs, terminal); headertab.resize(new_alloc_seqs); match_strand.resize(new_alloc_seqs); alloc_seqs = new_alloc_seqs; show_rusage(); } if (clusters + 1 > alloc_clusters) { uint64_t const new_alloc_clusters = 2 * alloc_clusters; rehash(hashtable); alloc_clusters = new_alloc_clusters; hashtablesize = 2 * alloc_clusters; hash_mask = hashtablesize - 1; show_rusage(); } auto const * seq = fastx_get_sequence(input_handle); auto const * header = fastx_get_header(input_handle); auto const headerlen = fastx_get_header_length(input_handle); auto const * qual = fastx_get_quality(input_handle); // nullptr if FASTA /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); /* reverse complement if necessary */ if (parameters.opt_strand) { reverse_complement(rc_seq_up.data(), seq_up.data(), seqlen); } /* Find free bucket or bucket for identical sequence. Make sure sequences are exactly identical in case of any hash collision. With 64-bit hashes, there is about 50% chance of a collision when the number of sequences is about 5e9. */ auto const hash_header = use_header ? hash_function(header, headerlen) : uint64_t{0}; auto const hash = hash_function(seq_up.data(), seqlen) ^ hash_header; auto j = hash & hash_mask; auto * bp = &hashtable[j]; // refactoring: rename to "cluster" while ((bp->size != 0U) and ((hash != bp->hash) or (seqcmp(seq_up.data(), bp->seq, seqlen) != 0) or (use_header and (std::strcmp(header, bp->header) != 0)))) { j = (j + 1) & hash_mask; bp = &hashtable[j]; } if (parameters.opt_strand and (bp->size == 0U)) { /* no match on plus strand */ /* check minus strand as well */ auto const rc_hash = hash_function(rc_seq_up.data(), seqlen) ^ hash_header; auto k = rc_hash & hash_mask; auto * rc_bp = &hashtable[k]; while ((rc_bp->size != 0U) and ((rc_hash != rc_bp->hash) or (seqcmp(rc_seq_up.data(), rc_bp->seq, seqlen) != 0U) or (use_header and (std::strcmp(header, rc_bp->header) != 0)))) { k = (k + 1) & hash_mask; rc_bp = &hashtable[k]; } if (rc_bp->size != 0U) { bp = rc_bp; j = k; if (extra_info) { match_strand[sequencecount] = 1; } } } auto const abundance = parameters.opt_sizein ? fastx_get_abundance(input_handle) : int64_t{1}; sumsize += abundance; if (bp->size != 0U) { /* at least one identical sequence already */ if (extra_info) { unsigned int const last = bp->seqno_last; nextseqtab[last] = sequencecount; bp->seqno_last = sequencecount; headertab[sequencecount] = header; } int64_t const s1 = bp->size; int64_t const s2 = abundance; int64_t const s3 = s1 + s2; if (parameters.opt_fastqout != nullptr) { /* update quality scores */ for (int i = 0; i < seqlen; i++) { int const q1 = bp->qual[i]; int const q2 = qual[i]; auto const p1 = convert_quality_symbol_to_probability(q1, parameters); auto const p2 = convert_quality_symbol_to_probability(q2, parameters); auto p3 = 0.0; /* how to compute the new quality score? */ if (parameters.opt_fastq_qout_max) { // fastq_qout_max /* min error prob, highest quality */ p3 = std::min(p1, p2); } else { // fastq_qout_avg /* average, as in USEARCH */ p3 = (p1 * s1 + p2 * s2) / s3; } // fastq_qout_min /* max error prob, lowest quality */ // p3 = std::max(p1, p2); // fastq_qout_first /* keep first */ // p3 = p1; // fastq_qout_last /* keep last */ // p3 = p2; // fastq_qout_ef /* Compute as multiple independent observations Edgar & Flyvbjerg (2015) But what about s1 and s2? */ // p3 = p1 * p2 / 3.0 / (1.0 - p1 - p2 + (4.0 * p1 * p2 / 3.0)); /* always worst quality possible, certain error */ // p3 = 1.0; // always best quality possible, perfect, no errors */ // p3 = 0.0; int const q3 = convert_probability_to_quality_symbol(p3, parameters); bp->qual[i] = q3; } } bp->size = s3; ++bp->count; } else { /* no identical sequences yet */ bp->size = abundance; bp->hash = hash; bp->seqno_first = sequencecount; bp->seqno_last = sequencecount; bp->seq = xstrdup(seq); bp->header = xstrdup(header); bp->count = 1; if (qual != nullptr) { bp->qual = xstrdup(qual); } else { bp->qual = nullptr; } ++clusters; } maxsize = std::max(bp->size, maxsize); ++sequencecount; progress_update(fastx_get_position(input_handle)); } progress_done(); fastx_close(input_handle); show_rusage(); if (not parameters.opt_quiet) { if (sequencecount > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (parameters.opt_log != nullptr) { if (sequencecount > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (discarded_short != 0U) { fprintf(stderr, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); if (parameters.opt_log != nullptr) { fprintf(fp_log, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); } } if (discarded_long != 0U) { fprintf(stderr, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); if (parameters.opt_log != nullptr) { fprintf(fp_log, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); } } show_rusage(); progress_init("Sorting", 1); qsort(hashtable.data(), hashtablesize, sizeof(struct bucket), derep_compare_full); progress_done(); show_rusage(); auto const median = find_median_size(hashtable); average = 1.0 * sumsize / clusters; if (clusters < 1) { if (not parameters.opt_quiet) { fprintf(stderr, "0 unique sequences\n"); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "0 unique sequences\n\n"); } } else { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", clusters, average, median, maxsize); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", clusters, average, median, maxsize); } } /* count selected */ auto const selected = count_selected(hashtable, parameters); show_rusage(); /* write output */ if ((parameters.opt_output != nullptr) or (parameters.opt_fastaout != nullptr)) { progress_init("Writing FASTA output file", clusters); int64_t relabel_count = 0; for (uint64_t i = 0; i < clusters; ++i) { auto const & cluster = hashtable[i]; int64_t const size = cluster.size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++relabel_count; fasta_print_general(fp_fastaout, nullptr, cluster.seq, std::strlen(cluster.seq), cluster.header, std::strlen(cluster.header), size, relabel_count, -1.0, -1, -1, nullptr, 0.0); if (relabel_count == parameters.opt_topn) { break; } } progress_update(i); } progress_done(); fclose(fp_fastaout); } if (parameters.opt_fastqout != nullptr) { progress_init("Writing FASTQ output file", clusters); int64_t relabel_count = 0; for (uint64_t i = 0; i < clusters; ++i) { auto const & cluster = hashtable[i]; int64_t const size = cluster.size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++relabel_count; fastq_print_general(fp_fastqout, cluster.seq, std::strlen(cluster.seq), cluster.header, std::strlen(cluster.header), cluster.qual, size, relabel_count, -1.0); if (relabel_count == parameters.opt_topn) { break; } } progress_update(i); } progress_done(); fclose(fp_fastqout); } show_rusage(); if (parameters.opt_uc != nullptr) { progress_init("Writing uc file, first part", clusters); for (uint64_t i = 0; i < clusters; ++i) { auto const & cluster = hashtable[i]; int64_t const len = std::strlen(cluster.seq); fprintf(fp_uc, "S\t%" PRId64 "\t%" PRId64 "\t*\t*\t*\t*\t*\t%s\t*\n", i, len, cluster.header); for (auto next = nextseqtab[cluster.seqno_first]; next != terminal; next = nextseqtab[next]) { fprintf(fp_uc, "H\t%" PRId64 "\t%" PRId64 "\t%.1f\t%s\t0\t0\t*\t%s\t%s\n", i, len, 100.0, ((match_strand[next] != 0) ? "-" : "+"), headertab[next].c_str(), cluster.header); } progress_update(i); } progress_done(); progress_init("Writing uc file, second part", clusters); for (uint64_t i = 0; i < clusters; ++i) { auto const & cluster = hashtable[i]; fprintf(fp_uc, "C\t%" PRId64 "\t%u\t*\t*\t*\t*\t*\t%s\t*\n", i, cluster.size, cluster.header); progress_update(i); } fclose(fp_uc); progress_done(); } if (parameters.opt_tabbedout != nullptr) { progress_init("Writing tab separated file", clusters); for (uint64_t i = 0; i < clusters; ++i) { auto const & cluster = hashtable[i]; if (parameters.opt_relabel != nullptr) { fprintf(fp_tabbedout, "%s\t%s%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", cluster.header, parameters.opt_relabel, i + 1, i, (uint64_t) 0, cluster.count, cluster.header); } else { fprintf(fp_tabbedout, "%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", cluster.header, cluster.header, i, (uint64_t) 0, cluster.count, cluster.header); } uint64_t j = 1; for (auto next = nextseqtab[cluster.seqno_first]; next != terminal; next = nextseqtab[next]) { if (parameters.opt_relabel != nullptr) { fprintf(fp_tabbedout, "%s\t%s%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", headertab[next].c_str(), parameters.opt_relabel, i + 1, i, j, cluster.count, cluster.header); } else { fprintf(fp_tabbedout, "%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", headertab[next].c_str(), cluster.header, i, j, cluster.count, cluster.header); } ++j; } progress_update(i); } fclose(fp_tabbedout); progress_done(); } show_rusage(); if (selected < clusters) { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } } show_rusage(); /* Free all seqs and headers */ for (auto & bucket : hashtable) { if (bucket.size == 0U) { continue; } xfree(bucket.seq); xfree(bucket.header); if (bucket.qual != nullptr) { xfree(bucket.qual); } } show_rusage(); show_rusage(); } vsearch-2.30.1/src/derep.h000066400000000000000000000050171506776541700153140ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto derep(struct Parameters const & parameters, char * input_filename, bool use_header) -> void; vsearch-2.30.1/src/derep_prefix.cc000066400000000000000000000367261506776541700170420ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/seqcmp.hpp" #include "utils/span.hpp" #include // std::max, std::transform #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::qsort #include // std::FILE, std::fprintf, std::fclose #include // std::strcmp #include // std::next #include #include struct bucket { uint64_t hash = 0; unsigned int seqno_first = 0; unsigned int seqno_last = 0; unsigned int size = 0; unsigned int count = 0; bool deleted = false; char * header = nullptr; char * seq = nullptr; char * qual = nullptr; }; // refactoring: FNV-1A is the hashing function used in std::hash // (designed for fast hash-table and checksum, not crypto). The // function below might be redundant with std? auto compute_hashes_of_all_prefixes(std::vector & prefix_hashes, Span const sequence) -> void { // Fowler-Noll-Vo (FNV-1A) hash function static constexpr auto FNV_offset_basis = uint64_t{14695981039346656037U}; static constexpr auto FNV_prime = uint64_t{1099511628211U}; auto FNV1a_hash = FNV_offset_basis; prefix_hashes[0] = FNV_offset_basis; auto incremental_hash = [&FNV1a_hash](char const nucleotide) -> uint64_t { FNV1a_hash ^= static_cast(nucleotide); FNV1a_hash *= FNV_prime; return FNV1a_hash; }; std::transform(sequence.cbegin(), sequence.cend(), std::next(prefix_hashes.begin()), incremental_hash); } auto derep_compare_prefix(const void * a, const void * b) -> int { auto * lhs = (struct bucket *) a; auto * rhs = (struct bucket *) b; /* deleted(?) first, then by highest abundance, then by label, otherwise keep order */ if (static_cast(lhs->deleted) > static_cast(rhs->deleted)) { return +1; } if (static_cast(lhs->deleted) < static_cast(rhs->deleted)) { return -1; } // both are deleted, compare abundances if (lhs->size < rhs->size) { return +1; } if (lhs->size > rhs->size) { return -1; } // both are deleted, same abundances, compare sequence headers auto const result = std::strcmp(db_getheader(lhs->seqno_first), db_getheader(rhs->seqno_first)); if (result != 0) { return result; } // both are deleted, same abundances, same sequence headers, compare input order if (lhs->seqno_first < rhs->seqno_first) { return -1; } if (lhs->seqno_first > rhs->seqno_first) { return +1; } return 0; } auto derep_prefix(struct Parameters const & parameters) -> void { std::FILE * fp_output = nullptr; std::FILE * fp_uc = nullptr; if (parameters.opt_strand) { fatal("Option '--strand both' not supported with --derep_prefix"); } if (parameters.opt_output != nullptr) { fp_output = fopen_output(parameters.opt_output); if (fp_output == nullptr) { fatal("Unable to open output file for writing"); } } if (parameters.opt_uc != nullptr) { fp_uc = fopen_output(parameters.opt_uc); if (fp_uc == nullptr) { fatal("Unable to open output (uc) file for writing"); } } db_read(parameters.opt_derep_prefix, 0); db_sortbylength_shortest_first(); show_rusage(); int64_t const dbsequencecount = db_getsequencecount(); /* adjust size of hash table for 2/3 fill rate */ int64_t hashtablesize = 1; while (3 * dbsequencecount > 2 * hashtablesize) { hashtablesize <<= 1U; } int const hash_mask = hashtablesize - 1; std::vector hashtable(hashtablesize); int64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; double median = 0.0; double average = 0.0; /* alloc and init table of links to other sequences in cluster */ constexpr auto terminal = std::numeric_limits::max(); std::vector nextseqtab(dbsequencecount, terminal); std::vector seq_up(db_getlongestsequence() + 1); /* make table of hash values of prefixes */ unsigned int const len_longest = db_getlongestsequence(); unsigned int const len_shortest = db_getshortestsequence(); std::vector prefix_hashes(len_longest + 1); progress_init("Dereplicating", dbsequencecount); for (int64_t i = 0; i < dbsequencecount; i++) { unsigned int const seqlen = db_getsequencelen(i); auto * seq = db_getsequence(i); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); auto const abundance = parameters.opt_sizein ? db_getabundance(i) : 1; sumsize += abundance; /* Look for matching identical or prefix sequences. Use a hash function that can quickly be applied iteratively on longer and longer sequences. Hash values are generated for all prefixes and saved. Should start at exact sequence and then try shorter and shorter sequences. No need to check shorter sequences than the shortest in the database. Three cases: 1) Exact match: Update count, point to next 2) Prefix match: Mark old, insert new, update count, point to next 3) No match: Insert new entry */ compute_hashes_of_all_prefixes(prefix_hashes, Span{seq_up.data(), seqlen}); /* first, look for an identical match */ auto prefix_len = seqlen; uint64_t hash = prefix_hashes[prefix_len]; auto * bp = &hashtable[hash & hash_mask]; while ((bp->size != 0U) and ((bp->deleted) or (bp->hash != hash) or (prefix_len != db_getsequencelen(bp->seqno_first)) or (seqcmp(seq_up.data(), db_getsequence(bp->seqno_first), prefix_len) != 0))) { ++bp; if (bp > &hashtable.back()) { bp = hashtable.data(); } } /* at this point, bp points either to (1) a free empty hash bucket, or (2) a bucket with an exact match. */ auto const orig_hash = hash; auto * orig_bp = bp; if (bp->size != 0U) { /* exact match */ bp->size += abundance; auto const last = bp->seqno_last; nextseqtab[last] = i; bp->seqno_last = i; maxsize = std::max(bp->size, maxsize); } else { /* look for prefix match */ while ((bp->size == 0U) and (prefix_len > len_shortest)) { --prefix_len; hash = prefix_hashes[prefix_len]; bp = &hashtable[hash & hash_mask]; while ((bp->size != 0U) and ((bp->deleted) or (bp->hash != hash) or (prefix_len != db_getsequencelen(bp->seqno_first)) or (seqcmp(seq_up.data(), db_getsequence(bp->seqno_first), prefix_len) != 0))) { ++bp; if (bp > &hashtable.back()) { bp = hashtable.data(); } } } if (bp->size != 0U) { /* prefix match */ /* get necessary info, then delete prefix from hash */ auto const first = bp->seqno_first; auto const last = bp->seqno_last; auto const size = bp->size; bp->deleted = true; /* create new hash entry */ bp = orig_bp; bp->size = size + abundance; bp->hash = orig_hash; bp->seqno_first = i; nextseqtab[i] = first; bp->seqno_last = last; maxsize = std::max(bp->size, maxsize); } else { /* no match */ orig_bp->size = abundance; orig_bp->hash = orig_hash; orig_bp->seqno_first = i; orig_bp->seqno_last = i; maxsize = std::max(abundance, maxsize); ++clusters; } } progress_update(i); } progress_done(); show_rusage(); progress_init("Sorting", 1); qsort(hashtable.data(), hashtablesize, sizeof(struct bucket), derep_compare_prefix); progress_done(); if (clusters > 0) { if ((clusters % 2) != 0) { median = hashtable[(clusters - 1) / 2].size; } else { median = (hashtable[(clusters / 2) - 1].size + hashtable[clusters / 2].size) / 2.0; } } average = 1.0 * sumsize / clusters; if (clusters < 1) { if (not parameters.opt_quiet) { fprintf(stderr, "0 unique sequences\n"); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "0 unique sequences\n\n"); } } else { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", clusters, average, median, maxsize); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", clusters, average, median, maxsize); } } show_rusage(); /* count selected */ int64_t selected = 0; for (int64_t i = 0; i < clusters; i++) { int64_t const size = hashtable[i].size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++selected; if (selected == parameters.opt_topn) { break; } } } /* write output */ if (parameters.opt_output != nullptr) { progress_init("Writing output file", clusters); int64_t relabel_count = 0; for (int64_t i = 0; i < clusters; i++) { auto const & bp = hashtable[i]; int64_t const size = bp.size; if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++relabel_count; fasta_print_general(fp_output, nullptr, db_getsequence(bp.seqno_first), db_getsequencelen(bp.seqno_first), db_getheader(bp.seqno_first), db_getheaderlen(bp.seqno_first), size, relabel_count, -1.0, -1, -1, nullptr, 0.0); if (relabel_count == parameters.opt_topn) { break; } } progress_update(i); } progress_done(); fclose(fp_output); } show_rusage(); if (parameters.opt_uc != nullptr) { progress_init("Writing uc file, first part", clusters); for (int64_t i = 0; i < clusters; i++) { auto const & bp = hashtable[i]; auto * h = db_getheader(bp.seqno_first); int64_t const len = db_getsequencelen(bp.seqno_first); fprintf(fp_uc, "S\t%" PRId64 "\t%" PRId64 "\t*\t*\t*\t*\t*\t%s\t*\n", i, len, h); for (auto next = nextseqtab[bp.seqno_first]; next != terminal; next = nextseqtab[next]) { fprintf(fp_uc, "H\t%" PRId64 "\t%" PRIu64 "\t%.1f\t+\t0\t0\t*\t%s\t%s\n", i, db_getsequencelen(next), 100.0, db_getheader(next), h); } progress_update(i); } progress_done(); show_rusage(); progress_init("Writing uc file, second part", clusters); for (int64_t i = 0; i < clusters; i++) { auto const & bp = hashtable[i]; fprintf(fp_uc, "C\t%" PRId64 "\t%u\t*\t*\t*\t*\t*\t%s\t*\n", i, bp.size, db_getheader(bp.seqno_first)); progress_update(i); } fclose(fp_uc); progress_done(); show_rusage(); } if (selected < clusters) { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } } db_free(); } vsearch-2.30.1/src/derep_prefix.h000066400000000000000000000047561506776541700167020ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto derep_prefix(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/derep_smallmem.cc000066400000000000000000000453011506776541700173410ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "city.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" // #include "util.h" // hash_cityhash128 #include // std::min, std::max #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::fprintf, std::fclose #include // std::qsort #include // refactoring: unused? #include #include #include using Hash = decltype(&hash_cityhash128); static Hash hash_function = hash_cityhash128; struct sm_bucket { uint128 hash; uint64_t size; }; static struct sm_bucket * hashtable = nullptr; static uint64_t hashtablesize = 0; auto find_median() -> double { /* find the median size, based on an iterative search starting at e.g. 1 */ uint64_t cand = 1; /* candidate for the median */ uint64_t below = 0; /* closest value below the candidate */ uint64_t above = 0; /* closest value above the candidate */ uint64_t cand_count = 0; /* number of clusters with same size as cand */ uint64_t below_count = 0; /* number of clusters with smaller size than cand */ uint64_t above_count = 0; /* number of clusters with larger size than cand */ while (true) { cand_count = 0; below_count = 0; above_count = 0; for (uint64_t i = 0; i < hashtablesize; i++) { auto const v = hashtable[i].size; if (v > 0) { if (v > cand) { if ((above_count == 0) or (v < above)) { above = v; } ++above_count; } else if (v < cand) { if ((below_count == 0) or (v > below)) { below = v; } ++below_count; } else { ++cand_count; } } } if (below_count + cand_count + above_count == 0U) { // fix -Wfloat-equal return 0; // unreachable? } if (above_count + cand_count >= below_count) // mid >= below_count { if (above_count <= below_count + cand_count) // mid <= below_count + cand_count { if (above_count == below_count + cand_count) // mid == below_count + cand_count // same as: // (below_count + cand_count + above_count) / 2 == below_count + cand_count // which simplifies into: // above_count == below_count + cand_count { return (cand + above) / 2.0; } if (above_count + cand_count == below_count) // mid == below_count // same as: // (below_count + cand_count + above_count) / 2 == below_count // which simplifies into: // above_count + cand_count == below_count { return (below + cand) / 2.0; // cannot reach? } return cand; } cand = above; } else { cand = below; // cannot reach? } } } inline auto hash2bucket(uint128 hash, uint64_t htsize) -> uint64_t { // extract hash's first uint64_t, cast to the size of the hash table return hash.first % htsize; } inline auto next_bucket(uint64_t prev_bucket, uint64_t htsize) -> uint64_t { return (prev_bucket + 1) % htsize; } auto rehash_smallmem() -> void { /* allocate new hash table, 50% larger */ auto const new_hashtablesize = 3 * hashtablesize / 2; auto * new_hashtable = (struct sm_bucket *) xmalloc(sizeof(struct sm_bucket) * new_hashtablesize); /* zero new hash table */ for (uint64_t j = 0; j < new_hashtablesize; j++) { new_hashtable[j].hash.first = 0; new_hashtable[j].hash.second = 0; new_hashtable[j].size = 0; } /* rehash all from old to new */ for (uint64_t i = 0; i < hashtablesize; i++) { auto * old_bp = hashtable + i; if (old_bp->size != 0U) { auto k = hash2bucket(old_bp->hash, new_hashtablesize); while (new_hashtable[k].size != 0U) { k = next_bucket(k, new_hashtablesize); } auto * new_bp = new_hashtable + k; * new_bp = * old_bp; } } /* free old table */ xfree(hashtable); /* update variables */ hashtable = new_hashtable; hashtablesize = new_hashtablesize; } auto derep_smallmem(struct Parameters const & parameters) -> void { /* dereplicate full length sequences using a small amount of memory output options: --fastaout */ show_rusage(); auto * input_filename = parameters.opt_derep_smallmem; auto * h = fastx_open(input_filename); if (h == nullptr) // refactoring: already checked by fastx_open()? { fatal("Unrecognized input file type (not proper FASTA or FASTQ format)."); } if (h->is_pipe) { fatal("The derep_smallmem command does not support input from a pipe."); } std::FILE * fp_fastaout = nullptr; if (parameters.opt_fastaout != nullptr) { fp_fastaout = fopen_output(parameters.opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open FASTA output file for writing"); } } else { fatal("Output file for dereplication must be specified with --fastaout"); } auto const filesize = fastx_get_size(h); /* allocate initial memory for sequences of length up to 1023 chars */ int64_t alloc_seqlen = 1024; /* allocate initial hashtable with 1024 buckets */ hashtablesize = 1024; hashtable = (struct sm_bucket *) xmalloc(sizeof(struct sm_bucket) * hashtablesize); /* zero hash table */ for (uint64_t j = 0; j < hashtablesize; j++) { hashtable[j].hash.first = 0; hashtable[j].hash.second = 0; hashtable[j].size = 0; } show_rusage(); std::vector seq_up(alloc_seqlen + 1); std::vector rc_seq_up(alloc_seqlen + 1); std::string const prompt = std::string("Dereplicating file ") + input_filename; progress_init(prompt.c_str(), filesize); uint64_t sequencecount = 0; uint64_t nucleotidecount = 0; int64_t shortest = std::numeric_limits::max(); int64_t longest = 0; uint64_t discarded_short = 0; uint64_t discarded_long = 0; uint64_t clusters = 0; int64_t sumsize = 0; uint64_t maxsize = 0; /* first pass */ while (fastx_next(h, not parameters.opt_notrunclabels, chrmap_no_change_vector.data())) { int64_t const seqlen = fastx_get_sequence_length(h); if (seqlen < parameters.opt_minseqlength) { ++discarded_short; continue; } if (seqlen > parameters.opt_maxseqlength) { ++discarded_long; continue; } nucleotidecount += seqlen; longest = std::max(seqlen, longest); shortest = std::min(seqlen, shortest); /* check allocations */ if (seqlen > alloc_seqlen) { alloc_seqlen = seqlen; seq_up.resize(alloc_seqlen + 1); rc_seq_up.resize(alloc_seqlen + 1); show_rusage(); } if (100 * (clusters + 1) > 95 * hashtablesize) { // keep hash table fill rate at max 95% */ rehash_smallmem(); show_rusage(); } auto const * seq = fastx_get_sequence(h); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); /* reverse complement if necessary */ if (parameters.opt_strand) { reverse_complement(rc_seq_up.data(), seq_up.data(), seqlen); } /* Find free bucket or bucket for identical sequence. Make sure sequences are exactly identical in case of any hash collision. With 64-bit hashes, there is about 50% chance of a collision when the number of sequences is about 5e9. */ auto const hash = hash_function(seq_up.data(), seqlen); auto j = hash2bucket(hash, hashtablesize); auto * bp = hashtable + j; while ((bp->size != 0U) and (hash != bp->hash)) { j = next_bucket(j, hashtablesize); bp = hashtable + j; } if (parameters.opt_strand and (bp->size == 0U)) { /* no match on plus strand */ /* check minus strand as well */ auto const rc_hash = hash_function(rc_seq_up.data(), seqlen); auto k = hash2bucket(rc_hash, hashtablesize); auto * rc_bp = hashtable + k; while ((rc_bp->size != 0U) and (rc_hash != rc_bp->hash)) { k = next_bucket(k, hashtablesize); rc_bp = hashtable + k; } if (rc_bp->size != 0U) { bp = rc_bp; j = k; // cppcheck: 'j' is assigned a value that is never used } } int const abundance = fastx_get_abundance(h); int64_t const ab = parameters.opt_sizein ? abundance : 1; sumsize += ab; if (bp->size != 0U) { /* at least one identical sequence already */ bp->size += ab; } else { /* no identical sequences yet */ bp->size = ab; bp->hash = hash; ++clusters; } maxsize = std::max(bp->size, maxsize); ++sequencecount; progress_update(fastx_get_position(h)); } progress_done(); fastx_close(h); show_rusage(); if (not parameters.opt_quiet) { if (sequencecount > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (parameters.opt_log != nullptr) { if (sequencecount > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", nucleotidecount, sequencecount, shortest, longest, nucleotidecount * 1.0 / sequencecount); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n", nucleotidecount, sequencecount); } } if (discarded_short != 0U) { fprintf(stderr, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); if (parameters.opt_log != nullptr) { fprintf(fp_log, "minseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_minseqlength, discarded_short, (discarded_short == 1 ? "sequence" : "sequences")); } } if (discarded_long != 0U) { fprintf(stderr, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); if (parameters.opt_log != nullptr) { fprintf(fp_log, "maxseqlength %" PRId64 ": %" PRId64 " %s discarded.\n\n", parameters.opt_maxseqlength, discarded_long, (discarded_long == 1 ? "sequence" : "sequences")); } } show_rusage(); if (clusters < 1) { if (not parameters.opt_quiet) { fprintf(stderr, "0 unique sequences\n"); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "0 unique sequences\n\n"); } } else { auto const average = 1.0 * sumsize / clusters; const auto median = find_median(); if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n", clusters, average, median, maxsize); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " unique sequences, avg cluster %.1lf, median %.0f, max %" PRIu64 "\n\n", clusters, average, median, maxsize); } } show_rusage(); /* second pass with output */ auto * h2 = fastx_open(input_filename); if (h2 == nullptr) { fatal("Cannot open and read from the input file."); } progress_init("Writing FASTA output file", filesize); uint64_t selected = 0; while (fastx_next(h2, not parameters.opt_notrunclabels, chrmap_no_change_vector.data())) { int64_t const seqlen = fastx_get_sequence_length(h2); if ((seqlen < parameters.opt_minseqlength) or (seqlen > parameters.opt_maxseqlength)) { continue; } auto const * seq = fastx_get_sequence(h2); /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up.data(), seq, seqlen); /* reverse complement if necessary */ if (parameters.opt_strand) { reverse_complement(rc_seq_up.data(), seq_up.data(), seqlen); } auto const hash = hash_function(seq_up.data(), seqlen); auto j = hash2bucket(hash, hashtablesize); auto * bp = hashtable + j; while ((bp->size != 0U) and (hash != bp->hash)) { j = next_bucket(j, hashtablesize); bp = hashtable + j; } if (parameters.opt_strand and (bp->size == 0U)) { /* no match on plus strand */ /* check minus strand as well */ auto const rc_hash = hash_function(rc_seq_up.data(), seqlen); auto k = hash2bucket(rc_hash, hashtablesize); auto * rc_bp = hashtable + k; while ((rc_bp->size != 0U) and (rc_hash != rc_bp->hash)) { k = next_bucket(k, hashtablesize); rc_bp = hashtable + k; } if (rc_bp->size != 0U) { bp = rc_bp; j = k; // cppcheck: 'j' is assigned a value that is never used } } int64_t const size = bp->size; if (size > 0) { /* print sequence */ auto const * header = fastx_get_header(h2); int const headerlen = fastx_get_header_length(h2); if ((size >= parameters.opt_minuniquesize) and (size <= parameters.opt_maxuniquesize)) { ++selected; fasta_print_general(fp_fastaout, nullptr, seq, seqlen, header, headerlen, size, selected, -1.0, -1, -1, nullptr, 0.0); } bp->size = -1; } progress_update(fastx_get_position(h2)); } progress_done(); fastx_close(h2); fclose(fp_fastaout); show_rusage(); if (selected < clusters) { if (not parameters.opt_quiet) { fprintf(stderr, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } if (parameters.opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " uniques written, %" PRId64 " clusters discarded (%.1f%%)\n\n", selected, clusters - selected, 100.0 * (clusters - selected) / clusters); } } show_rusage(); xfree(hashtable); show_rusage(); } vsearch-2.30.1/src/derep_smallmem.h000066400000000000000000000047601506776541700172070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto derep_smallmem(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/dynlibs.cc000066400000000000000000000120011506776541700160060ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dynlibs.h" #include "utils/fatal.hpp" #include // std::FILE #include #ifdef HAVE_ZLIB_H # ifdef _WIN32 const std::string gz_libname = "zlib1.dll"; HMODULE gz_lib; # else # ifdef __APPLE__ const std::string gz_libname = "libz.dylib"; # else const std::string gz_libname = "libz.so.1"; # endif void * gz_lib; # endif gzFile ZEXPORT (*gzdopen_p) OF((int, const char *)); int ZEXPORT (*gzclose_p) OF((gzFile)); int ZEXPORT (*gzread_p) OF((gzFile, void *, unsigned)); #endif #ifdef HAVE_BZLIB_H # ifdef _WIN32 const std::string bz2_libname = "libbz2.dll"; HMODULE bz2_lib; # else # ifdef __APPLE__ const std::string bz2_libname = "libbz2.dylib"; # else const std::string bz2_libname = "libbz2.so.1"; # endif void * bz2_lib; # endif BZFILE* (*BZ2_bzReadOpen_p)(int*, FILE*, int, int, void*, int); void (*BZ2_bzReadClose_p)(int*, BZFILE*); int (*BZ2_bzRead_p)(int*, BZFILE*, void*, int); #endif auto dynlibs_open() -> void { #ifdef HAVE_ZLIB_H #ifdef _WIN32 gz_lib = LoadLibraryA(gz_libname.data()); #else gz_lib = dlopen(gz_libname.data(), RTLD_LAZY); #endif if (gz_lib != nullptr) { gzdopen_p = (gzFile (*)(int, const char*)) arch_dlsym(gz_lib, "gzdopen"); gzclose_p = (int (*)(gzFile)) arch_dlsym(gz_lib, "gzclose"); gzread_p = (int (*)(gzFile, void*, unsigned)) arch_dlsym(gz_lib, "gzread"); if (not ((gzdopen_p != nullptr) && (gzclose_p != nullptr) && (gzread_p != nullptr))) { fatal("Invalid compression library (zlib)"); } } #endif #ifdef HAVE_BZLIB_H #ifdef _WIN32 bz2_lib = LoadLibraryA(bz2_libname.data()); #else bz2_lib = dlopen(bz2_libname.data(), RTLD_LAZY); #endif if (bz2_lib != nullptr) { BZ2_bzReadOpen_p = (BZFILE* (*)(int*, FILE*, int, int, void*, int)) arch_dlsym(bz2_lib, "BZ2_bzReadOpen"); BZ2_bzReadClose_p = (void (*)(int*, BZFILE*)) arch_dlsym(bz2_lib, "BZ2_bzReadClose"); BZ2_bzRead_p = (int (*)(int*, BZFILE*, void*, int)) arch_dlsym(bz2_lib, "BZ2_bzRead"); if (not ((BZ2_bzReadOpen_p != nullptr) && (BZ2_bzReadClose_p != nullptr) && (BZ2_bzRead_p != nullptr))) { fatal("Invalid compression library (bz2)"); } } #endif } auto dynlibs_close() -> void { #ifdef HAVE_ZLIB_H if (gz_lib != nullptr) { #ifdef _WIN32 FreeLibrary(gz_lib); #else dlclose(gz_lib); #endif } gz_lib = nullptr; #endif #ifdef HAVE_BZLIB_H if (bz2_lib != nullptr) { #ifdef _WIN32 FreeLibrary(bz2_lib); #else dlclose(bz2_lib); #endif } bz2_lib = nullptr; #endif } vsearch-2.30.1/src/dynlibs.h000066400000000000000000000061721506776541700156640ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_ZLIB_H #ifdef _WIN32 extern HMODULE gz_lib; #else extern void * gz_lib; #endif extern gzFile (*gzdopen_p)(int, const char *); extern int (*gzclose_p)(gzFile); extern int (*gzread_p)(gzFile, void*, unsigned); extern int (*gzgetc_p)(gzFile); extern int (*gzrewind_p)(gzFile); extern int (*gzungetc_p)(int, gzFile); extern const char * (*gzerror_p)(gzFile, int*); #endif #ifdef HAVE_BZLIB_H #ifdef _WIN32 extern HMODULE bz2_lib; #else extern void * bz2_lib; #endif extern BZFILE* (*BZ2_bzReadOpen_p)(int*, FILE*, int, int, void*, int); extern void (*BZ2_bzReadClose_p)(int*, BZFILE*); extern int (*BZ2_bzRead_p)(int*, BZFILE*, void*, int); #endif auto dynlibs_open() -> void; auto dynlibs_close() -> void; vsearch-2.30.1/src/eestats.cc000066400000000000000000000402301506776541700160170ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::max, std::min #include // macros PRIu64 and PRId64 #include // std::pow #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::exit, EXIT_FAILURE #include // refactoring: not used? #include #include inline auto fastq_get_qual_eestats(char const q) -> int { int const qual = q - opt_fastq_ascii; if (qual < opt_fastq_qmin) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); if (fp_log != nullptr) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", qual, opt_fastq_qmin); } exit(EXIT_FAILURE); } else if (qual > opt_fastq_qmax) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(stderr, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); if (fp_log != nullptr) { fprintf(fp_log, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", qual, opt_fastq_qmax); fprintf(fp_log, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", qual); } exit(EXIT_FAILURE); } return qual; } auto q2p(int const quality_value) -> double { static constexpr auto base = 10.0; return std::pow(base, -quality_value / base); } auto ee_start(int const pos, int const resolution) -> int64_t { return pos * (resolution * (pos + 1) + 2) / 2; } auto fastq_eestats(struct Parameters const & parameters) -> void { if (opt_output == nullptr) { fatal("Output file for fastq_eestats must be specified with --output"); } fastx_handle h = fastq_open(parameters.opt_fastq_eestats); uint64_t const filesize = fastq_get_size(h); std::FILE * fp_output = nullptr; if (opt_output != nullptr) { fp_output = fopen_output(opt_output); if (fp_output == nullptr) { fatal("Unable to open output file for writing"); } } progress_init("Reading FASTQ file", filesize); uint64_t seq_count = 0; int64_t len_alloc = 10; const int resolution = 1000; int const max_quality = opt_fastq_qmax - opt_fastq_qmin + 1; int64_t ee_size = ee_start(len_alloc, resolution); std::vector read_length_table(len_alloc); std::vector qual_length_table(len_alloc * (max_quality + 1)); std::vector ee_length_table(ee_size); std::vector sum_ee_length_table(len_alloc); std::vector sum_pe_length_table(len_alloc); int64_t len_min = std::numeric_limits::max(); int64_t len_max = 0; while (fastq_next(h, false, chrmap_upcase_vector.data())) { ++seq_count; int64_t const len = fastq_get_sequence_length(h); char const * q = fastq_get_quality(h); /* update length statistics */ int64_t const new_alloc = len + 1; if (new_alloc > len_alloc) { int64_t const new_ee_size = ee_start(new_alloc, resolution); read_length_table.resize(new_alloc); qual_length_table.resize(new_alloc * (max_quality + 1)); ee_length_table.resize(new_ee_size); sum_ee_length_table.resize(new_alloc); sum_pe_length_table.resize(new_alloc); len_alloc = new_alloc; ee_size = new_ee_size; } len_min = std::min(len, len_min); len_max = std::max(len, len_max); /* update quality statistics */ double ee = 0.0; for (int64_t i = 0; i < len; i++) { ++read_length_table[i]; /* quality score */ auto const qual = std::max(fastq_get_qual_eestats(q[i]), 0); ++qual_length_table[((max_quality + 1) * i) + qual]; /* probability of error (Pe) */ auto const probability_of_error = q2p(qual); sum_pe_length_table[i] += probability_of_error; /* expected number of errors */ ee += probability_of_error; auto const e_int = std::min(resolution * (i + 1), (int) (resolution * ee)); ++ee_length_table[ee_start(i, resolution) + e_int]; sum_ee_length_table[i] += ee; } progress_update(fastq_get_position(h)); } progress_done(); fprintf(fp_output, "Pos\tRecs\tPctRecs\t" "Min_Q\tLow_Q\tMed_Q\tMean_Q\tHi_Q\tMax_Q\t" "Min_Pe\tLow_Pe\tMed_Pe\tMean_Pe\tHi_Pe\tMax_Pe\t" "Min_EE\tLow_EE\tMed_EE\tMean_EE\tHi_EE\tMax_EE\n"); for (int64_t i = 0; i < len_max; i++) { int64_t const reads = read_length_table[i]; double const pctrecs = 100.0 * reads / seq_count; /* q */ double min_q = -1.0; double low_q = -1.0; double med_q = -1.0; double hi_q = -1.0; double max_q = -1.0; double qsum = 0; double n = 0; for (int q = 0; q <= max_quality; q++) { double const x = qual_length_table[((max_quality + 1) * i) + q]; if (x > 0) { qsum += q * x; n += x; if (min_q < 0) { min_q = q; } if ((low_q < 0) && (n >= 0.25 * reads)) { low_q = q; } if ((med_q < 0) && (n >= 0.50 * reads)) { med_q = q; } if ((hi_q < 0) && (n >= 0.75 * reads)) { hi_q = q; } max_q = q; } } double const mean_q = 1.0 * qsum / reads; /* pe */ double min_pe = -1.0; double low_pe = -1.0; double med_pe = -1.0; double hi_pe = -1.0; double max_pe = -1.0; double pesum = 0; n = 0; for (int q = max_quality; q >= 0; q--) { double const x = qual_length_table[((max_quality + 1) * i) + q]; if (x > 0) { double const pe = q2p(q); pesum += pe * x; n += x; if (min_pe < 0) { min_pe = pe; } if ((low_pe < 0) && (n >= 0.25 * reads)) { low_pe = pe; } if ((med_pe < 0) && (n >= 0.50 * reads)) { med_pe = pe; } if ((hi_pe < 0) && (n >= 0.75 * reads)) { hi_pe = pe; } max_pe = pe; } } double const mean_pe = 1.0 * pesum / reads; /* expected errors */ double min_ee = -1.0; double low_ee = -1.0; double med_ee = -1.0; double hi_ee = -1.0; double max_ee = -1.0; int64_t const ee_offset = ee_start(i, resolution); int64_t const max_errors = resolution * (i + 1); n = 0; for (int64_t e = 0; e <= max_errors; e++) { int64_t const x = ee_length_table[ee_offset + e]; if (x > 0) { n += x; if (min_ee < 0) { min_ee = e; } if ((low_ee < 0) && (n >= 0.25 * reads)) { low_ee = e; } if ((med_ee < 0) && (n >= 0.50 * reads)) { med_ee = e; } if ((hi_ee < 0) && (n >= 0.75 * reads)) { hi_ee = e; } max_ee = e; } } double const mean_ee = sum_ee_length_table[i] / reads; min_ee = (min_ee + 0.5) / resolution; low_ee = (low_ee + 0.5) / resolution; med_ee = (med_ee + 0.5) / resolution; hi_ee = (hi_ee + 0.5) / resolution; max_ee = (max_ee + 0.5) / resolution; fprintf(fp_output, "%" PRId64 "\t%" PRId64 "\t%.1lf" "\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.1lf\t%.1lf" "\t%.2lg\t%.2lg\t%.2lg\t%.2lg\t%.2lg\t%.2lg" "\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", i + 1, reads, pctrecs, min_q, low_q, med_q, mean_q, hi_q, max_q, min_pe, low_pe, med_pe, mean_pe, hi_pe, max_pe, min_ee, low_ee, med_ee, mean_ee, hi_ee, max_ee); } fclose(fp_output); fastq_close(h); } auto fastq_eestats2(struct Parameters const & parameters) -> void { if (opt_output == nullptr) { fatal("Output file for fastq_eestats2 must be specified with --output"); } fastx_handle h = fastq_open(parameters.opt_fastq_eestats2); uint64_t const filesize = fastq_get_size(h); std::FILE * fp_output = nullptr; if (opt_output != nullptr) { fp_output = fopen_output(opt_output); if (fp_output == nullptr) { fatal("Unable to open output file for writing"); } } progress_init("Reading FASTQ file", filesize); uint64_t seq_count = 0; uint64_t symbols = 0; uint64_t longest = 0; int len_steps = 0; std::vector count_table; while (fastq_next(h, false, chrmap_upcase_vector.data())) { ++seq_count; auto const len = fastq_get_sequence_length(h); auto const * q = fastq_get_quality(h); /* update length statistics */ if (len > longest) { longest = len; // opt_length_cutoffs_longest is an int between 1 and INT_MAX int const high = std::min(longest, (uint64_t) (opt_length_cutoffs_longest)); auto const new_len_steps = 1 + std::max(0, ((high - opt_length_cutoffs_shortest) / opt_length_cutoffs_increment)); if (new_len_steps > len_steps) { count_table.resize(new_len_steps * opt_ee_cutoffs_count); len_steps = new_len_steps; } } /* update quality statistics */ symbols += len; auto ee = 0.0; for (uint64_t i = 0; i < len; i++) { /* quality score */ auto const qual = std::max(fastq_get_qual_eestats(q[i]), 0); auto const pe = q2p(qual); ee += pe; for (int x = 0; x < len_steps; x++) { uint64_t const len_cutoff = opt_length_cutoffs_shortest + (x * opt_length_cutoffs_increment); if (i + 1 == len_cutoff) { for (int y = 0; y < opt_ee_cutoffs_count; y++) { if (ee <= opt_ee_cutoffs_values[y]) { ++count_table[(x * opt_ee_cutoffs_count) + y]; } } } } } progress_update(fastq_get_position(h)); } progress_done(); fprintf(fp_output, "%" PRIu64 " reads", seq_count); if (seq_count > 0) { fprintf(fp_output, ", max len %" PRIu64 ", avg %.1f", longest, 1.0 * symbols / seq_count); } fprintf(fp_output, "\n\n"); fprintf(fp_output, "Length"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_output, " MaxEE %.2f", opt_ee_cutoffs_values[y]); } fprintf(fp_output, "\n"); fprintf(fp_output, "------"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_output, " ----------------"); } fprintf(fp_output, "\n"); for (int x = 0; x < len_steps; x++) { int const len_cutoff = opt_length_cutoffs_shortest + (x * opt_length_cutoffs_increment); if (len_cutoff > opt_length_cutoffs_longest) { break; } fprintf(fp_output, "%6d", len_cutoff); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_output, " %8" PRIu64 "(%5.1f%%)", count_table[(x * opt_ee_cutoffs_count) + y], 100.0 * count_table[(x * opt_ee_cutoffs_count) + y] / seq_count); } fprintf(fp_output, "\n"); } if (fp_log != nullptr) { fprintf(fp_log, "%" PRIu64 " reads, max len %" PRIu64 ", avg %.1f\n\n", seq_count, longest, 1.0 * symbols / seq_count); fprintf(fp_log, "Length"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_log, " MaxEE %.2f", opt_ee_cutoffs_values[y]); } fprintf(fp_log, "\n"); fprintf(fp_log, "------"); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_log, " ----------------"); } fprintf(fp_log, "\n"); for (int x = 0; x < len_steps; x++) { int const len_cutoff = opt_length_cutoffs_shortest + (x * opt_length_cutoffs_increment); if (len_cutoff > opt_length_cutoffs_longest) { break; } fprintf(fp_log, "%6d", len_cutoff); for (int y = 0; y < opt_ee_cutoffs_count; y++) { fprintf(fp_log, " %8" PRIu64 "(%5.1f%%)", count_table[(x * opt_ee_cutoffs_count) + y], 100.0 * count_table[(x * opt_ee_cutoffs_count) + y] / seq_count); } fprintf(fp_log, "\n"); } } fclose(fp_output); fastq_close(h); } vsearch-2.30.1/src/eestats.h000066400000000000000000000050621506776541700156650ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_eestats(struct Parameters const & parameters) -> void; auto fastq_eestats2(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fasta.cc000066400000000000000000000551111506776541700154510ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "utils/fatal.hpp" #include // std::min #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::size_t, std::snprintf #include // std::memchr #include // std::next #include // anonymous namespace: limit visibility and usage to this translation unit namespace { enum struct Action : unsigned char { warn, // (0) symbol is stripped, with a warning accept, // (1) reject, // (2) fatal printable symbol ('.', '-') show, // (3) fatal non-printable symbol (0-32, but not 127?) skip, // (4) symbol is stripped, silently count // (5) track the number of lines }; /* How to handle input characters for FASTA 0=warn, 1=accept, 2=reject, 3=show, 4=skip, 5=count 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 4, 4, 4, 3, 3, // 0-15 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, // 32-47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 48-63 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, // 64-79 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, // 80-95 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, // 96-111 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, // 112-127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 */ const std::vector char_actions = { Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::skip, Action::count, Action::skip, Action::skip, Action::skip, Action::show, Action::show, // 0-15 Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, Action::show, // 16-31 Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::reject, Action::reject, Action::warn, // 32-47 Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, // 48-63 Action::warn, Action::accept, Action::accept, Action::accept, Action::accept, Action::warn, Action::warn, Action::accept, Action::accept, Action::warn, Action::warn, Action::accept, Action::warn, Action::accept, Action::accept, Action::warn, // 64-79 Action::warn, Action::warn, Action::accept, Action::accept, Action::accept, Action::accept, Action::accept, Action::accept, Action::warn, Action::accept, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, // 80-95 Action::warn, Action::accept, Action::accept, Action::accept, Action::accept, Action::warn, Action::warn, Action::accept, Action::accept, Action::warn, Action::warn, Action::accept, Action::warn, Action::accept, Action::accept, Action::warn, // 96-111 Action::warn, Action::warn, Action::accept, Action::accept, Action::accept, Action::accept, Action::accept, Action::accept, Action::warn, Action::accept, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, // 112-127 Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn, Action::warn }; auto map_action(char const nucleotide) -> Action { auto const current_char = static_cast(nucleotide); return char_actions[current_char]; } // __attribute__((noreturn)) auto report_illegal_symbol_and_exit(unsigned char symbol, uint64_t line_number) -> void { static constexpr std::size_t max_buffer_size = 200; static std::array msg {{}}; static_cast(std::snprintf( msg.data(), max_buffer_size, "Illegal character '%c' in sequence on line %" PRIu64 " of FASTA file", symbol, line_number)); fatal(msg.data()); } // __attribute__((noreturn)) auto report_unprintable_symbol_and_exit(unsigned char symbol, uint64_t line_number) -> void { static constexpr std::size_t max_buffer_size = 200; static std::array msg {{}}; static_cast(std::snprintf( msg.data(), max_buffer_size, "Illegal unprintable ASCII character no %d in sequence on line %" PRIu64 " of FASTA file", symbol, line_number)); fatal(msg.data()); } } // end of anonymous namespace auto fasta_open(const char * filename) -> fastx_handle { auto * input_handle = fastx_open(filename); if (fastx_is_fastq(input_handle) and not input_handle->is_empty) { fatal("FASTA file expected, FASTQ file found (%s)", filename); } return input_handle; } auto fasta_close(fastx_handle input_handle) -> void { fastx_close(input_handle); } auto fasta_filter_sequence(fastx_handle input_handle, unsigned char const * char_mapping) -> void { /* Strip unwanted characters from the sequence and raise warnings or errors on certain characters. */ auto * source = input_handle->sequence_buffer.data; auto * dest = source; while (*source != '\0') { auto const current_char = static_cast(*source); switch (map_action(*source)) { case Action::warn: /* stripped */ ++input_handle->stripped_all; ++input_handle->stripped[current_char]; break; case Action::accept: /* legal character */ *dest = static_cast(char_mapping[current_char]); dest = std::next(dest); break; case Action::reject: /* fatal character */ report_illegal_symbol_and_exit(current_char, input_handle->lineno); break; case Action::show: /* fatal unprintable character */ report_unprintable_symbol_and_exit(current_char, input_handle->lineno); break; case Action::skip: /* silently stripped chars (whitespace) */ break; case Action::count: /* newline (silently stripped) */ ++input_handle->lineno; break; } source = std::next(source); } /* add nullchar after sequence */ *dest = '\0'; input_handle->sequence_buffer.length = dest - input_handle->sequence_buffer.data; } auto fasta_next(fastx_handle input_handle, bool truncateatspace, const unsigned char * char_mapping) -> bool { input_handle->lineno_start = input_handle->lineno; input_handle->header_buffer.length = 0; input_handle->header_buffer.data[0] = 0; input_handle->sequence_buffer.length = 0; input_handle->sequence_buffer.data[0] = 0; std::size_t rest = fastx_file_fill_buffer(input_handle); if (rest == 0) { return false; } /* read header */ /* check initial > character */ if (input_handle->file_buffer.data[input_handle->file_buffer.position] != '>') { fprintf(stderr, "Found character %02x\n", (unsigned char)(input_handle->file_buffer.data[input_handle->file_buffer.position])); fatal("Invalid FASTA - header must start with > character"); } ++input_handle->file_buffer.position; --rest; char const * line_end = nullptr; while (line_end == nullptr) { /* get more data if buffer empty*/ rest = fastx_file_fill_buffer(input_handle); if (rest == 0) { fatal("Invalid FASTA - header must be terminated with newline"); } /* find new line char ('LF') */ auto * const current_position = std::next(input_handle->file_buffer.data, static_cast(input_handle->file_buffer.position)); line_end = static_cast(std::memchr(current_position, '\n', rest)); /* copy to header buffer */ uint64_t len = rest; if (line_end != nullptr) { /* LF found, copy up to and including LF */ len = line_end - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; ++input_handle->lineno; } buffer_extend(& input_handle->header_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len); input_handle->file_buffer.position += len; rest -= len; } /* read one or more sequence lines */ while (true) { /* get more data, if necessary */ rest = fastx_file_fill_buffer(input_handle); /* end if no more data */ if (rest == 0) { break; } /* end if new sequence starts */ if ((line_end != nullptr) and (input_handle->file_buffer.data[input_handle->file_buffer.position] == '>')) { break; } /* find LF */ auto * const current_position = std::next(input_handle->file_buffer.data, input_handle->file_buffer.position); line_end = static_cast(std::memchr(current_position, '\n', rest)); uint64_t len = rest; if (line_end != nullptr) { /* LF found, copy up to and including LF */ len = line_end - current_position + 1; } buffer_extend(& input_handle->sequence_buffer, current_position, len); input_handle->file_buffer.position += len; rest -= len; } ++input_handle->seqno; fastx_filter_header(input_handle, truncateatspace); fasta_filter_sequence(input_handle, char_mapping); return true; } auto fasta_get_abundance(fastx_handle input_handle) -> int64_t { // return 1 if not present auto const size = header_get_size(input_handle->header_buffer.data, input_handle->header_buffer.length); if (size > 0) { return size; } return 1; } auto fasta_get_abundance_and_presence(fastx_handle input_handle) -> int64_t { // return 0 if not present return header_get_size(input_handle->header_buffer.data, input_handle->header_buffer.length); } auto fasta_get_position(fastx_handle input_handle) -> uint64_t { return input_handle->file_position; } auto fasta_get_size(fastx_handle input_handle) -> uint64_t { return input_handle->file_size; } auto fasta_get_lineno(fastx_handle input_handle) -> uint64_t { return input_handle->lineno_start; } auto fasta_get_seqno(fastx_handle input_handle) -> uint64_t { return input_handle->seqno; } auto fasta_get_header_length(fastx_handle input_handle) -> uint64_t { return input_handle->header_buffer.length; } auto fasta_get_sequence_length(fastx_handle input_handle) -> uint64_t { return input_handle->sequence_buffer.length; } auto fasta_get_header(fastx_handle input_handle) -> char const * { return input_handle->header_buffer.data; } auto fasta_get_sequence(fastx_handle input_handle) -> char const * { return input_handle->sequence_buffer.data; } /* fasta output */ auto fasta_print_sequence(std::FILE * output_handle, char const * seq, uint64_t const len, int const width) -> void { /* The actual length of the sequence may be longer than "len", but only "len" characters are printed. Specify width of lines - zero (or <1) means linearize (all on one line). */ auto const sequence_length = static_cast(len); if (width < 1) // no sequence folding { std::fprintf(output_handle, "%.*s\n", sequence_length, seq); } else // sequence folding every 'width' { auto rest = sequence_length; for (auto i = 0; i < sequence_length; i += width) { std::fprintf(output_handle, "%.*s\n", std::min(rest, width), seq + i); rest -= width; } } } auto fasta_print(std::FILE * output_handle, char const * header, char const * seq, uint64_t const len) -> void { std::fprintf(output_handle, ">%s\n", header); fasta_print_sequence(output_handle, seq, len, opt_fasta_width); } inline auto fprint_seq_label(std::FILE * output_handle, char const * seq, int const len) -> void { /* normalize first? */ std::fprintf(output_handle, "%.*s", len, seq); } auto fasta_print_general(std::FILE * output_handle, char const * prefix, char const * seq, int const len, char const * header, int const header_length, unsigned int const abundance, int const ordinal, double const expected_error, int const clustersize, int const clusterid, char const * score_name, double const score) -> void { std::fprintf(output_handle, ">"); if (prefix != nullptr) { std::fprintf(output_handle, "%s", prefix); } if (opt_relabel_self) { fprint_seq_label(output_handle, seq, len); } else if (opt_relabel_sha1) { fprint_seq_digest_sha1(output_handle, seq, len); } else if (opt_relabel_md5) { fprint_seq_digest_md5(output_handle, seq, len); } else if ((opt_relabel != nullptr) and (ordinal > 0)) { std::fprintf(output_handle, "%s%d", opt_relabel, ordinal); } else { bool const strip_size = opt_xsize or (opt_sizeout and (abundance > 0)); bool const strip_ee = opt_xee or ((opt_eeout or opt_fastq_eeout) and (expected_error >= 0.0)); bool const strip_length = opt_xlength or opt_lengthout; header_fprint_strip(output_handle, header, header_length, strip_size, strip_ee, strip_length); } if (opt_label_suffix != nullptr) { std::fprintf(output_handle, "%s", opt_label_suffix); } if (opt_sample != nullptr) { std::fprintf(output_handle, ";sample=%s", opt_sample); } if (clustersize > 0) { std::fprintf(output_handle, ";seqs=%d", clustersize); } if (clusterid >= 0) { std::fprintf(output_handle, ";clusterid=%d", clusterid); } if (opt_sizeout and (abundance > 0)) { std::fprintf(output_handle, ";size=%u", abundance); } if ((opt_eeout or opt_fastq_eeout) and (expected_error >= 0.0)) { if (expected_error < 0.000000001) { std::fprintf(output_handle, ";ee=%.13lf", expected_error); } else if (expected_error < 0.00000001) { std::fprintf(output_handle, ";ee=%.12lf", expected_error); } else if (expected_error < 0.0000001) { std::fprintf(output_handle, ";ee=%.11lf", expected_error); } else if (expected_error < 0.000001) { std::fprintf(output_handle, ";ee=%.10lf", expected_error); } else if (expected_error < 0.00001) { std::fprintf(output_handle, ";ee=%.9lf", expected_error); } else if (expected_error < 0.0001) { std::fprintf(output_handle, ";ee=%.8lf", expected_error); } else if (expected_error < 0.001) { std::fprintf(output_handle, ";ee=%.7lf", expected_error); } else if (expected_error < 0.01) { std::fprintf(output_handle, ";ee=%.6lf", expected_error); } else if (expected_error < 0.1) { std::fprintf(output_handle, ";ee=%.5lf", expected_error); } else { std::fprintf(output_handle, ";ee=%.4lf", expected_error); } } if (opt_lengthout) { std::fprintf(output_handle, ";length=%d", len); } if (score_name != nullptr) { std::fprintf(output_handle, ";%s=%.4lf", score_name, score); } if (opt_relabel_keep and (((opt_relabel != nullptr) and (ordinal > 0)) or opt_relabel_sha1 or opt_relabel_md5 or opt_relabel_self)) { std::fprintf(output_handle, " %s", header); } std::fprintf(output_handle, "\n"); if (seq != nullptr) { fasta_print_sequence(output_handle, seq, len, opt_fasta_width); } } auto fasta_print_db_relabel(std::FILE * output_handle, uint64_t seqno, int ordinal) -> void { fasta_print_general(output_handle, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), db_getabundance(seqno), ordinal, -1.0, -1, -1, nullptr, 0.0); } auto fasta_print_db_relabel(std::FILE * output_handle, uint64_t seqno, std::size_t ordinal) -> void { fasta_print_general(output_handle, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), db_getabundance(seqno), static_cast(ordinal), -1.0, -1, -1, nullptr, 0.0); } auto fasta_print_db(std::FILE * output_handle, uint64_t seqno) -> void { fasta_print_general(output_handle, nullptr, db_getsequence(seqno), db_getsequencelen(seqno), db_getheader(seqno), db_getheaderlen(seqno), db_getabundance(seqno), 0, -1.0, -1, -1, nullptr, 0.0); } vsearch-2.30.1/src/fasta.h000066400000000000000000000111241506776541700153070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // uint64_t /* fasta input */ auto fasta_open_rest(fastx_handle input_handle) -> void; auto fasta_open(const char * filename) -> fastx_handle; auto fasta_close(fastx_handle input_handle) -> void; auto fasta_next(fastx_handle input_handle, bool truncateatspace, const unsigned char * char_mapping) -> bool; auto fasta_get_position(fastx_handle input_handle) -> uint64_t; auto fasta_get_size(fastx_handle input_handle) -> uint64_t; auto fasta_get_lineno(fastx_handle input_handle) -> uint64_t; auto fasta_get_seqno(fastx_handle input_handle) -> uint64_t; auto fasta_get_header(fastx_handle input_handle) -> char const *; auto fasta_get_sequence(fastx_handle input_handle) -> char const *; auto fasta_get_header_length(fastx_handle input_handle) -> uint64_t; auto fasta_get_sequence_length(fastx_handle input_handle) -> uint64_t; auto fasta_get_abundance(fastx_handle input_handle) -> int64_t; auto fasta_get_abundance_and_presence(fastx_handle input_handle) -> int64_t; /* fasta output */ auto fasta_print(std::FILE * output_handle, char const * header, char const * seq, uint64_t len) -> void; auto fasta_print_general(std::FILE * output_handle, char const * prefix, char const * seq, int len, char const * header, int header_length, unsigned int abundance, int ordinal, double expected_error, int clustersize, int clusterid, char const * score_name, double score) -> void; auto fasta_print_db(std::FILE * output_handle, uint64_t seqno) -> void; auto fasta_print_db_relabel(std::FILE * output_handle, uint64_t seqno, int ordinal) -> void; auto fasta_print_db_relabel(std::FILE * output_handle, uint64_t seqno, std::size_t ordinal) -> void; vsearch-2.30.1/src/fasta2fastq.cc000066400000000000000000000110711506776541700165670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/check_output_filehandle.hpp" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/open_file.hpp" #include "utils/progress.hpp" #include #include // std::FILE, std::size_t, std::fclose #include auto fasta2fastq(struct Parameters const & parameters) -> void { auto const max_ascii_value = static_cast(parameters.opt_fastq_asciiout + parameters.opt_fastq_qmaxout); assert(parameters.opt_fastqout != nullptr); // check performed in auto * fp_input = fasta_open(parameters.opt_fasta2fastq); assert(fp_input != nullptr); // check performed in fasta_open(fastx_open()) auto const output_handle = open_output_file(parameters.opt_fastqout); check_mandatory_fastq_output_handle(parameters.opt_fastqout, (not output_handle)); static constexpr auto initial_length = 1024U; std::vector quality(initial_length, max_ascii_value); Progress progress("Converting FASTA file to FASTQ", fasta_get_size(fp_input), parameters); auto counter = 0; while (fasta_next(fp_input, false, chrmap_no_change_vector.data())) { /* get sequence length and allocate more mem if necessary */ auto const length = fastq_get_sequence_length(fp_input); if (quality.size() < length + 1) { quality.resize(length + 1, max_ascii_value); } // note: adding '\0' and the end of the quality string is not necessary, // fastq_print_general() uses 'length' for both sequence and quality ++counter; /* write to fastq file */ fastq_print_general(output_handle.get(), fastq_get_sequence(fp_input), static_cast(length), fasta_get_header(fp_input), static_cast(fasta_get_header_length(fp_input)), quality.data(), static_cast(fastq_get_abundance(fp_input)), counter, -1.0); progress.update(fasta_get_position(fp_input)); } fasta_close(fp_input); } vsearch-2.30.1/src/fasta2fastq.h000066400000000000000000000047551506776541700164440ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fasta2fastq(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fastq.cc000066400000000000000000000571721506776541700155020ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "utils/fatal.hpp" #include #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::snprintf, std::size_t #include // std::memcmp, std::memchr, std::strlen #include // anonymous namespace: limit visibility and usage to this translation unit namespace { // refactoring: eliminate and replace with an overload of buffer_filter_extend()? const std::vector chrmap_identity = { /* identity map: does nothing */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; const std::vector char_fq_action_seq = { /* How to handle input characters for FASTQ: All IUPAC characters are valid. CR (^M) silently stripped. LF is newline. Rest is fatal 0=stripped, 1=legal, 2=fatal, 3=silently stripped, 4=newline @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, }; const std::vector char_fq_action_qual = { /* Quality characters, any from 33 to 126 is valid (legal). CR (^M) silently stripped. LF is newline. Rest is fatal @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; } // end of anonymous namespace auto fastq_fatal(uint64_t const lineno, const char * msg) -> void { char * string = nullptr; if (xsprintf(&string, "Invalid line %lu in FASTQ file: %s", lineno, msg) == -1) { fatal("Out of memory"); } if (string != nullptr) { fatal(string); xfree(string); } else { fatal("Out of memory"); } } auto buffer_filter_extend(fastx_handle input_handle, struct fastx_buffer_s * dest_buffer, char const * source_buf, uint64_t const len, unsigned int const * char_action, unsigned char const * char_mapping, bool * ok, char * illegal_char) -> void { buffer_makespace(dest_buffer, len + 1); /* Strip unwanted characters from the string and raise warnings or errors on certain characters. */ auto const * p = source_buf; auto * d = dest_buffer->data + dest_buffer->length; auto * q = d; *ok = true; for (auto i = 0ULL; i < len; i++) { auto const c = *p++; char const m = char_action[(unsigned char) (c)]; switch (m) { case 0: /* stripped */ input_handle->stripped_all++; input_handle->stripped[(unsigned char) (c)]++; break; case 1: /* legal character */ *q++ = char_mapping[(unsigned char) (c)]; break; case 2: /* fatal character */ if (*ok) { *illegal_char = c; } *ok = false; break; case 3: /* silently stripped chars (whitespace) */ break; case 4: /* newline (silently stripped) */ break; } } /* add zero after sequence */ *q = 0; dest_buffer->length += q - d; } auto fastq_open(const char * filename) -> fastx_handle { auto * input_handle = fastx_open(filename); if (not fastx_is_fastq(input_handle)) { fatal("FASTQ file expected, FASTA file found (%s)", filename); } return input_handle; } auto fastq_close(fastx_handle input_handle) -> void { fastx_close(input_handle); } auto fastq_next(fastx_handle input_handle, bool const truncateatspace, const unsigned char * char_mapping) -> bool { input_handle->header_buffer.length = 0; input_handle->header_buffer.data[0] = 0; input_handle->sequence_buffer.length = 0; input_handle->sequence_buffer.data[0] = 0; input_handle->plusline_buffer.length = 0; input_handle->plusline_buffer.data[0] = 0; input_handle->quality_buffer.length = 0; input_handle->quality_buffer.data[0] = 0; input_handle->lineno_start = input_handle->lineno; static constexpr auto max_message_length = std::size_t{200}; std::array message {{}}; auto ok = true; char illegal_char = '\0'; auto rest = fastx_file_fill_buffer(input_handle); /* check end of file */ if (rest == 0) { return false; } /* read header */ /* check initial @ character */ if (input_handle->file_buffer.data[input_handle->file_buffer.position] != '@') { fastq_fatal(input_handle->lineno, "Header line must start with '@' character"); } input_handle->file_buffer.position++; --rest; char const * line_end = nullptr; while (line_end == nullptr) { /* get more data if buffer empty */ rest = fastx_file_fill_buffer(input_handle); if (rest == 0) { fastq_fatal(input_handle->lineno, "Unexpected end of file"); } /* find LF */ line_end = (char *) std::memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to header buffer */ auto len = rest; if (line_end != nullptr) { /* LF found, copy up to and including LF */ len = line_end - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_extend(&input_handle->header_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len); input_handle->file_buffer.position += len; rest -= len; } /* read sequence line(s) */ line_end = nullptr; while (true) { /* get more data, if necessary */ rest = fastx_file_fill_buffer(input_handle); /* cannot end here */ if (rest == 0) { fastq_fatal(input_handle->lineno, "Unexpected end of file"); } /* end when new line starting with + is seen */ if ((line_end != nullptr) && (input_handle->file_buffer.data[input_handle->file_buffer.position] == '+')) { break; } /* find LF */ line_end = (char *) std::memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to sequence buffer */ auto len = rest; if (line_end != nullptr) { /* LF found, copy up to and including LF */ len = line_end - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_filter_extend(input_handle, &input_handle->sequence_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len, char_fq_action_seq.data(), char_mapping, &ok, &illegal_char); input_handle->file_buffer.position += len; rest -= len; if (! ok) { if ((illegal_char >= 32) && (illegal_char < 127)) { snprintf(message.data(), max_message_length, "Illegal sequence character '%c'", illegal_char); } else { snprintf(message.data(), max_message_length, "Illegal sequence character (unprintable, no %d)", (unsigned char) illegal_char); } fastq_fatal(input_handle->lineno - ((line_end != nullptr) ? 1 : 0), message.data()); } } /* read + line */ /* skip + character */ input_handle->file_buffer.position++; --rest; line_end = nullptr; while (line_end == nullptr) { /* get more data if buffer empty */ rest = fastx_file_fill_buffer(input_handle); /* cannot end here */ if (rest == 0) { fastq_fatal(input_handle->lineno, "Unexpected end of file"); } /* find LF */ line_end = (char *) std::memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to plusline buffer */ auto len = rest; if (line_end != nullptr) { /* LF found, copy up to and including LF */ len = line_end - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_extend(&input_handle->plusline_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len); input_handle->file_buffer.position += len; rest -= len; } /* check that the plus line is empty or identical to @ line */ auto plusline_invalid = false; if (input_handle->header_buffer.length == input_handle->plusline_buffer.length) { if ((memcmp(input_handle->header_buffer.data, input_handle->plusline_buffer.data, input_handle->header_buffer.length) != 0)) { plusline_invalid = true; } } else { if ((input_handle->plusline_buffer.length > 2) || ((input_handle->plusline_buffer.length == 2) && (input_handle->plusline_buffer.data[0] != '\r'))) { plusline_invalid = true; } } if (plusline_invalid) { fastq_fatal(input_handle->lineno - ((line_end != nullptr) ? 1 : 0), "'+' line must be empty or identical to header"); } /* read quality line(s) */ line_end = nullptr; while (true) { /* get more data, if necessary */ rest = fastx_file_fill_buffer(input_handle); /* end if no more data */ if (rest == 0) { break; } /* end if next entry starts : LF + '@' + correct length */ if ((line_end != nullptr) && (input_handle->file_buffer.data[input_handle->file_buffer.position] == '@') && (input_handle->quality_buffer.length == input_handle->sequence_buffer.length)) { break; } /* find LF */ line_end = (char *) std::memchr(input_handle->file_buffer.data + input_handle->file_buffer.position, '\n', rest); /* copy to quality buffer */ auto len = rest; if (line_end != nullptr) { /* LF found, copy up to and including LF */ len = line_end - (input_handle->file_buffer.data + input_handle->file_buffer.position) + 1; input_handle->lineno++; } buffer_filter_extend(input_handle, &input_handle->quality_buffer, input_handle->file_buffer.data + input_handle->file_buffer.position, len, char_fq_action_qual.data(), chrmap_identity.data(), &ok, &illegal_char); input_handle->file_buffer.position += len; rest -= len; /* break if quality line already too long */ if (input_handle->quality_buffer.length > input_handle->sequence_buffer.length) { break; } if (! ok) { if ((illegal_char >= 32) && (illegal_char < 127)) { snprintf(message.data(), max_message_length, "Illegal quality character '%c'", illegal_char); } else { snprintf(message.data(), max_message_length, "Illegal quality character (unprintable, no %d)", (unsigned char) illegal_char); } fastq_fatal(input_handle->lineno - ((line_end != nullptr) ? 1 : 0), message.data()); } } if (input_handle->sequence_buffer.length != input_handle->quality_buffer.length) { fastq_fatal(input_handle->lineno - ((line_end != nullptr) ? 1 : 0), "Sequence and quality lines must be equally long"); } fastx_filter_header(input_handle, truncateatspace); input_handle->seqno++; return true; } auto fastq_get_quality(fastx_handle input_handle) -> char const * { return input_handle->quality_buffer.data; } auto fastq_get_quality_length(fastx_handle input_handle) -> uint64_t { return input_handle->quality_buffer.length; } auto fastq_get_position(fastx_handle input_handle) -> uint64_t { return input_handle->file_position; } auto fastq_get_size(fastx_handle input_handle) -> uint64_t { return input_handle->file_size; } auto fastq_get_lineno(fastx_handle input_handle) -> uint64_t { return input_handle->lineno_start; } auto fastq_get_seqno(fastx_handle input_handle) -> uint64_t { return input_handle->seqno; } auto fastq_get_header_length(fastx_handle input_handle) -> uint64_t { return input_handle->header_buffer.length; } auto fastq_get_sequence_length(fastx_handle input_handle) -> uint64_t { return input_handle->sequence_buffer.length; } auto fastq_get_header(fastx_handle input_handle) -> char const * { return input_handle->header_buffer.data; } auto fastq_get_sequence(fastx_handle input_handle) -> char const * { return input_handle->sequence_buffer.data; } auto fastq_get_abundance(fastx_handle input_handle) -> int64_t { // return 1 if not present auto const size = header_get_size(input_handle->header_buffer.data, input_handle->header_buffer.length); if (size > 0) { return size; } return 1; } auto fastq_get_abundance_and_presence(fastx_handle input_handle) -> int64_t { // return 0 if not present return header_get_size(input_handle->header_buffer.data, input_handle->header_buffer.length); } inline auto fprint_seq_label(std::FILE * output_handle, char const * seq, int const len) -> void { /* normalize first? */ std::fprintf(output_handle, "%.*s", len, seq); } auto fastq_print_general(FILE * output_handle, char const * seq, int const len, char const * header, int const header_len, char const * quality, int const abundance, int const ordinal, double const expected_error) -> void { std::fprintf(output_handle, "@"); if (opt_relabel_self) { fprint_seq_label(output_handle, seq, len); } else if (opt_relabel_sha1) { fprint_seq_digest_sha1(output_handle, seq, len); } else if (opt_relabel_md5) { fprint_seq_digest_md5(output_handle, seq, len); } else if ((opt_relabel != nullptr) && (ordinal > 0)) { std::fprintf(output_handle, "%s%d", opt_relabel, ordinal); } else { auto const xsize = opt_xsize || (opt_sizeout && (abundance > 0)); auto const xee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (expected_error >= 0.0)); auto const xlength = opt_xlength || opt_lengthout; header_fprint_strip(output_handle, header, header_len, xsize, xee, xlength); } if (opt_label_suffix != nullptr) { std::fprintf(output_handle, "%s", opt_label_suffix); } if (opt_sample != nullptr) { std::fprintf(output_handle, ";sample=%s", opt_sample); } if (opt_sizeout && (abundance > 0)) { std::fprintf(output_handle, ";size=%u", abundance); } if ((opt_eeout || opt_fastq_eeout) && (expected_error >= 0.0)) { if (expected_error < 0.000000001) { std::fprintf(output_handle, ";ee=%.13lf", expected_error); } else if (expected_error < 0.00000001) { std::fprintf(output_handle, ";ee=%.12lf", expected_error); } else if (expected_error < 0.0000001) { std::fprintf(output_handle, ";ee=%.11lf", expected_error); } else if (expected_error < 0.000001) { std::fprintf(output_handle, ";ee=%.10lf", expected_error); } else if (expected_error < 0.00001) { std::fprintf(output_handle, ";ee=%.9lf", expected_error); } else if (expected_error < 0.0001) { std::fprintf(output_handle, ";ee=%.8lf", expected_error); } else if (expected_error < 0.001) { std::fprintf(output_handle, ";ee=%.7lf", expected_error); } else if (expected_error < 0.01) { std::fprintf(output_handle, ";ee=%.6lf", expected_error); } else if (expected_error < 0.1) { std::fprintf(output_handle, ";ee=%.5lf", expected_error); } else { std::fprintf(output_handle, ";ee=%.4lf", expected_error); } } if (opt_lengthout) { std::fprintf(output_handle, ";length=%d", len); } if (opt_relabel_keep && (((opt_relabel != nullptr) && (ordinal > 0)) || opt_relabel_sha1 || opt_relabel_md5 || opt_relabel_self)) { std::fprintf(output_handle, " %.*s", header_len, header); } std::fprintf(output_handle, "\n%.*s\n+\n%.*s\n", len, seq, len, quality); } auto fastq_print(std::FILE * output_handle, char const * header, char const * sequence, char const * quality) -> void { auto const slen = static_cast(std::strlen(sequence)); auto const hlen = static_cast(std::strlen(header)); fastq_print_general(output_handle, sequence, slen, header, hlen, quality, 0, 0, -1.0); } vsearch-2.30.1/src/fastq.h000066400000000000000000000101111506776541700153220ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // uint64_t auto fastq_open_rest(fastx_handle input_handle) -> void; auto fastq_open(const char * filename) -> fastx_handle; auto fastq_close(fastx_handle input_handle) -> void; auto fastq_next(fastx_handle input_handle, bool truncateatspace, const unsigned char * char_mapping) -> bool; auto fastq_get_position(fastx_handle input_handle) -> uint64_t; auto fastq_get_size(fastx_handle input_handle) -> uint64_t; auto fastq_get_lineno(fastx_handle input_handle) -> uint64_t; auto fastq_get_seqno(fastx_handle input_handle) -> uint64_t; auto fastq_get_header(fastx_handle input_handle) -> char const *; auto fastq_get_sequence(fastx_handle input_handle) -> char const *; auto fastq_get_quality(fastx_handle input_handle) -> char const *; auto fastq_get_abundance(fastx_handle input_handle) -> int64_t; auto fastq_get_abundance_and_presence(fastx_handle input_handle) -> int64_t; auto fastq_get_header_length(fastx_handle input_handle) -> uint64_t; auto fastq_get_sequence_length(fastx_handle input_handle) -> uint64_t; auto fastq_get_quality_length(fastx_handle input_handle) -> uint64_t; auto fastq_print(std::FILE * output_handle, char const * header, char const * sequence, char const * quality) -> void; auto fastq_print_general(std::FILE * output_handle, char const * seq, int len, char const * header, int header_len, char const * quality, int abundance, int ordinal, double expected_error) -> void; vsearch-2.30.1/src/fastq_chars.cc000066400000000000000000000302601506776541700166470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/maps.hpp" #include "utils/span.hpp" #include // std::find_if #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::distance #include #ifndef NDEBUG #include constexpr long int char_max = std::numeric_limits::max(); #endif constexpr unsigned int n_characters = 256; // anonymous namespace: limit visibility and usage to this translation unit namespace { struct statistics { std::vector sequence_chars; std::vector quality_chars; std::vector tail_chars; std::vector maxrun; uint64_t total_chars = 0; uint64_t seq_count = 0; unsigned char qmin_n = 255; unsigned char qmax_n = 0; char qmin = '\0'; char qmax = '\0'; char fastq_ascii = '\0'; char fastq_qmin = '\0'; char fastq_qmax = '\0'; }; auto guess_quality_offset(struct statistics & stats) -> void { static constexpr auto lowerbound = ';'; // char 59 (-5 to offset +64) static constexpr auto upperbound = 'K'; // char 75 (+1 after offset +33 normal range) if ((stats.qmin < lowerbound) or (stats.qmax < upperbound)) { stats.fastq_ascii = static_cast(default_ascii_offset); // +33, from vsearch.h } else { stats.fastq_ascii = alternative_ascii_offset; // +64, from vsearch.h } stats.fastq_qmax = static_cast(stats.qmax - stats.fastq_ascii); stats.fastq_qmin = static_cast(stats.qmin - stats.fastq_ascii); } auto find_lowest_quality_symbol(struct statistics & stats) -> void { auto lowest = std::find_if(stats.quality_chars.cbegin(), stats.quality_chars.cend(), [](uint64_t const counter) -> bool { return counter != 0; }); if (lowest == stats.quality_chars.cend()) { return; } auto const index = std::distance(stats.quality_chars.cbegin(), lowest); assert(index >= 0); assert(index <= char_max); stats.qmin = static_cast(index); } auto find_highest_quality_symbol(struct statistics & stats) -> void { // note: searching using reverse iterators auto highest = std::find_if(stats.quality_chars.rbegin(), stats.quality_chars.rend(), [](uint64_t const counter) -> bool { return counter != 0; } ); if (highest == stats.quality_chars.rend()) { return; } auto const index = std::distance(highest, stats.quality_chars.rend()) - 1; assert(index >= 0); assert(index <= char_max); stats.qmax = static_cast(index); } auto search_trailing_homopolymers(Span const symbols, int64_t const tail_length_signed) -> char { // search for trailing homopolymers of length >= 'tail_length' assert(tail_length_signed >= 0); auto const tail_length = static_cast(tail_length_signed); if (symbols.size() < tail_length) { return '\0'; } auto const last_symbol = symbols.back(); auto const tail = symbols.last(tail_length); if (std::all_of( tail.begin(), tail.end(), [last_symbol](char const symbol) -> bool { return symbol == last_symbol; }) ) { return last_symbol; } return '\0'; } auto stats_message(std::FILE * output_stream, struct statistics const & stats) -> void { static constexpr char first_char_in_Illumina_1_5 = 'B'; // 66th char static constexpr char last_char_in_original_Sanger = 'I'; // 73th char assert(stats.sequence_chars['n'] == 0); // sequences are uppercased, no results for lowercase symbols std::fprintf(output_stream, "Read %" PRIu64 " sequences.\n", stats.seq_count); if (stats.seq_count == 0) { return; } std::fprintf(output_stream, "Qmin %d, Qmax %d, Range %d\n", stats.qmin, stats.qmax, stats.qmax - stats.qmin + 1); std::fprintf(output_stream, "Guess: -fastq_qmin %d -fastq_qmax %d -fastq_ascii %d\n", stats.fastq_qmin, stats.fastq_qmax, stats.fastq_ascii); if (stats.fastq_ascii == alternative_ascii_offset) { if (stats.qmin < alternative_ascii_offset) { std::fprintf(output_stream, "Guess: Solexa format (phred+64)\n"); } else if (stats.qmin < first_char_in_Illumina_1_5) { std::fprintf(output_stream, "Guess: Illumina 1.3+ format (phred+64)\n"); } else { // Illumina 1.5+ Phred+64, quality values ranging from 3 to 41 (ascii: 67 to 105) // Q2 (ascii 66, 'B') is the Read Segment Quality Control Indicator std::fprintf(output_stream, "Guess: Illumina 1.5+ format (phred+64)\n"); } } else { if (stats.qmax > last_char_in_original_Sanger) { std::fprintf(output_stream, "Guess: Illumina 1.8+ format (phred+33)\n"); } else { // Sanger Phred+33, quality values ranging from 0 to 40 (ascii: 33 to 73) std::fprintf(output_stream, "Guess: Original Sanger format (phred+33)\n"); } } std::fprintf(output_stream, "\n"); std::fprintf(output_stream, "Letter N Freq MaxRun\n"); std::fprintf(output_stream, "------ ---------- ------ ------\n"); double const percentage_factor = 100.0 / static_cast(stats.total_chars); unsigned char index = 0; for (auto const counter: stats.sequence_chars) { if (counter == 0) { ++index ; continue; } std::fprintf(output_stream, " %c %10" PRIu64 " %5.1f%% %6d", index, counter, static_cast(counter) * percentage_factor, stats.maxrun[index]); if (index == 'N') { if (stats.qmin_n < stats.qmax_n) { std::fprintf(output_stream, " Q=%c..%c", stats.qmin_n, stats.qmax_n); } else { std::fprintf(output_stream, " Q=%c", stats.qmin_n); } } std::fprintf(output_stream, "\n"); ++index; } std::fprintf(output_stream, "\n"); std::fprintf(output_stream, "Char ASCII Freq Tails\n"); std::fprintf(output_stream, "---- ----- ------ ----------\n"); for (char i = stats.qmin; i <= stats.qmax; ++i) { if (stats.quality_chars[i] == 0) { continue; } std::fprintf(output_stream, " '%c' %5d %5.1f%% %10" PRIu64 "\n", i, i, static_cast(stats.quality_chars[i]) * percentage_factor, stats.tail_chars[i]); } } auto output_stats_message(struct Parameters const & parameters, struct statistics const & stats, char const * log_filename) -> void { if (log_filename == nullptr) { return; } stats_message(parameters.fp_log, stats); } auto output_stats_message(struct Parameters const & parameters, struct statistics const & stats) -> void { if (parameters.opt_quiet) { return; } stats_message(stderr, stats); } } // end of anonymous namespace auto fastq_chars(struct Parameters const & parameters) -> void { struct statistics stats; stats.sequence_chars.resize(n_characters); stats.quality_chars.resize(n_characters); stats.tail_chars.resize(n_characters); stats.maxrun.resize(n_characters); auto * fastq_handle = fastq_open(parameters.opt_fastq_chars); auto const filesize = fastq_get_size(fastq_handle); progress_init("Reading FASTQ file", filesize); while (fastq_next(fastq_handle, false, chrmap_upcase_vector.data())) { auto const seq_length = fastq_get_sequence_length(fastq_handle); auto const * seq_ptr = fastq_get_sequence(fastq_handle); auto const * qual_ptr = fastq_get_quality(fastq_handle); ++stats.seq_count; stats.total_chars += seq_length; auto run_char = -1; auto run = 0; for (auto i = 0ULL ; i < seq_length ; ++i) { auto const seq_symbol = static_cast(*seq_ptr); std::advance(seq_ptr, 1); auto const qual_symbol = static_cast(*qual_ptr); std::advance(qual_ptr, 1); ++stats.sequence_chars[seq_symbol]; ++stats.quality_chars[qual_symbol]; if (seq_symbol == 'N') { stats.qmin_n = std::min(qual_symbol, stats.qmin_n); stats.qmax_n = std::max(qual_symbol, stats.qmax_n); } if (seq_symbol == run_char) { ++run; stats.maxrun[run_char] = std::max(run, stats.maxrun[run_char]); } else { run_char = seq_symbol; run = 0; } } // search for trailing homopolymers in quality strings auto const tail_char = search_trailing_homopolymers(Span{fastq_get_quality(fastq_handle), seq_length}, parameters.opt_fastq_tail); if (tail_char != '\0') { ++stats.tail_chars[tail_char]; } progress_update(fastq_get_position(fastq_handle)); } progress_done(); fastq_close(fastq_handle); find_lowest_quality_symbol(stats); find_highest_quality_symbol(stats); guess_quality_offset(stats); output_stats_message(parameters, stats); output_stats_message(parameters, stats, parameters.opt_log); } vsearch-2.30.1/src/fastq_chars.h000066400000000000000000000047551506776541700165230ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_chars(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fastq_join.cc000066400000000000000000000257331506776541700165170ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::transform #include // macros PRIu64 and PRId64 #include // uint64_t #include // std::FILE, std::fprintf, std::fclose #include // anonymous namespace: limit visibility and usage to this translation unit namespace { struct input_file { char * name = nullptr; fastx_handle handle = nullptr; }; struct input_files { input_file forward; input_file reverse; }; struct output_file { char * name = nullptr; std::FILE * handle = nullptr; }; struct output_files { output_file fasta; output_file fastq; }; auto check_parameters(struct Parameters const & parameters) -> void { if (parameters.opt_reverse == nullptr) { fatal("No reverse reads file specified with --reverse"); } if ((parameters.opt_fastqout == nullptr) and (parameters.opt_fastaout == nullptr)) { fatal("No output files specified"); } if (parameters.opt_join_padgap.length() != parameters.opt_join_padgapq.length()) { fatal("Strings given by --join_padgap and --join_padgapq differ in length"); } } auto open_input_files(struct Parameters const & parameters) -> struct input_files { struct input_files infiles; infiles.forward.name = parameters.opt_fastq_join; infiles.reverse.name = parameters.opt_reverse; if (infiles.forward.name != nullptr) { infiles.forward.handle = fastq_open(infiles.forward.name); } if (infiles.reverse.name != nullptr) { infiles.reverse.handle = fastq_open(infiles.reverse.name); } return infiles; } auto open_output_files(struct Parameters const & parameters) -> struct output_files { struct output_files outfiles; outfiles.fasta.name = parameters.opt_fastaout; outfiles.fastq.name = parameters.opt_fastqout; if (outfiles.fasta.name != nullptr) { outfiles.fasta.handle = fopen_output(outfiles.fasta.name); } if (outfiles.fastq.name != nullptr) { outfiles.fastq.handle = fopen_output(outfiles.fastq.name); } return outfiles; } auto check_output_files(struct output_files const & outfiles) -> void { if (outfiles.fasta.name != nullptr) { if (outfiles.fasta.handle == nullptr) { fatal("Unable to open file for writing (%s)", outfiles.fasta.name); } } if (outfiles.fastq.name != nullptr) { if (outfiles.fastq.handle == nullptr) { fatal("Unable to open file for writing (%s)", outfiles.fastq.name); } } } auto close_output_files(struct output_files const & outfiles) -> void { for (auto * fp_outputfile : {outfiles.fasta.handle, outfiles.fastq.handle}) { if (fp_outputfile != nullptr) { static_cast(std::fclose(fp_outputfile)); } } } auto close_input_files(struct input_files const & infiles) -> void { for (auto * fp_inputfile : {infiles.forward.handle, infiles.reverse.handle}) { if (fp_inputfile != nullptr) { fastq_close(fp_inputfile); } } } auto stats_message(std::FILE * output_stream, uint64_t const total) -> void { static_cast(std::fprintf(output_stream, "%" PRIu64 " pairs joined\n", total)); } auto output_stats_message(struct Parameters const & parameters, uint64_t const total, char const * log_filename) -> void { if (log_filename == nullptr) { return; } stats_message(parameters.fp_log, total); } auto output_stats_message(struct Parameters const & parameters, uint64_t const total) -> void { if (parameters.opt_quiet) { return; } stats_message(stderr, total); } } // end of anonymous namespace auto fastq_join(struct Parameters const & parameters) -> void { /* check parameters */ check_parameters(parameters); /* open and check input and output files */ auto const infiles = open_input_files(parameters); // check_input_files(infiles)? already done by the function fastq_open() auto const outfiles = open_output_files(parameters); check_output_files(outfiles); /* main */ auto const filesize = fastq_get_size(infiles.forward.handle); progress_init("Joining reads", filesize); /* do it */ constexpr auto bufferlength = 1024U; auto const padlen = parameters.opt_join_padgap.length(); uint64_t total = 0; std::string final_sequence; final_sequence.reserve(bufferlength + padlen + bufferlength); std::string final_quality; final_quality.reserve(final_sequence.capacity()); std::string reverse_sequence; reverse_sequence.reserve(bufferlength); std::string reverse_quality; reverse_quality.reserve(bufferlength); while (fastq_next(infiles.forward.handle, false, chrmap_no_change_vector.data())) { if (not fastq_next(infiles.reverse.handle, false, chrmap_no_change_vector.data())) { fatal("More forward reads than reverse reads"); } final_sequence.clear(); final_quality.clear(); reverse_sequence.clear(); reverse_quality.clear(); auto const fwd_seq_length = fastq_get_sequence_length(infiles.forward.handle); auto const rev_seq_length = fastq_get_sequence_length(infiles.reverse.handle); auto const needed = fwd_seq_length + padlen + rev_seq_length; /* allocate enough memory */ if (rev_seq_length > reverse_sequence.capacity()) { reverse_sequence.reserve(rev_seq_length); } if (rev_seq_length > reverse_quality.capacity()) { reverse_quality.reserve(rev_seq_length); } if (needed > final_sequence.capacity()) { final_sequence.reserve(needed); } /* reverse read: reverse-complement sequence */ reverse_sequence.assign(fastq_get_sequence(infiles.reverse.handle), rev_seq_length); std::reverse(reverse_sequence.begin(), reverse_sequence.end()); std::transform(reverse_sequence.begin(), reverse_sequence.end(), reverse_sequence.begin(), [](char const nucleotide) -> char { return map_complement(nucleotide); }); /* reverse read: reverse quality */ reverse_quality.assign(fastq_get_quality(infiles.reverse.handle), rev_seq_length); std::reverse(reverse_quality.begin(), reverse_quality.end()); /* join them */ final_sequence = std::string{fastq_get_sequence(infiles.forward.handle), fwd_seq_length} + parameters.opt_join_padgap + reverse_sequence; final_quality = std::string{fastq_get_quality(infiles.forward.handle), fwd_seq_length} + parameters.opt_join_padgapq + reverse_quality; /* write output */ if (parameters.opt_fastqout != nullptr) { fastq_print_general(outfiles.fastq.handle, final_sequence.c_str(), static_cast(needed), fastq_get_header(infiles.forward.handle), static_cast(fastq_get_header_length(infiles.forward.handle)), final_quality.c_str(), static_cast(fastq_get_abundance(infiles.forward.handle)), static_cast(total + 1), -1.0); } if (parameters.opt_fastaout != nullptr) { fasta_print_general(outfiles.fasta.handle, nullptr, final_sequence.c_str(), static_cast(needed), fastq_get_header(infiles.forward.handle), static_cast(fastq_get_header_length(infiles.forward.handle)), static_cast(fasta_get_abundance(infiles.forward.handle)), static_cast(total + 1), -1.0, -1, -1, nullptr, 0); } ++total; progress_update(fastq_get_position(infiles.forward.handle)); } progress_done(); if (fastq_next(infiles.reverse.handle, false, chrmap_no_change_vector.data())) { fatal("More reverse reads than forward reads"); } output_stats_message(parameters, total); output_stats_message(parameters, total, parameters.opt_log); /* clean up */ close_output_files(outfiles); close_input_files(infiles); } vsearch-2.30.1/src/fastq_join.h000066400000000000000000000047541506776541700163610ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_join(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fastq_mergepairs.cc000066400000000000000000001313531506776541700177120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "kmerhash.h" #include "utils/fatal.hpp" #include "utils/kmer_hash_struct.hpp" #include "utils/maps.hpp" #include "utils/span.hpp" #include "utils/xpthread.hpp" #include // std::copy, std::min, std::max #include #include #include // macros PRIu64 and PRId64 #include // std::pow, std::sqrt, std::round, std::log10, std::log2 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::exit, EXIT_FAILURE #include // std::strlen #include #include /* chunk constants */ constexpr auto chunk_size = 500; /* read pairs per chunk */ constexpr auto chunk_factor = 2; /* chunks per thread */ /* scores in bits */ constexpr auto k = 5; static int merge_mindiagcount = 4; static double merge_minscore = 16.0; constexpr auto merge_dropmax = 16.0; constexpr auto merge_mismatchmax = -4.0; /* static variables */ static std::FILE * fp_fastqout = nullptr; static std::FILE * fp_fastaout = nullptr; static std::FILE * fp_fastqout_notmerged_fwd = nullptr; static std::FILE * fp_fastqout_notmerged_rev = nullptr; static std::FILE * fp_fastaout_notmerged_fwd = nullptr; static std::FILE * fp_fastaout_notmerged_rev = nullptr; static std::FILE * fp_eetabbedout = nullptr; static fastx_handle fastq_fwd; static fastx_handle fastq_rev; static int64_t merged = 0; static int64_t notmerged = 0; static int64_t total = 0; static double sum_read_length = 0.0; static double sum_squared_fragment_length = 0.0; static double sum_fragment_length = 0.0; static pthread_t * pthread; static pthread_attr_t attr; constexpr auto n_quality_symbols = 128U; std::array, n_quality_symbols> merge_qual_same {{}}; std::array, n_quality_symbols> merge_qual_diff {{}}; std::array, n_quality_symbols> match_score {{}}; std::array, n_quality_symbols> mism_score {{}}; std::array q2p {{}}; static double sum_ee_fwd = 0.0; static double sum_ee_rev = 0.0; static double sum_ee_merged = 0.0; static uint64_t sum_errors_fwd = 0.0; static uint64_t sum_errors_rev = 0.0; static uint64_t failed_undefined = 0; static uint64_t failed_minlen = 0; static uint64_t failed_maxlen = 0; static uint64_t failed_maxns = 0; static uint64_t failed_minovlen = 0; static uint64_t failed_maxdiffs = 0; static uint64_t failed_maxdiffpct = 0; static uint64_t failed_staggered = 0; static uint64_t failed_indel = 0; static uint64_t failed_repeat = 0; static uint64_t failed_minmergelen = 0; static uint64_t failed_maxmergelen = 0; static uint64_t failed_maxee = 0; static uint64_t failed_minscore = 0; static uint64_t failed_nokmers = 0; /* reasons for not merging: - undefined - ok - input seq too short (after truncation) - input seq too long - too many Ns in input - overlap too short - too many differences (maxdiffs) - too high percentage of differences (maxdiffpct) - staggered - indels in overlap region - potential repeats in overlap region / multiple overlaps - merged sequence too short - merged sequence too long - expected error too high - alignment score too low, insignificant, potential indel - too few kmers on same diag found */ enum struct Reason : char { undefined, ok, minlen, maxlen, maxns, minovlen, maxdiffs, maxdiffpct, staggered, indel, repeat, minmergelen, maxmergelen, maxee, minscore, nokmers }; enum struct State: char { empty, filled, inprogress, processed }; struct merge_data_s { std::vector fwd_header; std::vector rev_header; std::vector fwd_sequence; std::vector rev_sequence; std::vector fwd_quality; std::vector rev_quality; int64_t header_alloc = 0; int64_t seq_alloc = 0; int64_t fwd_length = 0; int64_t rev_length = 0; int64_t fwd_trunc = 0; int64_t rev_trunc = 0; int64_t pair_no = 0; std::vector merged_sequence; std::vector merged_quality_v; int64_t merged_length = 0; int64_t merged_seq_alloc = 0; double ee_merged = 0; double ee_fwd = 0; double ee_rev = 0; int64_t fwd_errors = 0; int64_t rev_errors = 0; int64_t offset = 0; bool merged = false; Reason reason = Reason::undefined; State state = State::empty; }; using merge_data_t = struct merge_data_s; struct chunk_s { int size = 0; /* size of merge_data = number of pairs of reads */ State state = State::empty; /* state of chunk: empty, read, processed */ std::vector merge_data = std::vector(chunk_size); }; std::vector chunks; static int chunk_count; static int chunk_read_next; static int chunk_process_next; static int chunk_write_next; static bool finished_reading = false; static bool finished_all = false; static int pairs_read = 0; static int pairs_written = 0; static pthread_mutex_t mutex_chunks; static pthread_cond_t cond_chunks; // refactoring: replace with check_optional_output_handle() auto fileopenw(char const * filename) -> std::FILE * { assert(filename != nullptr); auto * output_handle = fopen_output(filename); if (output_handle == nullptr) { fatal("Unable to open file for writing (%s)", filename); } return output_handle; } inline auto get_qual(char const quality_symbol) -> int { assert(quality_symbol >= 33); assert(quality_symbol <= 126); auto const quality_value = static_cast(quality_symbol - opt_fastq_ascii); if (quality_value < opt_fastq_qmin) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", quality_value, opt_fastq_qmin); if (fp_log != nullptr) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", quality_value, opt_fastq_qmin); } exit(EXIT_FAILURE); } else if (quality_value > opt_fastq_qmax) { fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", quality_value, opt_fastq_qmax); fprintf(stderr, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", quality_value); if (fp_log != nullptr) { fprintf(fp_log, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", quality_value, opt_fastq_qmax); fprintf(fp_log, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", quality_value); } exit(EXIT_FAILURE); } return quality_value; } inline auto q_to_p(int const quality_symbol) -> double { static constexpr auto low_quality_threshold = 2; static constexpr auto max_probability = 0.75; static constexpr auto quality_divider = 10.0; static constexpr auto power_base = 10.0; assert(quality_symbol >= 33); assert(quality_symbol <= 126); auto const quality_value = static_cast(quality_symbol - opt_fastq_ascii); // refactor: extract branch to a separate operation if (quality_value < low_quality_threshold) { return max_probability; } // probability = 10^-(quality / 10) return std::pow(power_base, -quality_value / quality_divider); } auto precompute_qual() -> void { /* Precompute tables of scores etc */ auto const qmaxout = static_cast(opt_fastq_qmaxout); auto const qminout = static_cast(opt_fastq_qminout); for (auto x = 33; x <= 126; x++) { auto const px = q_to_p(x); q2p[x] = px; for (auto y = 33; y <= 126; y++) { auto const py = q_to_p(y); auto p = 0.0; auto q = 0.0; /* Quality score equations from Edgar & Flyvbjerg (2015) */ /* Match */ p = px * py / 3.0 / (1.0 - px - py + 4.0 * px * py / 3.0); q = std::round(-10.0 * std::log10(p)); q = std::min(q, qmaxout); q = std::max(q, qminout); merge_qual_same[x][y] = opt_fastq_ascii + q; /* Mismatch, x is highest quality */ p = px * (1.0 - py / 3.0) / (px + py - 4.0 * px * py / 3.0); q = std::round(-10.0 * std::log10(p)); q = std::min(q, qmaxout); q = std::max(q, qminout); merge_qual_diff[x][y] = opt_fastq_ascii + q; /* observed match, p = probability that they truly are identical, given error probabilites of px and py, resp. */ // Given two initially identical aligned bases, and // the error probabilities px and py, // what is the probability of observing a match (or a mismatch)? p = 1.0 - px - py + (px * py * 4.0 / 3.0); match_score[x][y] = std::log2(p / 0.25); // Use a minimum mismatch penalty mism_score[x][y] = std::min(std::log2((1.0 - p) / 0.75), merge_mismatchmax); } } } auto merge_sym(char * sym, char * qual, char fwd_sym, char rev_sym, char fwd_qual, char rev_qual) -> void { if (rev_sym == 'N') { * sym = fwd_sym; * qual = fwd_qual; } else if (fwd_sym == 'N') { * sym = rev_sym; * qual = rev_qual; } else if (fwd_sym == rev_sym) { /* agreement */ * sym = fwd_sym; * qual = merge_qual_same[(unsigned) fwd_qual][(unsigned) rev_qual]; } else { /* disagreement */ if (fwd_qual > rev_qual) { * sym = fwd_sym; * qual = merge_qual_diff[(unsigned) fwd_qual][(unsigned) rev_qual]; } else { * sym = rev_sym; * qual = merge_qual_diff[(unsigned) rev_qual][(unsigned) fwd_qual]; } } } auto keep(merge_data_t const & a_read_pair) -> void { ++merged; sum_fragment_length += a_read_pair.merged_length; sum_squared_fragment_length += a_read_pair.merged_length * a_read_pair.merged_length; sum_ee_merged += a_read_pair.ee_merged; sum_ee_fwd += a_read_pair.ee_fwd; sum_ee_rev += a_read_pair.ee_rev; sum_errors_fwd += a_read_pair.fwd_errors; sum_errors_rev += a_read_pair.rev_errors; if (opt_fastqout != nullptr) { fastq_print_general(fp_fastqout, a_read_pair.merged_sequence.data(), a_read_pair.merged_length, a_read_pair.fwd_header.data(), std::strlen(a_read_pair.fwd_header.data()), a_read_pair.merged_quality_v.data(), 0, merged, a_read_pair.ee_merged); } if (opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, a_read_pair.merged_sequence.data(), a_read_pair.merged_length, a_read_pair.fwd_header.data(), std::strlen(a_read_pair.fwd_header.data()), 0, merged, a_read_pair.ee_merged, -1, -1, nullptr, 0.0); } if (opt_eetabbedout != nullptr) { fprintf(fp_eetabbedout, "%.2lf\t%.2lf\t%" PRId64 "\t%" PRId64 "\n", a_read_pair.ee_fwd, a_read_pair.ee_rev, a_read_pair.fwd_errors, a_read_pair.rev_errors); } } auto discard(merge_data_t const & a_read_pair) -> void { switch (a_read_pair.reason) { case Reason::undefined: ++failed_undefined; break; case Reason::ok: break; case Reason::minlen: ++failed_minlen; break; case Reason::maxlen: ++failed_maxlen; break; case Reason::maxns: ++failed_maxns; break; case Reason::minovlen: ++failed_minovlen; break; case Reason::maxdiffs: ++failed_maxdiffs; break; case Reason::maxdiffpct: ++failed_maxdiffpct; break; case Reason::staggered: ++failed_staggered; break; case Reason::indel: ++failed_indel; break; case Reason::repeat: ++failed_repeat; break; case Reason::minmergelen: ++failed_minmergelen; break; case Reason::maxmergelen: ++failed_maxmergelen; break; case Reason::maxee: ++failed_maxee; break; case Reason::minscore: ++failed_minscore; break; case Reason::nokmers: ++failed_nokmers; break; } ++notmerged; if (opt_fastqout_notmerged_fwd != nullptr) { fastq_print_general(fp_fastqout_notmerged_fwd, a_read_pair.fwd_sequence.data(), a_read_pair.fwd_length, a_read_pair.fwd_header.data(), std::strlen(a_read_pair.fwd_header.data()), a_read_pair.fwd_quality.data(), 0, notmerged, -1.0); } if (opt_fastqout_notmerged_rev != nullptr) { fastq_print_general(fp_fastqout_notmerged_rev, a_read_pair.rev_sequence.data(), a_read_pair.rev_length, a_read_pair.rev_header.data(), std::strlen(a_read_pair.rev_header.data()), a_read_pair.rev_quality.data(), 0, notmerged, -1.0); } if (opt_fastaout_notmerged_fwd != nullptr) { fasta_print_general(fp_fastaout_notmerged_fwd, nullptr, a_read_pair.fwd_sequence.data(), a_read_pair.fwd_length, a_read_pair.fwd_header.data(), std::strlen(a_read_pair.fwd_header.data()), 0, notmerged, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastaout_notmerged_rev != nullptr) { fasta_print_general(fp_fastaout_notmerged_rev, nullptr, a_read_pair.rev_sequence.data(), a_read_pair.rev_length, a_read_pair.rev_header.data(), std::strlen(a_read_pair.rev_header.data()), 0, notmerged, -1.0, -1, -1, nullptr, 0.0); } } auto merge(merge_data_t & a_read_pair) -> void { /* length of 5' overhang of the forward sequence not merged with the reverse sequence */ auto const fwd_5prime_overhang = (a_read_pair.fwd_trunc > a_read_pair.offset) ? a_read_pair.fwd_trunc - a_read_pair.offset : 0; // reset struct members a_read_pair.ee_merged = 0.0; a_read_pair.ee_fwd = 0.0; a_read_pair.ee_rev = 0.0; a_read_pair.fwd_errors = 0; a_read_pair.rev_errors = 0; auto sym = '\0'; auto qual = '\0'; auto fwd_sym = '\0'; auto fwd_qual = '\0'; auto rev_sym = '\0'; auto rev_qual = '\0'; int64_t fwd_pos = 0; int64_t rev_pos = 0; int64_t merged_pos = 0; auto ee = 0.0; merged_pos = 0; // 5' overhang in forward sequence fwd_pos = 0; while (fwd_pos < fwd_5prime_overhang) { sym = a_read_pair.fwd_sequence[fwd_pos]; qual = a_read_pair.fwd_quality[fwd_pos]; a_read_pair.merged_sequence[merged_pos] = sym; a_read_pair.merged_quality_v[merged_pos] = qual; ee = q2p[(unsigned) qual]; a_read_pair.ee_merged += ee; a_read_pair.ee_fwd += ee; ++fwd_pos; ++merged_pos; } // Merged region auto const rev_3prime_overhang = a_read_pair.offset > a_read_pair.fwd_trunc ? a_read_pair.offset - a_read_pair.fwd_trunc : 0; rev_pos = a_read_pair.rev_trunc - 1 - rev_3prime_overhang; while ((fwd_pos < a_read_pair.fwd_trunc) and (rev_pos >= 0)) { fwd_sym = a_read_pair.fwd_sequence[fwd_pos]; rev_sym = map_complement(a_read_pair.rev_sequence[rev_pos]); fwd_qual = a_read_pair.fwd_quality[fwd_pos]; rev_qual = a_read_pair.rev_quality[rev_pos]; merge_sym(& sym, & qual, fwd_qual < 2 ? 'N' : fwd_sym, rev_qual < 2 ? 'N' : rev_sym, fwd_qual, rev_qual); if (sym != fwd_sym) { ++a_read_pair.fwd_errors; } if (sym != rev_sym) { ++a_read_pair.rev_errors; } a_read_pair.merged_sequence[merged_pos] = sym; a_read_pair.merged_quality_v[merged_pos] = qual; a_read_pair.ee_merged += q2p[(unsigned) qual]; a_read_pair.ee_fwd += q2p[(unsigned) fwd_qual]; a_read_pair.ee_rev += q2p[(unsigned) rev_qual]; ++fwd_pos; --rev_pos; ++merged_pos; } // 5' overhang in reverse sequence while (rev_pos >= 0) { sym = map_complement(a_read_pair.rev_sequence[rev_pos]); qual = a_read_pair.rev_quality[rev_pos]; a_read_pair.merged_sequence[merged_pos] = sym; a_read_pair.merged_quality_v[merged_pos] = qual; ++merged_pos; ee = q2p[(unsigned) qual]; a_read_pair.ee_merged += ee; a_read_pair.ee_rev += ee; --rev_pos; } auto const mergelen = merged_pos; a_read_pair.merged_length = mergelen; a_read_pair.merged_sequence[mergelen] = 0; a_read_pair.merged_quality_v[mergelen] = 0; if (a_read_pair.ee_merged <= opt_fastq_maxee) { a_read_pair.reason = Reason::ok; a_read_pair.merged = true; } else { a_read_pair.reason = Reason::maxee; } } auto optimize(merge_data_t & a_read_pair, struct kh_handle_s & kmerhash) -> int64_t { /* ungapped alignment in each diagonal */ int64_t const i1 = 1; auto const i2 = a_read_pair.fwd_trunc + a_read_pair.rev_trunc - 1; auto best_score = 0.0; int64_t best_i = 0; int64_t best_diffs = 0; auto hits = 0; auto kmers = 0; std::vector diags(a_read_pair.fwd_trunc + a_read_pair.rev_trunc, 0); kh_insert_kmers(kmerhash, k, a_read_pair.fwd_sequence.data(), a_read_pair.fwd_trunc); kh_find_diagonals(kmerhash, k, a_read_pair.rev_sequence.data(), a_read_pair.rev_trunc, diags); for (int64_t i = i1; i <= i2; i++) { int const diag = a_read_pair.rev_trunc + a_read_pair.fwd_trunc - i; auto const diagcount = diags[diag]; if (diagcount >= merge_mindiagcount) { kmers = 1; /* for each interesting diagonal */ auto const fwd_3prime_overhang = i > a_read_pair.rev_trunc ? i - a_read_pair.rev_trunc : 0; auto const rev_3prime_overhang = i > a_read_pair.fwd_trunc ? i - a_read_pair.fwd_trunc : 0; auto const overlap = i - fwd_3prime_overhang - rev_3prime_overhang; auto const fwd_pos_start = a_read_pair.fwd_trunc - fwd_3prime_overhang - 1; auto const rev_pos_start = a_read_pair.rev_trunc - rev_3prime_overhang - overlap; auto fwd_pos = fwd_pos_start; auto rev_pos = rev_pos_start; auto score = 0.0; int64_t diffs = 0; auto score_high = 0.0; auto dropmax = 0.0; for (int64_t j = 0; j < overlap; j++) { /* for each pair of bases in the overlap */ auto const fwd_sym = a_read_pair.fwd_sequence[fwd_pos]; auto const rev_sym = map_complement(a_read_pair.rev_sequence[rev_pos]); unsigned int const fwd_qual = a_read_pair.fwd_quality[fwd_pos]; unsigned int const rev_qual = a_read_pair.rev_quality[rev_pos]; --fwd_pos; ++rev_pos; if (fwd_sym == rev_sym) { score += match_score[fwd_qual][rev_qual]; score_high = std::max(score, score_high); } else { score += mism_score[fwd_qual][rev_qual]; ++diffs; if (score < score_high - dropmax) { dropmax = score_high - score; } } } if (dropmax >= merge_dropmax) { score = 0.0; } if (score >= merge_minscore) { ++hits; } if (score > best_score) { best_score = score; best_i = i; best_diffs = diffs; } } } if (hits > 1) { a_read_pair.reason = Reason::repeat; return 0; } if ((not opt_fastq_allowmergestagger) and (best_i > a_read_pair.fwd_trunc)) { a_read_pair.reason = Reason::staggered; return 0; } if (best_diffs > opt_fastq_maxdiffs) { a_read_pair.reason = Reason::maxdiffs; return 0; } if ((100.0 * best_diffs / best_i) > opt_fastq_maxdiffpct) { a_read_pair.reason = Reason::maxdiffpct; return 0; } if (kmers == 0) { a_read_pair.reason = Reason::nokmers; return 0; } if (best_score < merge_minscore) { a_read_pair.reason = Reason::minscore; return 0; } if (best_i < opt_fastq_minovlen) { a_read_pair.reason = Reason::minovlen; return 0; } int const mergelen = a_read_pair.fwd_trunc + a_read_pair.rev_trunc - best_i; if (mergelen < opt_fastq_minmergelen) { a_read_pair.reason = Reason::minmergelen; return 0; } if (mergelen > opt_fastq_maxmergelen) { a_read_pair.reason = Reason::maxmergelen; return 0; } return best_i; } auto process(merge_data_t & a_read_pair, struct kh_handle_s & kmerhash) -> void { a_read_pair.merged = false; auto skip = false; /* check length */ if ((a_read_pair.fwd_length < opt_fastq_minlen) or (a_read_pair.rev_length < opt_fastq_minlen)) { a_read_pair.reason = Reason::minlen; skip = true; } if ((a_read_pair.fwd_length > opt_fastq_maxlen) or (a_read_pair.rev_length > opt_fastq_maxlen)) { a_read_pair.reason = Reason::maxlen; skip = true; } /* truncate sequences by quality */ int64_t fwd_trunc = a_read_pair.fwd_length; if (not skip) { for (int64_t i = 0; i < a_read_pair.fwd_length; i++) { if (get_qual(a_read_pair.fwd_quality[i]) <= opt_fastq_truncqual) { fwd_trunc = i; break; } } if (fwd_trunc < opt_fastq_minlen) { a_read_pair.reason = Reason::minlen; skip = true; } } a_read_pair.fwd_trunc = fwd_trunc; auto rev_trunc = a_read_pair.rev_length; if (not skip) { for (int64_t i = 0; i < a_read_pair.rev_length; i++) { if (get_qual(a_read_pair.rev_quality[i]) <= opt_fastq_truncqual) { rev_trunc = i; break; } } if (rev_trunc < opt_fastq_minlen) { a_read_pair.reason = Reason::minlen; skip = true; } } a_read_pair.rev_trunc = rev_trunc; /* count n's */ /* replace quality of N's by zero */ if (not skip) { int64_t fwd_ncount = 0; for (int64_t i = 0; i < fwd_trunc; i++) { if (a_read_pair.fwd_sequence[i] == 'N') { a_read_pair.fwd_quality[i] = opt_fastq_ascii; ++fwd_ncount; } } if (fwd_ncount > opt_fastq_maxns) { a_read_pair.reason = Reason::maxns; skip = true; } } if (not skip) { int64_t rev_ncount = 0; for (int64_t i = 0; i < rev_trunc; i++) { if (a_read_pair.rev_sequence[i] == 'N') { a_read_pair.rev_quality[i] = opt_fastq_ascii; ++rev_ncount; } } if (rev_ncount > opt_fastq_maxns) { a_read_pair.reason = Reason::maxns; skip = true; } } a_read_pair.offset = 0; if (not skip) { a_read_pair.offset = optimize(a_read_pair, kmerhash); } if (a_read_pair.offset > 0) { merge(a_read_pair); } a_read_pair.state = State::processed; } auto read_pair(merge_data_t & a_read_pair) -> bool { if (fastq_next(fastq_fwd, false, chrmap_upcase_vector.data())) { if (not fastq_next(fastq_rev, false, chrmap_upcase_vector.data())) { fatal("More forward reads than reverse reads"); } /* allocate more memory if necessary */ int64_t const fwd_header_len = fastq_get_header_length(fastq_fwd); int64_t const rev_header_len = fastq_get_header_length(fastq_rev); int64_t const header_needed = std::max(fwd_header_len, rev_header_len) + 1; if (header_needed > a_read_pair.header_alloc) { a_read_pair.header_alloc = header_needed; a_read_pair.fwd_header.resize(header_needed); a_read_pair.rev_header.resize(header_needed); } a_read_pair.fwd_length = fastq_get_sequence_length(fastq_fwd); a_read_pair.rev_length = fastq_get_sequence_length(fastq_rev); int64_t const seq_needed = std::max(a_read_pair.fwd_length, a_read_pair.rev_length) + 1; sum_read_length += a_read_pair.fwd_length + a_read_pair.rev_length; if (seq_needed > a_read_pair.seq_alloc) { a_read_pair.seq_alloc = seq_needed; a_read_pair.fwd_sequence.resize(seq_needed); a_read_pair.rev_sequence.resize(seq_needed); a_read_pair.fwd_quality.resize(seq_needed); a_read_pair.rev_quality.resize(seq_needed); } int64_t const merged_seq_needed = a_read_pair.fwd_length + a_read_pair.rev_length + 1; if (merged_seq_needed > a_read_pair.merged_seq_alloc) { a_read_pair.merged_seq_alloc = merged_seq_needed; a_read_pair.merged_sequence.resize(merged_seq_needed); a_read_pair.merged_quality_v.resize(merged_seq_needed); } /* make local copies of the seq, header and qual */ auto const fwd_header_view = Span { fastq_get_header(fastq_fwd), fastq_get_header_length(fastq_fwd)}; std::copy(fwd_header_view.cbegin(), fwd_header_view.cend(), a_read_pair.fwd_header.begin()); a_read_pair.fwd_header[fwd_header_view.size()] = '\0'; // fix issue when reusing allocated mem auto const rev_header_view = Span { fastq_get_header(fastq_rev), fastq_get_header_length(fastq_rev)}; std::copy(rev_header_view.cbegin(), rev_header_view.cend(), a_read_pair.rev_header.begin()); a_read_pair.rev_header[rev_header_view.size()] = '\0'; // fix issue when reusing allocated mem auto const fwd_sequence_view = Span { fastq_get_sequence(fastq_fwd), fastq_get_sequence_length(fastq_fwd)}; std::copy(fwd_sequence_view.cbegin(), fwd_sequence_view.cend(), a_read_pair.fwd_sequence.begin()); auto const rev_sequence_view = Span { fastq_get_sequence(fastq_rev), fastq_get_sequence_length(fastq_rev)}; std::copy(rev_sequence_view.cbegin(), rev_sequence_view.cend(), a_read_pair.rev_sequence.begin()); auto const fwd_quality_view = Span { fastq_get_quality(fastq_fwd), fastq_get_quality_length(fastq_fwd)}; std::copy(fwd_quality_view.cbegin(), fwd_quality_view.cend(), a_read_pair.fwd_quality.begin()); auto const rev_quality_view = Span { fastq_get_quality(fastq_rev), fastq_get_quality_length(fastq_rev)}; std::copy(rev_quality_view.cbegin(), rev_quality_view.cend(), a_read_pair.rev_quality.begin()); a_read_pair.merged_sequence[0] = 0; a_read_pair.merged_quality_v[0] = 0; a_read_pair.merged = false; a_read_pair.pair_no = total++; return true; } return false; } auto keep_or_discard(merge_data_t const & a_read_pair) -> void { if (a_read_pair.merged) { keep(a_read_pair); } else { discard(a_read_pair); } } inline auto chunk_perform_read() -> void { while ((not finished_reading) and (chunks[chunk_read_next].state == State::empty)) { xpthread_mutex_unlock(&mutex_chunks); progress_update(fastq_get_position(fastq_fwd)); auto r = 0; while ((r < chunk_size) and read_pair(chunks[chunk_read_next].merge_data[r])) { ++r; } chunks[chunk_read_next].size = r; xpthread_mutex_lock(&mutex_chunks); pairs_read += r; if (r > 0) { chunks[chunk_read_next].state = State::filled; chunk_read_next = (chunk_read_next + 1) % chunk_count; } if (r < chunk_size) { finished_reading = true; if (pairs_written >= pairs_read) { finished_all = true; } } xpthread_cond_broadcast(&cond_chunks); } } inline auto chunk_perform_write() -> void { while (chunks[chunk_write_next].state == State::processed) { xpthread_mutex_unlock(&mutex_chunks); for (auto i = 0; i < chunks[chunk_write_next].size; i++) { keep_or_discard(chunks[chunk_write_next].merge_data[i]); } xpthread_mutex_lock(&mutex_chunks); pairs_written += chunks[chunk_write_next].size; chunks[chunk_write_next].state = State::empty; if (finished_reading and (pairs_written >= pairs_read)) { finished_all = true; } chunk_write_next = (chunk_write_next + 1) % chunk_count; xpthread_cond_broadcast(&cond_chunks); } } inline auto chunk_perform_process(struct kh_handle_s & kmerhash) -> void { auto const chunk_current = chunk_process_next; if (chunks[chunk_current].state == State::filled) { chunks[chunk_current].state = State::inprogress; chunk_process_next = (chunk_current + 1) % chunk_count; xpthread_cond_broadcast(&cond_chunks); xpthread_mutex_unlock(&mutex_chunks); for (auto i = 0; i < chunks[chunk_current].size; i++) { process(chunks[chunk_current].merge_data[i], kmerhash); } xpthread_mutex_lock(&mutex_chunks); chunks[chunk_current].state = State::processed; xpthread_cond_broadcast(&cond_chunks); } } auto pair_worker(void * vp) -> void * { /* new */ auto t = (int64_t) vp; struct kh_handle_s kmerhash; xpthread_mutex_lock(&mutex_chunks); while (not finished_all) { if (opt_threads == 1) { /* One thread does it all */ chunk_perform_read(); chunk_perform_process(kmerhash); chunk_perform_write(); } else if (opt_threads == 2) { if (t == 0) { /* first thread reads and processes */ while (not ( finished_all or (chunks[chunk_process_next].state == State::filled) or ((not finished_reading) and chunks[chunk_read_next].state == State::empty))) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_read(); chunk_perform_process(kmerhash); } else /* t == 1 */ { /* second thread writes and processes */ while (not ( finished_all or (chunks[chunk_process_next].state == State::filled) or (chunks[chunk_write_next].state == State::processed) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_write(); chunk_perform_process(kmerhash); } } else { if (t == 0) { /* first thread reads and processes */ while (not ( finished_all or ((not finished_reading) and (chunks[chunk_read_next].state == State::empty)) or (chunks[chunk_process_next].state == State::filled) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_read(); chunk_perform_process(kmerhash); } else if (t == opt_threads - 1) { /* last thread writes and processes */ while (not ( finished_all or (chunks[chunk_write_next].state == State::processed) or (chunks[chunk_process_next].state == State::filled) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_write(); chunk_perform_process(kmerhash); } else { /* the other threads are only processing */ while (not ( finished_all or (chunks[chunk_process_next].state == State::filled) ) ) { xpthread_cond_wait(&cond_chunks, &mutex_chunks); } chunk_perform_process(kmerhash); } } } xpthread_mutex_unlock(&mutex_chunks); return nullptr; } auto pair_all() -> void { /* prepare chunks */ chunk_count = chunk_factor * opt_threads; chunk_read_next = 0; chunk_process_next = 0; chunk_write_next = 0; chunks.resize(chunk_count); xpthread_mutex_init(&mutex_chunks, nullptr); xpthread_cond_init(&cond_chunks, nullptr); /* prepare threads */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); std::vector pthread_v(opt_threads); pthread = pthread_v.data(); for (auto t = 0; t < opt_threads; t++) { xpthread_create(&pthread_v[t], &attr, pair_worker, (void *) (int64_t) t); } /* wait for threads to terminate */ for (auto t = 0; t < opt_threads; t++) { xpthread_join(pthread_v[t], nullptr); } /* free threads */ xpthread_attr_destroy(&attr); /* free chunks */ xpthread_cond_destroy(&cond_chunks); xpthread_mutex_destroy(&mutex_chunks); } auto print_stats(std::FILE * output_handle) -> void { fprintf(output_handle, "%10" PRIu64 " Pairs\n", total); fprintf(output_handle, "%10" PRIu64 " Merged", merged); if (total > 0) { fprintf(output_handle, " (%.1lf%%)", 100.0 * merged / total); } fprintf(output_handle, "\n"); fprintf(output_handle, "%10" PRIu64 " Not merged", notmerged); if (total > 0) { fprintf(output_handle, " (%.1lf%%)", 100.0 * notmerged / total); } fprintf(output_handle, "\n"); if (notmerged > 0) { fprintf(output_handle, "\nPairs that failed merging due to various reasons:\n"); } if (failed_undefined != 0U) { fprintf(output_handle, "%10" PRIu64 " undefined reason\n", failed_undefined); } if (failed_minlen != 0U) { fprintf(output_handle, "%10" PRIu64 " reads too short (after truncation)\n", failed_minlen); } if (failed_maxlen != 0U) { fprintf(output_handle, "%10" PRIu64 " reads too long (after truncation)\n", failed_maxlen); } if (failed_maxns != 0U) { fprintf(output_handle, "%10" PRIu64 " too many N's\n", failed_maxns); } if (failed_nokmers != 0U) { fprintf(output_handle, "%10" PRIu64 " too few kmers found on same diagonal\n", failed_nokmers); } if (failed_repeat != 0U) { fprintf(output_handle, "%10" PRIu64 " multiple potential alignments\n", failed_repeat); } if (failed_maxdiffs != 0U) { fprintf(output_handle, "%10" PRIu64 " too many differences\n", failed_maxdiffs); } if (failed_maxdiffpct != 0U) { fprintf(output_handle, "%10" PRIu64 " too high percentage of differences\n", failed_maxdiffpct); } if (failed_minscore != 0U) { fprintf(output_handle, "%10" PRIu64 " alignment score too low, or score drop too high\n", failed_minscore); } if (failed_minovlen != 0U) { fprintf(output_handle, "%10" PRIu64 " overlap too short\n", failed_minovlen); } if (failed_maxee != 0U) { fprintf(output_handle, "%10" PRIu64 " expected error too high\n", failed_maxee); } if (failed_minmergelen != 0U) { fprintf(output_handle, "%10" PRIu64 " merged fragment too short\n", failed_minmergelen); } if (failed_maxmergelen != 0U) { fprintf(output_handle, "%10" PRIu64 " merged fragment too long\n", failed_maxmergelen); } if (failed_staggered != 0U) { fprintf(output_handle, "%10" PRIu64 " staggered read pairs\n", failed_staggered); } if (failed_indel != 0U) { fprintf(output_handle, "%10" PRIu64 " indel errors\n", failed_indel); } fprintf(output_handle, "\n"); if (total > 0) { fprintf(output_handle, "Statistics of all reads:\n"); auto const mean_read_length = sum_read_length / (2.0 * pairs_read); fprintf(output_handle, "%10.2f Mean read length\n", mean_read_length); } if (merged > 0) { fprintf(output_handle, "\n"); fprintf(output_handle, "Statistics of merged reads:\n"); auto const mean = sum_fragment_length / merged; fprintf(output_handle, "%10.2f Mean fragment length\n", mean); auto const stdev = sqrt((sum_squared_fragment_length - 2.0 * mean * sum_fragment_length + mean * mean * merged) / (merged + 0.0)); fprintf(output_handle, "%10.2f Standard deviation of fragment length\n", stdev); fprintf(output_handle, "%10.2f Mean expected error in forward sequences\n", sum_ee_fwd / merged); fprintf(output_handle, "%10.2f Mean expected error in reverse sequences\n", sum_ee_rev / merged); fprintf(output_handle, "%10.2f Mean expected error in merged sequences\n", sum_ee_merged / merged); fprintf(output_handle, "%10.2f Mean observed errors in merged region of forward sequences\n", 1.0 * sum_errors_fwd / merged); fprintf(output_handle, "%10.2f Mean observed errors in merged region of reverse sequences\n", 1.0 * sum_errors_rev / merged); fprintf(output_handle, "%10.2f Mean observed errors in merged region\n", 1.0 * (sum_errors_fwd + sum_errors_rev) / merged); } } auto fastq_mergepairs(struct Parameters const & parameters) -> void { /* fatal error if specified overlap is too small */ if (opt_fastq_minovlen < 5) { fatal("Overlap specified with --fastq_minovlen must be at least 5"); } /* relax default parameters in case of short overlaps */ if (opt_fastq_minovlen < 9) { merge_mindiagcount = opt_fastq_minovlen - 4; merge_minscore = 1.6 * opt_fastq_minovlen; } /* open input files */ fastq_fwd = fastq_open(parameters.opt_fastq_mergepairs); fastq_rev = fastq_open(opt_reverse); /* open output files */ if (opt_fastqout != nullptr) { fp_fastqout = fileopenw(opt_fastqout); } if (opt_fastaout != nullptr) { fp_fastaout = fileopenw(opt_fastaout); } if (opt_fastqout_notmerged_fwd != nullptr) { fp_fastqout_notmerged_fwd = fileopenw(opt_fastqout_notmerged_fwd); } if (opt_fastqout_notmerged_rev != nullptr) { fp_fastqout_notmerged_rev = fileopenw(opt_fastqout_notmerged_rev); } if (opt_fastaout_notmerged_fwd != nullptr) { fp_fastaout_notmerged_fwd = fileopenw(opt_fastaout_notmerged_fwd); } if (opt_fastaout_notmerged_rev != nullptr) { fp_fastaout_notmerged_rev = fileopenw(opt_fastaout_notmerged_rev); } if (opt_eetabbedout != nullptr) { fp_eetabbedout = fileopenw(opt_eetabbedout); } /* precompute merged quality values */ precompute_qual(); /* main */ uint64_t const filesize = fastq_get_size(fastq_fwd); progress_init("Merging reads", filesize); if (not fastq_fwd->is_empty) { pair_all(); } progress_done(); if (fastq_next(fastq_rev, true, chrmap_upcase_vector.data())) { fatal("More reverse reads than forward reads"); } if (fp_log != nullptr) { print_stats(fp_log); } else { print_stats(stderr); } /* clean up */ if (opt_eetabbedout != nullptr) { fclose(fp_eetabbedout); } if (opt_fastaout_notmerged_rev != nullptr) { fclose(fp_fastaout_notmerged_rev); } if (opt_fastaout_notmerged_fwd != nullptr) { fclose(fp_fastaout_notmerged_fwd); } if (opt_fastqout_notmerged_rev != nullptr) { fclose(fp_fastqout_notmerged_rev); } if (opt_fastqout_notmerged_fwd != nullptr) { fclose(fp_fastqout_notmerged_fwd); } if (opt_fastaout != nullptr) { fclose(fp_fastaout); } if (opt_fastqout != nullptr) { fclose(fp_fastqout); } fastq_close(fastq_rev); fastq_rev = nullptr; fastq_close(fastq_fwd); fastq_fwd = nullptr; } vsearch-2.30.1/src/fastq_mergepairs.h000066400000000000000000000047621506776541700175570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_mergepairs(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fastq_stats.cc000066400000000000000000000657031506776541700167170ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/span.hpp" #include #include // std::max, std::min, std::find_if, std::transform, std::minmax_element, std::for_each #include #include // macros PRIu64 (for uint64_t) and PRId64 (for int64_t) #include // std::pow #include // int64_t, uint64_t #include // std::fprintf, std::size_t #include // std::plus #include // std::distance, std::next #include #include // std::partial_sum, std::inner_product, std::iota, std::accumulate #include #include constexpr auto n_eight_bit_values = std::size_t{256}; using Length_vs_Quality_counts = std::vector>; // anonymous namespace: limit visibility and usage to this translation unit namespace { struct Distributions { double avgq = 0.0; double avgp = 0.0; double avgee = 0.0; double rate = 0.0; }; struct Stats { uint64_t len_min; uint64_t len_max; double n_symbols; uint64_t seq_count; double n_sequences; std::vector length_dist; std::vector quality_dist; std::vector distributions; }; auto q2p(double quality_score) -> double { static constexpr auto base = 10.0; return std::pow(base, -quality_score / base); } auto q2p(uint64_t quality_score) -> double { static constexpr auto base = 10.0; auto const quality_score_double = static_cast(quality_score); return std::pow(base, -quality_score_double / base); } auto check_quality_score(struct Parameters const & parameters, unsigned int const quality_score) -> void { auto const is_in_accepted_range = (quality_score >= static_cast(parameters.opt_fastq_qmin)) and (quality_score <= static_cast(parameters.opt_fastq_qmax)); if (is_in_accepted_range) { return; } std::string const message = std::string("FASTQ quality value (") + std::to_string(quality_score) + ") out of range (" + std::to_string(parameters.opt_fastq_qmin) + "-" + std::to_string(parameters.opt_fastq_qmax) + ").\n" + "Please adjust the FASTQ quality base character or range with the\n" + "--fastq_ascii, --fastq_qmin or --fastq_qmax options. For a complete\n" + "diagnosis with suggested values, please run vsearch --fastq_chars file."; fatal(message.c_str()); } auto check_minmax_scores(Span const qualities, std::vector const & symbol_to_score, struct Parameters const & parameters) -> void { if (qualities.empty()) { return; } auto const minmax_scores = std::minmax_element(qualities.begin(), qualities.end()); auto const qmin = symbol_to_score[*std::get<0>(minmax_scores)]; auto const qmax = symbol_to_score[*std::get<1>(minmax_scores)]; check_quality_score(parameters, qmin); check_quality_score(parameters, qmax); } auto const is_observed = [](uint64_t const count) -> bool { return count != 0UL; }; auto find_smallest(std::vector const & observables) -> unsigned long { auto const first_hit = std::find_if(observables.begin(), observables.end(), is_observed); if (first_hit == observables.end()) { return 0UL; } return static_cast( std::distance(observables.begin(), first_hit)); } auto find_largest(std::vector const & observables) -> unsigned long { auto const last_hit = std::find_if(observables.rbegin(), observables.rend(), is_observed); if (last_hit == observables.rend()) { return 0UL; } return static_cast( std::distance(last_hit, observables.rend()) - 1); } auto compute_cumulative_sum(std::vector const & read_length_table) -> std::vector { std::vector cumulative_sum_of_lengths(read_length_table.size()); std::partial_sum(read_length_table.cbegin(), read_length_table.cend(), cumulative_sum_of_lengths.begin()); return cumulative_sum_of_lengths; } auto compute_number_of_symbols(std::vector const & n_reads_per_length) -> double { // total number of nucleotides = sum(read_length * n_reads_with_that_length) std::vector read_lengths(n_reads_per_length.size()); std::iota(read_lengths.begin(), read_lengths.end(), 0UL); return std::inner_product(read_lengths.begin(), read_lengths.end(), n_reads_per_length.begin(), double{0}); } auto compute_n_symbols_per_length(Length_vs_Quality_counts const & qual_length_table) -> std::vector { // sum_counts is the sum of observed valid symbols for each length // (invalid symbols are guaranteed to be set to zero) std::vector sum_counts(qual_length_table.size()); std::transform( qual_length_table.begin(), qual_length_table.end(), sum_counts.begin(), [](std::vector const & quality_symbols) -> std::uint64_t { return std::accumulate(quality_symbols.begin(), quality_symbols.end(), std::uint64_t{0}); }); return sum_counts; } auto precompute_quality_scores(struct Parameters const & parameters) -> std::vector { // quality score = quality symbol - opt_fastq_ascii // // opt_fastq_ascii is a fix value and quality_symbol increases // linearly, so a vector of quality_scores can be generated by // std::iota std::vector quality_scores(n_eight_bit_values); auto starting_position = std::next(quality_scores.begin(), parameters.opt_fastq_ascii); std::iota(starting_position, quality_scores.end(), 0UL); return quality_scores; } auto compute_sum_quality_scores_per_length(Length_vs_Quality_counts const & qual_length_table, struct Parameters const & parameters) -> std::vector { // sum_quality_scores is the sum of observed scores for each length std::vector sum_quality_scores(qual_length_table.size()); auto const quality_scores = precompute_quality_scores(parameters); std::transform( qual_length_table.begin(), qual_length_table.end(), sum_quality_scores.begin(), [&quality_scores](std::vector const & quality_symbols) -> std::uint64_t { return std::inner_product(quality_symbols.begin(), quality_symbols.end(), quality_scores.begin(), std::uint64_t{0}); }); return sum_quality_scores; } auto precompute_probability_values(struct Parameters const & parameters) -> std::vector { // probability value = 10 ^ - (quality score / 10) // quality score = quality symbol - opt_fastq_ascii // // opt_fastq_ascii is a fix value and quality_symbol increases // linearly, so a vector of quality_scores can be generated by // std::iota and then transformed into probability values auto const quality_scores = precompute_quality_scores(parameters); std::vector probability_values(n_eight_bit_values); std::transform(quality_scores.cbegin(), quality_scores.cend(), probability_values.begin(), [](uint64_t const quality_score) -> double { return q2p(quality_score); }); return probability_values; } auto compute_sum_error_probabilities_per_length(Length_vs_Quality_counts const & qual_length_table, struct Parameters const & parameters) -> std::vector { // sum_error_probabilities is the sum of observed probabilities for each length std::vector sum_error_probabilities(qual_length_table.size()); auto const probability_values = precompute_probability_values(parameters); std::transform( qual_length_table.begin(), qual_length_table.end(), sum_error_probabilities.begin(), [&probability_values](std::vector const & quality_symbols) -> double { return std::inner_product(quality_symbols.begin(), quality_symbols.end(), probability_values.begin(), double{0}); }); return sum_error_probabilities; } auto compute_distribution_of_quality_symbols(Length_vs_Quality_counts const & length_vs_quality) -> std::vector { // for each quality symbol: sum symbol observations for each position std::vector distribution(n_eight_bit_values); std::for_each(length_vs_quality.begin(), length_vs_quality.end(), [& distribution](std::vector const & observations) -> void { std::transform(observations.begin(), observations.end(), distribution.begin(), distribution.begin(), std::plus{}); }); return distribution; } auto compute_distributions( unsigned int const len_max, Length_vs_Quality_counts const & qual_length_table, std::vector const & sumee_length_table, struct Parameters const & parameters) -> std::vector { std::vector distributions(len_max + 1); auto const sum_counts = compute_n_symbols_per_length(qual_length_table); auto const sum_quality_scores = compute_sum_quality_scores_per_length(qual_length_table, parameters); auto const sum_error_probabilities = compute_sum_error_probabilities_per_length(qual_length_table, parameters); auto position = std::size_t{0}; for (auto & distribution: distributions) { auto const n_symbols = static_cast(sum_counts[position]); auto const length = static_cast(position + 1); auto const sum_quality_score = static_cast(sum_quality_scores[position]); distribution.avgq = sum_quality_score / n_symbols; distribution.avgp = sum_error_probabilities[position] / n_symbols; distribution.avgee = sumee_length_table[position] / n_symbols; distribution.rate = distributions[position].avgee / length; ++position; } return distributions; } auto find_first_complete_EE_filtering(struct Stats const & stats, std::vector> const & ee_length_table) -> uint64_t { // - find the first position where no reads remain when filtering // with the threshold EE > 1.0 // - by construction, more stringent filtering thresholds also have no reads, // - downstream positions also contain no reads auto read_count_is_null = [](std::array const & read_counts) -> bool { return read_counts.front() == 0; }; auto const iterator = std::find_if(ee_length_table.cbegin(), ee_length_table.cend(), read_count_is_null); if (iterator == ee_length_table.end()) { return stats.len_max; } return static_cast(std::distance(ee_length_table.begin(), iterator)); } // section 1 auto report_read_length_distribution(std::FILE * log_handle, struct Stats const & stats, std::vector const & read_length_table) -> void { assert(log_handle != nullptr); std::fprintf(log_handle, "\n%s\n%s\n%s\n", "Read length distribution", " L N Pct AccPct", "------- ---------- ------- -------"); for (auto length = stats.len_max; length >= stats.len_min; --length) { if (read_length_table[length] != 0) { auto const previous_count = (length != 0) ? static_cast(stats.length_dist[length - 1]) : 0; std::fprintf(log_handle, "%2s%5" PRIu64 " %10" PRIu64 " %5.1lf%% %5.1lf%%\n", (length == stats.len_max ? ">=" : " "), length, read_length_table[length], static_cast(read_length_table[length]) * 100.0 / stats.n_sequences, 100.0 * (stats.n_sequences - previous_count) / stats.n_sequences); } if (length == 0UL) { break; } } } // section 2 auto report_q_score_distribution( std::FILE * log_handle, struct Stats const & stats, std::vector const & symbol_to_probability, std::vector const & symbol_to_score) -> void { assert(log_handle != nullptr); auto const qmin = static_cast(find_smallest(stats.quality_dist)); auto const qmax = static_cast(find_largest(stats.quality_dist)); std::fprintf(log_handle, "\n%s\n%s\n%s\n", "Q score distribution", "ASCII Q Pe N Pct AccPct", "----- --- ------- ---------- ------- -------"); uint64_t qual_accum = 0; for (auto quality_symbol = qmax ; quality_symbol >= qmin ; --quality_symbol) { if (stats.quality_dist[quality_symbol] == 0) { continue; } qual_accum += stats.quality_dist[quality_symbol]; std::fprintf(log_handle, " %c %3" PRIu64 " %7.5lf %10" PRIu64 " %6.1lf%% %6.1lf%%\n", quality_symbol, symbol_to_score[quality_symbol], symbol_to_probability[quality_symbol], stats.quality_dist[quality_symbol], 100.0 * static_cast(stats.quality_dist[quality_symbol]) / stats.n_symbols, 100.0 * static_cast(qual_accum) / stats.n_symbols); } } // section 3 auto report_length_vs_quality_distribution(std::FILE * log_handle, struct Stats const & stats) -> void { assert(log_handle != nullptr); std::fprintf(log_handle, "\n%s\n%s\n", " L PctRecs AvgQ P(AvgQ) AvgP AvgEE Rate RatePct", "----- ------- ---- ------- -------- ----- --------- --------"); for (auto length = uint64_t{2}; length <= stats.len_max; ++length) { auto const previous_count = static_cast(stats.length_dist[length - 1]); auto const & distribution = stats.distributions[length - 1]; auto const PctRecs = 100.0 * (stats.n_sequences - previous_count) / stats.n_sequences; auto const AvgQ = distribution.avgq; auto const AvgP = distribution.avgp; auto const AvgEE = distribution.avgee; auto const Rate = distribution.rate; std::fprintf(log_handle, "%5" PRIu64 " %6.1lf%% %4.1lf %7.5lf %8.6lf %5.2lf %9.6lf %7.3lf%%\n", length, PctRecs, AvgQ, q2p(AvgQ), AvgP, AvgEE, Rate, 100.0 * Rate); } } // section 4 auto report_expected_error_and_length_filtering(std::FILE * log_handle, struct Stats const & stats, std::vector> const & ee_length_table) -> void { assert(log_handle != nullptr); std::fprintf(log_handle, "\n%s\n%s\n", " L 1.0000 0.5000 0.2500 0.1000 1.0000 0.5000 0.2500 0.1000", "----- ------- ------- ------- ------- ------- ------- ------- -------"); std::vector read_percentage(ee_length_table[0].size()); auto const max_length = find_first_complete_EE_filtering(stats, ee_length_table); for (auto length = max_length; length >= 1UL; --length) { auto const & read_count = ee_length_table[length - 1]; std::transform( read_count.begin(), read_count.end(), read_percentage.begin(), [&stats](unsigned long const count) -> double { return 100.0 * static_cast(count) / stats.n_sequences; }); std::fprintf(log_handle, "%5" PRIu64 " %7" PRIu64 " %7" PRIu64 " %7" PRIu64 " %7" PRIu64 " " "%6.2lf%% %6.2lf%% %6.2lf%% %6.2lf%%\n", length, read_count[0], read_count[1], read_count[2], read_count[3], read_percentage[0], read_percentage[1], read_percentage[2], read_percentage[3]); } } // section 5 auto report_minimum_quality_and_length_filtering(std::FILE * log_handle, struct Stats const & stats, std::vector> const & q_length_table) -> void { assert(log_handle != nullptr); std::fprintf(log_handle, "\n%s\n%s\n%s\n", "Truncate at first Q", " Len Q=5 Q=10 Q=15 Q=20", "----- ------ ------ ------ ------"); auto const mid_length = std::max(uint64_t{1}, stats.len_max / 2); std::vector read_percentage(q_length_table[0].size()); for (auto length = stats.len_max; length >= mid_length; --length) { auto const & read_count = q_length_table[length - 1]; std::transform( read_count.begin(), read_count.end(), read_percentage.begin(), [&stats](unsigned long const count) -> double { return 100.0 * static_cast(count) / stats.n_sequences; }); std::fprintf(log_handle, "%5" PRIu64 " %5.1lf%% %5.1lf%% %5.1lf%% %5.1lf%%\n", length, read_percentage[0], read_percentage[1], read_percentage[2], read_percentage[3]); } } // closing section auto report_sequence_stats(std::FILE * log_handle, struct Stats const & stats) -> void { assert(log_handle != nullptr); static constexpr auto a_million = double{1000000}; auto const n_sequences = static_cast(stats.seq_count); std::fprintf(log_handle, "\n%10" PRIu64 " Recs (%.1lfM), 0 too long\n", stats.seq_count, n_sequences / a_million); if (stats.seq_count != 0) { std::fprintf(log_handle, "%10.1lf Avg length\n", 1.0 * stats.n_symbols / n_sequences); } std::fprintf(log_handle, "%9.1lfM Bases\n", stats.n_symbols / a_million); } } // end of anonymous namespace auto fastq_stats(struct Parameters const & parameters) -> void { constexpr auto initial_memory_allocation = std::size_t{512}; constexpr std::array quality_thresholds = {5, 10, 15, 20}; constexpr std::array ee_thresholds = { 1.0, 0.5, 0.25, 0.1 }; auto * input_handle = fastq_open(parameters.opt_fastq_stats); auto const filesize = fastq_get_size(input_handle); progress_init("Reading FASTQ file", filesize); auto const symbol_to_score = precompute_quality_scores(parameters); auto const symbol_to_probability = precompute_probability_values(parameters); std::vector read_length_table(initial_memory_allocation); Length_vs_Quality_counts qual_length_table(initial_memory_allocation, std::vector(n_eight_bit_values)); std::vector> ee_length_table(initial_memory_allocation); std::vector> q_length_table(initial_memory_allocation); std::vector sumee_length_table(initial_memory_allocation); // refactoring: separate parse_fastq() and compute_distributions() // note: fastq parsing represents 99% of total wallclock time while (fastq_next(input_handle, false, chrmap_upcase_vector.data())) { /* update length statistics */ auto const length = fastq_get_sequence_length(input_handle); if (length + 1 > read_length_table.size()) { read_length_table.resize(length + 1); qual_length_table.resize(length + 1, std::vector(n_eight_bit_values)); ee_length_table.resize(length + 1); q_length_table.resize(length + 1); sumee_length_table.resize(length + 1); } ++read_length_table[length]; // can NOT be derived from qual_length_table /* update quality statistics */ auto const * quality_symbols = fastq_get_quality(input_handle); auto expected_error = 0.0; auto qmin = std::numeric_limits::max(); // lowest Q value observed so far in this read check_minmax_scores(Span{quality_symbols, length}, symbol_to_score, parameters); // refactoring: replace for-loop below with three functions: // 0) // - transform(quality_symbols, qual_length_table, [](auto& position){ ++position[quality_symbol]; }) // 1) // - search first position with symbol <= 20 + offset // - increment counts from begin to position // - do the same for 15, 10 and 5 // 2) // - create a temporary vector // - populate with probability values // - std::partial_sum() to compte EE // - search first position with EE <= 1.0 // - increment counts from begin to position // - do the same for EE <= 0.5, 0.25, and 0.1 for (auto i = 0UL; i < length; ++i) { auto const quality_symbol = static_cast(quality_symbols[i]); auto const quality_score = symbol_to_score[quality_symbol]; ++qual_length_table[i][quality_symbol]; qmin = std::min(quality_score, qmin); // increment quality observations if the current Q > 5, 10, 15, or 20 std::transform(quality_thresholds.begin(), quality_thresholds.end(), q_length_table[i].begin(), q_length_table[i].begin(), [qmin](uint64_t const threshold, uint64_t current_value) -> uint64_t { return current_value + (qmin > threshold ? 1 : 0); }); expected_error += symbol_to_probability[quality_symbol]; sumee_length_table[i] += expected_error; // can NOT be derived from qual_length_table // increment EE observations if the current EE <= 1.0, 0.5, 0.25, or 0.1 std::transform(ee_thresholds.begin(), ee_thresholds.end(), ee_length_table[i].begin(), ee_length_table[i].begin(), [expected_error](double const threshold, uint64_t current_value) -> uint64_t { return current_value + (expected_error <= threshold ? 1 : 0); }); } progress_update(fastq_get_position(input_handle)); } progress_done(); fastq_close(input_handle); // note: operations below represent 1% of total wallclock time /* compute various distributions */ auto const stats = Stats{ find_smallest(read_length_table), find_largest(read_length_table), compute_number_of_symbols(read_length_table), std::accumulate(read_length_table.begin(), read_length_table.end(), std::uint64_t{0}), static_cast(std::accumulate(read_length_table.begin(), read_length_table.end(), std::uint64_t{0})), compute_cumulative_sum(read_length_table), compute_distribution_of_quality_symbols(qual_length_table), compute_distributions(find_largest(read_length_table), qual_length_table, sumee_length_table, parameters) }; /* print report */ if (fp_log != nullptr) { report_read_length_distribution(fp_log, stats, read_length_table); // section 1 report_q_score_distribution(fp_log, stats, symbol_to_probability, symbol_to_score); // section 2 report_length_vs_quality_distribution(fp_log, stats); // section 3 report_expected_error_and_length_filtering(fp_log, stats, ee_length_table); // section 4 report_minimum_quality_and_length_filtering(fp_log, stats, q_length_table); // section 5 report_sequence_stats(fp_log, stats); // closing section } if (not parameters.opt_quiet) { std::fprintf(stderr, "Read %" PRIu64 " sequences.\n", stats.seq_count); } } vsearch-2.30.1/src/fastq_stats.h000066400000000000000000000047551506776541700165610ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_stats(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fastqops.cc000066400000000000000000000237461506776541700162240ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::max #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include constexpr auto initial_memory_allocation = 512; auto fastx_revcomp(struct Parameters const & parameters) -> void { uint64_t buffer_alloc = initial_memory_allocation; std::vector seq_buffer(buffer_alloc); std::vector qual_buffer(buffer_alloc); if ((parameters.opt_fastaout == nullptr) && (parameters.opt_fastqout == nullptr)) { fatal("No output files specified"); } auto * input_handle = fastx_open(parameters.opt_fastx_revcomp); if (input_handle == nullptr) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } if ((parameters.opt_fastqout != nullptr) && ! (input_handle->is_fastq || input_handle->is_empty)) { fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); } auto const filesize = fastx_get_size(input_handle); std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; if (parameters.opt_fastaout != nullptr) { fp_fastaout = fopen_output(parameters.opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (parameters.opt_fastqout != nullptr) { fp_fastqout = fopen_output(parameters.opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (input_handle->is_fastq) { progress_init("Reading FASTQ file", filesize); } else { progress_init("Reading FASTA file", filesize); } auto count = 0; while (fastx_next(input_handle, false, chrmap_no_change_vector.data())) { ++count; /* header */ auto const hlen = fastx_get_header_length(input_handle); auto const * header = fastx_get_header(input_handle); auto const abundance = fastx_get_abundance(input_handle); /* sequence */ auto const length = fastx_get_sequence_length(input_handle); if (length + 1 > buffer_alloc) { buffer_alloc = length + 1; seq_buffer.resize(buffer_alloc); qual_buffer.resize(buffer_alloc); } auto const * p = fastx_get_sequence(input_handle); reverse_complement(seq_buffer.data(), p, length); /* quality values */ auto const * q = fastx_get_quality(input_handle); if (fastx_is_fastq(input_handle)) { /* reverse quality values */ for (uint64_t i = 0; i < length; i++) { qual_buffer[i] = q[length - 1 - i]; } qual_buffer[length] = 0; } if (parameters.opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, seq_buffer.data(), length, header, hlen, abundance, count, -1.0, -1, -1, nullptr, 0.0); } if (parameters.opt_fastqout != nullptr) { fastq_print_general(fp_fastqout, seq_buffer.data(), length, header, hlen, qual_buffer.data(), abundance, count, -1.0); } progress_update(fastx_get_position(input_handle)); } progress_done(); if (parameters.opt_fastaout != nullptr) { fclose(fp_fastaout); } if (parameters.opt_fastqout != nullptr) { fclose(fp_fastqout); } fastx_close(input_handle); } auto fastq_convert(struct Parameters const & parameters) -> void { if (parameters.opt_fastqout == nullptr) { fatal("No output file specified with --fastqout"); } auto * input_handle = fastq_open(parameters.opt_fastq_convert); if (input_handle == nullptr) { fatal("Unable to open FASTQ file"); } auto const filesize = fastq_get_size(input_handle); std::FILE * fp_fastqout = nullptr; fp_fastqout = fopen_output(parameters.opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open FASTQ output file for writing"); } progress_init("Reading FASTQ file", filesize); std::vector normalized_quality; auto n_entries = 1; static constexpr auto default_expected_error = -1.0; // refactoring: print no ee value? while (fastq_next(input_handle, false, chrmap_no_change_vector.data())) { /* header */ auto const * header = fastq_get_header(input_handle); auto const abundance = fastq_get_abundance(input_handle); /* sequence */ auto const length = fastq_get_sequence_length(input_handle); auto const * sequence = fastq_get_sequence(input_handle); /* convert quality values */ // refactoring: std::copy(quality, normalized_quality); // - subspan(0, end - 1) // - substract parameters.opt_fastq_ascii // - clamp qminout < x < qmaxout // - add parameters.opt_fastq_asciiout // - clamp 33 < x < 126 normalized_quality.resize(length + 1); auto const * quality = fastq_get_quality(input_handle); for (uint64_t i = 0; i < length; i++) { int q = quality[i] - parameters.opt_fastq_ascii; if (q < parameters.opt_fastq_qmin) { fprintf(stderr, "\nFASTQ quality score (%d) below minimum (%" PRId64 ") in entry no %" PRIu64 " starting on line %" PRIu64 "\n", q, parameters.opt_fastq_qmin, fastq_get_seqno(input_handle) + 1, fastq_get_lineno(input_handle)); fatal("FASTQ quality score too low"); } if (q > parameters.opt_fastq_qmax) { fprintf(stderr, "\nFASTQ quality score (%d) above maximum (%" PRId64 ") in entry no %" PRIu64 " starting on line %" PRIu64 "\n", q, parameters.opt_fastq_qmax, fastq_get_seqno(input_handle) + 1, fastq_get_lineno(input_handle)); fatal("FASTQ quality score too high"); } q = std::max(q, parameters.opt_fastq_qminout); q = std::min(q, parameters.opt_fastq_qmaxout); q += parameters.opt_fastq_asciiout; q = std::max(q, 33); q = std::min(q, 126); normalized_quality[i] = q; } int const hlen = fastq_get_header_length(input_handle); fastq_print_general(fp_fastqout, sequence, length, header, hlen, normalized_quality.data(), abundance, n_entries, default_expected_error); // refactoring: prefer function overload? ++n_entries; normalized_quality.clear(); progress_update(fastq_get_position(input_handle)); } progress_done(); fclose(fp_fastqout); fastq_close(input_handle); } vsearch-2.30.1/src/fastqops.h000066400000000000000000000050611506776541700160540ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_convert(struct Parameters const & parameters) -> void; auto fastx_revcomp(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/fastx.cc000066400000000000000000000506661506776541700155120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dynlibs.h" #include "utils/fatal.hpp" #include "utils/span.hpp" #include // dup, STDIN_FILENO, STDOUT_FILENO #include // std::find_first_of #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::fopen, std::size_t, std::fread, std::fileno #include // std::exit, EXIT_FAILURE #include // std::memcpy, std::memcmp, std::strcmp #include // std::distance #include /* file compression and format detector */ /* basic file buffering function for fastq and fastx parsers */ constexpr uint64_t fastx_buffer_alloc = 8192; #ifdef HAVE_BZLIB_H constexpr auto BZ_VERBOSE_0 = 0; // constexpr auto BZ_VERBOSE_1 = 1; // constexpr auto BZ_VERBOSE_2 = 2; // constexpr auto BZ_VERBOSE_3 = 3; // constexpr auto BZ_VERBOSE_4 = 4; constexpr auto BZ_MORE_MEM = 0; /* faster decompression using more memory */ // constexpr auto BZ_LESS_MEM = 1; /* slower decompression but requires less memory */ #endif constexpr std::array magic_gzip = {0x1f, 0x8b}; constexpr std::array magic_bzip = {'B', 'Z'}; auto buffer_init(struct fastx_buffer_s * buffer) -> void { buffer->alloc = fastx_buffer_alloc; buffer->data = static_cast(xmalloc(buffer->alloc)); buffer->data[0] = 0; buffer->length = 0; buffer->position = 0; } auto buffer_free(struct fastx_buffer_s * buffer) -> void { if (buffer->data != nullptr) { xfree(buffer->data); } buffer->data = nullptr; buffer->alloc = 0; buffer->length = 0; buffer->position = 0; } auto buffer_makespace(struct fastx_buffer_s * buffer, uint64_t size) -> void { /* make sure there is space for x more chars in buffer */ if (buffer->length + size > buffer->alloc) { /* alloc space for x more characters, but round up to nearest block size */ buffer->alloc = ((buffer->length + size + fastx_buffer_alloc - 1) / fastx_buffer_alloc) * fastx_buffer_alloc; buffer->data = (char *) xrealloc(buffer->data, buffer->alloc); } } auto buffer_extend(struct fastx_buffer_s * dest_buffer, char * source_buf, uint64_t len) -> void { buffer_makespace(dest_buffer, len + 1); std::memcpy(dest_buffer->data + dest_buffer->length, source_buf, len); dest_buffer->length += len; dest_buffer->data[dest_buffer->length] = 0; } auto find_header_end_first_blank(Span raw_header) -> std::size_t { static const std::vector blanks {' ', '\t', '\0', '\r', '\n'}; auto * result = std::find_first_of(raw_header.begin(), raw_header.end(), blanks.begin(), blanks.end()); if (result != raw_header.end()) { *result = '\0'; } return std::distance(raw_header.begin(), result); } auto find_header_end(Span raw_header) -> std::size_t { static const std::vector blanks {'\0', '\r', '\n'}; auto * result = std::find_first_of(raw_header.begin(), raw_header.end(), blanks.begin(), blanks.end()); if (result != raw_header.end()) { *result = '\0'; } return std::distance(raw_header.begin(), result); } auto warn(char const * const format, unsigned char const symbol, uint64_t const line_number) -> void { std::fprintf(stderr, "\nWARNING: "); std::fprintf(stderr, format, symbol, symbol, line_number); std::fprintf(stderr, "\n"); if (fp_log != nullptr) { std::fprintf(fp_log, "\nWARNING: "); std::fprintf(fp_log, format, symbol, symbol, line_number); std::fprintf(fp_log, "\n"); } } auto fastx_filter_header(fastx_handle input_handle, bool truncateatspace) -> void { // truncate header (in-place) auto raw_header = Span{input_handle->header_buffer.data, input_handle->header_buffer.length}; auto const count = truncateatspace ? find_header_end_first_blank(raw_header) : find_header_end(raw_header); input_handle->header_buffer.length = count; // scan for unusual symbols auto const trimmed_header = raw_header.first(count); for (auto const symbol: trimmed_header) { auto const is_illegal = ((symbol == 127) or ((symbol > '\0') and (symbol < ' ') and not (symbol == '\t'))); if (is_illegal) { fatal("Illegal character encountered in FASTA/FASTQ header.\n" "Unprintable ASCII character no %d on line %" PRIu64 ".", symbol, input_handle->lineno_start); } auto const symbol_unsigned = static_cast(symbol); auto const is_not_ascii = (symbol_unsigned > 127); if (is_not_ascii) { warn("Non-ASCII character encountered in FASTA/FASTQ header.\n" "Character no %d (0x%2x) on line %" PRIu64 ".", symbol_unsigned, input_handle->lineno_start); } } } auto fopen_input(const char * filename) -> std::FILE * { /* open the input stream given by filename, but use stdin if name is - */ if (std::strcmp(filename, "-") == 0) { auto const file_descriptor = dup(STDIN_FILENO); if (file_descriptor < 0) { return nullptr; } return fdopen(file_descriptor, "rb"); } return std::fopen(filename, "rb"); } auto fastx_open(char const * filename) -> fastx_handle { // refactoring: duplicate function to output a struct fastx_s input_handle_s; auto * input_handle = (fastx_handle) xmalloc(sizeof(struct fastx_s)); input_handle->fp = nullptr; #ifdef HAVE_ZLIB_H input_handle->fp_gz = nullptr; #endif #ifdef HAVE_BZLIB_H input_handle->fp_bz = nullptr; int bzError = 0; #endif input_handle->fp = fopen_input(filename); if (input_handle->fp == nullptr) { fatal("Unable to open file for reading (%s)", filename); } /* Get mode and size of original (uncompressed) file */ xstat_t fs; if (xfstat(fileno(input_handle->fp), & fs) != 0) { fatal("Unable to get status for input file (%s)", filename); } input_handle->is_pipe = S_ISFIFO(fs.st_mode); if (input_handle->is_pipe) { input_handle->file_size = 0; } else { input_handle->file_size = fs.st_size; } if (opt_gzip_decompress) { input_handle->format = Format::gzip; } else if (opt_bzip2_decompress) { input_handle->format = Format::bzip; } else if (input_handle->is_pipe) { input_handle->format = Format::plain; } else { /* autodetect compression (plain, gzipped or bzipped) */ /* read two characters and compare with magic */ std::array magic {{}}; input_handle->format = Format::plain; // refactoring: fread() see C++ Weekly - Ep 482 - Safely Wrapping C APIs auto const bytes_read = fread(magic.data(), 1, 2, input_handle->fp); if (bytes_read >= 2) { if (memcmp(magic.data(), magic_gzip.data(), 2) == 0) { input_handle->format = Format::gzip; } else if (memcmp(magic.data(), magic_bzip.data(), 2) == 0) { input_handle->format = Format::bzip; } } else { /* consider it an empty file or a tiny fasta file, uncompressed */ } /* close and reopen to avoid problems with gzip library */ /* rewind was not enough */ fclose(input_handle->fp); input_handle->fp = fopen_input(filename); if (input_handle->fp == nullptr) { fatal("Unable to open file for reading (%s)", filename); } } if (input_handle->format == Format::gzip) { /* GZIP: Keep original file open, then open as gzipped file as well */ #ifdef HAVE_ZLIB_H if (gz_lib == nullptr) { fatal("Files compressed with gzip are not supported"); } input_handle->fp_gz = (*gzdopen_p)(fileno(input_handle->fp), "rb"); if (input_handle->fp_gz == nullptr) { // dup? fatal("Unable to open gzip compressed file (%s)", filename); } #else fatal("Files compressed with gzip are not supported"); #endif } if (input_handle->format == Format::bzip) { /* BZIP2: Keep original file open, then open as bzipped file as well */ #ifdef HAVE_BZLIB_H if (bz2_lib == nullptr) { fatal("Files compressed with bzip2 are not supported"); } input_handle->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, input_handle->fp, BZ_VERBOSE_0, BZ_MORE_MEM, nullptr, 0); if (input_handle->fp_bz == nullptr) { fatal("Unable to open bzip2 compressed file (%s)", filename); } #else fatal("Files compressed with bzip2 are not supported"); #endif } /* init buffers */ input_handle->file_position = 0; buffer_init(& input_handle->file_buffer); /* start filling up file buffer */ auto const rest = fastx_file_fill_buffer(input_handle); /* examine first char and see if it starts with > or @ */ auto filetype = 0; input_handle->is_empty = true; input_handle->is_fastq = false; if (rest > 0) { input_handle->is_empty = false; auto * first = input_handle->file_buffer.data; if (*first == '>') { filetype = 1; } else if (*first == '@') { filetype = 2; input_handle->is_fastq = true; } if (filetype == 0) { /* close files if unrecognized file type */ switch (input_handle->format) { case Format::plain: break; case Format::gzip: #ifdef HAVE_ZLIB_H (*gzclose_p)(input_handle->fp_gz); input_handle->fp_gz = nullptr; break; #endif case Format::bzip: #ifdef HAVE_BZLIB_H (*BZ2_bzReadClose_p)(&bzError, input_handle->fp_bz); input_handle->fp_bz = nullptr; break; #endif default: fatal("Internal error"); } fclose(input_handle->fp); input_handle->fp = nullptr; if (rest >= 2) { if (memcmp(first, magic_gzip.data(), 2) == 0) { fatal("File appears to be gzip compressed. Please use --gzip_decompress"); } if (memcmp(first, magic_bzip.data(), 2) == 0) { fatal("File appears to be bzip2 compressed. Please use --bzip2_decompress"); } } fatal("File type not recognized."); return nullptr; } } /* more initialization */ buffer_init(& input_handle->header_buffer); buffer_init(& input_handle->sequence_buffer); buffer_init(& input_handle->plusline_buffer); buffer_init(& input_handle->quality_buffer); input_handle->stripped_all = 0; input_handle->lineno = 1; input_handle->lineno_start = 1; input_handle->seqno = -1; return input_handle; } auto fastx_is_fastq(fastx_handle input_handle) -> bool { return input_handle->is_fastq or input_handle->is_empty; } auto fastx_is_empty(fastx_handle input_handle) -> bool { return input_handle->is_empty; } auto fastx_is_pipe(fastx_handle input_handle) -> bool { return input_handle->is_pipe; } auto fastx_close(fastx_handle input_handle) -> void { /* Warn about stripped chars */ if (input_handle->stripped_all != 0U) { fprintf(stderr, "WARNING: %" PRIu64 " invalid characters stripped from %s file:", input_handle->stripped_all, (input_handle->is_fastq ? "FASTQ" : "FASTA")); for (int i = 0; i < 256; i++) { if (input_handle->stripped[i] != 0U) { fprintf(stderr, " %c(%" PRIu64 ")", i, input_handle->stripped[i]); } } fprintf(stderr, "\n"); fprintf(stderr, "REMINDER: vsearch does not support amino acid sequences\n"); if (opt_log != nullptr) { fprintf(fp_log, "WARNING: %" PRIu64 " invalid characters stripped from %s file:", input_handle->stripped_all, (input_handle->is_fastq ? "FASTQ" : "FASTA")); for (int i = 0; i < 256; i++) { if (input_handle->stripped[i] != 0U) { fprintf(fp_log, " %c(%" PRIu64 ")", i, input_handle->stripped[i]); } } fprintf(fp_log, "\n"); fprintf(fp_log, "REMINDER: vsearch does not support amino acid sequences\n"); } } #ifdef HAVE_BZLIB_H int bz_error = 0; #endif switch (input_handle->format) { case Format::plain: break; case Format::gzip: #ifdef HAVE_ZLIB_H (*gzclose_p)(input_handle->fp_gz); input_handle->fp_gz = nullptr; break; #endif case Format::bzip: #ifdef HAVE_BZLIB_H (*BZ2_bzReadClose_p)(&bz_error, input_handle->fp_bz); input_handle->fp_bz = nullptr; break; #endif default: fatal("Internal error"); } fclose(input_handle->fp); input_handle->fp = nullptr; buffer_free(& input_handle->file_buffer); buffer_free(& input_handle->header_buffer); buffer_free(& input_handle->sequence_buffer); buffer_free(& input_handle->plusline_buffer); buffer_free(& input_handle->quality_buffer); input_handle->file_size = 0; input_handle->file_position = 0; input_handle->lineno = 0; input_handle->seqno = -1; xfree(input_handle); input_handle = nullptr; } auto fastx_file_fill_buffer(fastx_handle input_handle) -> uint64_t { /* read more data if necessary */ uint64_t const rest = input_handle->file_buffer.length - input_handle->file_buffer.position; if (rest > 0) { return rest; } uint64_t space = input_handle->file_buffer.alloc - input_handle->file_buffer.length; if (space == 0) { /* back to beginning of buffer */ input_handle->file_buffer.position = 0; input_handle->file_buffer.length = 0; space = input_handle->file_buffer.alloc; } int bytes_read = 0; #ifdef HAVE_BZLIB_H int bzError = 0; #endif switch (input_handle->format) { case Format::plain: bytes_read = fread(input_handle->file_buffer.data + input_handle->file_buffer.position, 1, space, input_handle->fp); break; case Format::gzip: #ifdef HAVE_ZLIB_H bytes_read = (*gzread_p)(input_handle->fp_gz, input_handle->file_buffer.data + input_handle->file_buffer.position, space); if (bytes_read < 0) { fatal("Unable to read gzip compressed file"); } break; #endif case Format::bzip: #ifdef HAVE_BZLIB_H bytes_read = (*BZ2_bzRead_p)(& bzError, input_handle->fp_bz, input_handle->file_buffer.data + input_handle->file_buffer.position, space); if ((bytes_read < 0) or not ((bzError == BZ_OK) or (bzError == BZ_STREAM_END) or (bzError == BZ_SEQUENCE_ERROR))) { fatal("Unable to read from bzip2 compressed file"); } break; #endif default: fatal("Internal error"); } if (not input_handle->is_pipe) { #ifdef HAVE_ZLIB_H if (input_handle->format == Format::gzip) { /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */ int const fd = dup(fileno(input_handle->fp)); input_handle->file_position = xlseek(fd, 0, SEEK_CUR); close(fd); } else #endif { input_handle->file_position = xftello(input_handle->fp); } } input_handle->file_buffer.length += bytes_read; return bytes_read; } auto fastx_next(fastx_handle input_handle, bool truncateatspace, const unsigned char * char_mapping) -> bool { if (input_handle->is_fastq) { return fastq_next(input_handle, truncateatspace, char_mapping); } return fasta_next(input_handle, truncateatspace, char_mapping); } auto fastx_get_position(fastx_handle input_handle) -> uint64_t { if (input_handle->is_fastq) { return fastq_get_position(input_handle); } return fasta_get_position(input_handle); } auto fastx_get_size(fastx_handle input_handle) -> uint64_t { if (input_handle->is_fastq) { return fastq_get_size(input_handle); } return fasta_get_size(input_handle); } auto fastx_get_lineno(fastx_handle input_handle) -> uint64_t { if (input_handle->is_fastq) { return fastq_get_lineno(input_handle); } return fasta_get_lineno(input_handle); } auto fastx_get_seqno(fastx_handle input_handle) -> uint64_t { if (input_handle->is_fastq) { return fastq_get_seqno(input_handle); } return fasta_get_seqno(input_handle); } auto fastx_get_header(fastx_handle input_handle) -> char const * { if (input_handle->is_fastq) { return fastq_get_header(input_handle); } return fasta_get_header(input_handle); } auto fastx_get_sequence(fastx_handle input_handle) -> char const * { if (input_handle->is_fastq) { return fastq_get_sequence(input_handle); } return fasta_get_sequence(input_handle); } auto fastx_get_header_length(fastx_handle input_handle) -> uint64_t { if (input_handle->is_fastq) { return fastq_get_header_length(input_handle); } return fasta_get_header_length(input_handle); } auto fastx_get_sequence_length(fastx_handle input_handle) -> uint64_t { if (input_handle->is_fastq) { return fastq_get_sequence_length(input_handle); } return fasta_get_sequence_length(input_handle); } auto fastx_get_quality(fastx_handle input_handle) -> char const * { if (input_handle->is_fastq) { return fastq_get_quality(input_handle); } return nullptr; } auto fastx_get_abundance(fastx_handle input_handle) -> int64_t { if (input_handle->is_fastq) { return fastq_get_abundance(input_handle); } return fasta_get_abundance(input_handle); } vsearch-2.30.1/src/fastx.h000066400000000000000000000117341506776541700153450ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include // std::FILE #include // uint64_t constexpr auto byte_range = 256U; struct fastx_buffer_s { char * data = nullptr; uint64_t length = 0; uint64_t alloc = 0; uint64_t position = 0; }; auto buffer_init(struct fastx_buffer_s * buffer) -> void; auto buffer_free(struct fastx_buffer_s * buffer) -> void; auto buffer_extend(struct fastx_buffer_s * dest_buffer, char * source_buf, uint64_t len) -> void; auto buffer_makespace(struct fastx_buffer_s * buffer, uint64_t size) -> void; enum struct Format : unsigned char { undefined, plain, bzip, gzip }; struct fastx_s { bool is_pipe = false; bool is_fastq = false; bool is_empty = false; std::FILE * fp = nullptr; #ifdef HAVE_ZLIB_H gzFile fp_gz = nullptr; #endif #ifdef HAVE_BZLIB_H BZFILE * fp_bz = nullptr; #endif struct fastx_buffer_s file_buffer; struct fastx_buffer_s header_buffer; struct fastx_buffer_s sequence_buffer; struct fastx_buffer_s plusline_buffer; struct fastx_buffer_s quality_buffer; uint64_t file_size = 0; uint64_t file_position = 0; uint64_t lineno = 0; uint64_t lineno_start = 0; int64_t seqno = 0; uint64_t stripped_all = 0; std::array stripped {{}}; Format format = Format::undefined; }; using fastx_handle = struct fastx_s *; /* fastx input */ auto fastx_is_fastq(fastx_handle input_handle) -> bool; auto fastx_is_empty(fastx_handle input_handle) -> bool; auto fastx_is_pipe(fastx_handle input_handle) -> bool; auto fastx_filter_header(fastx_handle input_handle, bool truncateatspace) -> void; auto fastx_open(const char * filename) -> fastx_handle; auto fastx_close(fastx_handle input_handle) -> void; auto fastx_next(fastx_handle input_handle, bool truncateatspace, const unsigned char * char_mapping) -> bool; auto fastx_get_position(fastx_handle input_handle) -> uint64_t; auto fastx_get_size(fastx_handle input_handle) -> uint64_t; auto fastx_get_lineno(fastx_handle input_handle) -> uint64_t; auto fastx_get_seqno(fastx_handle input_handle) -> uint64_t; auto fastx_get_header(fastx_handle input_handle) -> char const *; auto fastx_get_sequence(fastx_handle input_handle) -> char const *; auto fastx_get_header_length(fastx_handle input_handle) -> uint64_t; auto fastx_get_sequence_length(fastx_handle input_handle) -> uint64_t; auto fastx_get_quality(fastx_handle input_handle) -> char const *; auto fastx_get_abundance(fastx_handle input_handle) -> int64_t; auto fastx_file_fill_buffer(fastx_handle input_handle) -> uint64_t; vsearch-2.30.1/src/filter.cc000066400000000000000000000531761506776541700156510ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::min, std::max #include // macros PRIu64 and PRId64 #include // std::pow, std::signbit #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose #include // std::exit, EXIT_FAILURE #include inline auto fastq_get_qual(char const quality_symbol) -> int { int const quality_score = quality_symbol - opt_fastq_ascii; if (quality_score < opt_fastq_qmin) { std::fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", quality_score, opt_fastq_qmin); if (fp_log != nullptr) { std::fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) below qmin (%" PRId64 ")\n", quality_score, opt_fastq_qmin); } std::exit(EXIT_FAILURE); } else if (quality_score > opt_fastq_qmax) { std::fprintf(stderr, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", quality_score, opt_fastq_qmax); std::fprintf(stderr, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", quality_score); if (fp_log != nullptr) { std::fprintf(fp_log, "\n\nFatal error: FASTQ quality value (%d) above qmax (%" PRId64 ")\n", quality_score, opt_fastq_qmax); std::fprintf(fp_log, "By default, quality values range from 0 to 41.\n" "To allow higher quality values, " "please use the option --fastq_qmax %d\n", quality_score); } std::exit(EXIT_FAILURE); } return quality_score; } struct analysis_res { bool discarded = false; bool truncated = false; int start = 0; int length = 0; double ee = -1.0; }; auto analyse(fastx_handle input_handle) -> struct analysis_res { auto const fastq_trunclen = static_cast(opt_fastq_trunclen); auto const fastq_trunclen_keep = static_cast(opt_fastq_trunclen_keep); struct analysis_res res; res.length = static_cast(fastx_get_sequence_length(input_handle)); auto const old_length = res.length; /* strip left (5') end */ if (opt_fastq_stripleft < res.length) { res.start += opt_fastq_stripleft; res.length -= opt_fastq_stripleft; } else { res.start = res.length; res.length = 0; } /* strip right (3') end */ if (opt_fastq_stripright < res.length) { res.length -= opt_fastq_stripright; } else { res.length = 0; } /* truncate trailing (3') part */ if (opt_fastq_trunclen >= 0) { res.length = std::min(res.length, fastq_trunclen); } /* truncate trailing (3') part, but keep if short */ if (opt_fastq_trunclen_keep >= 0) { res.length = std::min(res.length, fastq_trunclen_keep); } if (input_handle->is_fastq) { /* truncate by quality and expected errors (ee) */ res.ee = 0.0; static constexpr auto base = 10.0; auto const * quality_symbols = fastx_get_quality(input_handle) + res.start; for (auto i = 0; i < res.length; ++i) { auto const quality_score = fastq_get_qual(quality_symbols[i]); auto const expected_error = std::pow(base, -quality_score / base); res.ee += expected_error; if ((quality_score <= opt_fastq_truncqual) or (res.ee > opt_fastq_truncee) or (res.ee > opt_fastq_truncee_rate * (i + 1))) { res.ee -= expected_error; res.length = i; break; } if (quality_score < opt_fastq_minqual) { res.discarded = true; } } /* filter by expected errors (ee) */ if (res.ee > opt_fastq_maxee) { res.discarded = true; } if ((res.length > 0) and ((res.ee / res.length) > opt_fastq_maxee_rate)) { res.discarded = true; } } /* filter by length */ if ((opt_fastq_trunclen >= 0) and (res.length < opt_fastq_trunclen)) { res.discarded = true; } if (res.length < opt_fastq_minlen) { res.discarded = true; } if (res.length > opt_fastq_maxlen) { res.discarded = true; } /* filter by n's */ // refactoring: std::count_if(); int64_t ncount = 0; auto const * nucleotides = fastx_get_sequence(input_handle) + res.start; for (auto i = 0; i < res.length; ++i) { auto const nucleotide = nucleotides[i]; if ((nucleotide == 'N') or (nucleotide == 'n')) { ++ncount; } } if (ncount > opt_fastq_maxns) { res.discarded = true; } /* filter by abundance */ auto const abundance = fastx_get_abundance(input_handle); if (abundance < opt_minsize) { res.discarded = true; } if (abundance > opt_maxsize) { res.discarded = true; } res.truncated = res.length < old_length; return res; } auto filter(bool const fastq_only, char * filename) -> void { static constexpr auto dbl_max = std::numeric_limits::max(); // refactoring: redundant? static constexpr auto long_min = std::numeric_limits::min(); if ((opt_fastqout == nullptr) and (opt_fastaout == nullptr) and (opt_fastqout_discarded == nullptr) and (opt_fastaout_discarded == nullptr) and (opt_fastqout_rev == nullptr) and (opt_fastaout_rev == nullptr) and (opt_fastqout_discarded_rev == nullptr) and (opt_fastaout_discarded_rev == nullptr)) { fatal("No output files specified"); } auto * forward_handle = fastx_open(filename); fastx_handle reverse_handle = nullptr; // refactoring: direct initialization if (forward_handle == nullptr) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } if (not (forward_handle->is_fastq or forward_handle->is_empty)) { if (fastq_only) { fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead"); } else if (opt_eeout or (opt_fastq_ascii != 33) or opt_fastq_eeout or (opt_fastq_maxee < dbl_max) or (opt_fastq_maxee_rate < dbl_max) or (opt_fastqout != nullptr) or (opt_fastq_qmax < 41) or (opt_fastq_qmin > 0) or (opt_fastq_truncee < dbl_max) or (opt_fastq_truncee_rate < dbl_max) or (opt_fastq_truncqual < long_min) or (opt_fastq_minqual > 0) or (opt_fastqout_discarded != nullptr) or (opt_fastqout_discarded_rev != nullptr) or (opt_fastqout_rev != nullptr)) { fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_minqual, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncee_rate, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev"); } } auto const filesize = fastx_get_size(forward_handle); if (opt_reverse != nullptr) { reverse_handle = fastx_open(opt_reverse); if (reverse_handle == nullptr) { fatal("Unrecognized file type (not proper FASTA or FASTQ format) for reverse reads"); } if (forward_handle->is_fastq != reverse_handle->is_fastq) { fatal("The forward and reverse input sequence must in the same format, either FASTA or FASTQ"); } } std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; std::FILE * fp_fastaout_discarded = nullptr; std::FILE * fp_fastqout_discarded = nullptr; std::FILE * fp_fastaout_rev = nullptr; std::FILE * fp_fastqout_rev = nullptr; std::FILE * fp_fastaout_discarded_rev = nullptr; std::FILE * fp_fastqout_discarded_rev = nullptr; if (opt_fastaout != nullptr) { fp_fastaout = fopen_output(opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout != nullptr) { fp_fastqout = fopen_output(opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (opt_fastaout_discarded != nullptr) { fp_fastaout_discarded = fopen_output(opt_fastaout_discarded); if (fp_fastaout_discarded == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout_discarded != nullptr) { fp_fastqout_discarded = fopen_output(opt_fastqout_discarded); if (fp_fastqout_discarded == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (reverse_handle != nullptr) { if (opt_fastaout_rev != nullptr) { fp_fastaout_rev = fopen_output(opt_fastaout_rev); if (fp_fastaout_rev == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout_rev != nullptr) { fp_fastqout_rev = fopen_output(opt_fastqout_rev); if (fp_fastqout_rev == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (opt_fastaout_discarded_rev != nullptr) { fp_fastaout_discarded_rev = fopen_output(opt_fastaout_discarded_rev); if (fp_fastaout_discarded_rev == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout_discarded_rev != nullptr) { fp_fastqout_discarded_rev = fopen_output(opt_fastqout_discarded_rev); if (fp_fastqout_discarded_rev == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } } progress_init("Reading input file", filesize); int64_t kept = 0; int64_t discarded = 0; int64_t truncated = 0; while (fastx_next(forward_handle, false, chrmap_no_change_vector.data())) { if ((reverse_handle != nullptr) and not fastx_next(reverse_handle, false, chrmap_no_change_vector.data())) { fatal("More forward reads than reverse reads"); } struct analysis_res res1; res1.ee = 0.0; struct analysis_res res2; res1 = analyse(forward_handle); if (reverse_handle != nullptr) { res2 = analyse(reverse_handle); } if (res1.discarded or res2.discarded) { /* discard the sequence(s) */ ++discarded; if (opt_fastaout_discarded != nullptr) { fasta_print_general(fp_fastaout_discarded, nullptr, fastx_get_sequence(forward_handle) + res1.start, res1.length, fastx_get_header(forward_handle), fastx_get_header_length(forward_handle), fastx_get_abundance(forward_handle), discarded, res1.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout_discarded != nullptr) { fastq_print_general(fp_fastqout_discarded, fastx_get_sequence(forward_handle) + res1.start, res1.length, fastx_get_header(forward_handle), fastx_get_header_length(forward_handle), fastx_get_quality(forward_handle) + res1.start, fastx_get_abundance(forward_handle), discarded, res1.ee); } if (reverse_handle != nullptr) { if (opt_fastaout_discarded_rev != nullptr) { fasta_print_general(fp_fastaout_discarded_rev, nullptr, fastx_get_sequence(reverse_handle) + res2.start, res2.length, fastx_get_header(reverse_handle), fastx_get_header_length(reverse_handle), fastx_get_abundance(reverse_handle), discarded, res2.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout_discarded_rev != nullptr) { fastq_print_general(fp_fastqout_discarded_rev, fastx_get_sequence(reverse_handle) + res2.start, res2.length, fastx_get_header(reverse_handle), fastx_get_header_length(reverse_handle), fastx_get_quality(reverse_handle) + res2.start, fastx_get_abundance(reverse_handle), discarded, res2.ee); } } } else { /* keep the sequence(s) */ ++kept; if (res1.truncated or res2.truncated) { ++truncated; } if (opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, fastx_get_sequence(forward_handle) + res1.start, res1.length, fastx_get_header(forward_handle), fastx_get_header_length(forward_handle), fastx_get_abundance(forward_handle), kept, res1.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout != nullptr) { fastq_print_general(fp_fastqout, fastx_get_sequence(forward_handle) + res1.start, res1.length, fastx_get_header(forward_handle), fastx_get_header_length(forward_handle), fastx_get_quality(forward_handle) + res1.start, fastx_get_abundance(forward_handle), kept, res1.ee); } if (reverse_handle != nullptr) { if (opt_fastaout_rev != nullptr) { fasta_print_general(fp_fastaout_rev, nullptr, fastx_get_sequence(reverse_handle) + res2.start, res2.length, fastx_get_header(reverse_handle), fastx_get_header_length(reverse_handle), fastx_get_abundance(reverse_handle), kept, res2.ee, -1, -1, nullptr, 0.0); } if (opt_fastqout_rev != nullptr) { fastq_print_general(fp_fastqout_rev, fastx_get_sequence(reverse_handle) + res2.start, res2.length, fastx_get_header(reverse_handle), fastx_get_header_length(reverse_handle), fastx_get_quality(reverse_handle) + res2.start, fastx_get_abundance(reverse_handle), kept, res2.ee); } } } progress_update(fastx_get_position(forward_handle)); } progress_done(); if ((reverse_handle != nullptr) and fastx_next(reverse_handle, false, chrmap_no_change_vector.data())) { fatal("More reverse reads than forward reads"); } if (not opt_quiet) { std::fprintf(stderr, "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n", kept, truncated, discarded); } if (opt_log != nullptr) { std::fprintf(fp_log, "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n", kept, truncated, discarded); } if (reverse_handle != nullptr) { if (opt_fastaout_rev != nullptr) { std::fclose(fp_fastaout_rev); } if (opt_fastqout_rev != nullptr) { std::fclose(fp_fastqout_rev); } if (opt_fastaout_discarded_rev != nullptr) { std::fclose(fp_fastaout_discarded_rev); } if (opt_fastqout_discarded_rev != nullptr) { std::fclose(fp_fastqout_discarded_rev); } fastx_close(reverse_handle); } if (opt_fastaout != nullptr) { std::fclose(fp_fastaout); } if (opt_fastqout != nullptr) { std::fclose(fp_fastqout); } if (opt_fastaout_discarded != nullptr) { std::fclose(fp_fastaout_discarded); } if (opt_fastqout_discarded != nullptr) { std::fclose(fp_fastqout_discarded); } fastx_close(forward_handle); } // anonymous namespace: limit visibility and usage to this translation unit namespace { auto check_parameters(struct Parameters const & parameters) -> void { auto const is_negative = std::signbit(parameters.opt_fastq_truncee_rate); if (is_negative) { fatal("--fastq_truncee_rate cannot be negative"); } if (parameters.opt_fastq_minqual < 0) { fatal("--fastq_minqual cannot be negative"); } } } // end of anonymous namespace auto fastq_filter(struct Parameters const & parameters) -> void { check_parameters(parameters); filter(true, parameters.opt_fastq_filter); } auto fastx_filter(struct Parameters const & parameters) -> void { check_parameters(parameters); filter(false, parameters.opt_fastx_filter); } vsearch-2.30.1/src/filter.h000066400000000000000000000050571506776541700155060ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastq_filter(struct Parameters const & parameters) -> void; auto fastx_filter(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/getseq.cc000066400000000000000000000426051506776541700156470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Implement fastx_getseq, fastx_getseqs and fastx_getsubseq as described here: https://drive5.com/usearch/manual/cmd_fastx_getseqs.html */ #include "vsearch.h" #include "utils/compare_strings_nocase.hpp" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/open_file.hpp" #include "utils/span.hpp" #include // std::copy, std::max, std::min, std::search, std::equal #include #include #include // isalnum #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::snprintf, std::fileno, std::fgets, EOF, std::size_t #include // std::strlen, std::strcpy, std::strstr #include std::vector> labels_data; // anonymous namespace: limit visibility and usage to this translation unit namespace { auto find_length_longest_label(std::vector> const & labels) -> std::size_t { auto longest = std::size_t{0}; for (auto const & label : labels) { longest = std::max(longest, label.size()); } return longest; } } // end of anonymous namespace auto read_labels_file(char * filename) -> void { auto labels_alloc = 0U; auto labels_count = 0U; auto labels_longest = std::size_t{0}; auto fp_labels = open_input_file(filename); if (fp_labels.get() == nullptr) { fatal("Unable to open labels file (%s)", filename); } xstat_t file_status; if (xfstat(fileno(fp_labels.get()), & file_status) != 0) { fatal("Unable to get status for labels file (%s)", filename); } auto const is_pipe = S_ISFIFO(file_status.st_mode); // linuxism uint64_t const file_size = is_pipe ? 0: file_status.st_size; progress_init("Reading labels", file_size); static constexpr auto a_memory_chunck = 1024U; while (true) { static constexpr auto buffer_size = 1024U; std::array buffer {{}}; auto * return_value = std::fgets(buffer.data(), buffer_size, fp_labels.get()); if (return_value == nullptr) { break; } auto length = std::strlen(buffer.data()); if ((length != 0) and (buffer[length - 1] == '\n')) { buffer[length - 1] = '\0'; --length; } labels_longest = std::max(length, labels_longest); if (labels_count + 1 > labels_alloc) { labels_alloc += a_memory_chunck; labels_data.resize(labels_alloc); } labels_data[labels_count].resize(length); std::copy(buffer.begin(), std::next(buffer.begin(), length), labels_data[labels_count].begin()); ++labels_count; } progress_done(); // definitive number of labels is known labels_data.resize(labels_count); static constexpr auto max_label_length = std::size_t{1023}; if (labels_longest >= max_label_length) { if (not opt_quiet) { fprintf(stderr, "WARNING: Labels longer than 1023 characters are not supported\n"); } if (opt_log != nullptr) { fprintf(fp_log, "WARNING: Labels longer than 1023 characters are not supported\n"); } } } auto test_label_match(fastx_handle input_handle) -> bool { char const * header = fastx_get_header(input_handle); auto const header_length = fastx_get_header_length(input_handle); int const hlen = static_cast(header_length); auto const header_view = Span{header, header_length}; auto const longest_label = find_length_longest_label(labels_data); std::vector field_buffer; int field_len = 0; if (opt_label_field != nullptr) { field_len = std::strlen(opt_label_field); int field_buffer_size = field_len + 2; if (opt_label_word != nullptr) { field_buffer_size += std::strlen(opt_label_word); } else { field_buffer_size += static_cast(longest_label); } field_buffer.resize(field_buffer_size); snprintf(field_buffer.data(), field_buffer_size, "%s=", opt_label_field); } if (opt_label != nullptr) { auto const needle_view = Span{opt_label, std::strlen(opt_label)}; if (opt_label_substr_match) { return (contains_substring(header_view, needle_view)); } return (are_same_string(header_view, needle_view)); } if (opt_labels != nullptr) { if (opt_label_substr_match) { for (auto & label: labels_data) { auto const label_view = Span{label.data(), label.size()}; if (contains_substring(header_view, label_view)) { return true; } } } else { for (auto const & label: labels_data) { if (are_same_string(header_view, label)) { return true; } } } } else if (opt_label_word != nullptr) { char const * needle = opt_label_word; if (opt_label_field != nullptr) { std::strcpy(&field_buffer[field_len + 1], needle); needle = field_buffer.data(); } int const wlen = std::strlen(needle); char const * hit = header_view.data(); while (true) { hit = std::strstr(hit, needle); if (hit == nullptr) { break; } if (opt_label_field != nullptr) { /* check of field */ if (((hit == header) or (*(hit - 1) == ';')) and ((hit + wlen == header + hlen) or (*(hit + wlen) == ';'))) { return true; } } else { /* check of full word */ if (((hit == header) or (isalnum(*(hit - 1)) == 0)) and ((hit + wlen == header + hlen) or (isalnum(*(hit + wlen)) == 0))) { return true; } } ++hit; } } else if (opt_label_words != nullptr) { for (auto & label: labels_data) { char const * needle = label.data(); if (opt_label_field != nullptr) { std::copy(label.begin(), label.end(), &field_buffer[field_len + 1]); needle = field_buffer.data(); } int const wlen = std::strlen(needle); char const * hit = header; while (true) { hit = std::strstr(hit, needle); if (hit == nullptr) { break; } if (opt_label_field != nullptr) { /* check of field */ if (((hit == header) or (*(hit - 1) == ';')) and ((hit + wlen == header + hlen) or (*(hit + wlen) == ';'))) { return true; } } else { /* check of full word */ if (((hit == header) or (isalnum(*(hit - 1)) == 0)) and ((hit + wlen == header + hlen) or (isalnum(*(hit + wlen)) == 0))) { return true; } } ++hit; } } // end of range-for loop } return false; } auto getseq(struct Parameters const & parameters, char * filename) -> void { if ((opt_fastqout == nullptr) and (opt_fastaout == nullptr) and (opt_notmatched == nullptr) and (opt_notmatchedfq == nullptr)) { fatal("No output files specified"); } if (parameters.opt_fastx_getseq != nullptr) { if (opt_label == nullptr) { fatal("Missing label option"); } } else if (parameters.opt_fastx_getsubseq != nullptr) { if (opt_label == nullptr) { fatal("Missing label option"); } if ((opt_subseq_start < 1) or (opt_subseq_end < 1)) { fatal("The argument to options subseq_start and subseq_end must be at least 1"); } if (opt_subseq_start > opt_subseq_end) { fatal("The argument to option subseq_start must be equal or less than to subseq_end"); } } else if (parameters.opt_fastx_getseqs != nullptr) { int label_options = 0; if (opt_label != nullptr) { ++label_options; } if (opt_labels != nullptr) { ++label_options; } if (opt_label_word != nullptr) { ++label_options; } if (opt_label_words != nullptr) { ++label_options; } if (label_options != 1) { fatal("Specify one label option (label, labels, label_word or label_words)"); } if (opt_labels != nullptr) { read_labels_file(opt_labels); } if (opt_label_words != nullptr) { read_labels_file(opt_label_words); } } fastx_handle h1 = nullptr; h1 = fastx_open(filename); if (h1 == nullptr) { fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); } if (((opt_fastqout != nullptr) or (opt_notmatchedfq != nullptr)) and not (h1->is_fastq or h1->is_empty)) { fatal("Cannot write FASTQ output from FASTA input"); } uint64_t const filesize = fastx_get_size(h1); std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; std::FILE * fp_notmatched = nullptr; std::FILE * fp_notmatchedfq = nullptr; if (opt_fastaout != nullptr) { fp_fastaout = fopen_output(opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (opt_fastqout != nullptr) { fp_fastqout = fopen_output(opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (opt_notmatched != nullptr) { fp_notmatched = fopen_output(opt_notmatched); if (fp_notmatched == nullptr) { fatal("Unable to open FASTA output file (notmatched) for writing"); } } if (opt_notmatchedfq != nullptr) { fp_notmatchedfq = fopen_output(opt_notmatchedfq); if (fp_notmatchedfq == nullptr) { fatal("Unable to open FASTQ output file (notmatchedfq) for writing"); } } progress_init("Extracting sequences", filesize); int64_t kept = 0; int64_t discarded = 0; while (fastx_next(h1, opt_notrunclabels == 0, chrmap_no_change_vector.data())) { bool const match = test_label_match(h1); int64_t start = 1; int64_t end = fastx_get_sequence_length(h1); if (parameters.opt_fastx_getsubseq != nullptr) { start = std::max(opt_subseq_start, start); end = std::min(opt_subseq_end, end); } int64_t const length = end - start + 1; if (match) { /* keep the sequence(s) */ ++kept; if (opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_abundance(h1), kept, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout != nullptr) { fastq_print_general(fp_fastqout, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_quality(h1) + start - 1, fastx_get_abundance(h1), kept, -1.0); } } else { /* discard the sequence */ ++discarded; if (opt_notmatched != nullptr) { fasta_print_general(fp_notmatched, nullptr, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_abundance(h1), discarded, -1.0, -1, -1, nullptr, 0.0); } if (opt_notmatchedfq != nullptr) { fastq_print_general(fp_notmatchedfq, fastx_get_sequence(h1) + start - 1, length, fastx_get_header(h1), fastx_get_header_length(h1), fastx_get_quality(h1) + start - 1, fastx_get_abundance(h1), discarded, -1.0); } } progress_update(fastx_get_position(h1)); } progress_done(); if (not opt_quiet) { fprintf(stderr, "%" PRId64 " of %" PRId64 " sequences extracted", kept, kept + discarded); if (kept + discarded > 0) { fprintf(stderr, " (%.1lf%%)", 100.0 * kept / (kept + discarded)); } fprintf(stderr, "\n"); } if (opt_log != nullptr) { fprintf(fp_log, "%" PRId64 " of %" PRId64 " sequences extracted", kept, kept + discarded); if (kept + discarded > 0) { fprintf(fp_log, " (%.1lf%%)", 100.0 * kept / (kept + discarded)); } fprintf(fp_log, "\n"); } if (opt_fastaout != nullptr) { fclose(fp_fastaout); } if (opt_fastqout != nullptr) { fclose(fp_fastqout); } if (opt_notmatched != nullptr) { fclose(fp_notmatched); } if (opt_notmatchedfq != nullptr) { fclose(fp_notmatchedfq); } fastx_close(h1); } auto fastx_getseq(struct Parameters const & parameters) -> void { getseq(parameters, parameters.opt_fastx_getseq); } auto fastx_getseqs(struct Parameters const & parameters) -> void { getseq(parameters, parameters.opt_fastx_getseqs); } auto fastx_getsubseq(struct Parameters const & parameters) -> void { getseq(parameters, parameters.opt_fastx_getsubseq); } vsearch-2.30.1/src/getseq.h000066400000000000000000000051641506776541700155100ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto fastx_getseq(struct Parameters const & parameters) -> void; auto fastx_getseqs(struct Parameters const & parameters) -> void; auto fastx_getsubseq(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/kmerhash.cc000066400000000000000000000173331506776541700161610ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "city.h" #include "utils/kmer_hash_struct.hpp" #include "utils/maps.hpp" #include // std::max #include using Hash = decltype(&CityHash64); static Hash hash_function = CityHash64; // anonymous namespace: limit visibility and usage to this translation unit namespace { auto reset_buckets(std::vector & hash) -> void { auto const current_size = hash.size(); hash.clear(); hash.resize(current_size); } } // end of anonymous namespace inline auto kh_insert_kmer(struct kh_handle_s & kmer_hash, int const k_offset, unsigned int const kmer, unsigned int const pos) -> void { /* find free bucket in hash */ auto bucket = hash_function((char *) &kmer, (k_offset + 3) / 4) & kmer_hash.hash_mask; while (kmer_hash.hash[bucket].pos != 0U) { bucket = (bucket + 1) & kmer_hash.hash_mask; } kmer_hash.hash[bucket].kmer = kmer; kmer_hash.hash[bucket].pos = pos; } auto kh_insert_kmers(struct kh_handle_s & kmer_hash, int const k_offset, char const * seq, int const len) -> void { int const kmers = 1U << (2U * k_offset); unsigned int const kmer_mask = kmers - 1; reset_buckets(kmer_hash.hash); /* reallocate hash table if necessary */ if (kmer_hash.alloc < 2 * len) { while (kmer_hash.alloc < 2 * len) { kmer_hash.alloc *= 2; } kmer_hash.hash.resize(kmer_hash.alloc); } kmer_hash.size = 1; while (kmer_hash.size < 2 * len) { kmer_hash.size *= 2; } kmer_hash.hash_mask = kmer_hash.size - 1; kmer_hash.maxpos = len; unsigned int bad = kmer_mask; unsigned int kmer = 0; char const * s = seq; for (int pos = 0; pos < len; pos++) { int const c = *s; ++s; bad <<= 2ULL; bad |= map_mask_ambig(c); bad &= kmer_mask; kmer <<= 2ULL; kmer |= map_2bit(c); kmer &= kmer_mask; if (bad == 0U) { /* 1-based pos of start of kmer */ kh_insert_kmer(kmer_hash, k_offset, kmer, pos - k_offset + 1 + 1); } } } auto kh_find_best_diagonal(struct kh_handle_s & kmer_hash, int const k_offset, char const * seq, int const len) -> int { std::vector diag_counts(kmer_hash.maxpos, 0); int const kmers = 1U << (2U * k_offset); unsigned int const kmer_mask = kmers - 1; unsigned int bad = kmer_mask; unsigned int kmer = 0; char const * seq_cursor = seq + len - 1; for (int pos = 0; pos < len; pos++) { int const nucleotide = *seq_cursor; --seq_cursor; bad <<= 2ULL; bad |= map_mask_ambig(nucleotide); bad &= kmer_mask; kmer <<= 2ULL; kmer |= map_2bit(map_complement(nucleotide)); kmer &= kmer_mask; if (bad == 0U) { /* find matching buckets in hash */ unsigned int j = hash_function((char *) &kmer, (k_offset + 3) / 4) & kmer_hash.hash_mask; while (kmer_hash.hash[j].pos != 0U) { if (kmer_hash.hash[j].kmer == kmer) { int const fpos = kmer_hash.hash[j].pos - 1; int const diag = fpos - (pos - k_offset + 1); if (diag >= 0) { ++diag_counts[diag]; } } j = (j + 1) & kmer_hash.hash_mask; } } } int best_diag_count = -1; int best_diag = -1; int good_diags = 0; for (int d = 0; d < kmer_hash.maxpos - k_offset + 1; d++) { int const diag_len = kmer_hash.maxpos - d; int const minmatch = std::max(1, diag_len - k_offset + 1 - (k_offset * std::max(diag_len / 20, 0))); int const c = diag_counts[d]; if (c >= minmatch) { ++good_diags; } if (c > best_diag_count) { best_diag_count = c; best_diag = d; } } if (good_diags == 1) { return best_diag; } return -1; } auto kh_find_diagonals(struct kh_handle_s & kmer_hash, int const k_offset, char const * seq, int const len, std::vector & diags) -> void { int const kmers = 1U << (2U * k_offset); unsigned int const kmer_mask = kmers - 1; unsigned int bad = kmer_mask; unsigned int kmer = 0; char const * seq_cursor = seq + len - 1; for (int pos = 0; pos < len; pos++) { int const nucleotide = *seq_cursor--; bad <<= 2ULL; bad |= map_mask_ambig(nucleotide); bad &= kmer_mask; kmer <<= 2ULL; kmer |= map_2bit(map_complement(nucleotide)); kmer &= kmer_mask; if (bad == 0U) { /* find matching buckets in hash */ unsigned int j = hash_function((char *) &kmer, (k_offset + 3) / 4) & kmer_hash.hash_mask; while (kmer_hash.hash[j].pos != 0U) { if (kmer_hash.hash[j].kmer == kmer) { int const fpos = kmer_hash.hash[j].pos - 1; int const diag = len + fpos - (pos - k_offset + 1); if (diag >= 0) { ++diags[diag]; } } j = (j + 1) & kmer_hash.hash_mask; } } } } vsearch-2.30.1/src/kmerhash.h000066400000000000000000000055651506776541700160270ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include auto kh_insert_kmers(struct kh_handle_s & kmer_hash, int k_offset, char const * seq, int len) -> void; auto kh_find_best_diagonal(struct kh_handle_s & kmer_hash, int k_offset, char const * seq, int len) -> int; auto kh_find_diagonals(struct kh_handle_s & kmer_hash, int k_offset, char const * seq, int len, std::vector & diags) -> void; vsearch-2.30.1/src/linmemalign.cc000066400000000000000000000517531506776541700166570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "linmemalign.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include // std::max #include // macros PRIu64 and PRId64 #include // int64_t #include // std::printf, std::size_t, std::snprintf, std::sscanf #include // #include /* Compute the optimal global alignment of two sequences in linear space using the divide and conquer method. These functions are based on the following articles: - Hirschberg (1975) Comm ACM 18:341-343 - Myers & Miller (1988) CABIOS 4:11-17 The method has been adapted for the use of different gap penalties for query/target/left/interior/right gaps. scorematrix consists of 16x16 int64_t integers Sequences and alignment matrix: A/a/i/query/q/downwards/vertical/top/bottom B/b/j/target/t/rightwards/horizontal/left/right f corresponds to score ending with gap in A/query EE corresponds to score ending with gap in B/target */ constexpr auto minimal_length = int64_t{64}; LinearMemoryAligner::LinearMemoryAligner(struct Scoring const & scoring) : go_q_l(scoring.gap_open_query_left), go_t_l(scoring.gap_open_target_left), go_q_i(scoring.gap_open_query_interior), go_t_i(scoring.gap_open_target_interior), go_q_r(scoring.gap_open_query_right), go_t_r(scoring.gap_open_target_right), ge_q_l(scoring.gap_extension_query_left), ge_t_l(scoring.gap_extension_target_left), ge_q_i(scoring.gap_extension_query_interior), ge_t_i(scoring.gap_extension_target_interior), ge_q_r(scoring.gap_extension_query_right), ge_t_r(scoring.gap_extension_target_right) { scorematrix_fill(scoring); } /* Expected score matrix (if option N is mismatch): 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - A C M G R S V T W Y H K D B N 0 - M X X 0 X 0 0 0 X 0 0 0 0 0 0 X 1 A X M X 0 X 0 0 0 X 0 0 0 0 0 0 X 2 C X X M 0 X 0 0 0 X 0 0 0 0 0 0 X 3 M 0 0 0 M 0 0 0 0 0 0 0 0 0 0 0 X 4 G X X X 0 M 0 0 0 X 0 0 0 0 0 0 X 5 R 0 0 0 0 0 M 0 0 0 0 0 0 0 0 0 X 6 S 0 0 0 0 0 0 M 0 0 0 0 0 0 0 0 X 7 V 0 0 0 0 0 0 0 M 0 0 0 0 0 0 0 X 8 T X X X 0 X 0 0 0 M 0 0 0 0 0 0 X 9 W 0 0 0 0 0 0 0 0 0 M 0 0 0 0 0 X 10 Y 0 0 0 0 0 0 0 0 0 0 M 0 0 0 0 X 11 H 0 0 0 0 0 0 0 0 0 0 0 M 0 0 0 X 12 K 0 0 0 0 0 0 0 0 0 0 0 0 M 0 0 X 13 D 0 0 0 0 0 0 0 0 0 0 0 0 0 M 0 X 14 B 0 0 0 0 0 0 0 0 0 0 0 0 0 0 M X 15 N X X X X X X X X X X X X X X X M M = match, X = mismatch Map from ascii to 4-bit nucleotide code -: 0 A: 1 C: 2 M: 3 G: 4 R: 5 S: 6 V: 7 T: 8 W: 9 Y: 10 H: 11 K: 12 D: 13 B: 14 N: 15 */ auto LinearMemoryAligner::scorematrix_fill(struct Scoring const & scoring) -> void { std::vector const nucleotides = {'-', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}; // fill-in the score matrix for (auto const row_nuc : nucleotides) { auto const row = map_4bit(row_nuc); for (auto const column_nuc : nucleotides) { auto const column = map_4bit(column_nuc); if (is_ambiguous_4bit(row) or is_ambiguous_4bit(column)) { continue; // then score is 0; already zero-initialized } if (row == column) { // diagonal scorematrix[row][column] = scoring.match; } else { scorematrix[row][column] = scoring.mismatch; } } } // if alignment with N is set to be a mismatch if (opt_n_mismatch) { // last column for (auto & row : scorematrix) { row.back() = scoring.mismatch; } // last row auto & last_row = scorematrix.back(); std::fill(last_row.begin(), last_row.end(), scoring.mismatch); } } auto LinearMemoryAligner::alloc_vectors(std::size_t const size) -> void { if (vector_alloc >= size) { return; } vector_alloc = size; HH.resize(vector_alloc); EE.resize(vector_alloc); XX.resize(vector_alloc); YY.resize(vector_alloc); } auto LinearMemoryAligner::cigar_reset() -> void { if (cigar_alloc < 1) { cigar_alloc = minimal_length; cigar_string.resize(cigar_alloc); } cigar_string[0] = '\0'; cigar_length = 0; op = '\0'; op_run = 0; } auto LinearMemoryAligner::cigar_flush() -> void { if (op_run <= 0) { return; } while (true) { /* try writing string until enough memory has been allocated */ auto const rest = cigar_alloc - cigar_length; auto n = 0; if (op_run > 1) { n = std::snprintf(&cigar_string[cigar_length], rest, "%" PRId64 "%c", op_run, op); } else { n = std::snprintf(&cigar_string[cigar_length], rest, "%c", op); } if (n < 0) { fatal("snprintf returned a negative number.\n"); } else if (n >= rest) { cigar_alloc += std::max(n - rest + 1, minimal_length); cigar_string.resize(cigar_alloc); } else { cigar_length += n; break; } } } auto LinearMemoryAligner::subst_score(char const lhs, char const rhs) -> int64_t { /* return substitution score for replacing char lhs (sequence a), with char rhs (sequence b) */ return scorematrix[map_4bit(rhs)][map_4bit(lhs)]; } auto LinearMemoryAligner::cigar_add(char _op, int64_t run) -> void { if (op == _op) { op_run += run; } else { cigar_flush(); op = _op; op_run = run; } } auto LinearMemoryAligner::diff(int64_t a_start, int64_t b_start, int64_t a_len, int64_t b_len, bool gap_b_left, /* gap open left of b */ bool gap_b_right, /* gap open right of b */ bool a_left, /* includes left end of a */ bool a_right, /* includes right end of a */ bool b_left, /* includes left end of b */ bool b_right) -> void /* includes right end of b */ { static constexpr auto int64_min = std::numeric_limits::min(); // auto span_A = Span{std::next(a_seq, a_start), a_len}; // auto span_B = Span{std::next(b_seq, b_start), b_len}; if (b_len == 0) { /* B and possibly A is empty */ if (a_len > 0) { // Delete a_len from A // AAA // --- cigar_add('D', a_len); } } else if (a_len == 0) { /* A is empty, B is not */ // Delete b_len from B // --- // BBB cigar_add('I', b_len); } else if (a_len == 1) { /* Convert 1 symbol from A to b_len symbols from B b_len >= 1 */ int64_t MaxScore = 0; int64_t best = 0; int64_t Score = 0; /* First possibility */ // Delete 1 from A, Insert b_len from B // A---- // -BBBB /* gap penalty for gap in B of length 1 */ if (not gap_b_left) { Score -= b_left ? go_t_l : go_t_i; } Score -= b_left ? ge_t_l : ge_t_i; /* gap penalty for gap in A of length b_len */ Score -= a_right ? go_q_r + (b_len * ge_q_r) : go_q_i + (b_len * ge_q_i); MaxScore = Score; best = -1; /* Second possibility */ // Insert b_len from B, Delete 1 from A // ----A // BBBB- /* gap penalty for gap in A of length b_len */ Score -= a_left ? go_q_l + (b_len * ge_q_l) : go_q_i + (b_len * ge_q_i); /* gap penalty for gap in B of length 1 */ if (not gap_b_right) { Score -= b_right ? go_t_r : go_t_i; } Score -= b_right ? ge_t_r : ge_t_i; if (Score > MaxScore) { MaxScore = Score; best = b_len; } /* Third possibility */ for (int64_t i = 0; i < b_len; i++) { // Insert zero or more from B, replace 1, insert rest of B // -A-- // BBBB Score = 0; if (i > 0) { Score -= a_left ? go_q_l + (i * ge_q_l) : go_q_i + (i * ge_q_i); } Score += subst_score(a_seq[a_start], b_seq[b_start + i]); if (i < b_len - 1) { Score -= a_right ? go_q_r + ((b_len - 1 - i) * ge_q_r) : go_q_i + ((b_len - 1 - i) * ge_q_i); } if (Score > MaxScore) { MaxScore = Score; best = i; } } if (best == -1) { cigar_add('D', 1); cigar_add('I', b_len); } else if (best == b_len) { cigar_add('I', b_len); cigar_add('D', 1); } else { if (best > 0) { cigar_add('I', best); } cigar_add('M', 1); if (best < b_len - 1) { cigar_add('I', b_len - 1 - best); } } } else { /* a_len >= 2, b_len >= 1 */ int64_t const I = a_len / 2; // rename: median? // Compute HH & EE in forward phase // Upper part /* initialize HH and EE for values corresponding to empty seq A vs B of i symbols, i.e. a gap of length i in A */ HH[0] = 0; EE[0] = 0; for (int64_t i = 1; i <= b_len; i++) { HH[i] = - (a_left ? go_q_l + (i * ge_q_l) : go_q_i + (i * ge_q_i)); EE[i] = int64_min; } /* compute matrix */ for (int64_t i = 1; i <= I; i++) { auto p = HH[0]; int64_t h = - (b_left ? (gap_b_left ? 0 : go_t_l) + (i * ge_t_l) : (gap_b_left ? 0 : go_t_i) + (i * ge_t_i)); HH[0] = h; auto f = int64_min; for (int64_t j = 1; j <= b_len; j++) { f = std::max(f, h - go_q_i) - ge_q_i; if (b_right and (j == b_len)) { EE[j] = std::max(EE[j], HH[j] - go_t_r) - ge_t_r; } else { EE[j] = std::max(EE[j], HH[j] - go_t_i) - ge_t_i; } h = p + subst_score(a_seq[a_start + i - 1], b_seq[b_start + j - 1]); h = std::max(f, h); h = std::max(EE[j], h); p = HH[j]; HH[j] = h; } } EE[0] = HH[0]; // Compute XX & YY in reverse phase // Lower part /* initialize XX and YY */ XX[0] = 0; YY[0] = 0; for (int64_t i = 1; i <= b_len; i++) { XX[i] = - (a_right ? go_q_r + (i * ge_q_r) : go_q_i + (i * ge_q_i)); YY[i] = int64_min; } /* compute matrix */ for (int64_t i = 1; i <= a_len - I; i++) { auto p = XX[0]; int64_t h = - (b_right ? (gap_b_right ? 0 : go_t_r) + (i * ge_t_r) : (gap_b_right ? 0 : go_t_i) + (i * ge_t_i)); XX[0] = h; auto f = int64_min; for (int64_t j = 1; j <= b_len; j++) { f = std::max(f, h - go_q_i) - ge_q_i; if (b_left and (j == b_len)) { YY[j] = std::max(YY[j], XX[j] - go_t_l) - ge_t_l; } else { YY[j] = std::max(YY[j], XX[j] - go_t_i) - ge_t_i; } h = p + subst_score(a_seq[a_start + a_len - i], b_seq[b_start + b_len - j]); h = std::max(f, h); h = std::max(YY[j], h); p = XX[j]; XX[j] = h; } } YY[0] = XX[0]; /* find maximum score along division line */ auto MaxScore0 = int64_min; int64_t best0 = -1; /* solutions with diagonal at break */ for (int64_t i = 0; i <= b_len; i++) { auto const Score = HH[i] + XX[b_len - i]; if (Score > MaxScore0) { MaxScore0 = Score; best0 = i; } } auto MaxScore1 = int64_min; int64_t best1 = -1; /* solutions that end with a gap in b from both ends at break */ for (int64_t i = 0; i <= b_len; i++) { int64_t g = 0; if (b_left and (i == 0)) { g = go_t_l; } else if (b_right and (i == b_len)) { g = go_t_r; } else { g = go_t_i; } auto const Score = EE[i] + YY[b_len - i] + g; if (Score > MaxScore1) { MaxScore1 = Score; best1 = i; } } int64_t P = 0; // rename: is_parted?? convert to bool? int64_t best = 0; if (MaxScore0 > MaxScore1) { P = 0; best = best0; } else if (MaxScore1 > MaxScore0) { P = 1; best = best1; } else { if (best0 <= best1) { P = 0; best = best0; } else { P = 1; best = best1; } } /* recursively compute upper left and lower right parts */ if (P == 0) { diff(a_start, b_start, I, best, gap_b_left, false, a_left, false, b_left, b_right and (best == b_len)); diff(a_start + I, b_start + best, a_len - I, b_len - best, false, gap_b_right, false, a_right, b_left and (best == 0), b_right); } else if (P == 1) { diff(a_start, b_start, I - 1, best, gap_b_left, true, a_left, false, b_left, b_right and (best == b_len)); cigar_add('D', 2); diff(a_start + I + 1, b_start + best, a_len - I - 1, b_len - best, true, gap_b_right, false, a_right, b_left and (best == 0), b_right); } } } auto LinearMemoryAligner::align(char * _a_seq, char * _b_seq, int64_t a_len, int64_t b_len) -> char * { /* copy parameters */ a_seq = _a_seq; b_seq = _b_seq; /* init cigar operations */ cigar_reset(); /* allocate enough memory for vectors */ alloc_vectors(b_len + 1); /* perform alignment */ diff(0, 0, a_len, b_len, false, false, true, true, true, true); /* ensure entire cigar has been written */ cigar_flush(); /* return cigar */ return cigar_string.data(); } auto LinearMemoryAligner::alignstats(char * cigar, char * _a_seq, char * _b_seq, int64_t * _nwscore, int64_t * _nwalignmentlength, int64_t * _nwmatches, int64_t * _nwmismatches, int64_t * _nwgaps) -> void { static constexpr auto is_N = 15; // 4-bit code for 'N' or 'n' a_seq = _a_seq; b_seq = _b_seq; int64_t nwscore = 0; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; int64_t a_pos = 0; int64_t b_pos = 0; auto * p = cigar; int64_t g = 0; while (*p != '\0') { int64_t runlength = 1; auto scanlength = 0; std::sscanf(p, "%" PRId64 "%n", &runlength, &scanlength); p += scanlength; switch (*p++) { case 'M': nwalignmentlength += runlength; for (int64_t k = 0; k < runlength; k++) { auto const a_nuc = a_seq[a_pos]; auto const b_nuc = b_seq[b_pos]; nwscore += subst_score(a_nuc, b_nuc); if (opt_n_mismatch and ((map_4bit(a_nuc) == is_N) or (map_4bit(b_nuc) == is_N))) { ++nwmismatches; } else if ((map_4bit(a_nuc) & map_4bit(b_nuc)) != 0U) { ++nwmatches; } else { ++nwmismatches; } ++a_pos; ++b_pos; } break; case 'I': if ((a_pos == 0) and (b_pos == 0)) { g = go_q_l + (runlength * ge_q_l); } else if (*p == '\0') // last operation? { g = go_q_r + (runlength * ge_q_r); } else { g = go_q_i + (runlength * ge_q_i); } nwscore -= g; ++nwgaps; nwalignmentlength += runlength; b_pos += runlength; break; case 'D': if ((a_pos == 0) and (b_pos == 0)) { g = go_t_l + (runlength * ge_t_l); } else if (*p == '\0') // last operation? { g = go_t_r + (runlength * ge_t_r); } else { g = go_t_i + (runlength * ge_t_i); } nwscore -= g; ++nwgaps; nwalignmentlength += runlength; a_pos += runlength; break; } // end of switch } // end of cigar parsing *_nwscore = nwscore; *_nwalignmentlength = nwalignmentlength; *_nwmatches = nwmatches; *_nwmismatches = nwmismatches; *_nwgaps = nwgaps; } // TODO: include guards span.hpp, linmemalign.h *DONE* // scorematrix as vector of vectors? fix scorematrix_create? *DONE* // pass nucleotides to subst_score(char const lhs, char const rhs) *DONE* // struct scoring as class private member (rename struct members everywhere), // inject struct Span in diff(), // pass a Pair // design a struct Pair? // struct Pair { // Span seq_A; // Span seq_B; // }; vsearch-2.30.1/src/linmemalign.h000066400000000000000000000131631506776541700165120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef LINMEMALIGN_HPP #define LINMEMALIGN_HPP #include // std::FILE, std::size_t #include // int64_t #include struct Scoring { // aligned nucleotides int64_t match = 0; int64_t mismatch = 0; // general gap penalties int64_t gap_open_query_interior = 0; // q int64_t gap_extension_query_interior = 0; // r // specific gap penalties int64_t gap_open_query_left = 0; // go_q_l int64_t gap_open_target_left = 0; // go_t_l // gap_open_query_interior // go_q_i int64_t gap_open_target_interior = 0; // go_t_i int64_t gap_open_query_right = 0; // go_q_r int64_t gap_open_target_right = 0; // go_t_r int64_t gap_extension_query_left = 0; // ge_q_l int64_t gap_extension_target_left = 0; // ge_t_l // gap_extension_query_interior // ge_q_i int64_t gap_extension_target_interior = 0; // ge_t_i int64_t gap_extension_query_right = 0; // ge_q_r int64_t gap_extension_target_right = 0; // ge_t_r }; class LinearMemoryAligner { private: static constexpr auto matrix_size = 16U; char op = '\0'; int64_t op_run = 0; int64_t cigar_alloc = 0; int64_t cigar_length = 0; std::vector cigar_string; char * a_seq = nullptr; char * b_seq = nullptr; // initialize a 16x16 matrix std::vector> scorematrix = std::vector>(matrix_size, std::vector(matrix_size)); /* gap penalties for open/extension query/target left/interior/right */ int64_t go_q_l = 0; int64_t go_t_l = 0; int64_t go_q_i = 0; int64_t go_t_i = 0; int64_t go_q_r = 0; int64_t go_t_r = 0; int64_t ge_q_l = 0; int64_t ge_t_l = 0; int64_t ge_q_i = 0; int64_t ge_t_i = 0; int64_t ge_q_r = 0; int64_t ge_t_r = 0; std::size_t vector_alloc = 0; std::vector HH; std::vector EE; std::vector XX; std::vector YY; // initializers auto scorematrix_fill(struct Scoring const & scoring) -> void; auto cigar_reset() -> void; auto cigar_flush() -> void; auto cigar_add(char _op, int64_t run) -> void; auto subst_score(char lhs, char rhs) -> int64_t; auto diff(int64_t a_start, int64_t b_start, int64_t a_len, int64_t b_len, bool gap_b_left, /* gap open left of b */ bool gap_b_right, /* gap open right of b */ bool a_left, /* includes left end of a */ bool a_right, /* includes right end of a */ bool b_left, /* includes left end of b */ bool b_right) -> void; /* includes right end of b */ auto alloc_vectors(std::size_t size) -> void; public: explicit LinearMemoryAligner(struct Scoring const & scoring); auto align(char * _a_seq, char * _b_seq, int64_t a_len, int64_t b_len) -> char *; auto alignstats(char * cigar, char * a_seq, char * b_seq, int64_t * nwscore, int64_t * nwalignmentlength, int64_t * nwmatches, int64_t * nwmismatches, int64_t * nwgaps) -> void; }; #endif // LINMEMALIGN_HPP vsearch-2.30.1/src/mask.cc000066400000000000000000000313101506776541700153010ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "mask.h" #include "utils/check_output_filehandle.hpp" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/open_file.hpp" #include "utils/xpthread.hpp" #include #include // std::toupper, std::isupper #include // int64_t, uint64_t #include // std::FILE #include // std::strcpy #include // #include #include constexpr int dust_window = 64; auto wo(int len, const char *s, int *beg, int *end) -> int { static constexpr auto dust_word = 3; static constexpr auto word_count = 1U << (2U * dust_word); // 64 static constexpr auto bitmask = word_count - 1; const auto l1 = len - dust_word + 1 - 5; /* smallest possible region is 8 */ if (l1 < 0) { return 0; } auto bestv = 0; auto besti = 0; auto bestj = 0; std::array counts {{}}; std::array words {{}}; auto word = 0; for (auto j = 0; j < len; j++) { word <<= 2U; word |= map_2bit(s[j]); words[j] = word & bitmask; } for (auto i = 0; i < l1; i++) { counts.fill(0); // reset counts to zero auto sum = 0; for (auto j = dust_word - 1; j < len - i; j++) { word = words[i + j]; const auto c = counts[word]; if (c != 0) { sum += c; const auto v = 10 * sum / j; if (v > bestv) { bestv = v; besti = i; bestj = j; } } ++counts[word]; } } *beg = besti; *end = besti + bestj; return bestv; } auto dust(char * seq, int len) -> void { static constexpr auto dust_level = 20; static constexpr auto half_dust_window = dust_window / 2; auto a = 0; auto b = 0; /* make a local copy of the original sequence */ std::vector local_seq(len + 1); strcpy(local_seq.data(), seq); // refactoring: // std::string local_seq2; // local_seq2.reserve(len + 1); // local_seq2.insert(0, m); // local_seq2.insert(len, 1, '\0'); if (opt_hardmask == 0) { /* convert sequence to upper case unless hardmask in effect */ for (auto i = 0; i < len; i++) { seq[i] = toupper(seq[i]); } seq[len] = 0; } for (auto i = 0; i < len; i += half_dust_window) { const auto l = (len > i + dust_window) ? dust_window : len - i; const auto v = wo(l, &local_seq[i], &a, &b); if (v > dust_level) { if (opt_hardmask != 0) { for (auto j = a + i; j <= b + i; j++) { seq[j] = 'N'; } } else { for (auto j = a + i; j <= b + i; j++) { seq[j] = local_seq[j] | 32U; // check_5th_bit (0x20) } } if (b < half_dust_window) { i += half_dust_window - b; } } } } static pthread_t * pthread; static pthread_attr_t attr; static pthread_mutex_t mutex; static auto nextseq = 0; static auto seqcount = 0; auto dust_all_worker(void * vp) -> void * { (void) vp; // not used, but required for thread creation while (true) { xpthread_mutex_lock(&mutex); const auto seqno = nextseq; if (seqno < seqcount) { ++nextseq; progress_update(seqno); xpthread_mutex_unlock(&mutex); dust(db_getsequence(seqno), db_getsequencelen(seqno)); } else { xpthread_mutex_unlock(&mutex); break; } } return nullptr; } auto dust_all() -> void { nextseq = 0; seqcount = db_getsequencecount(); progress_init("Masking", seqcount); xpthread_mutex_init(&mutex, nullptr); xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); std::vector pthread_v(opt_threads); pthread = pthread_v.data(); for (auto t = 0; t < opt_threads; t++) { xpthread_create(&pthread_v[t], &attr, dust_all_worker, (void *) (int64_t) t); } for (auto t = 0; t < opt_threads; t++) { xpthread_join(pthread_v[t], nullptr); } xpthread_attr_destroy(&attr); xpthread_mutex_destroy(&mutex); progress_done(); } auto hardmask(char * seq, int len) -> void { /* convert all lower case letters in seq to N */ // auto const * const end = std::next(seq, len); // refactoring: std::transform(seq, end, seq, [](unsigned char nuc){ if (std::islower(nuc) != 0) { return hardmask_char; } return nuc;}); static constexpr auto check_5th_bit = 32U; // 0x20 static constexpr auto hardmask_char = 'N'; for (auto i = 0; i < len; i++) { if ((seq[i] & check_5th_bit) != 0U) { seq[i] = hardmask_char; } } } auto hardmask_all() -> void { for (uint64_t i = 0; i < db_getsequencecount(); i++) { hardmask(db_getsequence(i), db_getsequencelen(i)); } } auto maskfasta(struct Parameters const & parameters) -> void { auto const output_handle = open_output_file(parameters.opt_output); check_mandatory_output_handle(parameters.opt_output, (not output_handle)); db_read(parameters.opt_maskfasta, 0); show_rusage(); seqcount = db_getsequencecount(); if (parameters.opt_qmask == MASK_DUST) { dust_all(); } else if ((parameters.opt_qmask == MASK_SOFT) && parameters.opt_hardmask) { hardmask_all(); } show_rusage(); progress_init("Writing output", seqcount); for (auto i = 0; i < seqcount; i++) { fasta_print_db_relabel(output_handle.get(), i, i + 1); progress_update(i); } progress_done(); show_rusage(); db_free(); } auto fastx_mask(struct Parameters const & parameters) -> void { std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; if ((parameters.opt_fastaout == nullptr) && (parameters.opt_fastqout == nullptr)) { fatal("Specify output files for masking with --fastaout and/or --fastqout"); } if (parameters.opt_fastaout != nullptr) { fp_fastaout = fopen_output(parameters.opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open mask output FASTA file for writing"); } } if (parameters.opt_fastqout != nullptr) { fp_fastqout = fopen_output(parameters.opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open mask output FASTQ file for writing"); } } db_read(parameters.opt_fastx_mask, 0); show_rusage(); if ((fp_fastqout != nullptr) && ! db_is_fastq()) { fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); } seqcount = db_getsequencecount(); if (parameters.opt_qmask == MASK_DUST) { dust_all(); } else if ((parameters.opt_qmask == MASK_SOFT) && parameters.opt_hardmask) { hardmask_all(); } show_rusage(); auto kept = 0; auto discarded_less = 0; auto discarded_more = 0; progress_init("Writing output", seqcount); for (auto i = 0; i < seqcount; i++) { auto unmasked = 0; auto * seq = db_getsequence(i); const int len = db_getsequencelen(i); if (parameters.opt_qmask == MASK_NONE) { unmasked = len; } else if (parameters.opt_hardmask) { for (auto j = 0; j < len; j++) { if (seq[j] != 'N') { ++unmasked; } } } else { for (auto j = 0; j < len; j++) { if (isupper(seq[j]) != 0) { ++unmasked; } } } auto const unmasked_pct = 100.0 * unmasked / len; if (unmasked_pct < parameters.opt_min_unmasked_pct) { ++discarded_less; } else if (unmasked_pct > parameters.opt_max_unmasked_pct) { ++discarded_more; } else { ++kept; if (parameters.opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, seq, len, db_getheader(i), db_getheaderlen(i), db_getabundance(i), kept, -1.0, -1, -1, nullptr, 0.0); } if (parameters.opt_fastqout != nullptr) { fastq_print_general(fp_fastqout, seq, len, db_getheader(i), db_getheaderlen(i), db_getquality(i), db_getabundance(i), kept, -1.0); } } progress_update(i); } progress_done(); if (! parameters.opt_quiet) { if (parameters.opt_min_unmasked_pct > 0.0) { fprintf(stderr, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, parameters.opt_min_unmasked_pct); } if (parameters.opt_max_unmasked_pct < 100.0) { fprintf(stderr, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, parameters.opt_max_unmasked_pct); } fprintf(stderr, "%d sequences kept\n", kept); } if (parameters.opt_log != nullptr) { if (parameters.opt_min_unmasked_pct > 0.0) { fprintf(fp_log, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, parameters.opt_min_unmasked_pct); } if (parameters.opt_max_unmasked_pct < 100.0) { fprintf(fp_log, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, parameters.opt_max_unmasked_pct); } fprintf(fp_log, "%d sequences kept\n", kept); } show_rusage(); db_free(); if (fp_fastaout != nullptr) { fclose(fp_fastaout); } if (fp_fastqout != nullptr) { fclose(fp_fastqout); } } vsearch-2.30.1/src/mask.h000066400000000000000000000055671506776541700151620ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // refactoring: enum struct Masking: int { error, none, dust, soft }; constexpr auto MASK_ERROR = -1; constexpr auto MASK_NONE = 0; constexpr auto MASK_DUST = 1; constexpr auto MASK_SOFT = 2; auto maskfasta(struct Parameters const & parameters) -> void; auto fastx_mask(struct Parameters const & parameters) -> void; auto dust(char * seq, int len) -> void; auto hardmask(char * seq, int len) -> void; auto dust_all() -> void; auto hardmask_all() -> void; vsearch-2.30.1/src/md5.c000066400000000000000000000246241506776541700147020ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * (This is a heavily cut-down "BSD license".) * * This differs from Colin Plumb's older public domain implementation in that * no exactly 32-bit integer data type is required (any 32-bit or wider * unsigned integer data type will do), there's no compile-time endianness * configuration, and the function prototypes match OpenSSL's. No code from * Colin Plumb's implementation has been reused; this comment merely compares * the properties of the two independent implementations. * * The primary goals of this implementation are portability and ease of use. * It is meant to be fast, but not as fast as possible. Some known * optimizations are not included to reduce source code size and avoid * compile-time configuration. */ /* refactoring C++11: https://codereview.stackexchange.com/questions/163872/md5-implementation-in-c11 */ #ifndef HAVE_OPENSSL #include #include "md5.h" /* * The basic MD5 functions. * * F and G are optimized compared to their RFC 1321 definitions for * architectures that lack an AND-NOT instruction, just like in Colin Plumb's * implementation. */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | ~(z))) /* * The MD5 transformation for all four rounds. */ #define STEP(f, a, b, c, d, x, t, s) \ (a) += f((b), (c), (d)) + (x) + (t); \ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ (a) += (b); /* * SET reads 4 input bytes in little-endian byte order and stores them * in a properly aligned word in host byte order. * * The check for little-endian architectures that tolerate unaligned * memory accesses is just an optimization. Nothing will break if it * doesn't work. */ #if defined(__i386__) || defined(__x86_64__) || defined(__vax__) #define SET(n) \ (*(MD5_u32plus *)&ptr[(n) * 4]) #define GET(n) \ SET(n) #else #define SET(n) \ (ctx->block[(n)] = \ (MD5_u32plus)ptr[(n) * 4] | \ ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) #define GET(n) \ (ctx->block[(n)]) #endif /* * This processes one or more 64-byte data blocks, but does NOT update * the bit counters. There are no alignment requirements. */ static void *body(MD5_CTX *ctx, void *data, unsigned long size) { unsigned char *ptr; MD5_u32plus a = 0; MD5_u32plus b = 0; MD5_u32plus c = 0; MD5_u32plus d = 0; MD5_u32plus saved_a = 0; MD5_u32plus saved_b = 0; MD5_u32plus saved_c = 0; MD5_u32plus saved_d = 0; ptr = data; a = ctx->a; b = ctx->b; c = ctx->c; d = ctx->d; do { saved_a = a; saved_b = b; saved_c = c; saved_d = d; /* Round 1 */ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) STEP(F, c, d, a, b, SET(2), 0x242070db, 17) STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) /* Round 2 */ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) STEP(G, d, a, b, c, GET(10), 0x02441453, 9) STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) /* Round 3 */ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) /* Round 4 */ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) a += saved_a; b += saved_b; c += saved_c; d += saved_d; ptr += 64; } while (size -= 64); ctx->a = a; ctx->b = b; ctx->c = c; ctx->d = d; return ptr; } void MD5_Init(MD5_CTX *ctx) { ctx->a = 0x67452301; ctx->b = 0xefcdab89; ctx->c = 0x98badcfe; ctx->d = 0x10325476; ctx->lo = 0; ctx->hi = 0; } void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) { MD5_u32plus saved_lo = 0; unsigned long used = 0; unsigned long free = 0; saved_lo = ctx->lo; ctx->lo = (saved_lo + size) & 0x1fffffff; if (ctx->lo < saved_lo) { ctx->hi++; } ctx->hi += size >> 29; used = saved_lo & 0x3f; if (used) { free = 64 - used; if (size < free) { memcpy(&ctx->buffer[used], data, size); return; } memcpy(&ctx->buffer[used], data, free); data = (unsigned char *)data + free; size -= free; body(ctx, ctx->buffer, 64); } if (size >= 64) { data = body(ctx, data, size & ~(unsigned long)0x3f); size &= 0x3f; } memcpy(ctx->buffer, data, size); } void MD5_Final(unsigned char *result, MD5_CTX *ctx) { unsigned long used; unsigned long free; used = ctx->lo & 0x3f; ctx->buffer[used++] = 0x80; free = 64 - used; if (free < 8) { memset(&ctx->buffer[used], 0, free); body(ctx, ctx->buffer, 64); used = 0; free = 64; } memset(&ctx->buffer[used], 0, free - 8); ctx->lo <<= 3; ctx->buffer[56] = ctx->lo; ctx->buffer[57] = ctx->lo >> 8; ctx->buffer[58] = ctx->lo >> 16; ctx->buffer[59] = ctx->lo >> 24; ctx->buffer[60] = ctx->hi; ctx->buffer[61] = ctx->hi >> 8; ctx->buffer[62] = ctx->hi >> 16; ctx->buffer[63] = ctx->hi >> 24; body(ctx, ctx->buffer, 64); result[0] = ctx->a; result[1] = ctx->a >> 8; result[2] = ctx->a >> 16; result[3] = ctx->a >> 24; result[4] = ctx->b; result[5] = ctx->b >> 8; result[6] = ctx->b >> 16; result[7] = ctx->b >> 24; result[8] = ctx->c; result[9] = ctx->c >> 8; result[10] = ctx->c >> 16; result[11] = ctx->c >> 24; result[12] = ctx->d; result[13] = ctx->d >> 8; result[14] = ctx->d >> 16; result[15] = ctx->d >> 24; memset(ctx, 0, sizeof(*ctx)); } #endif vsearch-2.30.1/src/md5.h000066400000000000000000000027751506776541700147120ustar00rootroot00000000000000/* Slightly modified for vsearch by Torbjorn Rognes, 29 Sep 2015 */ /* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * See md5.c for more information. */ #ifndef __MD5_H #define __MD5_H #ifdef __cplusplus extern "C" { #endif /* Any 32-bit or wider unsigned integer data type will do */ typedef unsigned int MD5_u32plus; typedef struct { MD5_u32plus lo, hi; MD5_u32plus a, b, c, d; unsigned char buffer[64]; MD5_u32plus block[16]; } MD5_CTX; extern void MD5_Init(MD5_CTX *ctx); extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); #ifdef __cplusplus } #endif #endif /* __MD5_H */ vsearch-2.30.1/src/minheap.cc000066400000000000000000000157771506776541700160120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "minheap.h" #include // printf #include // qsort() /* implement a priority queue with a min heap binary array structure */ /* elements with the lowest count should be at the top (root) */ // refactoring: std::priority_queue (#include ) /* To keep track of the n best potential target sequences, we store them in a min heap. The root element corresponds to the least good target, while the best elements are found at the leaf nodes. This makes it simple to decide whether a new target should be included or not, because it just needs to be compared to the root note. The list will be fully sorted before use when we want to find the best element and then the second best and so on. */ auto elem_smaller(elem_t * lhs, elem_t * rhs) -> int { /* return 1 if lhs is smaller than rhs, 0 if equal or greater */ if (lhs->count < rhs->count) { return 1; } if (lhs->count > rhs->count) { return 0; } if (lhs->length > rhs->length) { return 1; } if (lhs->length < rhs->length) { return 0; } if (lhs->seqno > rhs->seqno) { return 1; } return 0; } auto minheap_compare(const void * lhs_a, const void * rhs_b) -> int { auto * lhs = (elem_t *) lhs_a; auto * rhs = (elem_t *) rhs_b; /* return -1 if a is smaller than b, +1 if greater, otherwize 0 */ /* first: lower count, larger length, lower seqno */ if (lhs->count < rhs->count) { return -1; } if (lhs->count > rhs->count) { return +1; } if (lhs->length > rhs->length) { return -1; } if (lhs->length < rhs->length) { return +1; } if (lhs->seqno > rhs->seqno) { return -1; } if (lhs->seqno < rhs->seqno) { return +1; } return 0; } auto minheap_init(int size) -> minheap_t * { auto * a_minheap = static_cast(xmalloc(sizeof(minheap_t))); a_minheap->alloc = size; a_minheap->array = static_cast(xmalloc(size * sizeof(elem_t))); a_minheap->count = 0; return a_minheap; } auto minheap_exit(minheap_t * a_minheap) -> void { xfree(a_minheap->array); xfree(a_minheap); } auto minheap_replaceroot(minheap_t * a_minheap, elem_t tmp) -> void { /* remove the element at the root, then swap children up to the root and insert tmp at suitable place */ /* start with root */ int parent = 0; int nth_child = (2 * parent) + 1; /* while at least one child */ while (nth_child < a_minheap->count) { /* if two children: swap with the one with smallest value */ if ((nth_child + 1 < a_minheap->count) and (elem_smaller(a_minheap->array + nth_child + 1, a_minheap->array + nth_child) != 0)) { ++nth_child; } /* swap parent and child if child has lower value */ if (elem_smaller(a_minheap->array + nth_child, &tmp) != 0) { a_minheap->array[parent] = a_minheap->array[nth_child]; } else { break; } /* step down */ parent = nth_child; nth_child = (2 * parent) + 1; } a_minheap->array[parent] = tmp; } auto minheap_add(minheap_t * a_minheap, elem_t * n) -> void { if (a_minheap->count < a_minheap->alloc) { /* space for another item at end; swap upwards */ int index = a_minheap->count; ++a_minheap->count; int pos = (index - 1) / 2; while ((index > 0) and (elem_smaller(n, a_minheap->array + pos) != 0)) { a_minheap->array[index] = a_minheap->array[pos]; index = pos; pos = (index - 1) / 2; } a_minheap->array[index] = *n; } else if (elem_smaller(a_minheap->array, n) != 0) { /* replace the root if new element is larger than root */ minheap_replaceroot(a_minheap, *n); } } auto minheap_pop(minheap_t * a_minheap) -> elem_t { /* return top element and restore order */ static const elem_t zero = {0, 0, 0}; if (a_minheap->count != 0) { elem_t top = a_minheap->array[0]; --a_minheap->count; if (a_minheap->count != 0) { const elem_t tmp = a_minheap->array[a_minheap->count]; minheap_replaceroot(a_minheap, tmp); } return top; } return zero; } auto minheap_sort(minheap_t * a_minheap) -> void { std::qsort(a_minheap->array, a_minheap->count, sizeof(elem_t), minheap_compare); } auto minheap_poplast(minheap_t * a_minheap) -> elem_t { /* return top element and restore order */ if (a_minheap->count == 0) { return elem_t{0, 0, 0}; } return a_minheap->array[--a_minheap->count]; } vsearch-2.30.1/src/minheap.h000066400000000000000000000062041506776541700156350ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct topscore { unsigned int count; unsigned int seqno; unsigned int length; }; using elem_t = struct topscore; struct minheap_s { int alloc; int count; elem_t * array; }; using minheap_t = struct minheap_s; inline auto minheap_isempty(minheap_t const * a_minheap) -> bool { return (a_minheap->count == 0); } inline auto minheap_clear(minheap_t * a_minheap) -> void { a_minheap->count = 0; } auto minheap_poplast(minheap_t * a_minheap) -> elem_t; auto minheap_sort(minheap_t * a_minheap) -> void; auto minheap_init(int size) -> minheap_t *; auto minheap_exit(minheap_t * a_minheap) -> void; auto minheap_add(minheap_t * a_minheap, elem_t * n) -> void; auto minheap_pop(minheap_t * a_minheap) -> elem_t; vsearch-2.30.1/src/msa.cc000066400000000000000000000540371506776541700151410ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "msa.h" #include "utils/cigar.hpp" #include "utils/span.hpp" #include #include // std::max() #include #include // std::toupper #include // macro PRId64 #include // uint64_t #include // std::FILE, std::sscanf, std::fprintf #include // std::strtoll #include // std::strlen #include // std::next #include // std::accumulate #include #ifndef NDEBUG #include #endif /* Compute multiple sequence alignment (msa), profile, and consensus sequence of clustered sequences */ using prof_type = std::uint64_t; constexpr auto profsize = 6; auto update_profile(char const nucleotide, int const position_in_alignment, prof_type const abundance, std::vector& profile) -> void { static constexpr auto A_counter = 0; static constexpr auto C_counter = 1; static constexpr auto G_counter = 2; static constexpr auto U_counter = 3; // note: T converted to U? static constexpr auto N_counter = 4; static constexpr auto gap_counter = 5; auto const offset = profsize * position_in_alignment; // refactoring: eliminate unused cases? No, T and U are merged, same as IUPAC and N switch (std::toupper(nucleotide)) { case 'A': profile[offset + A_counter] += abundance; break; case 'C': profile[offset + C_counter] += abundance; break; case 'G': profile[offset + G_counter] += abundance; break; case 'T': case 'U': profile[offset + U_counter] += abundance; break; case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M': case 'B': case 'D': case 'H': case 'V': case 'N': profile[offset + N_counter] += abundance; break; case '-': profile[offset + gap_counter] += abundance; break; default: break; } } auto update_msa(char const nucleotide, int &position_in_alignment, std::vector& alignment) -> void { alignment[position_in_alignment] = nucleotide; ++position_in_alignment; } // anonymous namespace: limit visibility and usage to this translation unit namespace { } // end of anonymous namespace auto find_max_insertions_per_position(int const target_count, std::vector const & target_list_v, int const centroid_len) -> std::vector { std::vector max_insertions(centroid_len + 1); // refactoring: with template Span // auto target_list_view = Span{target_list_v.data(), target_list_v.size()}; // for (auto const & a_msa_target : target_list_view.subspan(1, target_count) { } // assert(static_cast(target_count), target_list_v.size()); for (auto i = 1; i < target_count; ++i) { auto position = 0LL; auto * cigar_start = target_list_v[i].cigar; auto const cigar_length = std::strlen(cigar_start); auto const cigar_pairs = parse_cigar_string(Span{cigar_start, cigar_length}); for (auto const & a_pair: cigar_pairs) { auto const operation = a_pair.first; auto const runlength = a_pair.second; switch (operation) { case Operation::match: case Operation::insertion: position += runlength; break; case Operation::deletion: assert(runlength <= std::numeric_limits::max()); max_insertions[position] = std::max(static_cast(runlength), max_insertions[position]); break; } } } return max_insertions; } auto find_total_alignment_length(std::vector const & max_insertions) -> int { auto const centroid_len = static_cast(max_insertions.size() - 1); return std::accumulate(max_insertions.begin(), max_insertions.end(), centroid_len); } auto find_longest_target_on_reverse_strand(int const target_count, std::vector const & target_list_v) -> int64_t { int64_t longest_reversed = 0; for (auto i = 0; i < target_count; ++i) { auto const & target = target_list_v[i]; if (target.strand == 0) { continue; } auto const len = static_cast(db_getsequencelen(target.seqno)); longest_reversed = std::max(len, longest_reversed); } return longest_reversed; } auto allocate_buffer_for_reverse_strand_target(int const target_count, std::vector const & target_list_v, std::vector & rc_buffer_v) -> char * { /* Find longest target sequence on reverse strand and allocate buffer */ auto const longest_reversed = find_longest_target_on_reverse_strand(target_count, target_list_v); if (longest_reversed > 0) { rc_buffer_v.resize(longest_reversed + 1); return rc_buffer_v.data(); } return nullptr; } auto blank_line_before_each_msa(std::FILE * fp_msaout) -> void { if (fp_msaout == nullptr) { return ; } static_cast(std::fprintf(fp_msaout, "\n")); } auto print_header_and_sequence(std::FILE * fp_msaout, char const * header_prefix, int const target_seqno, std::vector & aln_v) -> void { // header_prefix == "*" or "", resulting in ">*header" or ">header" if (fp_msaout == nullptr) { return ; } fasta_print_general(fp_msaout, header_prefix, aln_v.data(), static_cast(aln_v.size() - 1), db_getheader(target_seqno), static_cast(db_getheaderlen(target_seqno)), db_getabundance(target_seqno), 0, -1.0, -1, -1, nullptr, 0.0); } auto reverse_complement_target_if_need_be(int const strand, int const target_seqno, char * rc_buffer, char * target_seq) -> char * { if (strand == 0) { return target_seq; } reverse_complement(rc_buffer, target_seq, static_cast(db_getsequencelen(target_seqno))); return rc_buffer; } auto process_and_print_centroid(char *rc_buffer, std::vector const &target_list_v, std::vector const &max_insertions, std::vector &profile, std::vector &aln_v, std::FILE * fp_msaout) -> void { auto const centroid_len = static_cast(max_insertions.size() - 1); auto const & target = target_list_v.front(); auto const target_seqno = target.seqno; auto * const target_seq = reverse_complement_target_if_need_be(target.strand, target_seqno, rc_buffer, db_getsequence(target_seqno)); prof_type const target_abundance = opt_sizein ? db_getabundance(target_seqno) : 1; auto position_in_alignment = 0; for (auto i = 0; i < centroid_len; ++i) { for (auto j = 0; j < max_insertions[i]; ++j) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } update_profile(*std::next(target_seq, i), position_in_alignment, target_abundance, profile); update_msa(*std::next(target_seq, i), position_in_alignment, aln_v); } // insert for (auto j = 0; j < max_insertions[centroid_len]; ++j) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } /* end of sequence string */ aln_v[position_in_alignment] = '\0'; /* print header & sequence */ print_header_and_sequence(fp_msaout, "*", target_seqno, aln_v); } auto insert_gaps_in_alignment_and_profile(bool const is_inserted, int const max_insertions_at_position, int & position_in_alignment, prof_type const target_abundance, std::vector & profile, std::vector & aln_v) -> void { if (is_inserted) { return ; } for (auto i = 0; i < max_insertions_at_position; ++i) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } } auto compute_and_print_msa(int const target_count, std::vector const & target_list_v, std::vector const &max_insertions, std::vector &profile, std::vector &aln_v, std::FILE * fp_msaout) -> void { blank_line_before_each_msa(fp_msaout); /* Find longest target sequence on reverse strand and allocate buffer */ std::vector rc_buffer_v; char * rc_buffer = allocate_buffer_for_reverse_strand_target(target_count, target_list_v, rc_buffer_v); // ------------------------------------------------------- deal with centroid process_and_print_centroid(rc_buffer, target_list_v, max_insertions, profile, aln_v, fp_msaout); // --------------------------------- deal with other sequences in the cluster for (auto i = 1; i < target_count; ++i) { auto const & target = target_list_v[i]; auto const target_seqno = target.seqno; auto * const target_seq = reverse_complement_target_if_need_be(target.strand, target_seqno, rc_buffer, db_getsequence(target_seqno)); prof_type const target_abundance = opt_sizein ? db_getabundance(target_seqno) : 1; int position_in_alignment = 0; auto is_inserted = false; auto qpos = 0; auto tpos = 0; auto * cigar_start = target.cigar; auto const cigar_length = static_cast(std::strlen(cigar_start)); auto const * cigar_end = std::next(cigar_start, cigar_length); auto * position_in_cigar = cigar_start; while (position_in_cigar < cigar_end) { // Consume digits (if any), return the position of the // first char (M, D, or I), store it, move cursor to the next byte. // Operations: match (M), insertion (I), or deletion (D) auto** next_operation = &position_in_cigar; auto const runlength = find_runlength_of_leftmost_operation(position_in_cigar, next_operation); auto const operation = **next_operation; position_in_cigar = std::next(position_in_cigar); switch (operation) { case 'D': for (auto j = 0; j < runlength; ++j) { update_profile(*std::next(target_seq, tpos), position_in_alignment, target_abundance, profile); update_msa(*std::next(target_seq, tpos), position_in_alignment, aln_v); ++tpos; } for (auto j = runlength; j < max_insertions[qpos]; ++j) { update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); } is_inserted = true; break; case 'M': for (auto j = 0; j < runlength; ++j) { insert_gaps_in_alignment_and_profile(is_inserted, max_insertions[qpos], position_in_alignment, target_abundance, profile, aln_v); update_profile(*std::next(target_seq, tpos), position_in_alignment, target_abundance, profile); update_msa(*std::next(target_seq, tpos), position_in_alignment, aln_v); ++tpos; ++qpos; is_inserted = false; } break; case 'I': for (auto j = 0; j < runlength; ++j) { insert_gaps_in_alignment_and_profile(is_inserted, max_insertions[qpos], position_in_alignment, target_abundance, profile, aln_v); update_profile('-', position_in_alignment, target_abundance, profile); update_msa('-', position_in_alignment, aln_v); ++qpos; is_inserted = false; } break; default: break; } } insert_gaps_in_alignment_and_profile(is_inserted, max_insertions[qpos], position_in_alignment, target_abundance, profile, aln_v); /* end of sequence string */ aln_v[position_in_alignment] = '\0'; /* print header & sequence */ print_header_and_sequence(fp_msaout, "", target_seqno, aln_v); } } auto compute_and_print_consensus(std::vector const &max_insertions, std::vector &aln_v, std::vector &cons_v, std::vector &profile, std::FILE * fp_msaout) -> void { static constexpr std::array sym_nt_4bit = {{'-', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}}; static constexpr char index_of_N = 15; // 15th char in sym_nt_4bit[] (=> 'N') auto const alignment_length = static_cast(aln_v.size() - 1); int conslen = 0; /* Censor part of the consensus sequence outside the centroid sequence */ auto const left_censored = max_insertions.front(); auto const right_censored = max_insertions.back(); for (auto i = 0; i < left_censored; ++i) { aln_v[i] = '+'; } for (auto i = alignment_length - right_censored; i < alignment_length; ++i) { aln_v[i] = '+'; } for (auto i = left_censored; i < alignment_length - right_censored; ++i) { /* find most common symbol of A, C, G and T */ char best_sym = 0; prof_type best_count = 0; for (auto nucleotide = 0U; nucleotide < 4; ++nucleotide) { auto const count = profile[(profsize * i) + nucleotide]; if (count > best_count) { best_count = count; best_sym = static_cast(1U << nucleotide); // 1, 2, 4, or 8 } } /* if no A, C, G, or T, check if there are any N's */ auto const N_count = profile[(profsize * i) + 4]; if ((best_count == 0) and (N_count > 0)) { best_count = N_count; best_sym = index_of_N; // N } /* compare to the number of gap symbols */ auto const gap_count = profile[(profsize * i) + 5]; if (best_count >= gap_count) { auto const index = static_cast(best_sym); auto const sym = sym_nt_4bit[index]; // A, C, G, T, or N aln_v[i] = sym; cons_v[conslen] = sym; ++conslen; } else { aln_v[i] = '-'; } } aln_v.back() = '\0'; cons_v[conslen] = '\0'; cons_v.resize(conslen + 1); if (fp_msaout != nullptr) { fasta_print(fp_msaout, "consensus", aln_v.data(), alignment_length); } } auto print_consensus_sequence(std::FILE *fp_consout, std::vector & cons_v, int64_t const totalabundance, int const target_count, int const cluster, int const centroid_seqno) -> void { if (fp_consout == nullptr) { return ; } fasta_print_general(fp_consout, "centroid=", cons_v.data(), static_cast(cons_v.size()), db_getheader(centroid_seqno), static_cast(db_getheaderlen(centroid_seqno)), totalabundance, cluster + 1, -1.0, target_count, opt_clusterout_id ? cluster : -1, nullptr, 0.0); } auto print_alignment_profile(std::FILE *fp_profile, std::vector &aln_v, std::vector const &profile, int64_t const totalabundance, int const target_count, int const cluster, int const centroid_seqno) -> void { if (fp_profile == nullptr) { return ; } // Note: gaps before Ns in profile output // 0 = A, 1 = C, 2 = G, 3 = T, 4 = N, 5 = '-' (gap) static const std::array symbol_indexes = {0, 1, 2, 3, 5, 4}; fasta_print_general(fp_profile, "centroid=", nullptr, 0, db_getheader(centroid_seqno), static_cast(db_getheaderlen(centroid_seqno)), totalabundance, cluster + 1, -1.0, target_count, opt_clusterout_id ? cluster : -1, nullptr, 0.0); aln_v.pop_back(); // remove last element ('\0') auto counter = 0; for (auto const nucleotide: aln_v) { static_cast(std::fprintf(fp_profile, "%d\t%c", counter, nucleotide)); // A, C, G and T, then gap '-', then N for (auto const symbol_index : symbol_indexes) { static_cast(std::fprintf(fp_profile, "\t%" PRId64, profile[(profsize * counter) + symbol_index])); } static_cast(std::fprintf(fp_profile, "\n")); ++counter; } static_cast(std::fprintf(fp_profile, "\n")); } auto msa(std::FILE * fp_msaout, std::FILE * fp_consout, std::FILE * fp_profile, int cluster, int const target_count, std::vector const & target_list_v, int64_t totalabundance) -> void { int const centroid_seqno = target_list_v[0].seqno; auto const centroid_length = static_cast(db_getsequencelen(centroid_seqno)); /* find max insertions in front of each position in the centroid sequence */ auto const max_insertions = find_max_insertions_per_position(target_count, target_list_v, centroid_length); auto const alignment_length = find_total_alignment_length(max_insertions); /* allocate memory for profile (for consensus) and aligned seq */ std::vector profile(static_cast(profsize) * alignment_length); // C++20 refactoring: std::vector>(alnlen); std::vector aln_v(alignment_length + 1); std::vector cons_v(alignment_length + 1); /* msaout: multiple sequence alignment ... */ compute_and_print_msa(target_count, target_list_v, max_insertions, profile, aln_v, fp_msaout); /* msaout: ... and consensus sequence at the end */ compute_and_print_consensus(max_insertions, aln_v, cons_v, profile, fp_msaout); /* consout: consensus sequence (dedicated input) */ print_consensus_sequence(fp_consout, cons_v, totalabundance, target_count, cluster, centroid_seqno); /* profile: multiple sequence alignment profile (dedicated input) */ print_alignment_profile(fp_profile, aln_v, profile, totalabundance, target_count, cluster, centroid_seqno); } vsearch-2.30.1/src/msa.h000066400000000000000000000054501506776541700147760ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // int64_t #include // std::FILE #include struct msa_target_s { int seqno; char * cigar; int strand; }; auto msa(std::FILE * fp_msaout, std::FILE * fp_consout, std::FILE * fp_profile, int cluster, int target_count, std::vector const & target_list_v, int64_t totalabundance) -> void; vsearch-2.30.1/src/orient.cc000066400000000000000000000365661506776541700156700ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "dbindex.h" #include "mask.h" #include "udb.h" #include "unique.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include #include // uint64_t #include // std::FILE, std::fprintf, std::size_t, std::fclose #include // documentation: assuming opt_wordlength = 3 (6 bits) // input output // 0b000000 -> 0b111111 // 0b111111 -> 0b000000 // 0b111100 -> 0b110000 // 0b110000 -> 0b111100 // 0b001100 -> 0b110011 // 0b000011 -> 0b001111 // 0b001111 -> 0b000011 // 0b100001 -> 0b101101 // 0b011110 -> 0b010010 // 0b101010 -> 0b010101 // 0b010101 -> 0b101010 auto rc_kmer(unsigned int kmer) -> unsigned int { /* reverse complement a kmer where k = opt_wordlength */ assert(opt_wordlength * 2 <= 32); auto fwd = kmer; auto rev = 0U; for (auto i = int64_t{0}; i < opt_wordlength; ++i) { // compute complement of the last two bits auto const complement_bits = (fwd & 3U) ^ 3U; fwd = fwd >> 2U; rev = rev << 2U; rev |= complement_bits; } return rev; } auto orient(struct Parameters const & parameters) -> void { fastx_handle query_h = nullptr; // refactoring: use struct, like in subsample std::FILE * fp_fastaout = nullptr; std::FILE * fp_fastqout = nullptr; std::FILE * fp_tabbedout = nullptr; std::FILE * fp_notmatched = nullptr; int queries = 0; int qmatches = 0; int matches_fwd = 0; int matches_rev = 0; int notmatched = 0; /* check arguments */ if (opt_db == nullptr) { fatal("Database not specified with --db"); } if (not ((opt_fastaout != nullptr) or (opt_fastqout != nullptr) or (opt_notmatched != nullptr) or (opt_tabbedout != nullptr))) { fatal("Output file not specified with --fastaout, --fastqout, --notmatched or --tabbedout"); } /* prepare reading of queries */ query_h = fastx_open(parameters.opt_orient); /* open output files */ if (opt_fastaout != nullptr) { fp_fastaout = fopen_output(opt_fastaout); if (fp_fastaout == nullptr) { fatal("Unable to open fasta output file for writing"); } } if (opt_fastqout != nullptr) { if (not fastx_is_fastq(query_h)) { fatal("Cannot write FASTQ output with FASTA input"); } fp_fastqout = fopen_output(opt_fastqout); if (fp_fastqout == nullptr) { fatal("Unable to open fastq output file for writing"); } } if (opt_notmatched != nullptr) { fp_notmatched = fopen_output(opt_notmatched); if (fp_notmatched == nullptr) { fatal("Unable to open notmatched output file for writing"); } } if (opt_tabbedout != nullptr) { fp_tabbedout = fopen_output(opt_tabbedout); if (fp_tabbedout == nullptr) { fatal("Unable to open tabbedout output file for writing"); } } /* check if it may be an UDB file */ auto const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); } else { db_read(opt_db, 0); } if (not is_udb) { if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) and (opt_hardmask != 0)) { hardmask_all(); } } if (not is_udb) { dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } uhandle_s * uh_fwd = unique_init(); std::size_t alloc = 0; std::vector qseq_rev; std::vector query_qual_rev; progress_init("Orienting sequences", fasta_get_size(query_h)); while (fastx_next(query_h, (opt_notrunclabels == 0), chrmap_no_change_vector.data())) { char const * query_head = fastx_get_header(query_h); int const query_head_len = fastx_get_header_length(query_h); char const * qseq_fwd = fastx_get_sequence(query_h); int const qseqlen = fastx_get_sequence_length(query_h); int const qsize = fastx_get_abundance(query_h); char const * query_qual_fwd = fastx_get_quality(query_h); /* find kmers in query sequence */ unsigned int kmer_count_fwd = 0; unsigned int const * kmer_list_fwd = nullptr; unique_count(uh_fwd, opt_wordlength, qseqlen, qseq_fwd, & kmer_count_fwd, & kmer_list_fwd, opt_qmask); /* count kmers matching on each strand */ unsigned int count_fwd = 0; unsigned int count_rev = 0; constexpr auto hits_factor = 8U; for (unsigned int i = 0; i < kmer_count_fwd; i++) { unsigned int const kmer_fwd = kmer_list_fwd[i]; unsigned int const kmer_rev = rc_kmer(kmer_fwd); unsigned int const hits_fwd = dbindex_getmatchcount(kmer_fwd); unsigned int const hits_rev = dbindex_getmatchcount(kmer_rev); /* require 8 times as many matches on one stand than the other */ if (hits_fwd > hits_factor * hits_rev) { ++count_fwd; } else if (hits_rev > hits_factor * hits_fwd) { ++count_rev; } } /* get progress as amount of input file read */ auto const progress = fasta_get_position(query_h); /* update stats */ ++queries; auto strand = 2; // refactoring: enum struct Strand : char {positive = '+', negative = '-', undetermined = '?'}; auto const min_count = 1U; auto const min_factor = 4U; if ((count_fwd >= min_count) and (count_fwd >= min_factor * count_rev)) { /* fwd */ strand = 0; ++matches_fwd; ++qmatches; if (opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, qseq_fwd, qseqlen, query_head, query_head_len, qsize, qmatches, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout != nullptr) { fastq_print_general(fp_fastqout, qseq_fwd, qseqlen, query_head, query_head_len, query_qual_fwd, qsize, qmatches, -1.0); } } else if ((count_rev >= min_count) and (count_rev >= min_factor * count_fwd)) { /* rev */ strand = 1; ++matches_rev; ++qmatches; /* alloc more mem if necessary to keep reverse sequence and qual */ assert(qseqlen > 0); static_assert(sizeof(std::size_t) >= sizeof(int), "size_t is too small"); const std::size_t requirements = qseqlen + 1; // refactoring: unsigned int qseqlen if (requirements > alloc) { alloc = requirements; qseq_rev.resize(alloc); if (fastx_is_fastq(query_h)) { query_qual_rev.resize(alloc); } } /* get reverse complementary sequence */ reverse_complement(qseq_rev.data(), qseq_fwd, qseqlen); if (opt_fastaout != nullptr) { fasta_print_general(fp_fastaout, nullptr, qseq_rev.data(), qseqlen, query_head, query_head_len, qsize, qmatches, -1.0, -1, -1, nullptr, 0.0); } if (opt_fastqout != nullptr) { /* reverse quality scores */ if (fastx_is_fastq(query_h)) { // copy query string in reverse order for (int i = 0; i < qseqlen; i++) { query_qual_rev[i] = query_qual_fwd[qseqlen - 1 - i]; } query_qual_rev[qseqlen] = '\0'; } fastq_print_general(fp_fastqout, qseq_rev.data(), qseqlen, query_head, query_head_len, query_qual_rev.data(), qsize, qmatches, -1.0); } } else { /* undecided */ strand = 2; ++notmatched; if (opt_notmatched != nullptr) { if (fastx_is_fastq(query_h)) { fastq_print_general(fp_notmatched, qseq_fwd, qseqlen, query_head, query_head_len, query_qual_fwd, qsize, notmatched, -1.0); } else { fasta_print_general(fp_notmatched, nullptr, qseq_fwd, qseqlen, query_head, query_head_len, qsize, notmatched, -1.0, -1, -1, nullptr, 0.0); } } } if (opt_tabbedout != nullptr) { fprintf(fp_tabbedout, "%s\t%c\t%d\t%d\n", query_head, strand == 0 ? '+' : (strand == 1 ? '-' : '?'), count_fwd, count_rev); } /* show progress */ progress_update(progress); } progress_done(); /* clean up */ unique_exit(uh_fwd); dbindex_free(); db_free(); if (opt_tabbedout != nullptr) { fclose(fp_tabbedout); } if (opt_notmatched != nullptr) { fclose(fp_notmatched); } if (opt_fastqout != nullptr) { fclose(fp_fastqout); } if (opt_fastaout != nullptr) { fclose(fp_fastaout); } fasta_close(query_h); if (not opt_quiet) { fprintf(stderr, "Forward oriented sequences: %d", matches_fwd); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * matches_fwd / queries); } fprintf(stderr, "\n"); fprintf(stderr, "Reverse oriented sequences: %d", matches_rev); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * matches_rev / queries); } fprintf(stderr, "\n"); fprintf(stderr, "All oriented sequences: %d", qmatches); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); fprintf(stderr, "Not oriented sequences: %d", notmatched); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * notmatched / queries); } fprintf(stderr, "\n"); fprintf(stderr, "Total number of sequences: %d\n", queries); } if (opt_log != nullptr) { fprintf(fp_log, "Forward oriented sequences: %d", matches_fwd); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * matches_fwd / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "Reverse oriented sequences: %d", matches_rev); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * matches_rev / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "All oriented sequences: %d", qmatches); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "Not oriented sequences: %d", notmatched); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * notmatched / queries); } fprintf(fp_log, "\n"); fprintf(fp_log, "Total number of sequences: %d\n", queries); } } vsearch-2.30.1/src/orient.h000066400000000000000000000047501506776541700155200ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto orient(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/otutable.cc000066400000000000000000000360111506776541700161700ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include // std::copy #include #include // macros PRIu64 and PRId64 #include // std::strftime, std::localtime, std::time, std::time_t, std::tm #include // int64_t, uint64_t #include // std::FILE, std::fprintf #include // std::strcspn, std::strspn #include // std::next #include #include #include #include // std::pair #include // refactoring: is there a reason to prefer regex.h over ? #ifdef HAVE_REGEX_H #include // C: regcomp, regexec, regfree (POSIX functions) #else #include // C++: std::regex_search #endif /* Identify sample and otu identifiers in headers, and count abundance of the samples in different OTUs. http://www.drive5.com/usearch/manual/upp_labels_sample.html http://www.drive5.com/usearch/manual/upp_labels_otus.html TODO: - add relabel @ */ #ifndef HAVE_REGEX_H const std::regex regex_sample("(^|;)(sample|barcodelabel)=([^;]*)($|;)", std::regex::extended); const std::regex regex_otu("(^|;)otu=([^;]*)($|;)", std::regex::extended); const std::regex regex_tax("(^|;)tax=([^;]*)($|;)", std::regex::extended); #endif using string_set_t = std::set; using string_pair_t = std::pair; using string_pair_map_t = std::map; using otu_tax_map_t = std::map; using string_no_map_t = std::map; struct otutable_s { #ifdef HAVE_REGEX_H regex_t regex_sample; regex_t regex_otu; regex_t regex_tax; #endif string_set_t otu_set; string_set_t sample_set; string_pair_map_t sample_otu_count; string_pair_map_t otu_sample_count; otu_tax_map_t otu_tax_map; }; static otutable_s * otutable; auto otutable_init() -> void { otutable = new otutable_s; #ifdef HAVE_REGEX_H /* compile regular expression matchers */ if (regcomp(&otutable->regex_sample, "(^|;)(sample|barcodelabel)=([^;]*)($|;)", REG_EXTENDED) != 0) { fatal("Compilation of regular expression for sample annotation failed"); } if (regcomp(&otutable->regex_otu, "(^|;)otu=([^;]*)($|;)", REG_EXTENDED) != 0) { fatal("Compilation of regular expression for otu annotation failed"); } if (regcomp(&otutable->regex_tax, "(^|;)tax=([^;]*)($|;)", REG_EXTENDED) != 0) { fatal("Compilation of regular expression for taxonomy annotation failed"); } #endif } auto otutable_done() -> void { #ifdef HAVE_REGEX_H regfree(&otutable->regex_sample); regfree(&otutable->regex_otu); regfree(&otutable->regex_tax); #endif otutable->otu_set.clear(); otutable->sample_set.clear(); otutable->sample_otu_count.clear(); otutable->otu_sample_count.clear(); delete otutable; } auto otutable_add(char const * query_header, char const * target_header, int64_t abundance) -> void { /* read sample annotation in query */ int len_sample = 0; char const * start_sample = query_header; std::vector sample_name; if (query_header != nullptr) { #ifdef HAVE_REGEX_H std::array pmatch_sample {{}}; if (regexec(&otutable->regex_sample, query_header, 5, pmatch_sample.data(), 0) == 0) { /* match: use the matching sample name */ len_sample = pmatch_sample[3].rm_eo - pmatch_sample[3].rm_so; start_sample += pmatch_sample[3].rm_so; } #else std::cmatch cmatch_sample; if (std::regex_search(query_header, cmatch_sample, regex_sample)) { len_sample = cmatch_sample.length(3); start_sample += cmatch_sample.position(3); } #endif else { /* no match: use first name in header with A-Za-z0-9_ */ len_sample = std::strspn(query_header, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_" "0123456789"); } sample_name.resize(len_sample + 1); std::copy(start_sample, std::next(start_sample, len_sample), sample_name.begin()); sample_name[len_sample] = '\0'; } /* read OTU annotation in target */ int len_otu = 0; char const * start_otu = target_header; std::vector otu_name; if (target_header != nullptr) { #ifdef HAVE_REGEX_H std::array pmatch_otu {{}}; if (regexec(&otutable->regex_otu, target_header, 4, pmatch_otu.data(), 0) == 0) { /* match: use the matching otu name */ len_otu = pmatch_otu[2].rm_eo - pmatch_otu[2].rm_so; start_otu += pmatch_otu[2].rm_so; } #else std::cmatch cmatch_otu; if (std::regex_search(target_header, cmatch_otu, regex_otu)) { len_otu = cmatch_otu.length(2); start_otu += cmatch_otu.position(2); } #endif else { /* no match: use first name in header up to ; */ len_otu = std::strcspn(target_header, ";"); } otu_name.resize(len_otu + 1); std::copy(start_otu, std::next(start_otu, len_otu), otu_name.begin()); otu_name[len_otu] = '\0'; /* read tax annotation in target */ #ifdef HAVE_REGEX_H char const * start_tax = target_header; std::array pmatch_tax {{}}; if (regexec(&otutable->regex_tax, target_header, 4, pmatch_tax.data(), 0) == 0) { /* match: use the matching tax name */ int const len_tax = pmatch_tax[2].rm_eo - pmatch_tax[2].rm_so; start_tax += pmatch_tax[2].rm_so; std::vector tax_name(len_tax + 1); std::copy(start_tax, std::next(start_tax, len_tax), tax_name.begin()); tax_name[len_tax] = '\0'; otutable->otu_tax_map[otu_name.data()] = tax_name.data(); } #else std::cmatch cmatch_tax; if (std::regex_search(target_header, cmatch_tax, regex_tax)) { otutable->otu_tax_map[otu_name.data()] = cmatch_tax.str(2); } #endif } /* store data */ if (not sample_name.empty()) { otutable->sample_set.insert(sample_name.data()); } if (not otu_name.empty()) { otutable->otu_set.insert(otu_name.data()); } if ((not sample_name.empty()) and (not otu_name.empty()) and (abundance != 0)) { otutable->sample_otu_count[string_pair_t(sample_name.data(), otu_name.data())] += abundance; otutable->otu_sample_count[string_pair_t(otu_name.data(), sample_name.data())] += abundance; } } auto otutable_print_otutabout(std::FILE * output_handle) -> void { int64_t progress = 0; progress_init("Writing OTU table (classic)", otutable->otu_set.size()); fprintf(output_handle, "#OTU ID"); for (auto const & it_sample : otutable->sample_set) { fprintf(output_handle, "\t%s", it_sample.c_str()); } if (not otutable->otu_tax_map.empty()) { fprintf(output_handle, "\ttaxonomy"); } fprintf(output_handle, "\n"); auto it_map = otutable->otu_sample_count.begin(); for (auto it_otu = otutable->otu_set.begin(); it_otu != otutable->otu_set.end(); ++it_otu) { fprintf(output_handle, "%s", it_otu->c_str()); for (auto it_sample = otutable->sample_set.begin(); it_sample != otutable->sample_set.end(); ++it_sample) { uint64_t a = 0; if ((it_map != otutable->otu_sample_count.end()) and (it_map->first.first == *it_otu) and (it_map->first.second == *it_sample)) { a = it_map->second; ++it_map; } fprintf(output_handle, "\t%" PRIu64, a); } if (not otutable->otu_tax_map.empty()) { fprintf(output_handle, "\t"); auto it = otutable->otu_tax_map.find(*it_otu); if (it != otutable->otu_tax_map.end()) { fprintf(output_handle, "%s", it->second.c_str()); } } fprintf(output_handle, "\n"); progress_update(++progress); } progress_done(); } auto otutable_print_mothur_shared_out(std::FILE * output_handle) -> void { int64_t progress = 0; progress_init("Writing OTU table (mothur)", otutable->sample_set.size()); fprintf(output_handle, "label\tGroup\tnumOtus"); int64_t numotus = 0; for (auto const & it_otu : otutable->otu_set) { char const * otu_name = it_otu.c_str(); fprintf(output_handle, "\t%s", otu_name); ++numotus; } fprintf(output_handle, "\n"); auto it_map = otutable->sample_otu_count.begin(); for (auto it_sample = otutable->sample_set.begin(); it_sample != otutable->sample_set.end(); ++it_sample) { fprintf(output_handle, "vsearch\t%s\t%" PRId64, it_sample->c_str(), numotus); for (auto it_otu = otutable->otu_set.begin(); it_otu != otutable->otu_set.end(); ++it_otu) { uint64_t a = 0; if ((it_map != otutable->sample_otu_count.end()) and (it_map->first.first == *it_sample) and (it_map->first.second == *it_otu)) { a = it_map->second; ++it_map; } fprintf(output_handle, "\t%" PRIu64, a); } fprintf(output_handle, "\n"); progress_update(++progress); } progress_done(); } auto otutable_print_biomout(std::FILE * output_handle) -> void { int64_t progress = 0; progress_init("Writing OTU table (biom 1.0)", otutable->otu_sample_count.size()); int64_t const rows = otutable->otu_set.size(); int64_t const columns = otutable->sample_set.size(); static const time_t time_now = time(nullptr); struct tm const * tm_now = localtime(& time_now); std::array date {{}}; strftime(date.data(), 50, "%Y-%m-%dT%H:%M:%S", tm_now); fprintf(output_handle, "{\n" "\t\"id\":\"%s\",\n" "\t\"format\": \"Biological Observation Matrix 1.0\",\n" "\t\"format_url\": \"http://biom-format.org/documentation/format_versions/biom-1.0.html\",\n" "\t\"type\": \"OTU table\",\n" "\t\"generated_by\": \"%s %s\",\n" "\t\"date\": \"%s\",\n" "\t\"matrix_type\": \"sparse\",\n" "\t\"matrix_element_type\": \"int\",\n" "\t\"shape\": [%" PRId64 ",%" PRId64 "],\n", opt_biomout, PROG_NAME, PROG_VERSION, date.data(), rows, columns); string_no_map_t otu_no_map; uint64_t otu_no = 0; fprintf(output_handle, "\t\"rows\":["); for (auto it_otu = otutable->otu_set.begin(); it_otu != otutable->otu_set.end(); ++it_otu) { if (it_otu != otutable->otu_set.begin()) { fprintf(output_handle, ","); } char const * otu_name = it_otu->c_str(); fprintf(output_handle, "\n\t\t{\"id\":\"%s\", \"metadata\":", otu_name); if (otutable->otu_tax_map.empty()) { fprintf(output_handle, "null"); } else { fprintf(output_handle, R"({"taxonomy":")"); auto it = otutable->otu_tax_map.find(otu_name); if (it != otutable->otu_tax_map.end()) { fprintf(output_handle, "%s", it->second.c_str()); } fprintf(output_handle, "\"}"); } fprintf(output_handle, "}"); otu_no_map[*it_otu] = otu_no; ++otu_no; } fprintf(output_handle, "\n"); fprintf(output_handle, "\t],\n"); string_no_map_t sample_no_map; uint64_t sample_no = 0; fprintf(output_handle, "\t\"columns\":["); for (auto it_sample = otutable->sample_set.begin(); it_sample != otutable->sample_set.end(); ++it_sample) { if (it_sample != otutable->sample_set.begin()) { fprintf(output_handle, ","); } fprintf(output_handle, "\n\t\t{\"id\":\"%s\", \"metadata\":null}", it_sample->c_str()); sample_no_map[*it_sample] = sample_no++; } fprintf(output_handle, "\n\t],\n"); auto first = true; fprintf(output_handle, "\t\"data\": ["); for (auto & it_map : otutable->otu_sample_count) { if (not first) { fprintf(output_handle, ","); } otu_no = otu_no_map[it_map.first.first]; sample_no = sample_no_map[it_map.first.second]; fprintf(output_handle, "\n\t\t[%" PRIu64 ",%" PRIu64 ",%" PRIu64 "]", otu_no, sample_no, it_map.second); first = false; ++progress; progress_update(progress); } fprintf(output_handle, "\n\t]\n"); fprintf(output_handle, "}\n"); progress_done(); } vsearch-2.30.1/src/otutable.h000066400000000000000000000055331506776541700160370ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // int64_t auto otutable_init() -> void; auto otutable_done() -> void; auto otutable_add(char const * query_header, char const * target_header, int64_t abundance) -> void; auto otutable_print_otutabout(std::FILE * output_handle) -> void; auto otutable_print_mothur_shared_out(std::FILE * output_handle) -> void; auto otutable_print_biomout(std::FILE * output_handle) -> void; vsearch-2.30.1/src/rereplicate.cc000066400000000000000000000120161506776541700166470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/check_output_filehandle.hpp" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/open_file.hpp" #include // macros PRIu64 and PRId64 #include // std::FILE, std::fprintf #include // int64_t // anonymous namespace: limit visibility and usage to this translation unit namespace { } // end of anonymous namespace auto rereplicate(struct Parameters const & parameters) -> void { auto const output_handle = open_output_file(parameters.opt_output); check_mandatory_output_handle(parameters.opt_output, (not output_handle)); auto * input_handle = fasta_open(parameters.opt_rereplicate); auto const filesize = static_cast(fasta_get_size(input_handle)); progress_init("Rereplicating", filesize); int64_t n_amplicons = 0; auto missing_abundance = false; int64_t n_reads = 0; auto const truncateatspace = not parameters.opt_notrunclabels; while (fasta_next(input_handle, truncateatspace, chrmap_no_change_vector.data())) { ++n_amplicons; auto abundance = fasta_get_abundance_and_presence(input_handle); if (abundance == 0) { missing_abundance = true; abundance = 1; } for (int64_t i = 0; i < abundance; ++i) { ++n_reads; fasta_print_general(output_handle.get(), nullptr, fasta_get_sequence(input_handle), static_cast(fasta_get_sequence_length(input_handle)), fasta_get_header(input_handle), static_cast(fasta_get_header_length(input_handle)), 1, static_cast(n_reads), -1.0, -1, -1, nullptr, 0.0); } progress_update(fasta_get_position(input_handle)); } progress_done(); if (not parameters.opt_quiet) { if (missing_abundance) { std::fprintf(stderr, "WARNING: Missing abundance information for some input sequences, assumed 1\n"); } std::fprintf(stderr, "Rereplicated %" PRId64 " reads from %" PRId64 " amplicons\n", n_reads, n_amplicons); } if (parameters.opt_log != nullptr) { if (missing_abundance) { std::fprintf(stderr, "WARNING: Missing abundance information for some input sequences, assumed 1\n"); } std::fprintf(fp_log, "Rereplicated %" PRId64 " reads from %" PRId64 " amplicons\n", n_reads, n_amplicons); } fasta_close(input_handle); } vsearch-2.30.1/src/rereplicate.h000066400000000000000000000047551506776541700165240ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto rereplicate(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/results.cc000066400000000000000000000756041506776541700160650ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "showalign.h" #include "tax.h" #include "userfields.h" #include "utils/cigar.hpp" #include "utils/maps.hpp" #include "utils/span.hpp" #include "utils/taxonomic_fields.h" #include "xstring.h" #include // std::max #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::snprintf, std::sscanf #include // std::strlen, std::strncmp // anonymous namespace: limit visibility and usage to this translation unit namespace { auto check_if_perfect_match(char const * opt_cluster_fast, struct hit const * hits) -> bool { if (opt_cluster_fast != nullptr) { /* cluster_fast */ /* use '=' for identical sequences, ignoring terminal gaps */ return (hits->matches == hits->internal_alignmentlength); } /* cluster_size, cluster_smallmem, cluster_unoise */ /* usearch_global, search_exact, allpairs_global */ /* use '=' for strictly identical sequences */ return (hits->matches == hits->nwalignmentlength); } } // end of anonymous namespace auto results_show_fastapairs_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, char const * qsequence, char const * qsequence_rc) -> void { /* http://www.drive5.com/usearch/manual/fastapairs.html */ if (hits == nullptr) { return; } auto const * query = (hits->strand != 0) ? qsequence_rc : qsequence; auto const qrow = align_getrow(Span{query, std::strlen(query)}, Span{hits->nwalignment, std::strlen(hits->nwalignment)}, hits->nwalignmentlength, false); fasta_print_general(output_handle, nullptr, &qrow[hits->trim_q_left + hits->trim_t_left], hits->internal_alignmentlength, query_head, strlen(query_head), 0, 0, -1.0, -1, -1, nullptr, 0.0); auto const trow = align_getrow(Span{db_getsequence(hits->target), db_getsequencelen(hits->target)}, Span{hits->nwalignment, std::strlen(hits->nwalignment)}, hits->nwalignmentlength, true); fasta_print_general(output_handle, nullptr, &trow[hits->trim_q_left + hits->trim_t_left], hits->internal_alignmentlength, db_getheader(hits->target), db_getheaderlen(hits->target), 0, 0, -1.0, -1, -1, nullptr, 0.0); fprintf(output_handle, "\n"); } auto results_show_qsegout_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, char const * qsequence, int64_t const qseqlen, char const * qsequence_rc) -> void { if (hits == nullptr) { return; } char const * qseg = ((hits->strand != 0) ? qsequence_rc : qsequence) + hits->trim_q_left; int const qseglen = qseqlen - hits->trim_q_left - hits->trim_q_right; fasta_print_general(output_handle, nullptr, qseg, qseglen, query_head, strlen(query_head), 0, 0, -1.0, -1, -1, nullptr, 0.0); } auto results_show_tsegout_one(std::FILE * output_handle, struct hit const * hits) -> void { if (hits == nullptr) { return; } auto const * tseg = db_getsequence(hits->target) + hits->trim_t_left; int const tseglen = db_getsequencelen(hits->target) - hits->trim_t_left - hits->trim_t_right; fasta_print_general(output_handle, nullptr, tseg, tseglen, db_getheader(hits->target), db_getheaderlen(hits->target), 0, 0, -1.0, -1, -1, nullptr, 0.0); } auto results_show_blast6out_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, int64_t const qseqlen) -> void { /* http://www.drive5.com/usearch/manual/blast6out.html query label target label percent identity alignment length number of mismatches number of gap opens 1-based position of start in query 1-based position of end in query 1-based position of start in target 1-based position of end in target E-value bit score Note that USEARCH shows 13 fields when there is no hit, but only 12 when there is a hit. Fixed in VSEARCH. */ if (hits == nullptr) { fprintf(output_handle, "%s\t*\t0.0\t0\t0\t0\t0\t0\t0\t0\t-1\t0\n", query_head); return; } // if 'hp->strand' then 'minus strand' else 'plus strand' int const qstart = (hits->strand != 0) ? qseqlen : 1; int const qend = (hits->strand != 0) ? 1 : qseqlen; fprintf(output_handle, "%s\t%s\t%.1f\t%d\t%d\t%d\t%d\t%d\t%d\t%" PRIu64 "\t%d\t%d\n", query_head, db_getheader(hits->target), hits->id, hits->internal_alignmentlength, hits->mismatches, hits->internal_gaps, qstart, qend, 1, db_getsequencelen(hits->target), -1, 0); } auto results_show_uc_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, int64_t const qseqlen, int const clusterno) -> void { /* http://www.drive5.com/usearch/manual/ucout.html Columns: H/N cluster no (0-based) (target sequence no) sequence length (query) percent identity strand: + or - 0 0 compressed alignment, e.g. 9I92M14D, or "=" if perfect alignment query label target label */ if (hits == nullptr) { fprintf(output_handle, "N\t*\t*\t*\t.\t*\t*\t*\t%s\t*\n", query_head); return; } auto const is_perfect_match = check_if_perfect_match(opt_cluster_fast, hits); fprintf(output_handle, "H\t%d\t%" PRId64 "\t%.1f\t%c\t0\t0\t%s\t", clusterno, qseqlen, hits->id, (hits->strand != 0) ? '-' : '+', is_perfect_match ? "=" : hits->nwalignment); header_fprint_strip(output_handle, query_head, strlen(query_head), opt_xsize, opt_xee, opt_xlength); fprintf(output_handle, "\t"); header_fprint_strip(output_handle, db_getheader(hits->target), db_getheaderlen(hits->target), opt_xsize, opt_xee, opt_xlength); fprintf(output_handle, "\n"); } auto results_show_userout_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, char const * qsequence, int64_t const qseqlen, char const * qsequence_rc) -> void { /* http://drive5.com/usearch/manual/userout.html qlo, qhi, tlo, thi and raw are given more meaningful values here */ for (auto c = 0; c < userfields_requested_count; ++c) { if (c != 0) { fprintf(output_handle, "\t"); } auto const field = userfields_requested[c]; char * tsequence = nullptr; int64_t tseqlen = 0; char const * t_head = nullptr; if (hits != nullptr) { tsequence = db_getsequence(hits->target); tseqlen = db_getsequencelen(hits->target); t_head = db_getheader(hits->target); } switch (field) { case 0: /* query */ fprintf(output_handle, "%s", query_head); break; case 1: /* target */ fprintf(output_handle, "%s", (hits != nullptr) ? t_head : "*"); break; case 2: /* evalue */ fprintf(output_handle, "-1"); break; case 3: /* id */ fprintf(output_handle, "%.1f", (hits != nullptr) ? hits->id : 0.0); break; case 4: /* pctpv */ fprintf(output_handle, "%.1f", ((hits != nullptr) and (hits->internal_alignmentlength > 0)) ? 100.0 * hits->matches / hits->internal_alignmentlength : 0.0); break; case 5: /* pctgaps */ fprintf(output_handle, "%.1f", ((hits != nullptr) and (hits->internal_alignmentlength > 0)) ? 100.0 * hits->internal_indels / hits->internal_alignmentlength : 0.0); break; case 6: /* pairs */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->matches + hits->mismatches : 0); break; case 7: /* gaps */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->internal_indels : 0); break; case 8: /* qlo */ fprintf(output_handle, "%" PRId64, (hits != nullptr) ? ((hits->strand != 0) ? qseqlen : 1) : 0); break; case 9: /* qhi */ fprintf(output_handle, "%" PRId64, (hits != nullptr) ? ((hits->strand != 0) ? 1 : qseqlen) : 0); break; case 10: /* tlo */ fprintf(output_handle, "%d", (hits != nullptr) ? 1 : 0); break; case 11: /* thi */ fprintf(output_handle, "%" PRId64, tseqlen); break; case 12: /* pv */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->matches : 0); break; case 13: /* ql */ fprintf(output_handle, "%" PRId64, qseqlen); break; case 14: /* tl */ fprintf(output_handle, "%" PRId64, (hits != nullptr) ? tseqlen : 0); break; case 15: /* qs */ fprintf(output_handle, "%" PRId64, qseqlen); break; case 16: /* ts */ fprintf(output_handle, "%" PRId64, (hits != nullptr) ? tseqlen : 0); break; case 17: /* alnlen */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->internal_alignmentlength : 0); break; case 18: /* opens */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->internal_gaps : 0); break; case 19: /* exts */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->internal_indels - hits->internal_gaps : 0); break; case 20: /* raw */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->nwscore : 0); break; case 21: /* bits */ fprintf(output_handle, "%d", 0); break; case 22: /* aln */ if (hits != nullptr) { print_uncompressed_cigar(output_handle, Span{hits->nwalignment, std::strlen(hits->nwalignment)}); } break; case 23: /* caln */ if (hits != nullptr) { fprintf(output_handle, "%s", hits->nwalignment); } break; case 24: /* qstrand */ if (hits != nullptr) { fprintf(output_handle, "%c", (hits->strand != 0) ? '-' : '+'); } break; case 25: /* tstrand */ if (hits != nullptr) { fprintf(output_handle, "%c", '+'); } break; case 26: /* qrow */ if (hits != nullptr) { auto const * query = (hits->strand != 0) ? qsequence_rc : qsequence; auto const qrow = align_getrow(Span{query, std::strlen(query)}, Span{hits->nwalignment, std::strlen(hits->nwalignment)}, hits->nwalignmentlength, false); fprintf(output_handle, "%.*s", hits->internal_alignmentlength, &qrow[hits->trim_q_left + hits->trim_t_left]); } break; case 27: /* trow */ if (hits != nullptr) { auto const trow = align_getrow(Span{tsequence, std::strlen(tsequence)}, Span{hits->nwalignment, std::strlen(hits->nwalignment)}, hits->nwalignmentlength, true); fprintf(output_handle, "%.*s", hits->internal_alignmentlength, &trow[hits->trim_q_left + hits->trim_t_left]); } break; case 28: /* qframe */ fprintf(output_handle, "+0"); break; case 29: /* tframe */ fprintf(output_handle, "+0"); break; case 30: /* mism */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->mismatches : 0); break; case 31: /* ids */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->matches : 0); break; case 32: /* qcov */ fprintf(output_handle, "%.1f", (hits != nullptr) ? 100.0 * (hits->matches + hits->mismatches) / qseqlen : 0.0); break; case 33: /* tcov */ fprintf(output_handle, "%.1f", (hits != nullptr) ? 100.0 * (hits->matches + hits->mismatches) / tseqlen : 0.0); break; case 34: /* id0 */ fprintf(output_handle, "%.1f", (hits != nullptr) ? hits->id0 : 0.0); break; case 35: /* id1 */ fprintf(output_handle, "%.1f", (hits != nullptr) ? hits->id1 : 0.0); break; case 36: /* id2 */ fprintf(output_handle, "%.1f", (hits != nullptr) ? hits->id2 : 0.0); break; case 37: /* id3 */ fprintf(output_handle, "%.1f", (hits != nullptr) ? hits->id3 : 0.0); break; case 38: /* id4 */ fprintf(output_handle, "%.1f", (hits != nullptr) ? hits->id4 : 0.0); break; /* new internal alignment coordinates */ case 39: /* qilo */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->trim_q_left + 1 : 0); break; case 40: /* qihi */ fprintf(output_handle, "%" PRId64, (hits != nullptr) ? qseqlen - hits->trim_q_right : 0); break; case 41: /* tilo */ fprintf(output_handle, "%d", (hits != nullptr) ? hits->trim_t_left + 1 : 0); break; case 42: /* tihi */ fprintf(output_handle, "%" PRId64, (hits != nullptr) ? tseqlen - hits->trim_t_right : 0); break; } } fprintf(output_handle, "\n"); } auto results_show_lcaout(std::FILE * output_handle, struct hit const * hits, int const hitcount, char const * query_head) -> void { /* Output last common ancestor (LCA) of the hits, in a similar way to the Sintax command */ /* Use a modified Boyer-Moore majority voting algorithm at each taxonomic level to find the most common name at each level */ fprintf(output_handle, "%s\t", query_head); std::array votes {{}}; std::array cand; cand.fill(-1); std::array, tax_levels> cand_level_start {{}}; std::array, tax_levels> cand_level_len {{}}; std::array level_match {{}}; auto const top_hit_id = hits[0].id; auto tophitcount = 0; for (auto t = 0; t < hitcount; ++t) { struct hit const * hp = hits + t; if ((opt_top_hits_only != 0) and (hp->id < top_hit_id)) { break; } ++tophitcount; int const seqno = hp->target; std::array new_level_start {{}}; // refactoring: std::array std::array new_level_len {{}}; tax_split(seqno, new_level_start.data(), new_level_len.data()); for (auto k = 0; k < tax_levels; ++k) { if (votes[k] == 0) { cand[k] = seqno; votes[k] = 1; for (auto j = 0; j < tax_levels; ++j) { cand_level_start[k][j] = new_level_start[j]; cand_level_len[k][j] = new_level_len[j]; } } else { auto match = true; for (auto j = 0; j <= k; ++j) { if ((new_level_len[j] != cand_level_len[k][j]) or (std::strncmp(db_getheader(cand[k]) + cand_level_start[k][j], db_getheader(seqno) + new_level_start[j], new_level_len[j]) != 0)) { match = false; break; } } if (match) { ++votes[k]; } else { --votes[k]; } } } } /* count actual matches to the candidate at each level */ for (auto t = 0; t < tophitcount; ++t) { auto const seqno = hits[t].target; std::array new_level_start {{}}; std::array new_level_len {{}}; tax_split(seqno, new_level_start.data(), new_level_len.data()); for (auto k = 0; k < tax_levels; ++k) { auto match = true; for (auto j = 0; j <= k; ++j) { if ((new_level_len[j] != cand_level_len[k][j]) or (std::strncmp(db_getheader(cand[k]) + cand_level_start[k][j], db_getheader(seqno) + new_level_start[j], new_level_len[j]) != 0)) { match = false; break; } } if (match) { ++level_match[k]; } } } /* output results */ if (tophitcount == 0) { fprintf(output_handle, "\n"); return; } auto comma = false; for (auto j = 0; j < tax_levels; ++j) { if (1.0 * level_match[j] / tophitcount < opt_lca_cutoff) { break; } if (cand_level_len[j][j] > 0) { fprintf(output_handle, "%s%c:%.*s", (comma ? "," : ""), taxonomic_fields[j], cand_level_len[j][j], db_getheader(cand[j]) + cand_level_start[j][j]); comma = true; } } fprintf(output_handle, "\n"); } auto results_show_alnout(std::FILE * output_handle, struct hit const * hits, int const hitcount, char const * query_head, char const * qsequence, int64_t const qseqlen) -> void { /* http://drive5.com/usearch/manual/alnout.html */ if (hitcount == 0) { if (opt_output_no_hits != 0) { fprintf(output_handle, "\n"); fprintf(output_handle,"Query >%s\n", query_head); fprintf(output_handle, "No hits\n"); } return; } fprintf(output_handle, "\n"); fprintf(output_handle,"Query >%s\n", query_head); fprintf(output_handle," %%Id TLen Target\n"); auto const top_hit_id = hits[0].id; for (auto t = 0; t < hitcount; ++t) { auto const * hp = hits + t; if ((opt_top_hits_only != 0) and (hp->id < top_hit_id)) { break; } fprintf(output_handle,"%3.0f%% %6" PRIu64 " %s\n", hp->id, db_getsequencelen(hp->target), db_getheader(hp->target)); } for (auto t = 0; t < hitcount; ++t) { auto const * hp = hits + t; if ((opt_top_hits_only != 0) and (hp->id < top_hit_id)) { break; } fprintf(output_handle,"\n"); auto * dseq = db_getsequence(hp->target); int64_t const dseqlen = db_getsequencelen(hp->target); auto const qlenlen = snprintf(nullptr, 0, "%" PRId64, qseqlen); auto const tlenlen = snprintf(nullptr, 0, "%" PRId64, dseqlen); auto const numwidth = std::max(qlenlen, tlenlen); fprintf(output_handle," Query %*" PRId64 "nt >%s\n", numwidth, qseqlen, query_head); fprintf(output_handle,"Target %*" PRId64 "nt >%s\n", numwidth, dseqlen, db_getheader(hp->target)); int const rowlen = (opt_rowlen == 0) ? qseqlen + dseqlen : opt_rowlen; align_show(output_handle, qsequence, qseqlen, hp->trim_q_left, "Qry", dseq, dseqlen, hp->trim_t_left, "Tgt", hp->nwalignment + hp->trim_aln_left, strlen(hp->nwalignment) - hp->trim_aln_left - hp->trim_aln_right, numwidth, 3, rowlen, hp->strand); fprintf(output_handle, "\n%d cols, %d ids (%3.1f%%), %d gaps (%3.1f%%)\n", hp->internal_alignmentlength, hp->matches, hp->id, hp->internal_indels, hp->internal_alignmentlength > 0 ? 100.0 * hp->internal_indels / hp->internal_alignmentlength : 0.0); } } auto build_sam_strings(char const * alignment, char const * queryseq, char const * targetseq, xstring & cigar, xstring & md) -> void { /* convert cigar to sam format: add "1" to operations without run length flip direction of indels in cigar string build MD-string with substitutions */ cigar.clear(); md.clear(); auto const * p = alignment; auto const * e = p + strlen(p); auto qpos = 0; auto tpos = 0; auto matched = 0; auto flag = false; /* 1: MD string ends with a number */ while (p < e) { auto run = 1; auto scanned = 0; sscanf(p, "%d%n", &run, &scanned); p += scanned; auto const op = *p; ++p; switch (op) { case 'M': cigar.add_d(run); cigar.add_c('M'); for (auto i = 0; i < run; ++i) { if (is_same_4bit(queryseq[qpos], targetseq[tpos])) { ++matched; } else { if (not flag) { md.add_d(matched); matched = 0; flag = true; } md.add_c(targetseq[tpos]); flag = false; } ++qpos; ++tpos; } break; case 'D': cigar.add_d(run); cigar.add_c('I'); qpos += run; break; case 'I': cigar.add_d(run); cigar.add_c('D'); if (not flag) { md.add_d(matched); matched = 0; flag = true; } md.add_c('^'); for (auto i = 0; i < run; ++i) { md.add_c(targetseq[tpos]); ++tpos; } flag = false; break; } } if (not flag) { md.add_d(matched); matched = 0; flag = true; } } auto results_show_samheader(std::FILE * output_handle, char const * cmdline, char const * dbname) -> void { if ((opt_samout != nullptr) and opt_samheader) { fprintf(output_handle, "@HD\tVN:1.0\tSO:unsorted\tGO:query\n"); std::array md5hex; for (uint64_t i = 0; i < db_getsequencecount(); ++i) { get_hex_seq_digest_md5(md5hex.data(), db_getsequence(i), db_getsequencelen(i)); fprintf(output_handle, "@SQ\tSN:%s\tLN:%" PRIu64 "\tM5:%s\tUR:file:%s\n", db_getheader(i), db_getsequencelen(i), md5hex.data(), dbname); } fprintf(output_handle, "@PG\tID:%s\tVN:%s\tCL:%s\n", PROG_NAME, PROG_VERSION, cmdline); } } auto results_show_samout(std::FILE * output_handle, struct hit const * hits, int const hitcount, char const * query_head, char const * qsequence, char const * qsequence_rc) -> void { /* SAM format output http://samtools.github.io/hts-specs/SAMv1.pdf http://www.drive5.com/usearch/manual/sam_files.html http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output http://davetang.org/muse/2011/01/28/perl-and-sam/ 1: qname, query template name 2: flag, bitwise flag (12 bits) (0x004=unmapped, 0x010=rev strand, 0x100 sec. alignment) 3: rname, reference sequence name 4: pos, 1-based leftmost mapping position (1) 5: mapq, mapping quality (255) 6: cigar, cigar string (MID) 7: rnext, ref name of next/paired read (*) 8: pnest, position of next/paired read (0) 9: tlen, obs template length (target length) 10: seq, segment of sequence 11: qual, ascii of phred based quality+33 (*) 12: optional tags (tag:type:value) Optional tags AS, XN, XM, XO, XG, NM, MD and YT used in usearch8. Usearch8: AS:i:? alignment score (i.e percent identity) XN:i:? next best alignment score (always 0?) XM:i:? number of mismatches XO:i:? number of gap opens (excluding terminal gaps) XG:i:? number of gap extensions (excluding terminal gaps) NM:i:? edit distance (sum of XM and XG) MD:Z:? variant string YT:Z:UU string representing alignment type */ if (hitcount == 0) { if (opt_output_no_hits != 0) { fprintf(output_handle, "%s\t%u\t%s\t%" PRIu64 "\t%u\t%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%s\t%s\n", query_head, 0x04, "*", (uint64_t) 0, 255, "*", "*", (uint64_t) 0, (uint64_t) 0, qsequence, "*"); } return; } auto const top_hit_id = hits[0].id; for (auto t = 0; t < hitcount; ++t) { auto const * hp = hits + t; if ((opt_top_hits_only != 0) and (hp->id < top_hit_id)) { break; } xstring cigar; xstring md; build_sam_strings(hp->nwalignment, (hp->strand != 0) ? qsequence_rc : qsequence, db_getsequence(hp->target), cigar, md); fprintf(output_handle, "%s\t%u\t%s\t%" PRIu64 "\t%u\t%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%s\t%s\t" "AS:i:%.0f\tXN:i:%d\tXM:i:%d\tXO:i:%d\t" "XG:i:%d\tNM:i:%d\tMD:Z:%s\tYT:Z:%s\n", query_head, (0x10 * hp->strand) | (t > 0 ? 0x100 : 0), db_getheader(hp->target), (uint64_t) 1, 255, cigar.c_str(), "*", (uint64_t) 0, (uint64_t) 0, (hp->strand != 0) ? qsequence_rc : qsequence, "*", hp->id, 0, hp->mismatches, hp->internal_gaps, hp->internal_indels, hp->mismatches + hp->internal_indels, md.c_str(), "UU"); } } auto clean_up() -> void { if (userfields_requested != nullptr) { xfree(userfields_requested); } userfields_requested = nullptr; } vsearch-2.30.1/src/results.h000066400000000000000000000117511506776541700157200ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE #include // int64_t auto results_show_alnout(std::FILE * output_handle, struct hit const * hits, int hitcount, char const * query_head, char const * qsequence, int64_t qseqlen) -> void; auto results_show_lcaout(std::FILE * output_handle, struct hit const * hits, int hitcount, char const * query_head) -> void; auto results_show_blast6out_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, int64_t qseqlen) -> void; auto results_show_uc_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, int64_t qseqlen, int clusterno) -> void; auto results_show_userout_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, char const * qsequence, int64_t qseqlen, char const * qsequence_rc) -> void; auto results_show_fastapairs_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, char const * qsequence, char const * qsequence_rc) -> void; auto results_show_qsegout_one(std::FILE * output_handle, struct hit const * hits, char const * query_head, char const * qsequence, int64_t qseqlen, char const * qsequence_rc) -> void; auto results_show_tsegout_one(std::FILE * output_handle, struct hit const * hits) -> void; auto results_show_samheader(std::FILE * output_handle, char const * cmdline, char const * dbname) -> void; auto results_show_samout(std::FILE * output_handle, struct hit const * hits, int hitcount, char const * query_head, char const * qsequence, char const * qsequence_rc) -> void; auto clean_up() -> void; vsearch-2.30.1/src/search.cc000066400000000000000000000663171506776541700156320ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "dbindex.h" #include "mask.h" #include "minheap.h" #include "otutable.h" #include "udb.h" #include "unique.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/xpthread.hpp" #include // std::min #include // macros PRIu64 and PRId64 #include // uint64_t, int64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::strlen, std::memset, std::strcpy #include #include static struct searchinfo_s * si_plus; static struct searchinfo_s * si_minus; static pthread_t * pthread; /* global constants/data, no need for synchronization */ static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; static fastx_handle query_fastx_h; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static int qmatches; static uint64_t qmatches_abundance; static int queries; static uint64_t queries_abundance; static uint64_t * dbmatched; static FILE * fp_samout = nullptr; static FILE * fp_alnout = nullptr; static FILE * fp_userout = nullptr; static FILE * fp_blast6out = nullptr; static FILE * fp_uc = nullptr; static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; static FILE * fp_dbmatched = nullptr; static FILE * fp_dbnotmatched = nullptr; static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; static FILE * fp_lcaout = nullptr; static FILE * fp_qsegout = nullptr; static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; auto search_output_results(std::vector const & hits, char const * query_head, int qseqlen, char const * qsequence, char const * qsequence_rc, int qsize) -> void { xpthread_mutex_lock(&mutex_output); /* show results */ auto const toreport = std::min(opt_maxhits, static_cast(hits.size())); if (fp_alnout != nullptr) { results_show_alnout(fp_alnout, hits.data(), toreport, query_head, qsequence, qseqlen); } if (fp_lcaout != nullptr) { results_show_lcaout(fp_lcaout, hits.data(), toreport, query_head); } if (fp_samout != nullptr) { results_show_samout(fp_samout, hits.data(), toreport, query_head, qsequence, qsequence_rc); } if (toreport != 0) { double const top_hit_id = hits[0].id; if ((opt_otutabout != nullptr) || (opt_mothur_shared_out != nullptr) || (opt_biomout != nullptr)) { otutable_add(query_head, db_getheader(hits[0].target), qsize); } for (auto t = 0; t < toreport; t++) { auto const * hp = &hits[t]; if ((opt_top_hits_only != 0) && (hp->id < top_hit_id)) { break; } if (fp_fastapairs != nullptr) { results_show_fastapairs_one(fp_fastapairs, hp, query_head, qsequence, qsequence_rc); } if (fp_qsegout != nullptr) { results_show_qsegout_one(fp_qsegout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout != nullptr) { results_show_tsegout_one(fp_tsegout, hp); } if (fp_uc != nullptr) { if ((t==0) || (opt_uc_allhits != 0)) { results_show_uc_one(fp_uc, hp, query_head, qseqlen, hp->target); } } if (fp_userout != nullptr) { results_show_userout_one(fp_userout, hp, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, hp, query_head, qseqlen); } } } else { if ((opt_otutabout != nullptr) || (opt_mothur_shared_out != nullptr) || (opt_biomout != nullptr)) { otutable_add(query_head, nullptr, qsize); } if (fp_uc != nullptr) { results_show_uc_one(fp_uc, nullptr, query_head, qseqlen, 0); } if (opt_output_no_hits != 0) { if (fp_userout != nullptr) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } } if (not hits.empty()) { count_matched++; if (opt_matched != nullptr) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_matched, -1.0, -1, -1, nullptr, 0.0); } } else { count_notmatched++; if (opt_notmatched != nullptr) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } /* update matching db sequences */ for (auto const & hit : hits) { if (hit.accepted or hit.weak) { dbmatched[hit.target] += opt_sizein ? qsize : 1; } } xpthread_mutex_unlock(&mutex_output); } auto search_query(int64_t t) -> int { for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_minus + t : si_plus + t; /* mask query */ if (opt_qmask == MASK_DUST) { dust(si->qsequence, si->qseqlen); } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask != 0)) { hardmask(si->qsequence, si->qseqlen); } /* perform search */ search_onequery(si, opt_qmask); } std::vector hits; search_joinhits(si_plus + t, opt_strand > 1 ? si_minus + t : nullptr, hits); search_output_results(hits, si_plus[t].query_head, si_plus[t].qseqlen, si_plus[t].qsequence, opt_strand > 1 ? si_minus[t].qsequence : nullptr, si_plus[t].qsize); /* free memory for alignment strings */ for (auto const & hit : hits) { if (hit.aligned) { xfree(hit.nwalignment); } } return static_cast(hits.size()); } auto search_thread_run(int64_t t) -> void { while (true) { xpthread_mutex_lock(&mutex_input); if (fastx_next(query_fastx_h, (opt_notrunclabels == 0), chrmap_no_change_vector.data())) { char const * qhead = fastx_get_header(query_fastx_h); int const query_head_len = fastx_get_header_length(query_fastx_h); char const * qseq = fastx_get_sequence(query_fastx_h); int const qseqlen = fastx_get_sequence_length(query_fastx_h); int const query_no = fastx_get_seqno(query_fastx_h); int const qsize = fastx_get_abundance(query_fastx_h); for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_minus + t : si_plus + t; si->query_head_len = query_head_len; si->qseqlen = qseqlen; si->query_no = query_no; si->qsize = qsize; si->strand = s; /* allocate more memory for header and sequence, if necessary */ if (si->query_head_len + 1 > si->query_head_alloc) { si->query_head_alloc = si->query_head_len + 2001; si->query_head = (char *) xrealloc(si->query_head, (size_t) (si->query_head_alloc)); } if (si->qseqlen + 1 > si->seq_alloc) { si->seq_alloc = si->qseqlen + 2001; si->qsequence = (char *) xrealloc(si->qsequence, (size_t) (si->seq_alloc)); } } /* plus strand: copy header and sequence */ strcpy(si_plus[t].query_head, qhead); strcpy(si_plus[t].qsequence, qseq); /* get progress as amount of input file read */ uint64_t const progress = fastx_get_position(query_fastx_h); /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* minus strand: copy header and reverse complementary sequence */ if (opt_strand > 1) { strcpy(si_minus[t].query_head, si_plus[t].query_head); reverse_complement(si_minus[t].qsequence, si_plus[t].qsequence, si_plus[t].qseqlen); } int const match = search_query(t); /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* update stats */ ++queries; queries_abundance += qsize; if (match != 0) { ++qmatches; qmatches_abundance += qsize; } /* show progress */ progress_update(progress); xpthread_mutex_unlock(&mutex_output); } else { xpthread_mutex_unlock(&mutex_input); break; } } } auto search_thread_init(struct searchinfo_s * si) -> void { /* thread specific initialiation */ si->uh = unique_init(); si->kmers = (count_t *) xmalloc((seqcount * sizeof(count_t)) + 32); si->m = minheap_init(tophits); si->hits = (struct hit *) xmalloc (sizeof(struct hit) * (tophits) * opt_strand); si->qsize = 1; si->query_head_alloc = 0; si->query_head = nullptr; si->seq_alloc = 0; si->qsequence = nullptr; si->s = search16_init(opt_match, opt_mismatch, opt_gap_open_query_left, opt_gap_open_target_left, opt_gap_open_query_interior, opt_gap_open_target_interior, opt_gap_open_query_right, opt_gap_open_target_right, opt_gap_extension_query_left, opt_gap_extension_target_left, opt_gap_extension_query_interior, opt_gap_extension_target_interior, opt_gap_extension_query_right, opt_gap_extension_target_right); } auto search_thread_exit(struct searchinfo_s * si) -> void { /* thread specific clean up */ search16_exit(si->s); unique_exit(si->uh); xfree(si->hits); minheap_exit(si->m); xfree(si->kmers); if (si->query_head != nullptr) { xfree(si->query_head); } if (si->qsequence != nullptr) { xfree(si->qsequence); } } auto search_thread_worker(void * vp) -> void * { auto t = (int64_t) vp; search_thread_run(t); return nullptr; } auto search_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { search_thread_init(si_plus + t); if (si_minus != nullptr) { search_thread_init(si_minus + t); } xpthread_create(pthread + t, &attr, search_thread_worker, (void *) (int64_t) t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); search_thread_exit(si_plus + t); if (si_minus != nullptr) { search_thread_exit(si_minus + t); } } xpthread_attr_destroy(&attr); } auto search_prep(char * cmdline, char * progheader) -> void { /* open output files */ if (opt_alnout != nullptr) { fp_alnout = fopen_output(opt_alnout); if (fp_alnout == nullptr) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_lcaout != nullptr) { fp_lcaout = fopen_output(opt_lcaout); if (fp_lcaout == nullptr) { fatal("Unable to open lca output file for writing"); } } if (opt_samout != nullptr) { fp_samout = fopen_output(opt_samout); if (fp_samout == nullptr) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout != nullptr) { fp_userout = fopen_output(opt_userout); if (fp_userout == nullptr) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out != nullptr) { fp_blast6out = fopen_output(opt_blast6out); if (fp_blast6out == nullptr) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_uc != nullptr) { fp_uc = fopen_output(opt_uc); if (fp_uc == nullptr) { fatal("Unable to open uc output file for writing"); } } if (opt_fastapairs != nullptr) { fp_fastapairs = fopen_output(opt_fastapairs); if (fp_fastapairs == nullptr) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout != nullptr) { fp_qsegout = fopen_output(opt_qsegout); if (fp_qsegout == nullptr) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout != nullptr) { fp_tsegout = fopen_output(opt_tsegout); if (fp_tsegout == nullptr) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched != nullptr) { fp_matched = fopen_output(opt_matched); if (fp_matched == nullptr) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched != nullptr) { fp_notmatched = fopen_output(opt_notmatched); if (fp_notmatched == nullptr) { fatal("Unable to open notmatched output file for writing"); } } if (opt_otutabout != nullptr) { fp_otutabout = fopen_output(opt_otutabout); if (fp_otutabout == nullptr) { fatal("Unable to open OTU table (text format) output file for writing"); } } if (opt_mothur_shared_out != nullptr) { fp_mothur_shared_out = fopen_output(opt_mothur_shared_out); if (fp_mothur_shared_out == nullptr) { fatal("Unable to open OTU table (mothur format) output file for writing"); } } if (opt_biomout != nullptr) { fp_biomout = fopen_output(opt_biomout); if (fp_biomout == nullptr) { fatal("Unable to open OTU table (biom 1.0 format) output file for writing"); } } /* check if it may be an UDB file */ bool const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); results_show_samheader(fp_samout, cmdline, opt_db); show_rusage(); seqcount = db_getsequencecount(); } else { db_read(opt_db, 0); results_show_samheader(fp_samout, cmdline, opt_db); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask != 0)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } /* tophits = the maximum number of hits we need to store */ if ((opt_maxrejects == 0) || (opt_maxrejects > seqcount)) { opt_maxrejects = seqcount; } if ((opt_maxaccepts == 0) || (opt_maxaccepts > seqcount)) { opt_maxaccepts = seqcount; } tophits = opt_maxrejects + opt_maxaccepts + MAXDELAYED; tophits = std::min(tophits, seqcount); } auto search_done() -> void { /* clean up, global */ dbindex_free(); db_free(); if (opt_lcaout != nullptr) { fclose(fp_lcaout); } if (opt_matched != nullptr) { fclose(fp_matched); } if (opt_notmatched != nullptr) { fclose(fp_notmatched); } if (opt_fastapairs != nullptr) { fclose(fp_fastapairs); } if (opt_qsegout != nullptr) { fclose(fp_qsegout); } if (opt_tsegout != nullptr) { fclose(fp_tsegout); } if (fp_uc != nullptr) { fclose(fp_uc); } if (fp_blast6out != nullptr) { fclose(fp_blast6out); } if (fp_userout != nullptr) { fclose(fp_userout); clean_up(); // free userfields allocation } if (fp_alnout != nullptr) { fclose(fp_alnout); } if (fp_samout != nullptr) { fclose(fp_samout); } show_rusage(); } auto usearch_global(struct Parameters const & parameters, char * cmdline, char * progheader) -> void { search_prep(cmdline, progheader); if (opt_dbmatched != nullptr) { fp_dbmatched = fopen_output(opt_dbmatched); if (fp_dbmatched == nullptr) { fatal("Unable to open dbmatched output file for writing"); } } if (opt_dbnotmatched != nullptr) { fp_dbnotmatched = fopen_output(opt_dbnotmatched); if (fp_dbnotmatched == nullptr) { fatal("Unable to open dbnotmatched output file for writing"); } } dbmatched = (uint64_t *) xmalloc(seqcount * sizeof(uint64_t *)); std::memset(dbmatched, 0, seqcount * sizeof(uint64_t *)); otutable_init(); /* prepare reading of queries */ qmatches = 0; qmatches_abundance = 0; queries = 0; queries_abundance = 0; query_fastx_h = fastx_open(parameters.opt_usearch_global); /* allocate memory for thread info */ si_plus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); if (opt_strand > 1) { si_minus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); } else { si_minus = nullptr; } pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); progress_init("Searching", fastx_get_size(query_fastx_h)); search_thread_worker_run(); progress_done(); xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); xfree(pthread); xfree(si_plus); if (si_minus != nullptr) { xfree(si_minus); } fastx_close(query_fastx_h); if (! opt_quiet) { fprintf(stderr, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); if (opt_sizein) { fprintf(stderr, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(stderr, "\n"); } } if (opt_log != nullptr) { fprintf(fp_log, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n"); if (opt_sizein) { fprintf(fp_log, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(fp_log, "\n"); } } // Add OTUs with no matches to OTU table if ((opt_otutabout != nullptr) || (opt_mothur_shared_out != nullptr) || (opt_biomout != nullptr)) { for (int64_t i = 0; i < seqcount; i++) { if (dbmatched[i] == 0U) { otutable_add(nullptr, db_getheader(i), 0); } } } if (opt_biomout != nullptr) { otutable_print_biomout(fp_biomout); fclose(fp_biomout); } if (opt_otutabout != nullptr) { otutable_print_otutabout(fp_otutabout); fclose(fp_otutabout); } if (opt_mothur_shared_out != nullptr) { otutable_print_mothur_shared_out(fp_mothur_shared_out); fclose(fp_mothur_shared_out); } otutable_done(); int count_dbmatched = 0; int count_dbnotmatched = 0; if ((opt_dbmatched != nullptr) || (opt_dbnotmatched != nullptr)) { for (int64_t i = 0; i < seqcount; i++) { if (dbmatched[i] != 0U) { count_dbmatched++; if (opt_dbmatched != nullptr) { fasta_print_general(fp_dbmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), dbmatched[i], count_dbmatched, -1.0, -1, -1, nullptr, 0.0); } } else { count_dbnotmatched++; if (opt_dbnotmatched != nullptr) { fasta_print_general(fp_dbnotmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), db_getabundance(i), count_dbnotmatched, -1.0, -1, -1, nullptr, 0.0); } } } } xfree(dbmatched); if (opt_dbmatched != nullptr) { fclose(fp_dbmatched); } if (opt_dbnotmatched != nullptr) { fclose(fp_dbnotmatched); } search_done(); } vsearch-2.30.1/src/search.h000066400000000000000000000050231506776541700154570ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto usearch_global(struct Parameters const & parameters, char * cmdline, char * progheader) -> void; vsearch-2.30.1/src/search_exact.cc000066400000000000000000000660641506776541700170150ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "mask.h" #include "otutable.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/xpthread.hpp" #include // std::min #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::strlen, std::memset, std::strcpy #include #include static struct searchinfo_s * si_plus = nullptr; static struct searchinfo_s * si_minus = nullptr; static pthread_t * pthread = nullptr; /* global constants/data, no need for synchronization */ static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; static fastx_handle query_fastx_h; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static int qmatches; static uint64_t qmatches_abundance; static int queries; static uint64_t queries_abundance; static uint64_t * dbmatched; static FILE * fp_samout = nullptr; static FILE * fp_alnout = nullptr; static FILE * fp_userout = nullptr; static FILE * fp_blast6out = nullptr; static FILE * fp_uc = nullptr; static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; static FILE * fp_dbmatched = nullptr; static FILE * fp_dbnotmatched = nullptr; static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; static FILE * fp_qsegout = nullptr; static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; auto add_hit(struct searchinfo_s * si, uint64_t seqno) -> void { if (search_acceptable_unaligned(*si, seqno)) { struct hit * hp = si->hits + si->hit_count; si->hit_count++; hp->target = seqno; hp->strand = si->strand; hp->count = 0; hp->nwscore = si->qseqlen * opt_match; hp->nwdiff = 0; hp->nwgaps = 0; hp->nwindels = 0; hp->nwalignmentlength = si->qseqlen; hp->nwid = 100.0; hp->matches = si->qseqlen; hp->mismatches = 0; int const ret = xsprintf(&hp->nwalignment, "%dM", si->qseqlen); if ((ret == -1) || (hp->nwalignment == nullptr)) { fatal("Out of memory"); } hp->internal_alignmentlength = si->qseqlen; hp->internal_gaps = 0; hp->internal_indels = 0; hp->trim_q_left = 0; hp->trim_q_right = 0; hp->trim_t_left = 0; hp->trim_t_right = 0; hp->trim_aln_left = 0; hp->trim_aln_right = 0; hp->id = 100.0; hp->id0 = 100.0; hp->id1 = 100.0; hp->id2 = 100.0; hp->id3 = 100.0; hp->id4 = 100.0; hp->shortest = si->qseqlen; hp->longest = si->qseqlen; hp->aligned = true; hp->accepted = false; hp->rejected = false; hp->weak = false; (void) search_acceptable_aligned(*si, hp); } } auto search_exact_onequery(struct searchinfo_s * si) -> void { dbhash_search_info_s info; char const * seq = si->qsequence; uint64_t const seqlen = si->qseqlen; std::vector normalized(seqlen + 1); string_normalize(normalized.data(), seq, seqlen); si->hit_count = 0; int64_t ret = dbhash_search_first(normalized.data(), seqlen, & info); while (ret >= 0) { add_hit(si, ret); ret = dbhash_search_next(&info); } } auto search_exact_output_results(std::vector const & hits, char * query_head, int qseqlen, char * qsequence, char * qsequence_rc, int qsize) -> void { xpthread_mutex_lock(&mutex_output); /* show results */ auto const n_results_to_report = std::min(opt_maxhits, static_cast(hits.size())); if (fp_alnout != nullptr) { results_show_alnout(fp_alnout, hits.data(), n_results_to_report, query_head, qsequence, qseqlen); } if (fp_samout != nullptr) { results_show_samout(fp_samout, hits.data(), n_results_to_report, query_head, qsequence, qsequence_rc); } if (n_results_to_report != 0) { double const top_hit_id = hits[0].id; if ((opt_otutabout != nullptr) || (opt_mothur_shared_out != nullptr) || (opt_biomout != nullptr)) { otutable_add(query_head, db_getheader(hits[0].target), qsize); } for (int t = 0; t < n_results_to_report; t++) { auto const & hit = hits[t]; if ((opt_top_hits_only != 0) && (hit.id < top_hit_id)) { break; } if (fp_fastapairs != nullptr) { results_show_fastapairs_one(fp_fastapairs, &hit, query_head, qsequence, qsequence_rc); } if (fp_qsegout != nullptr) { results_show_qsegout_one(fp_qsegout, &hit, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_tsegout != nullptr) { results_show_tsegout_one(fp_tsegout, &hit); } if (fp_uc != nullptr) { if ((t == 0) || (opt_uc_allhits != 0)) { results_show_uc_one(fp_uc, &hit, query_head, qseqlen, hit.target); } } if (fp_userout != nullptr) { results_show_userout_one(fp_userout, &hit, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, &hit, query_head, qseqlen); } } } else { if ((opt_otutabout != nullptr) || (opt_mothur_shared_out != nullptr) || (opt_biomout != nullptr)) { otutable_add(query_head, nullptr, qsize); } if (fp_uc != nullptr) { results_show_uc_one(fp_uc, nullptr, query_head, qseqlen, 0); } if (opt_output_no_hits != 0) { if (fp_userout != nullptr) { results_show_userout_one(fp_userout, nullptr, query_head, qsequence, qseqlen, qsequence_rc); } if (fp_blast6out != nullptr) { results_show_blast6out_one(fp_blast6out, nullptr, query_head, qseqlen); } } } if (not hits.empty()) { ++count_matched; if (opt_matched != nullptr) { fasta_print_general(fp_matched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_matched, -1.0, -1, -1, nullptr, 0.0); } } else { ++count_notmatched; if (opt_notmatched != nullptr) { fasta_print_general(fp_notmatched, nullptr, qsequence, qseqlen, query_head, strlen(query_head), qsize, count_notmatched, -1.0, -1, -1, nullptr, 0.0); } } /* update matching db sequences */ for (auto const & hit : hits) { if (hit.accepted) { dbmatched[hit.target] += opt_sizein ? qsize : 1; } } xpthread_mutex_unlock(&mutex_output); } auto search_exact_query(int64_t t) -> int { for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_minus + t : si_plus + t; /* mask query */ if (opt_qmask == MASK_DUST) { dust(si->qsequence, si->qseqlen); } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask != 0)) { hardmask(si->qsequence, si->qseqlen); } /* perform search */ search_exact_onequery(si); } std::vector hits; search_joinhits(si_plus + t, opt_strand > 1 ? si_minus + t : nullptr, hits); search_exact_output_results(hits, si_plus[t].query_head, si_plus[t].qseqlen, si_plus[t].qsequence, opt_strand > 1 ? si_minus[t].qsequence : nullptr, si_plus[t].qsize); /* free memory for alignment strings */ for (auto const & hit : hits) { if (hit.aligned) { xfree(hit.nwalignment); } } return static_cast(hits.size()); } auto search_exact_thread_run(int64_t t) -> void { while (true) { xpthread_mutex_lock(&mutex_input); if (fastx_next(query_fastx_h, (opt_notrunclabels == 0), chrmap_no_change_vector.data())) { char const * qhead = fastx_get_header(query_fastx_h); int const query_head_len = fastx_get_header_length(query_fastx_h); char const * qseq = fastx_get_sequence(query_fastx_h); int const qseqlen = fastx_get_sequence_length(query_fastx_h); int const query_no = fastx_get_seqno(query_fastx_h); int const qsize = fastx_get_abundance(query_fastx_h); for (int s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_minus + t : si_plus + t; si->query_head_len = query_head_len; si->qseqlen = qseqlen; si->query_no = query_no; si->qsize = qsize; si->strand = s; /* allocate more memory for header and sequence, if necessary */ if (si->query_head_len + 1 > si->query_head_alloc) { si->query_head_alloc = si->query_head_len + 2001; si->query_head = (char *) xrealloc(si->query_head, (size_t) (si->query_head_alloc)); } if (si->qseqlen + 1 > si->seq_alloc) { si->seq_alloc = si->qseqlen + 2001; si->qsequence = (char *) xrealloc(si->qsequence, (size_t) (si->seq_alloc)); } } /* plus strand: copy header and sequence */ strcpy(si_plus[t].query_head, qhead); strcpy(si_plus[t].qsequence, qseq); /* get progress as amount of input file read */ uint64_t const progress = fastx_get_position(query_fastx_h); /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* minus strand: copy header and reverse complementary sequence */ if (opt_strand > 1) { strcpy(si_minus[t].query_head, si_plus[t].query_head); reverse_complement(si_minus[t].qsequence, si_plus[t].qsequence, si_plus[t].qseqlen); } int const match = search_exact_query(t); /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* update stats */ queries++; queries_abundance += qsize; if (match != 0) { qmatches++; qmatches_abundance += qsize; } /* show progress */ progress_update(progress); xpthread_mutex_unlock(&mutex_output); } else { xpthread_mutex_unlock(&mutex_input); break; } } } auto search_exact_thread_init(struct searchinfo_s * si) -> void { /* thread specific initialiation */ si->uh = nullptr; si->kmers = nullptr; si->m = nullptr; si->hits_v.resize(tophits * opt_strand); si->hits = si->hits_v.data(); si->qsize = 1; si->query_head_alloc = 0; si->query_head = nullptr; si->seq_alloc = 0; si->qsequence = nullptr; si->nw = nullptr; si->s = nullptr; } auto search_exact_thread_exit(struct searchinfo_s * si) -> void { /* thread specific clean up */ if (si->query_head != nullptr) { xfree(si->query_head); } if (si->qsequence != nullptr) { xfree(si->qsequence); } } auto search_exact_thread_worker(void * vp) -> void * { auto t = (int64_t) vp; search_exact_thread_run(t); return nullptr; } auto search_exact_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (int t = 0; t < opt_threads; t++) { search_exact_thread_init(si_plus + t); if (si_minus != nullptr) { search_exact_thread_init(si_minus + t); } xpthread_create(pthread + t, &attr, search_exact_thread_worker, (void *) (int64_t) t); } /* finish and clean up worker threads */ for (int t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); search_exact_thread_exit(si_plus + t); if (si_minus != nullptr) { search_exact_thread_exit(si_minus + t); } } xpthread_attr_destroy(&attr); } auto search_exact_prep(char const * cmdline, char const * progheader) -> void { /* open output files */ if (opt_alnout != nullptr) { fp_alnout = fopen_output(opt_alnout); if (fp_alnout == nullptr) { fatal("Unable to open alignment output file for writing"); } fprintf(fp_alnout, "%s\n", cmdline); fprintf(fp_alnout, "%s\n", progheader); } if (opt_samout != nullptr) { fp_samout = fopen_output(opt_samout); if (fp_samout == nullptr) { fatal("Unable to open SAM output file for writing"); } } if (opt_userout != nullptr) { fp_userout = fopen_output(opt_userout); if (fp_userout == nullptr) { fatal("Unable to open user-defined output file for writing"); } } if (opt_blast6out != nullptr) { fp_blast6out = fopen_output(opt_blast6out); if (fp_blast6out == nullptr) { fatal("Unable to open blast6-like output file for writing"); } } if (opt_uc != nullptr) { fp_uc = fopen_output(opt_uc); if (fp_uc == nullptr) { fatal("Unable to open uc output file for writing"); } } if (opt_fastapairs != nullptr) { fp_fastapairs = fopen_output(opt_fastapairs); if (fp_fastapairs == nullptr) { fatal("Unable to open fastapairs output file for writing"); } } if (opt_qsegout != nullptr) { fp_qsegout = fopen_output(opt_qsegout); if (fp_qsegout == nullptr) { fatal("Unable to open qsegout output file for writing"); } } if (opt_tsegout != nullptr) { fp_tsegout = fopen_output(opt_tsegout); if (fp_tsegout == nullptr) { fatal("Unable to open tsegout output file for writing"); } } if (opt_matched != nullptr) { fp_matched = fopen_output(opt_matched); if (fp_matched == nullptr) { fatal("Unable to open matched output file for writing"); } } if (opt_notmatched != nullptr) { fp_notmatched = fopen_output(opt_notmatched); if (fp_notmatched == nullptr) { fatal("Unable to open notmatched output file for writing"); } } if (opt_dbmatched != nullptr) { fp_dbmatched = fopen_output(opt_dbmatched); if (fp_dbmatched == nullptr) { fatal("Unable to open dbmatched output file for writing"); } } if (opt_dbnotmatched != nullptr) { fp_dbnotmatched = fopen_output(opt_dbnotmatched); if (fp_dbnotmatched == nullptr) { fatal("Unable to open dbnotmatched output file for writing"); } } if (opt_otutabout != nullptr) { fp_otutabout = fopen_output(opt_otutabout); if (fp_otutabout == nullptr) { fatal("Unable to open OTU table (text format) output file for writing"); } } if (opt_mothur_shared_out != nullptr) { fp_mothur_shared_out = fopen_output(opt_mothur_shared_out); if (fp_mothur_shared_out == nullptr) { fatal("Unable to open OTU table (mothur format) output file for writing"); } } if (opt_biomout != nullptr) { fp_biomout = fopen_output(opt_biomout); if (fp_biomout == nullptr) { fatal("Unable to open OTU table (biom 1.0 format) output file for writing"); } } db_read(opt_db, 0); results_show_samheader(fp_samout, cmdline, opt_db); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask != 0)) { hardmask_all(); } show_rusage(); seqcount = db_getsequencecount(); /* tophits = the maximum number of hits we need to store */ tophits = seqcount; dbmatched = (uint64_t *) xmalloc(seqcount * sizeof(uint64_t *)); std::memset(dbmatched, 0, seqcount * sizeof(uint64_t *)); dbhash_open(seqcount); dbhash_add_all(); } auto search_exact_done() -> void { /* clean up, global */ dbhash_close(); db_free(); xfree(dbmatched); if (opt_dbmatched != nullptr) { fclose(fp_dbmatched); } if (opt_dbnotmatched != nullptr) { fclose(fp_dbnotmatched); } if (opt_matched != nullptr) { fclose(fp_matched); } if (opt_notmatched != nullptr) { fclose(fp_notmatched); } if (opt_fastapairs != nullptr) { fclose(fp_fastapairs); } if (opt_qsegout != nullptr) { fclose(fp_qsegout); } if (opt_tsegout != nullptr) { fclose(fp_tsegout); } if (fp_uc != nullptr) { fclose(fp_uc); } if (fp_blast6out != nullptr) { fclose(fp_blast6out); } if (fp_userout != nullptr) { fclose(fp_userout); clean_up(); // free userfields allocation } if (fp_alnout != nullptr) { fclose(fp_alnout); } if (fp_samout != nullptr) { fclose(fp_samout); } show_rusage(); } auto search_exact(struct Parameters const & parameters, char const * cmdline, char const * progheader) -> void { search_exact_prep(cmdline, progheader); otutable_init(); /* prepare reading of queries */ qmatches = 0; qmatches_abundance = 0; queries = 0; queries_abundance = 0; query_fastx_h = fastx_open(parameters.opt_search_exact); /* allocate memory for thread info */ std::vector si_plus_v(parameters.opt_threads); si_plus = si_plus_v.data(); std::vector si_minus_v; if (parameters.opt_strand) { si_minus_v.resize(parameters.opt_threads); si_minus = si_minus_v.data(); } std::vector pthread_v(parameters.opt_threads); pthread = pthread_v.data(); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); progress_init("Searching", fastx_get_size(query_fastx_h)); search_exact_thread_worker_run(); progress_done(); xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); // si_plus not used below that point // si_minus not used below that point fastx_close(query_fastx_h); if (! parameters.opt_quiet) { fprintf(stderr, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(stderr, "\n"); if (parameters.opt_sizein) { fprintf(stderr, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(stderr, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(stderr, "\n"); } } if (parameters.opt_log != nullptr) { fprintf(fp_log, "Matching unique query sequences: %d of %d", qmatches, queries); if (queries > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries); } fprintf(fp_log, "\n"); if (parameters.opt_sizein) { fprintf(fp_log, "Matching total query sequences: %" PRIu64 " of %" PRIu64, qmatches_abundance, queries_abundance); if (queries_abundance > 0) { fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches_abundance / queries_abundance); } fprintf(fp_log, "\n"); } } // Add OTUs with no matches to OTU table if ((opt_otutabout != nullptr) || (opt_mothur_shared_out != nullptr) || (opt_biomout != nullptr)) { for (int64_t i = 0; i < seqcount; i++) { if (dbmatched[i] == 0U) { otutable_add(nullptr, db_getheader(i), 0); } } } if (fp_biomout != nullptr) { otutable_print_biomout(fp_biomout); fclose(fp_biomout); } if (fp_otutabout != nullptr) { otutable_print_otutabout(fp_otutabout); fclose(fp_otutabout); } if (fp_mothur_shared_out != nullptr) { otutable_print_mothur_shared_out(fp_mothur_shared_out); fclose(fp_mothur_shared_out); } otutable_done(); int count_dbmatched = 0; int count_dbnotmatched = 0; if ((opt_dbmatched != nullptr) || (opt_dbnotmatched != nullptr)) { for (int64_t i = 0; i < seqcount; i++) { if (dbmatched[i] != 0U) { ++count_dbmatched; if (opt_dbmatched != nullptr) { fasta_print_general(fp_dbmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), dbmatched[i], count_dbmatched, -1.0, -1, -1, nullptr, 0.0); } } else { ++count_dbnotmatched; if (opt_dbnotmatched != nullptr) { fasta_print_general(fp_dbnotmatched, nullptr, db_getsequence(i), db_getsequencelen(i), db_getheader(i), db_getheaderlen(i), 0, count_dbnotmatched, -1.0, -1, -1, nullptr, 0.0); } } } } search_exact_done(); } vsearch-2.30.1/src/search_exact.h000066400000000000000000000050351506776541700166460ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto search_exact(struct Parameters const & parameters, char const * cmdline, char const * progheader) -> void; vsearch-2.30.1/src/searchcore.cc000066400000000000000000000630041506776541700164710ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "align_simd.h" #include "dbindex.h" #include "linmemalign.h" #include "minheap.h" #include "otutable.h" #include "unique.h" #include "utils/seqcmp.hpp" #include "utils/span.hpp" #include // std::count_if, std::min, std::max #include #include #include // macros PRIu64 and PRId64 #include // std::pow #include // int64_t, uint64_t #include // std::sscanf, std::size_t #include // std::qsort #include // std::strlen, std::memset, std::strcmp #include #include // anonymous namespace: limit visibility and usage to this translation unit namespace { auto make_hits_span(struct searchinfo_s const * search_info) -> Span { assert(search_info != nullptr); assert(search_info->hit_count >= 0); auto const length = static_cast(search_info->hit_count); return Span{search_info->hits, length}; } auto count_number_of_hits_to_keep(struct searchinfo_s const * search_info) -> std::size_t { if (search_info == nullptr) { return std::size_t{0}; } auto const hits = make_hits_span(search_info); return static_cast(std::count_if(hits.cbegin(), hits.cend(), [](struct hit const & hit) -> bool { return hit.accepted or hit.weak; })); } auto copy_over_hits_to_be_kept(std::vector & hits, struct searchinfo_s const * search_info) -> void { if (search_info == nullptr) { return; } for (auto const & hit : make_hits_span(search_info)) { if (hit.accepted or hit.weak) { hits.emplace_back(hit); } } } auto free_rejected_alignments(struct searchinfo_s const * search_info) -> void { if (search_info == nullptr) { return; } for (auto const & hit : make_hits_span(search_info)) { if (not (hit.accepted or hit.weak) and hit.aligned) { xfree(hit.nwalignment); } } } } // end of anonymous namespace /* per thread data */ inline auto hit_compare_byid_typed(struct hit const * lhs, struct hit const * rhs) -> int { /* Order: accepted, then rejected (weak) high id, then low id early target, then late target */ if (lhs->rejected < rhs->rejected) { return -1; } if (lhs->rejected > rhs->rejected) { return +1; } if (lhs->aligned > rhs->aligned) { return -1; } if (lhs->aligned < rhs->aligned) { return +1; } if (lhs->aligned == 0) { return 0; } if (lhs->id > rhs->id) { return -1; } if (lhs->id < rhs->id) { return +1; } if (lhs->target < rhs->target) { return -1; } if (lhs->target > rhs->target) { return +1; } return 0; } inline auto hit_compare_bysize_typed(struct hit const * lhs, struct hit const * rhs) -> int { // high abundance, then low abundance // high id, then low id // early target, then late target if (lhs->rejected < rhs->rejected) { return -1; } if (lhs->rejected > rhs->rejected) { return +1; } if (lhs->rejected == 1) { return 0; } if (lhs->aligned > rhs->aligned) { return -1; } if (lhs->aligned < rhs->aligned) { return +1; } if (lhs->aligned == 0) { return 0; } auto const lhs_abundance = db_getabundance(lhs->target); auto const rhs_abundance = db_getabundance(rhs->target); if (lhs_abundance > rhs_abundance) { return -1; } if (lhs_abundance < rhs_abundance) { return +1; } if (lhs->id > rhs->id) { return -1; } if (lhs->id < rhs->id) { return +1; } if (lhs->target < rhs->target) { return -1; } if (lhs->target > rhs->target) { return +1; } return 0; } auto hit_compare_byid(const void * lhs, const void * rhs) -> int { return hit_compare_byid_typed((struct hit *) lhs, (struct hit *) rhs); } auto hit_compare_bysize(const void * lhs, const void * rhs) -> int { return hit_compare_bysize_typed((struct hit *) lhs, (struct hit *) rhs); } auto search_enough_kmers(struct searchinfo_s const & searchinfo, unsigned int const count) -> bool { return (count >= opt_minwordmatches) or (count >= searchinfo.kmersamplecount); } auto search_topscores(struct searchinfo_s * searchinfo) -> void { /* Count the kmer hits in each database sequence and make a sorted list of a given number (th) of the database sequences with the highest number of matching kmers. These are stored in the min heap array. */ /* count kmer hits in the database sequences */ const int indexed_count = dbindex_getcount(); /* zero counts */ std::memset(searchinfo->kmers, 0, indexed_count * sizeof(count_t)); minheap_clear(searchinfo->m); for (auto i = 0U; i < searchinfo->kmersamplecount; i++) { auto const kmer = searchinfo->kmersample[i]; auto * bitmap = dbindex_getbitmap(kmer); if (bitmap != nullptr) { #ifdef __x86_64__ if (ssse3_present != 0) { increment_counters_from_bitmap_ssse3(searchinfo->kmers, bitmap, indexed_count); } else { increment_counters_from_bitmap_sse2(searchinfo->kmers, bitmap, indexed_count); } #else increment_counters_from_bitmap(searchinfo->kmers, bitmap, indexed_count); #endif } else { auto * list = dbindex_getmatchlist(kmer); auto const count = dbindex_getmatchcount(kmer); for (auto j = 0U; j < count; j++) { searchinfo->kmers[list[j]]++; } } } auto const minmatches = std::min(static_cast(opt_minwordmatches), searchinfo->kmersamplecount); for (auto i = 0; i < indexed_count; i++) { auto const count = searchinfo->kmers[i]; if (count >= minmatches) { auto const seqno = dbindex_getmapping(i); unsigned int const length = db_getsequencelen(seqno); elem_t novel; novel.count = count; novel.seqno = seqno; novel.length = length; minheap_add(searchinfo->m, & novel); } } minheap_sort(searchinfo->m); } auto align_trim(struct hit * hit) -> void { /* trim alignment and fill in info */ /* assumes that the hit has been aligned */ /* info for semi-global alignment (without gaps at ends) */ hit->trim_aln_left = 0; hit->trim_q_left = 0; hit->trim_t_left = 0; hit->trim_aln_right = 0; hit->trim_q_right = 0; hit->trim_t_right = 0; /* left trim alignment */ auto * p = hit->nwalignment; auto op = '\0'; int64_t run = 0; if (*p != 0) { run = 1; auto scanlength = 0; sscanf(p, "%" PRId64 "%n", &run, &scanlength); op = *(p + scanlength); if (op != 'M') { hit->trim_aln_left = 1 + scanlength; if (op == 'D') { hit->trim_q_left = run; } else { hit->trim_t_left = run; } } } /* right trim alignment */ auto * e = hit->nwalignment + strlen(hit->nwalignment); if (e > hit->nwalignment) { p = e - 1; op = *p; if (op != 'M') { while ((p > hit->nwalignment) and (*(p - 1) <= '9')) { --p; } run = 1; sscanf(p, "%" PRId64, &run); hit->trim_aln_right = e - p; if (op == 'D') { hit->trim_q_right = run; } else { hit->trim_t_right = run; } } } if (hit->trim_q_left >= hit->nwalignmentlength) { hit->trim_q_right = 0; } if (hit->trim_t_left >= hit->nwalignmentlength) { hit->trim_t_right = 0; } hit->internal_alignmentlength = hit->nwalignmentlength - hit->trim_q_left - hit->trim_t_left - hit->trim_q_right - hit->trim_t_right; hit->internal_indels = hit->nwindels - hit->trim_q_left - hit->trim_t_left - hit->trim_q_right - hit->trim_t_right; hit->internal_gaps = hit->nwgaps - ((hit->trim_q_left + hit->trim_t_left) > 0 ? 1 : 0) - ((hit->trim_q_right + hit->trim_t_right) > 0 ? 1 : 0); /* CD-HIT */ hit->id0 = hit->shortest > 0 ? 100.0 * hit->matches / hit->shortest : 0.0; /* all diffs */ hit->id1 = hit->nwalignmentlength > 0 ? 100.0 * hit->matches / hit->nwalignmentlength : 0.0; /* internal diffs */ hit->id2 = hit->internal_alignmentlength > 0 ? 100.0 * hit->matches / hit->internal_alignmentlength : 0.0; /* Marine Biology Lab */ hit->id3 = std::max(0.0, 100.0 * (1.0 - (1.0 * (hit->mismatches + hit->nwgaps) / hit->longest))); /* BLAST */ hit->id4 = hit->nwalignmentlength > 0 ? 100.0 * hit->matches / hit->nwalignmentlength : 0.0; switch (opt_iddef) { case 0: hit->id = hit->id0; break; case 1: hit->id = hit->id1; break; case 2: hit->id = hit->id2; break; case 3: hit->id = hit->id3; break; case 4: hit->id = hit->id4; break; } } auto search_acceptable_unaligned(struct searchinfo_s const & searchinfo, int const target) -> bool { /* consider whether a hit satisfies accepted criteria before alignment */ // true: needs further consideration // false: reject auto const * qseq = searchinfo.qsequence; auto const * dlabel = db_getheader(target); auto const * dseq = db_getsequence(target); int64_t const dseqlen = db_getsequencelen(target); int64_t const tsize = db_getabundance(target); return ( /* maxqsize */ (searchinfo.qsize <= opt_maxqsize) and /* mintsize */ (tsize >= opt_mintsize) and /* minsizeratio */ (searchinfo.qsize >= opt_minsizeratio * tsize) and /* maxsizeratio */ (searchinfo.qsize <= opt_maxsizeratio * tsize) and /* minqt */ (searchinfo.qseqlen >= opt_minqt * dseqlen) and /* maxqt */ (searchinfo.qseqlen <= opt_maxqt * dseqlen) and /* minsl */ (searchinfo.qseqlen < dseqlen ? searchinfo.qseqlen >= opt_minsl * dseqlen : dseqlen >= opt_minsl * searchinfo.qseqlen) and /* maxsl */ (searchinfo.qseqlen < dseqlen ? searchinfo.qseqlen <= opt_maxsl * dseqlen : dseqlen <= opt_maxsl * searchinfo.qseqlen) and /* idprefix */ ((searchinfo.qseqlen >= opt_idprefix) and (dseqlen >= opt_idprefix) and (seqcmp(qseq, dseq, opt_idprefix) == 0)) and /* idsuffix */ ((searchinfo.qseqlen >= opt_idsuffix) and (dseqlen >= opt_idsuffix) and (seqcmp(qseq + searchinfo.qseqlen - opt_idsuffix, dseq + dseqlen - opt_idsuffix, opt_idsuffix) == 0)) and /* self */ ((opt_self == 0) or (std::strcmp(searchinfo.query_head, dlabel) != 0)) and /* selfid */ ((opt_selfid == 0) or (searchinfo.qseqlen != dseqlen) or (seqcmp(qseq, dseq, searchinfo.qseqlen) != 0)) ); } auto search_acceptable_aligned(struct searchinfo_s const & searchinfo, struct hit * hit) -> bool { if (/* weak_id */ (hit->id >= 100.0 * opt_weak_id) and /* maxsubs */ (hit->mismatches <= opt_maxsubs) and /* maxgaps */ (hit->internal_gaps <= opt_maxgaps) and /* mincols */ (hit->internal_alignmentlength >= opt_mincols) and /* leftjust */ ((opt_leftjust == 0) or (hit->trim_q_left + hit->trim_t_left == 0)) and /* rightjust */ ((opt_rightjust == 0) or (hit->trim_q_right + hit->trim_t_right == 0)) and /* query_cov */ (hit->matches + hit->mismatches >= opt_query_cov * searchinfo.qseqlen) and /* target_cov */ (hit->matches + hit->mismatches >= opt_target_cov * db_getsequencelen(hit->target)) and /* maxid */ (hit->id <= 100.0 * opt_maxid) and /* mid */ (100.0 * hit->matches / (hit->matches + hit->mismatches) >= opt_mid) and /* maxdiffs */ (hit->mismatches + hit->internal_indels <= opt_maxdiffs)) { if (opt_cluster_unoise != nullptr) { const auto mismatches = hit->mismatches; auto const skew = 1.0 * searchinfo.qsize / db_getabundance(hit->target); auto const beta = 1.0 / std::pow(2, (1.0 * opt_unoise_alpha * mismatches) + 1); if (skew <= beta or mismatches == 0) { /* accepted */ hit->accepted = true; hit->weak = false; return true; } /* rejected, but weak hit */ hit->rejected = true; hit->weak = true; return false; } if (hit->id >= 100.0 * opt_id) { /* accepted */ hit->accepted = true; hit->weak = false; return true; } /* rejected, but weak hit */ hit->rejected = true; hit->weak = true; return false; } /* rejected */ hit->rejected = true; hit->weak = false; return false; } auto align_delayed(struct searchinfo_s * searchinfo) -> void { /* compute global alignment */ std::array target_list {{}}; std::array nwscore_list {{}}; std::array nwalignmentlength_list {{}}; std::array nwmatches_list {{}}; std::array nwmismatches_list {{}}; std::array nwgaps_list {{}}; std::array nwcigar_list {{}}; int target_count = 0; for (int x = searchinfo->finalized; x < searchinfo->hit_count; x++) { struct hit const * hit = searchinfo->hits + x; if (not hit->rejected) { target_list[target_count++] = hit->target; } } if (target_count != 0) { search16(searchinfo->s, target_count, target_list.data(), nwscore_list.data(), nwalignmentlength_list.data(), nwmatches_list.data(), nwmismatches_list.data(), nwgaps_list.data(), nwcigar_list.data()); } int i = 0; for (int x = searchinfo->finalized; x < searchinfo->hit_count; x++) { /* maxrejects or maxaccepts reached - ignore remaining hits */ if ((searchinfo->rejects < opt_maxrejects) and (searchinfo->accepts < opt_maxaccepts)) { struct hit * hit = searchinfo->hits + x; if (hit->rejected) { searchinfo->rejects++; } else { int64_t const target = hit->target; int64_t nwscore = nwscore_list[i]; char * nwcigar = nullptr; int64_t nwalignmentlength = 0; int64_t nwmatches = 0; int64_t nwmismatches = 0; int64_t nwgaps = 0; int64_t const dseqlen = db_getsequencelen(target); if (nwscore == std::numeric_limits::max()) { /* In case the SIMD aligner cannot align, perform a new alignment with the linear memory aligner */ char * dseq = db_getsequence(target); if (nwcigar_list[i] != nullptr) { xfree(nwcigar_list[i]); } nwcigar = xstrdup(searchinfo->lma->align(searchinfo->qsequence, dseq, searchinfo->qseqlen, dseqlen)); searchinfo->lma->alignstats(nwcigar, searchinfo->qsequence, dseq, & nwscore, & nwalignmentlength, & nwmatches, & nwmismatches, & nwgaps); } else { nwalignmentlength = nwalignmentlength_list[i]; nwmatches = nwmatches_list[i]; nwmismatches = nwmismatches_list[i]; nwgaps = nwgaps_list[i]; nwcigar = nwcigar_list[i]; } hit->aligned = true; hit->shortest = std::min(searchinfo->qseqlen, static_cast(dseqlen)); hit->longest = std::max(searchinfo->qseqlen, static_cast(dseqlen)); hit->nwalignment = nwcigar; hit->nwscore = nwscore; hit->nwdiff = nwalignmentlength - nwmatches; hit->nwgaps = nwgaps; hit->nwindels = nwalignmentlength - nwmatches - nwmismatches; hit->nwalignmentlength = nwalignmentlength; hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) / nwalignmentlength; hit->matches = nwalignmentlength - hit->nwdiff; hit->mismatches = hit->nwdiff - hit->nwindels; /* trim alignment and compute numbers excluding terminal gaps */ align_trim(hit); /* test accept/reject criteria after alignment */ if (search_acceptable_aligned(*searchinfo, hit)) { searchinfo->accepts++; } else { searchinfo->rejects++; } ++i; } } } /* free ignored alignments */ while (i < target_count) { xfree(nwcigar_list[i++]); } searchinfo->finalized = searchinfo->hit_count; } auto search_onequery(struct searchinfo_s * searchinfo, int seqmask) -> void { searchinfo->hit_count = 0; search16_qprep(searchinfo->s, searchinfo->qsequence, searchinfo->qseqlen); struct Scoring scoring; scoring.match = opt_match; scoring.mismatch = opt_mismatch; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_open_query_left = opt_gap_open_query_left; scoring.gap_open_target_left = opt_gap_open_target_left; scoring.gap_open_query_interior = opt_gap_open_query_interior; scoring.gap_open_target_interior = opt_gap_open_target_interior; scoring.gap_open_query_right = opt_gap_open_query_right; scoring.gap_open_target_right = opt_gap_open_target_right; scoring.gap_extension_query_left = opt_gap_extension_query_left; scoring.gap_extension_target_left = opt_gap_extension_target_left; scoring.gap_extension_query_interior = opt_gap_extension_query_interior; scoring.gap_extension_target_interior = opt_gap_extension_target_interior; scoring.gap_extension_query_right = opt_gap_extension_query_right; scoring.gap_extension_target_right = opt_gap_extension_target_right; searchinfo->lma = new LinearMemoryAligner(scoring); /* extract unique kmer samples from query*/ unique_count(searchinfo->uh, opt_wordlength, searchinfo->qseqlen, searchinfo->qsequence, &searchinfo->kmersamplecount, &searchinfo->kmersample, seqmask); /* find database sequences with the most kmer hits */ search_topscores(searchinfo); /* analyse targets with the highest number of kmer hits */ searchinfo->accepts = 0; searchinfo->rejects = 0; searchinfo->finalized = 0; int delayed = 0; while ((searchinfo->finalized + delayed < opt_maxaccepts + opt_maxrejects - 1) and (searchinfo->rejects < opt_maxrejects) and (searchinfo->accepts < opt_maxaccepts) and (not minheap_isempty(searchinfo->m))) { elem_t const e = minheap_poplast(searchinfo->m); struct hit * hit = searchinfo->hits + searchinfo->hit_count; hit->target = e.seqno; hit->count = e.count; hit->strand = searchinfo->strand; hit->rejected = false; hit->accepted = false; hit->aligned = false; hit->weak = false; hit->nwalignment = nullptr; /* Test some accept/reject criteria before alignment */ if (search_acceptable_unaligned(*searchinfo, e.seqno)) { ++delayed; } else { hit->rejected = true; } searchinfo->hit_count++; if (delayed == MAXDELAYED) { align_delayed(searchinfo); delayed = 0; } } if (delayed > 0) { align_delayed(searchinfo); } delete searchinfo->lma; } auto search_findbest2_byid(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit * { struct hit * best = nullptr; for (int i = 0; i < si_p->hit_count; i++) { if ((best == nullptr) or (hit_compare_byid_typed(si_p->hits + i, best) < 0)) { best = si_p->hits + i; } } if (opt_strand > 1) { for (int i = 0; i < si_m->hit_count; i++) { if ((best == nullptr) or (hit_compare_byid_typed(si_m->hits + i, best) < 0)) { best = si_m->hits + i; } } } if ((best != nullptr) and not best->accepted) { best = nullptr; } return best; } auto search_findbest2_bysize(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit * { struct hit * best = nullptr; for (int i = 0; i < si_p->hit_count; i++) { if ((best == nullptr) or (hit_compare_bysize_typed(si_p->hits + i, best) < 0)) { best = si_p->hits + i; } } if (opt_strand > 1) { for (int i = 0; i < si_m->hit_count; i++) { if ((best == nullptr) or (hit_compare_bysize_typed(si_m->hits + i, best) < 0)) { best = si_m->hits + i; } } } if ((best != nullptr) and not best->accepted) { best = nullptr; } return best; } auto search_joinhits(struct searchinfo_s * si_plus, struct searchinfo_s * si_minus, std::vector & hits) -> void { /* join and sort accepted and weak hits from both strands */ /* free the remaining alignments */ auto const counter = count_number_of_hits_to_keep(si_plus) + count_number_of_hits_to_keep(si_minus); /* allocate new array of hits */ hits.reserve(counter); copy_over_hits_to_be_kept(hits, si_plus); copy_over_hits_to_be_kept(hits, si_minus); free_rejected_alignments(si_plus); free_rejected_alignments(si_minus); /* last, sort the hits */ std::qsort(hits.data(), counter, sizeof(struct hit), hit_compare_byid); } vsearch-2.30.1/src/searchcore.h000066400000000000000000000156441506776541700163420ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "linmemalign.h" #include #include struct uhandle_s; /* the number of alignments that can be delayed */ constexpr auto MAXDELAYED = 8U; /* Default minimum number of word matches for word lengths 3-15 */ constexpr std::array minwordmatches_defaults = {{ -1, -1, -1, 18, 17, 16, 15, 14, 12, 11, 10, 9, 8, 7, 5, 3 }}; struct hit { int target; int strand; /* candidate info */ unsigned int count; /* number of unique kmers shared with query */ bool accepted; /* is it accepted? */ bool rejected; /* is it rejected? */ bool aligned; /* has this hit been aligned */ bool weak; /* weak hits are aligned with id > weak_id */ /* info about global alignment, including terminal gaps */ int nwscore; /* alignment score */ int nwdiff; /* indels and mismatches in global alignment */ int nwgaps; /* gaps in global alignment */ int nwindels; /* indels in global alignment */ int nwalignmentlength; /* length of global alignment */ double nwid; /* percent identity of global alignment */ char * nwalignment; /* alignment string (cigar) of global alignment */ int matches; int mismatches; /* info about alignment excluding terminal gaps */ int internal_alignmentlength; int internal_gaps; int internal_indels; int trim_q_left; int trim_q_right; int trim_t_left; int trim_t_right; int trim_aln_left; int trim_aln_right; /* more info */ double id; /* identity used for ranking */ double id0; double id1; double id2; double id3; double id4; int shortest; /* length of shortest of query and target */ int longest; /* length of longest of query and target */ }; /* type of kmer hit counter element remember possibility of overflow */ using count_t = unsigned short; struct searchinfo_s { int query_no = 0; /* query number, zero-based */ int strand = 0; /* strand of query being analysed */ int qsize = 0; /* query abundance */ int query_head_len = 0; /* query header length */ int query_head_alloc = 0; /* bytes allocated for the header */ char * query_head = nullptr; /* query header */ int qseqlen = 0; /* query length */ int seq_alloc = 0; /* bytes allocated for the query sequence */ std::vector qsequence_v; /* vector of query sequence chars */ char * qsequence = nullptr; /* query sequence */ unsigned int kmersamplecount = 0; /* number of kmer samples from query */ unsigned int const * kmersample = nullptr; /* list of kmers sampled from query */ std::vector kmers_v; /* vector of kmer counts */ count_t * kmers = nullptr; /* list of kmer counts for each db seq */ std::vector hits_v; /* vector of hits */ struct hit * hits = nullptr; /* list of hits */ int hit_count = 0; /* number of hits in the above list */ struct uhandle_s * uh = nullptr; /* unique kmer finder instance */ struct s16info_s * s = nullptr; /* SIMD aligner instance */ struct nwinfo_s * nw = nullptr; /* NW aligner instance */ LinearMemoryAligner * lma = nullptr; /* Linear memory aligner instance pointer */ int accepts = 0; /* number of accepts */ int rejects = 0; /* number of rejects */ struct minheap_s * m = nullptr; /* min heap with the top kmer db seqs */ int finalized = 0; }; auto search_topscores(struct searchinfo_s * searchinfo) -> void; auto search_onequery(struct searchinfo_s * searchinfo, int seqmask) -> void; auto search_findbest2_byid(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit *; auto search_findbest2_bysize(struct searchinfo_s * si_p, struct searchinfo_s * si_m) -> struct hit *; auto search_acceptable_unaligned(struct searchinfo_s const & searchinfo, int target) -> bool; auto search_acceptable_aligned(struct searchinfo_s const & searchinfo, struct hit * hit) -> bool; auto align_trim(struct hit * hit) -> void; auto search_joinhits(struct searchinfo_s * si_p, struct searchinfo_s * si_m, std::vector & hits) -> void; auto search_enough_kmers(struct searchinfo_s const & searchinfo, unsigned int count) -> bool; vsearch-2.30.1/src/sff_convert.cc000066400000000000000000000605251506776541700166760ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include "utils/open_file.hpp" #include "utils/os_byteswap.hpp" #include // std::min, std::max, std::transform #include #include #include // std::tolower, std::toupper #include // uint64_t, uint32_t, uint16_t, uint8_t #include // std::fprintf, std::FILE, std:fclose, std::fread, std::size_t, stderr #include // std::prev #include #include // sff file map: // - common header // - index (optional, skipped when present) // - reads (or index) // - index (optional, can be at the end) (index_offset tells us where it is) constexpr uint32_t sff_magic = 0x2e736666; // encoding the string ".sff" constexpr auto byte_size = sizeof(uint8_t); constexpr auto memory_alignment = 8U; constexpr auto max_padding_length = 7U; constexpr auto expected_version_number = 1U; constexpr auto expected_flowgram_format_code = 1U; constexpr auto expected_key_length = 4U; // key sequences always have 4 nucleotides? constexpr auto index_header_length = 8U; // index_magic_number (uint32_t) + index_version (char[4]) // SFF format expects the following to be true: static_assert(sizeof(uint8_t) == 1, "sff expects a uint8_t of size 1"); static_assert(sizeof(uint16_t) == 2, "sff expects a uint16_t of size 2"); static_assert(sizeof(uint32_t) == 4, "sff expects a uint32_t of size 4"); static_assert(sizeof(uint64_t) == memory_alignment, "sff expects a uint64_t of size 8"); struct sff_header_s { uint32_t magic_number = 0; /* .sff */ uint32_t version = 0; uint64_t index_offset = 0; uint32_t index_length = 0; uint32_t number_of_reads = 0; uint16_t header_length = 0; uint16_t key_length = 0; uint16_t flows_per_read = 0; uint8_t flowgram_format_code = 0; // automatic padding: +1 bytes }; constexpr std::size_t n_bytes_in_header = sizeof(struct sff_header_s); // first part of the header is 31 bytes in total + automatic padding = 32 bytes struct sff_read_header_s { uint16_t read_header_length = 0; uint16_t name_length = 0; uint32_t number_of_bases = 0; uint16_t clip_qual_left = 0; uint16_t clip_qual_right = 0; uint16_t clip_adapter_left = 0; uint16_t clip_adapter_right = 0; }; constexpr std::size_t n_bytes_in_read_header = sizeof(struct sff_read_header_s); // 16 bytes struct sff_read_stats { std::size_t total_length = 0; uint32_t minimum = std::numeric_limits::max(); uint32_t maximum = 0; }; auto fskip(std::FILE * file_handle, uint64_t length) -> uint64_t { /* read given amount of data from a stream and ignore it */ /* used instead of seeking in order to work with pipes */ static constexpr uint64_t blocksize = 4096; std::array buffer {{}}; uint64_t skipped = 0; uint64_t rest = length; while (rest > 0) { auto const want = (rest > blocksize) ? blocksize : rest; uint64_t const got = std::fread(buffer.data(), byte_size, want, file_handle); skipped += got; rest -= got; if (got < want) { break; } } return skipped; } auto check_sff_input(char const * filename, bool const filehandle_is_empty) -> void { assert(filename != nullptr); if ((filename != nullptr) and filehandle_is_empty) { fatal("Unable to open SFF input file for reading."); } } auto open_fastq_output(char const * filename) -> std::FILE * { if (filename == nullptr) { fatal("No output file for sff_convert specified with --fastqout."); } auto * fastq_handle = fopen_output(filename); if (fastq_handle == nullptr) { fatal("Unable to open FASTQ output file for writing."); } return fastq_handle; } auto read_sff_header(std::FILE * sff_handle) -> struct sff_header_s { assert(sff_handle != nullptr); struct sff_header_s sff_header; auto const n_bytes_read = std::fread(&sff_header, byte_size, n_bytes_in_header, sff_handle); if (n_bytes_read < n_bytes_in_header) { fatal("Unable to read from SFF file. File may be truncated."); } // SFF multi-byte numeric values are stored using a big-endian byte order // vsearch expects little-endian, so we need to swap bytes // refactoring: C++23 std::byteswap() sff_header.magic_number = bswap_32(sff_header.magic_number); sff_header.version = bswap_32(sff_header.version); sff_header.index_offset = bswap_64(sff_header.index_offset); sff_header.index_length = bswap_32(sff_header.index_length); sff_header.number_of_reads = bswap_32(sff_header.number_of_reads); sff_header.header_length = bswap_16(sff_header.header_length); sff_header.key_length = bswap_16(sff_header.key_length); sff_header.flows_per_read = bswap_16(sff_header.flows_per_read); return sff_header; } auto read_sff_read_header(std::FILE * sff_handle) -> struct sff_read_header_s { assert(sff_handle != nullptr); struct sff_read_header_s read_header; auto const n_bytes_read = std::fread(&read_header, byte_size, n_bytes_in_read_header, sff_handle); if (n_bytes_read < n_bytes_in_read_header) { fatal("Invalid SFF file. Unable to read read header. File may be truncated."); } // SFF multi-byte numeric values are stored using a big-endian byte order // vsearch expects little-endian, so we need to swap bytes // refactoring: C++23 std::byteswap() read_header.read_header_length = bswap_16(read_header.read_header_length); read_header.name_length = bswap_16(read_header.name_length); read_header.number_of_bases = bswap_32(read_header.number_of_bases); read_header.clip_qual_left = bswap_16(read_header.clip_qual_left); read_header.clip_qual_right = bswap_16(read_header.clip_qual_right); read_header.clip_adapter_left = bswap_16(read_header.clip_adapter_left); read_header.clip_adapter_right = bswap_16(read_header.clip_adapter_right); return read_header; } auto check_sff_header(struct sff_header_s const &sff_header) -> void { if (sff_header.magic_number != sff_magic) { fatal("Invalid SFF file. Incorrect magic number. Must be 0x2e736666 (.sff)."); } if (sff_header.version != expected_version_number) { fatal("Invalid SFF file. Incorrect version. Must be 1."); } if (sff_header.flowgram_format_code != expected_flowgram_format_code) { fatal("Invalid SFF file. Incorrect flowgram format code. Must be 1."); } // The header_length field should be the total number of bytes // required by this set of header fields, and should be equal to "31 // + number_of_flows_per_read + key_length" rounded up to the next // value divisible by 8 if (sff_header.header_length != memory_alignment * ((n_bytes_in_header + sff_header.flows_per_read + sff_header.key_length + max_padding_length) / memory_alignment)) { fatal("Invalid SFF file. Incorrect header length."); } if (sff_header.key_length != expected_key_length) { fatal("Invalid SFF file. Incorrect key length. Must be 4."); } if ((sff_header.index_length != 0) and (sff_header.index_length < index_header_length)) { fatal("Invalid SFF file. Incorrect index size. Must be at least 8."); } // index_length includes the bytes of index_magic_number (uint32_t), // index_version (char[4]), the 8n bytes for the indexing method, // and padding bytes. And the length of the index section is // divisible by 8. So, index_length modulo 8 should be null. // This is not the case: // assert((sff_header.index_length % memory_alignment) == 0); // fails on our test dataset } auto check_sff_read_header(struct sff_read_header_s const &read_header) -> void { // The read_header_length should be set to the length of the read // header for this read, and should be equal to "16 + name_length" // rounded up to the next value divisible by 8. if (read_header.read_header_length != memory_alignment * ((n_bytes_in_read_header + read_header.name_length + max_padding_length) / memory_alignment)) { fatal("Invalid SFF file. Incorrect read header length."); } if (read_header.clip_qual_left > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_qual_left value."); } if (read_header.clip_adapter_left > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_adapter_left value."); } if (read_header.clip_qual_right > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_qual_right value."); } if (read_header.clip_adapter_right > read_header.number_of_bases) { fatal("Invalid SFF file. Incorrect clip_adapter_right value."); } } auto skip_sff_section(std::FILE * sff_handle, uint64_t n_bytes_to_skip, char const * const message) -> void { auto const n_bytes_skipped = fskip(sff_handle, n_bytes_to_skip); if (n_bytes_skipped < n_bytes_to_skip) { fatal("Invalid SFF file. Unable to read %s. File may be truncated.", message); } } auto read_a_string(std::FILE * sff_handle, std::size_t n_bytes_to_read, char const * const message) -> std::vector { assert(sff_handle != nullptr); assert(n_bytes_to_read < std::numeric_limits::max()); assert(message != nullptr); std::vector a_string(n_bytes_to_read + 1); auto const n_bytes_read = std::fread(a_string.data(), byte_size, n_bytes_to_read, sff_handle); if (n_bytes_read < n_bytes_to_read) { fatal("Invalid SFF file. Unable to read %s. File may be truncated.", message); } assert(a_string.back() == '\0'); // C-string should be null-terminated return a_string; } auto convert_quality_scores(std::vector & quality_scores, struct Parameters const & parameters) -> void { auto const qmin = static_cast(parameters.opt_fastq_qminout); auto const qmax = static_cast(parameters.opt_fastq_qmaxout); auto const offset = static_cast(parameters.opt_fastq_asciiout); auto clamp_and_offset = [&](char & quality_score) -> char { // refactoring C++17: return std::clamp(quality_score, qmin, qmax) + offset; quality_score = std::max(quality_score, qmin); quality_score = std::min(quality_score, qmax); return static_cast(quality_score + offset); }; // note: quality_scores has room for an extra null terminator at the // end, be careful to stop the conversion one position before. std::transform(quality_scores.begin(), std::prev(quality_scores.end()), quality_scores.begin(), clamp_and_offset); assert(quality_scores.back() == '\0'); } auto compute_padding_length(uint32_t const section_length) -> uint32_t { // padding_length = section_length rounded up to the next value divisible by 8. auto const remainder = section_length & max_padding_length; return remainder == 0 ? 0U : memory_alignment - remainder; } // refactoring: C++14 tests // static_assert(compute_index_padding(0) == 0U, "error: wrong padding value (expect 0)"); // static_assert(compute_index_padding(7) == 1U, "error: wrong padding value (expect 1)"); // static_assert(compute_index_padding(8) == 0U, "error: wrong padding value (expect 0)"); // static_assert(compute_index_padding(9) == 7U, "error: wrong padding value (expect 7)"); // static_assert(compute_index_padding(87) == 1U, "error: wrong padding value (expect 1)"); // static_assert(compute_index_padding(88) == 0U, "error: wrong padding value (expect 0)"); // static_assert(compute_index_padding(89) == 7U, "error: wrong padding value (expect 7)"); // static_assert(compute_index_padding(111) == 1U, "error: wrong padding value (expect 1)"); // static_assert(compute_index_padding(112) == 0U, "error: wrong padding value (expect 0)"); // static_assert(compute_index_padding(113) == 7U, "error: wrong padding value (expect 7)"); // static_assert(compute_index_padding(std::numeric_limits::max()) == 1U, "error: wrong padding value (expect 1)"); auto warn_if(struct Parameters const & parameters, bool const condition, char const * const message) -> void { if (condition) { return; }; static_cast(std::fprintf(stderr, "WARNING: %s\n", message)); if (parameters.opt_log != nullptr) { static_cast(std::fprintf(parameters.fp_log, "WARNING: %s\n", message)); } } auto check_for_additional_tail_data(std::FILE * sff_handle, struct Parameters const & parameters) -> void { // try to read another byte auto const n_bytes_read = fskip(sff_handle, byte_size); warn_if(parameters, n_bytes_read == 0, "Additional data at end of SFF file ignored"); } auto write_report(std::FILE * output_stream, struct sff_header_s const & sff_header, struct sff_read_stats const & sff_stats, char * index_kind) -> void { if (sff_header.index_length != 0) { std::fprintf(output_stream, "Index type: %s\n", index_kind); } std::fprintf(output_stream, "\nSFF file read successfully.\n"); if (sff_header.number_of_reads == 0) { return; } auto const average_read_length = static_cast(sff_stats.total_length) / sff_header.number_of_reads; std::fprintf(output_stream, "Sequence length: minimum %d, average %.1f, maximum %d\n", sff_stats.minimum, average_read_length, sff_stats.maximum); } auto sff_convert(struct Parameters const & parameters) -> void { /* open input and output files */ auto fp_sff = open_input_file(parameters.opt_sff_convert); check_sff_input(parameters.opt_sff_convert, (not fp_sff)); auto * fp_fastqout = open_fastq_output(parameters.opt_fastqout); /* read and check header */ uint64_t filepos = 0; auto const sff_header = read_sff_header(fp_sff.get()); filepos += n_bytes_in_header; check_sff_header(sff_header); /* skip flow chars, read and check key, and skip padding */ skip_sff_section(fp_sff.get(), sff_header.flows_per_read, "flow characters"); filepos += sff_header.flows_per_read; auto const key_sequence = read_a_string(fp_sff.get(), sff_header.key_length, "key sequence"); filepos += sff_header.key_length; uint32_t const padding_length = sff_header.header_length - n_bytes_in_header - sff_header.flows_per_read - sff_header.key_length; skip_sff_section(fp_sff.get(), padding_length, "read padding"); filepos += padding_length; /* output common header stats */ // refactoring: see fastq_join.cc if (not parameters.opt_quiet) { fprintf(stderr, "Number of reads: %d\n", sff_header.number_of_reads); fprintf(stderr, "Flows per read: %d\n", sff_header.flows_per_read); fprintf(stderr, "Key sequence: %s\n", key_sequence.data()); } if (parameters.opt_log != nullptr) { fprintf(parameters.fp_log, "Number of reads: %d\n", sff_header.number_of_reads); fprintf(parameters.fp_log, "Flows per read: %d\n", sff_header.flows_per_read); fprintf(parameters.fp_log, "Key sequence: %s\n", key_sequence.data()); } /* prepare to parse reads or index */ struct sff_read_stats sff_stats; bool index_is_done = (sff_header.index_offset == 0) or (sff_header.index_length == 0); // refactoring: need a variable has_index? bool index_is_odd = false; std::array index_kind {{}}; auto const index_padding = compute_padding_length(sff_header.index_length); progress_init("Converting SFF: ", sff_header.number_of_reads); for (uint32_t read_no = 0; read_no < sff_header.number_of_reads; read_no++) { /* check if the index block is here */ if ((not index_is_done) and (filepos == sff_header.index_offset)) { if (std::fread(index_kind.data(), byte_size, index_header_length, fp_sff.get()) < index_header_length) { fatal("Invalid SFF file. Unable to read index header. File may be truncated."); } filepos += index_header_length; index_kind[index_header_length] = 0; uint64_t const index_size = sff_header.index_length - index_header_length + index_padding; // refactoring: skip index data and padding in one go? if (fskip(fp_sff.get(), index_size) != index_size) { fatal("Invalid SFF file. Unable to read entire index. File may be truncated."); } filepos += index_size; index_is_done = true; index_is_odd = true; } /* read and check each read header */ auto const read_header = read_sff_read_header(fp_sff.get()); filepos += n_bytes_in_read_header; check_sff_read_header(read_header); auto read_name = read_a_string(fp_sff.get(), read_header.name_length, "read name"); // refactoring: reserve memory only once, clear and resize if need be filepos += read_header.name_length; uint32_t const read_header_padding_length = read_header.read_header_length - read_header.name_length - n_bytes_in_read_header; skip_sff_section(fp_sff.get(), read_header_padding_length, "read header padding"); filepos += read_header_padding_length; /* read and check the flowgram and sequence */ if (fskip(fp_sff.get(), 2UL * sff_header.flows_per_read) < sff_header.flows_per_read) { fatal("Invalid SFF file. Unable to read flowgram values. File may be truncated."); } filepos += 2UL * sff_header.flows_per_read; skip_sff_section(fp_sff.get(), read_header.number_of_bases, "flow indices"); filepos += read_header.number_of_bases; auto bases = read_a_string(fp_sff.get(), read_header.number_of_bases, "read length"); filepos += read_header.number_of_bases; auto quality_scores = read_a_string(fp_sff.get(), read_header.number_of_bases, "quality scores"); filepos += read_header.number_of_bases; /* convert quality scores to ascii characters */ convert_quality_scores(quality_scores, parameters); uint32_t const read_data_length = ((2 * sff_header.flows_per_read) + (3 * read_header.number_of_bases)); uint32_t const read_data_padded_length = memory_alignment * ((read_data_length + max_padding_length) / memory_alignment); uint32_t const read_data_padding_length = read_data_padded_length - read_data_length; // refactoring: replace with compute_padding_length() skip_sff_section(fp_sff.get(), read_data_padding_length, "read data padding"); filepos += read_data_padding_length; // refactoring; mask_start, mask_end_5prime, mask_end_3prime, left_mask_end, right_mask_start uint32_t clip_start = std::max({uint16_t{1}, read_header.clip_qual_left, read_header.clip_adapter_left}) - 1 ; uint32_t clip_end = std::min((read_header.clip_qual_right == 0 ? read_header.number_of_bases : read_header.clip_qual_right), (read_header.clip_adapter_right == 0 ? read_header.number_of_bases : read_header.clip_adapter_right)); /* make the clipped bases lowercase and the rest uppercase */ // refactoring: soft_mask_read(transform(begin(), left_mask_end); transform(right_mask_start, end())) for (uint32_t i = 0; i < read_header.number_of_bases; i++) { if ((i < clip_start) or (i >= clip_end)) { bases[i] = std::tolower(bases[i]); } else { bases[i] = std::toupper(bases[i]); } } if (parameters.opt_sff_clip) { bases[clip_end] = '\0'; quality_scores[clip_end] = '\0'; } else { clip_start = 0; clip_end = read_header.number_of_bases; } uint32_t const length = clip_end - clip_start; fastq_print_general(fp_fastqout, bases.data() + clip_start, length, read_name.data(), read_name.size() - 1, quality_scores.data() + clip_start, 1, read_no + 1, -1.0); sff_stats.total_length += length; sff_stats.minimum = std::min(length, sff_stats.minimum); sff_stats.maximum = std::max(length, sff_stats.maximum); progress_update(read_no + 1); } progress_done(); /* check if the index block is here */ if (not index_is_done) { if (filepos == sff_header.index_offset) { if (std::fread(index_kind.data(), byte_size, 8, fp_sff.get()) < 8) { fatal("Invalid SFF file. Unable to read index header. File may be truncated."); } filepos += 8; index_kind[8] = 0; uint64_t const index_size = sff_header.index_length - 8; if (fskip(fp_sff.get(), index_size) != index_size) { fatal("Invalid SFF file. Unable to read entire index. File may be truncated."); } filepos += index_size; index_is_done = true; /* try to skip padding, if any */ // refactoring: should skip index_data + index padding in one go? or do not reject SFF files just because index padding is missing? if (index_padding > 0) { uint64_t const got = fskip(fp_sff.get(), index_padding); if ((got < index_padding) and (got != 0)) { fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n"); // refactoring: should be "missing padding"! } } } } warn_if(parameters, index_is_done, "SFF index missing"); warn_if(parameters, not index_is_odd, "Index at unusual position in file"); /* ignore the rest of file */ check_for_additional_tail_data(fp_sff.get(), parameters); // rename to warn_if_additional_tail_data()? std::fclose(fp_fastqout); if (not parameters.opt_quiet) { write_report(stderr, sff_header, sff_stats, index_kind.data()); } if (parameters.opt_log != nullptr) { write_report(parameters.fp_log, sff_header, sff_stats, index_kind.data()); } } // refactoring: // struct Read {std::vector identifier ; std::vector nucleotides ; // std::vector quality_scores } a_read; // initialize all vectors to large values (256 for name, 4096 for nucleotides // and quality_scores) // auto resize_if(std::vector & a_string, std::size_t needed_length) -> void { // if (a_string.size() >= needed_length) { return; } // a_string.resize(needed_length, '\0'); // } vsearch-2.30.1/src/sff_convert.h000066400000000000000000000047551506776541700165430ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sff_convert(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/sha1.c000066400000000000000000000301101506776541700150340ustar00rootroot00000000000000/* refactoring: sha1.h headers are available in gcc and clang alternatively, there are C++ implementations available. OpenSSH's version is the fastest */ /* Slightly modified for vsearch by Torbjorn Rognes */ /* SHA-1 in C By Steve Reid 100% Public Domain ----------------- Modified 7/98 By James H. Brown Still 100% Public Domain Corrected a problem which generated improper hash values on 16 bit machines Routine SHA1Update changed from void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned int len) to void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned long len) The 'len' parameter was declared an int which works fine on 32 bit machines. However, on 16 bit machines an int is too small for the shifts being done against it. This caused the hash function to generate incorrect values if len was greater than 8191 (8K - 1) due to the 'len << 3' on line 3 of SHA1Update(). Since the file IO in main() reads 16K at a time, any file 8K or larger would be guaranteed to generate the wrong hash (e.g. Test Vector #3, a million "a"s). I also changed the declaration of variables i & j in SHA1Update to unsigned long from unsigned int for the same reason. These changes should make no difference to any 32 bit implementations since an int and a long are the same size in those environments. -- I also corrected a few compiler warnings generated by Borland C. 1. Added #include for exit() prototype 2. Removed unused variable 'j' in SHA1Final 3. Changed exit(0) to return(0) at end of main. ALL changes I made can be located by searching for comments containing 'JHB' ----------------- Modified 8/98 By Steve Reid Still 100% public domain 1- Removed #include and used return() instead of exit() 2- Fixed overwriting of finalcount in SHA1Final() (discovered by Chris Hall) 3- Changed email address from steve@edmweb.com to sreid@sea-to-sky.net ----------------- Modified 4/01 By Saul Kravitz Still 100% PD Modified to run on Compaq Alpha hardware. ----------------- Modified 07/2002 By Ralph Giles Still 100% public domain modified for use with stdint types, autoconf code cleanup, removed attribution comments switched SHA1Final() argument order for consistency use SHA1_ prefix for public api move public api to sha1.h */ /* Test Vectors (from FIPS PUB 180-1) "abc" A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 A million repetitions of "a" 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F */ /* #define SHA1HANDSOFF */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "sha1.h" void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64]); #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ /* FIXME: can we do this in an endian-proof way? */ #ifdef WORDS_BIGENDIAN #define blk0(i) block->l[i] #else #define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ |(rol(block->l[i],8)&0x00FF00FF)) #endif #define blk(i) (block->l[(i)&15] = rol(block->l[((i)+13)&15]^block->l[((i)+8)&15] \ ^block->l[((i)+2)&15]^block->l[(i)&15],1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v,w,x,y,z,i) z+=(((w)&((x)^(y)))^(y))+blk0(i)+0x5A827999+rol(v,5);(w)=rol(w,30); #define R1(v,w,x,y,z,i) z+=(((w)&((x)^(y)))^(y))+blk(i)+0x5A827999+rol(v,5);(w)=rol(w,30); #define R2(v,w,x,y,z,i) z+=((w)^(x)^(y))+blk(i)+0x6ED9EBA1+rol(v,5);(w)=rol(w,30); #define R3(v,w,x,y,z,i) z+=((((w)|(x))&(y))|((w)&(x)))+blk(i)+0x8F1BBCDC+rol(v,5);(w)=rol(w,30); #define R4(v,w,x,y,z,i) z+=((w)^(x)^(y))+blk(i)+0xCA62C1D6+rol(v,5);(w)=rol(w,30); #ifdef VERBOSE /* SAK */ void SHAPrintContext(SHA1_CTX *context, char *msg){ printf("%s (%d,%d) %x %x %x %x %x\n", msg, context->count[0], context->count[1], context->state[0], context->state[1], context->state[2], context->state[3], context->state[4]); } #endif /* VERBOSE */ /* Hash a single 512-bit block. This is the core of the algorithm. */ void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64]) { uint32_t a = 0; uint32_t b = 0; uint32_t c = 0; uint32_t d = 0; uint32_t e = 0; typedef union { uint8_t c[64]; uint32_t l[16]; } CHAR64LONG16; CHAR64LONG16 * block; #ifdef SHA1HANDSOFF static uint8_t workspace[64]; block = (CHAR64LONG16 *) workspace; memcpy(block, buffer, 64); #else block = (CHAR64LONG16*)buffer; #endif /* Copy context->state[] to working vars */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); /* Add the working vars back into context.state[] */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; /* Wipe variables */ a = b = c = d = e = 0; } /* SHA1Init - Initialize new context */ void SHA1_Init(SHA1_CTX* context) { /* SHA1 initialization constants */ context->state[0] = 0x67452301; context->state[1] = 0xEFCDAB89; context->state[2] = 0x98BADCFE; context->state[3] = 0x10325476; context->state[4] = 0xC3D2E1F0; context->count[0] = context->count[1] = 0; } /* Run your data through this. */ void SHA1_Update(SHA1_CTX* context, const uint8_t* data, const size_t len) { size_t i = 0; size_t j = 0; #ifdef VERBOSE SHAPrintContext(context, "before"); #endif j = (context->count[0] >> 3) & 63; context->count[0] += len << 3; if (context->count[0] < (len << 3)) { context->count[1]++; } context->count[1] += (len >> 29); if ((j + len) > 63) { memcpy(&context->buffer[j], data, (i = 64-j)); SHA1_Transform(context->state, context->buffer); for ( ; i + 63 < len; i += 64) { SHA1_Transform(context->state, data + i); } j = 0; } else { i = 0; } memcpy(&context->buffer[j], &data[i], len - i); #ifdef VERBOSE SHAPrintContext(context, "after "); #endif } /* Add padding and return the message digest. */ void SHA1_Final(SHA1_CTX* context, uint8_t digest[SHA1_DIGEST_SIZE]) { uint32_t i = 0; uint8_t finalcount[8]; uint8_t padding_buffer[64]; for (i = 0; i < 64; i++) { padding_buffer[i] = 0; } for (i = 0; i < 8; i++) { finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ } padding_buffer[0] = 0x80; SHA1_Update(context, padding_buffer, 1); padding_buffer[0] = 0x00; while ((context->count[0] & 504) != 448) { SHA1_Update(context, padding_buffer, 1); } SHA1_Update(context, finalcount, 8); /* Should cause a SHA1_Transform() */ for (i = 0; i < SHA1_DIGEST_SIZE; i++) { digest[i] = (uint8_t) ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); } /* Wipe variables */ i = 0; memset(context->buffer, 0, 64); memset(context->state, 0, 20); memset(context->count, 0, 8); memset(finalcount, 0, 8); /* SWR */ #ifdef SHA1HANDSOFF /* make SHA1Transform overwrite its own static vars */ SHA1_Transform(context->state, context->buffer); #endif } /*************************************************************/ #if 0 int main(int argc, char** argv) { int i, j; SHA1_CTX context; unsigned char digest[SHA1_DIGEST_SIZE], buffer[16384]; FILE* file; if (argc > 2) { puts("Public domain SHA-1 implementation - by Steve Reid "); puts("Modified for 16 bit environments 7/98 - by James H. Brown "); /* JHB */ puts("Produces the SHA-1 hash of a file, or stdin if no file is specified."); return(0); } if (argc < 2) { file = stdin; } else { if (!(file = fopen(argv[1], "rb"))) { fputs("Unable to open file.", stderr); return(-1); } } SHA1_Init(&context); while (!feof(file)) { /* note: what if ferror(file) */ i = fread(buffer, 1, 16384, file); SHA1_Update(&context, buffer, i); } SHA1_Final(&context, digest); fclose(file); for (i = 0; i < SHA1_DIGEST_SIZE/4; i++) { for (j = 0; j < 4; j++) { printf("%02X", digest[i*4+j]); } putchar(' '); } putchar('\n'); return(0); /* JHB */ } #endif /* self test */ #ifdef TEST static char *test_data[] = { "abc", "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", "A million repetitions of 'a'"}; static char *test_results[] = { "A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D", "84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1", "34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F"}; void digest_to_hex(const uint8_t digest[SHA1_DIGEST_SIZE], char *output) { int i,j; char *c = output; for (i = 0; i < SHA1_DIGEST_SIZE/4; i++) { for (j = 0; j < 4; j++) { sprintf(c,"%02X", digest[i*4+j]); c += 2; } sprintf(c, " "); c += 1; } *(c - 1) = '\0'; } int main(int argc, char** argv) { int k; SHA1_CTX context; uint8_t digest[20]; char output[80]; fprintf(stdout, "verifying SHA-1 implementation... "); for (k = 0; k < 2; k++){ SHA1_Init(&context); SHA1_Update(&context, (uint8_t*)test_data[k], strlen(test_data[k])); SHA1_Final(&context, digest); digest_to_hex(digest, output); if (strcmp(output, test_results[k])) { fprintf(stdout, "FAIL\n"); fprintf(stderr,"* hash of \"%s\" incorrect:\n", test_data[k]); fprintf(stderr,"\t%s returned\n", output); fprintf(stderr,"\t%s is correct\n", test_results[k]); return (1); } } /* million 'a' vector we feed separately */ SHA1_Init(&context); for (k = 0; k < 1000000; k++) SHA1_Update(&context, (uint8_t*)"a", 1); SHA1_Final(&context, digest); digest_to_hex(digest, output); if (strcmp(output, test_results[2])) { fprintf(stdout, "FAIL\n"); fprintf(stderr,"* hash of \"%s\" incorrect:\n", test_data[2]); fprintf(stderr,"\t%s returned\n", output); fprintf(stderr,"\t%s is correct\n", test_results[2]); return (1); } /* success */ fprintf(stdout, "ok\n"); return(0); } #endif /* TEST */ vsearch-2.30.1/src/sha1.h000066400000000000000000000010361506776541700150460ustar00rootroot00000000000000/* public api for steve reid's public domain SHA-1 implementation */ /* this file is in the public domain */ #ifndef __SHA1_H #define __SHA1_H #ifdef __cplusplus extern "C" { #endif typedef struct { uint32_t state[5]; uint32_t count[2]; uint8_t buffer[64]; } SHA1_CTX; #define SHA1_DIGEST_SIZE 20 void SHA1_Init(SHA1_CTX* context); void SHA1_Update(SHA1_CTX* context, const uint8_t* data, size_t len); void SHA1_Final(SHA1_CTX* context, uint8_t digest[SHA1_DIGEST_SIZE]); #ifdef __cplusplus } #endif #endif /* __SHA1_H */ vsearch-2.30.1/src/showalign.cc000066400000000000000000000267331506776541700163560ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/cigar.hpp" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/span.hpp" #include // std::copy, std::fill_n, std::min #include #include // macros PRIu64 and PRId64 #include // int64_t #include // std::FILE #include // std::strlen #include // std::next #include // anonymous namespace: limit visibility and usage to this translation unit namespace { std::vector q_line; // query std::vector a_line; // alignment symbols (|) std::vector d_line; // target struct Position { int64_t line = 0; int64_t query = 0; int64_t target = 0; int64_t query_start = 0; int64_t target_start = 0; }; struct Sequence { char const * sequence = nullptr; int64_t length = 0; int64_t offset = 0; char const * name = nullptr; }; struct Alignment { static constexpr auto poswidth_default = 3; static constexpr auto headwidth_default = 5; std::FILE * output_handle = nullptr; Sequence query; Sequence target; int64_t width = 0; int poswidth = poswidth_default; int headwidth = headwidth_default; bool is_reverse_strand = false; }; auto get_aligment_symbol(char const query_nuc, char const target_nuc) -> char { static constexpr auto is_N = 15U; auto const query_coded = map_4bit(query_nuc); auto const target_coded = map_4bit(target_nuc); if (opt_n_mismatch and ((query_coded == is_N) or (target_coded == is_N))) { return ' '; // N are mismatches } if ((query_coded == target_coded) and not is_ambiguous_4bit(query_coded)) { return '|'; // a perfect match } if ((query_coded & target_coded) != 0U) { return '+'; // an equivalence (ambiguous nucleotides) } return ' '; } auto get_query_nucleotide(Alignment const & alignment, Position const & position) -> char { auto const nucleotide = *std::next(alignment.query.sequence, position.query); if (alignment.is_reverse_strand) { return map_complement(nucleotide); } return nucleotide; } auto get_target_nucleotide(Alignment const & alignment, Position const & position) -> char { return *std::next(alignment.target.sequence, position.target); } auto print_alignment_block(Alignment const & alignment, Position const & position) -> void { // current query and target starting and ending positions auto const query_start = std::min(position.query_start + 1, alignment.query.length); auto const query_end = alignment.is_reverse_strand ? position.query + 2 : position.query; auto const target_start = std::min(position.target_start + 1, alignment.target.length); auto const target_end = position.target; static_cast( std::fprintf(alignment.output_handle, "\n%*s %*" PRId64 " %c %s %" PRId64 "\n", alignment.headwidth, alignment.query.name, alignment.poswidth, query_start, alignment.is_reverse_strand ? '-' : '+', q_line.data(), query_end)); static_cast( std::fprintf(alignment.output_handle, "%*s %*s %s\n", alignment.headwidth, "", alignment.poswidth, "", a_line.data())); static_cast( std::fprintf(alignment.output_handle, "%*s %*" PRId64 " %c %s %" PRId64 "\n", alignment.headwidth, alignment.target.name, alignment.poswidth, target_start, '+', d_line.data(), target_end)); } inline auto putop(Alignment const & alignment, Position & position, Operation const operation, int64_t const runlength) -> void { int64_t const delta = alignment.is_reverse_strand ? -1 : +1; for (auto count = runlength; count != 0; --count) { if (position.line == 0) { position.query_start = position.query; position.target_start = position.target; } auto const query_nuc = get_query_nucleotide(alignment, position); auto const target_nuc = get_target_nucleotide(alignment, position); switch (operation) { case Operation::match: position.query += delta; position.target += 1; q_line[position.line] = query_nuc; a_line[position.line] = get_aligment_symbol(query_nuc, target_nuc); d_line[position.line] = target_nuc; ++position.line; break; case Operation::deletion: // gap in target (insertion in query) position.query += delta; q_line[position.line] = query_nuc; a_line[position.line] = ' '; d_line[position.line] = '-'; ++position.line; break; case Operation::insertion: // insertion in target (gap in query) position.target += 1; q_line[position.line] = '-'; a_line[position.line] = ' '; d_line[position.line] = target_nuc; ++position.line; break; } if (position.line == alignment.width) { // maximal alignment width is reached, print alignment block q_line[position.line] = '\0'; a_line[position.line] = '\0'; d_line[position.line] = '\0'; print_alignment_block(alignment, position); position.line = 0; // needed to avoid out-of-bounds } } } auto putop_final(Alignment const & alignment, Position const & position) -> void { if (position.line == 0) { return; } // final block already printed q_line[position.line] = '\0'; a_line[position.line] = '\0'; d_line[position.line] = '\0'; print_alignment_block(alignment, position); } } // end of anonymous namespace auto align_show(std::FILE * output_handle, char const * seq1, int64_t const seq1len, int64_t const seq1off, char const * seq1name, char const * seq2, int64_t const seq2len, int64_t const seq2off, char const * seq2name, char const * cigar, int64_t const cigarlen, int const numwidth, int const namewidth, int const alignwidth, int const strand) -> void { Alignment alignment; alignment.output_handle = output_handle; alignment.query.sequence = seq1; alignment.query.length = seq1len; alignment.query.offset = seq1off; alignment.query.name = seq1name; alignment.target.sequence = seq2; alignment.target.length = seq2len; alignment.target.offset = seq2off; alignment.target.name = seq2name; alignment.width = alignwidth; alignment.poswidth = numwidth; alignment.headwidth = namewidth; alignment.is_reverse_strand = strand != 0; // C++14 refactoring: aggregate initialization of a struct with // default member initializers // Alignment const alignment = { // output_handle, // {seq1, seq1len, seq1off, seq1name}, // {seq2, seq2len, seq2off, seq2name}, // numwidth, // namewidth, // alignwidth, // strand // }; Position position; position.query = alignment.is_reverse_strand ? alignment.query.length - 1 - alignment.query.offset : alignment.query.offset; position.target = alignment.target.offset; position.query_start = position.query; position.target_start = position.target; q_line.resize(alignment.width + 1); a_line.resize(alignment.width + 1); d_line.resize(alignment.width + 1); // cigar string can be trimmed (left and right): cigarlen maybe != std::strlen(cigar) auto const cigar_pairs = parse_cigar_string(Span{cigar, static_cast(cigarlen)}); for (auto const & a_pair: cigar_pairs) { auto const operation = a_pair.first; auto const runlength = a_pair.second; putop(alignment, position, operation, runlength); } putop_final(alignment, position); q_line.clear(); a_line.clear(); d_line.clear(); } auto align_getrow(Span const seq_view, Span const cigar_view, int const alignlen, bool const is_target) -> std::vector { std::vector row(alignlen + 1); auto const is_query = not is_target; auto cursor = size_t{0}; for (auto const & a_pair: parse_cigar_string(cigar_view)) { auto const operation = a_pair.first; auto const runlength = a_pair.second; assert(static_cast(runlength) < row.size() - cursor); auto const is_not_a_gap = (operation == Operation::match) // a match, all good or ((operation == Operation::deletion) and is_query) // seq = query, insertion in seq or ((operation == Operation::insertion) and is_target); // seq = target, insertion in seq if (is_not_a_gap) { auto const subsequence = seq_view.subspan(cursor, runlength); std::copy(subsequence.cbegin(), subsequence.cend(), &row[cursor]); } else { /* deletion in sequence: insert gap symbols */ std::fill_n(&row[cursor], runlength, '-'); } cursor += runlength; } assert(row[cursor] == '\0'); return row; } vsearch-2.30.1/src/showalign.h000066400000000000000000000063161506776541700162130ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "utils/span.hpp" #include // int64_t #include // std::FILE #include auto align_getrow(Span seq_view, Span cigar_view, int alignlen, bool is_target) -> std::vector; auto align_show(std::FILE * output_handle, char const * seq1, int64_t seq1len, int64_t seq1off, char const * seq1name, char const * seq2, int64_t seq2len, int64_t seq2off, char const * seq2name, char const * cigar, int64_t cigarlen, int numwidth, int namewidth, int alignwidth, int strand) -> void; vsearch-2.30.1/src/shuffle.cc000066400000000000000000000115561506776541700160140ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/check_output_filehandle.hpp" #include "utils/fatal.hpp" #include "utils/open_file.hpp" #include // std::min, std::shuffle #include // std::FILE, std::size_t #include // std::iota #include #include // anonymous namespace: limit visibility and usage to this translation unit namespace { auto create_deck() -> std::vector { auto const dbsequencecount = db_getsequencecount(); std::vector deck(dbsequencecount); std::iota(deck.begin(), deck.end(), 0); return deck; } auto generate_seed(long int const user_seed) -> unsigned int { if (user_seed != 0) { return static_cast(user_seed); } std::random_device number_generator; return number_generator(); } auto shuffle_deck(std::vector & deck, long int const user_seed) -> void { static constexpr auto one_hundred_percent = 100ULL; progress_init("Shuffling", one_hundred_percent); auto const seed = generate_seed(user_seed); std::mt19937_64 uniform_generator(seed); std::shuffle(deck.begin(), deck.end(), uniform_generator); progress_done(); } // refactoring: turn into template, remove conditional auto truncate_deck(std::vector & deck, long int const n_first_sequences) -> void { // auto const new_size = std::min(deck.size(), n_first_sequences) // deck.resize(new_size); if (deck.size() > static_cast(n_first_sequences)) { deck.resize(n_first_sequences); } } auto output_shuffled_fasta(std::vector const & deck, std::FILE * output_file) -> void { progress_init("Writing output", deck.size()); auto counter = std::size_t{0}; for (auto const sequence_id: deck) { fasta_print_db_relabel(output_file, sequence_id, counter + 1); progress_update(counter); ++counter; } progress_done(); } } // end of anonymous namespace auto shuffle(struct Parameters const & parameters) -> void { auto const output_handle = open_output_file(parameters.opt_output); check_mandatory_output_handle(parameters.opt_output, (not output_handle)); db_read(parameters.opt_shuffle, 0); show_rusage(); auto deck = create_deck(); shuffle_deck(deck, parameters.opt_randseed); show_rusage(); truncate_deck(deck, parameters.opt_topn); output_shuffled_fasta(deck, output_handle.get()); show_rusage(); db_free(); } vsearch-2.30.1/src/shuffle.h000066400000000000000000000047511506776541700156550ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto shuffle(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/sintax.cc000066400000000000000000000517731506776541700156730ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Implements the Sintax algorithm as described in Robert Edgar's preprint: Robert Edgar (2016) SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences BioRxiv, 074161 doi: https://doi.org/10.1101/074161 Further details: https://www.drive5.com/usearch/manual/cmd_sintax.html Note that due to the lack of details in the description, this implementation in vsearch is surely somewhat different from the one in usearch. */ #include "vsearch.h" #include "bitmap.h" #include "dbindex.h" #include "mask.h" #include "minheap.h" #include "tax.h" #include "udb.h" #include "unique.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include "utils/taxonomic_fields.h" #include "utils/xpthread.hpp" #include // std::min, std::max #include #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t #include // std::memset, std::strncmp, std::strcpy #include static struct searchinfo_s * si_plus; static struct searchinfo_s * si_minus; static pthread_t * pthread; /* global constants/data, no need for synchronization */ static int tophits; /* the maximum number of hits to keep */ static int seqcount; /* number of database sequences */ static pthread_attr_t attr; static fastx_handle query_fastx_h; constexpr auto subset_size = 32; constexpr auto bootstrap_count = 100; /* global data protected by mutex */ static pthread_mutex_t mutex_input; static pthread_mutex_t mutex_output; static std::FILE * fp_tabbedout; static int queries = 0; static int classified = 0; auto sintax_analyse(char const * query_head, int const strand, int const * all_seqno, int const count) -> void { std::array level_matchcount {{}}; std::array level_best {{}}; std::array, bootstrap_count> cand_level_name_start {{}}; std::array, bootstrap_count> cand_level_name_len {{}}; /* Check number of successful bootstraps, must be at least half */ auto const is_enough = count >= (bootstrap_count + 1) / 2; if (is_enough) { /* Find the most common name at each taxonomic rank, but with the same names at higher ranks. */ for (auto i = 0; i < count ; i++) { /* Split headers of all candidates by taxonomy ranks */ auto const seqno = all_seqno[i]; std::array new_level_name_start {{}}; std::array new_level_name_len {{}}; tax_split(seqno, new_level_name_start.data(), new_level_name_len.data()); cand_level_name_len[i] = new_level_name_len; for (auto k = 0; k < tax_levels; k++) { cand_level_name_start[i][k] = db_getheader(seqno) + new_level_name_start[k]; } } std::array cand_included {{}}; cand_included.fill(true); /* Count matching names among candidates */ for (auto k = 0; k < tax_levels; k++) { level_best[k] = -1; level_matchcount[k] = 0; std::array cand_match {{}}; cand_match.fill(-1); std::array cand_matchcount {{}}; for (auto i = 0; i < count ; i++) { if (cand_included[i]) { for (auto j = 0; j <= i ; j++) { if (cand_included[j]) { /* check match at current level */ if ((cand_level_name_len[i][k] == cand_level_name_len[j][k]) && (std::strncmp(cand_level_name_start[i][k], cand_level_name_start[j][k], cand_level_name_len[i][k]) == 0)) { cand_match[i] = j; cand_matchcount[j]++; break; /* stop at first match */ } } } } } for (auto i = 0; i < count ; i++) { if (cand_matchcount[i] > level_matchcount[k]) { level_best[k] = i; level_matchcount[k] = cand_matchcount[i]; } } for (auto i = 0; i < count; i++) { if (cand_match[i] != level_best[k]) { cand_included[i] = false; } } } } /* write to tabbedout file */ xpthread_mutex_lock(&mutex_output); std::fprintf(fp_tabbedout, "%s\t", query_head); queries++; if (is_enough) { classified++; auto comma = false; for (auto j = 0; j < tax_levels; j++) { auto const best = level_best[j]; if (cand_level_name_len[best][j] > 0) { std::fprintf(fp_tabbedout, "%s%c:%.*s(%.2f)", (comma ? "," : ""), taxonomic_fields[j], cand_level_name_len[best][j], cand_level_name_start[best][j], 1.0 * level_matchcount[j] / count); comma = true; } } std::fprintf(fp_tabbedout, "\t%c", (strand != 0) ? '-' : '+'); if (opt_sintax_cutoff > 0.0) { std::fprintf(fp_tabbedout, "\t"); auto comma = false; for (auto j = 0; j < tax_levels; j++) { auto const best = level_best[j]; if ((cand_level_name_len[best][j] > 0) && (1.0 * level_matchcount[j] / count >= opt_sintax_cutoff)) { std::fprintf(fp_tabbedout, "%s%c:%.*s", (comma ? "," : ""), taxonomic_fields[j], cand_level_name_len[best][j], cand_level_name_start[best][j]); comma = true; } } } } else { if (opt_sintax_cutoff > 0.0) { std::fprintf(fp_tabbedout, "\t\t"); } else { std::fprintf(fp_tabbedout, "\t"); } } std::fprintf(fp_tabbedout, "\n"); xpthread_mutex_unlock(&mutex_output); } auto sintax_search_topscores(struct searchinfo_s * searchinfo) -> void { /* Count the number of kmer hits in each database sequence and select the database sequence with the highest number of matching kmers. If several sequences have equally many kmer matches, choose one of them according to the following rules: By default, choose the shortest. If two are equally short, choose the one that comes first in the database. If the sintax_random option is in effect, ties will instead be chosen randomly. */ /* count kmer hits in the database sequences */ const int indexed_count = dbindex_getcount(); /* zero counts */ std::memset(searchinfo->kmers, 0, indexed_count * sizeof(count_t)); for (auto i = 0U; i < searchinfo->kmersamplecount; i++) { unsigned int const kmer = searchinfo->kmersample[i]; auto * bitmap = dbindex_getbitmap(kmer); if (bitmap != nullptr) { #ifdef __x86_64__ if (ssse3_present != 0) { increment_counters_from_bitmap_ssse3(searchinfo->kmers, bitmap, indexed_count); } else { increment_counters_from_bitmap_sse2(searchinfo->kmers, bitmap, indexed_count); } #else increment_counters_from_bitmap(searchinfo->kmers, bitmap, indexed_count); #endif } else { auto * list = dbindex_getmatchlist(kmer); auto const count = dbindex_getmatchcount(kmer); for (auto j = 0U; j < count; j++) { searchinfo->kmers[list[j]]++; } } } auto tophits = 0U; elem_t best; best.count = 0; best.seqno = 0; best.length = 0; for (auto i = 0; i < indexed_count; i++) { count_t const count = searchinfo->kmers[i]; auto const seqno = dbindex_getmapping(i); unsigned int const length = db_getsequencelen(best.seqno); if (count > best.count) { best.count = count; best.seqno = seqno; best.length = length; tophits = 1; } else if (count == best.count) { if (opt_sintax_random) { tophits++; if (random_int(tophits) == 0) { best.seqno = seqno; best.length = length; } } else { if (length < best.length) { best.seqno = seqno; best.length = length; } else if (length == best.length) { best.seqno = std::min(seqno, best.seqno); } } } } minheap_clear(searchinfo->m); if (best.count > 1) { minheap_add(searchinfo->m, &best); } } auto sintax_query(int64_t const t) -> void { std::array, 2> all_seqno {{}}; std::array boot_count = {0, 0}; std::array best_count = {0, 0}; int const qseqlen = si_plus[t].qseqlen; char const * query_head = si_plus[t].query_head; auto * b = bitmap_init(qseqlen); for (auto s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_minus + t : si_plus + t; /* perform search */ auto kmersamplecount = 0U; unsigned int const * kmersample = nullptr; /* find unique kmers */ unique_count(si->uh, opt_wordlength, si->qseqlen, si->qsequence, &kmersamplecount, &kmersample, MASK_NONE); /* perform 100 bootstraps */ if (kmersamplecount >= subset_size) { for (auto i = 0; i < bootstrap_count ; i++) { /* subsample 32 kmers */ std::array kmersample_subset {{}}; auto subsamples = 0; bitmap_reset_all(b); for (auto j = 0; j < subset_size ; j++) { int64_t const x = random_int(kmersamplecount); if (bitmap_get(b, x) == 0U) { kmersample_subset[subsamples++] = kmersample[x]; bitmap_set(b, x); } } si->kmersamplecount = subsamples; si->kmersample = kmersample_subset.data(); sintax_search_topscores(si); if (! minheap_isempty(si->m)) { auto const e = minheap_poplast(si->m); all_seqno[s][boot_count[s]++] = e.seqno; best_count[s] = std::max(e.count, best_count[s]); } } } } auto best_strand = 0; if (opt_strand == 1) { best_strand = 0; } else { if (best_count[0] > best_count[1]) { best_strand = 0; } else if (best_count[1] > best_count[0]) { best_strand = 1; } else { if (boot_count[0] >= boot_count[1]) { best_strand = 0; } else { best_strand = 1; } } } sintax_analyse(query_head, best_strand, all_seqno[best_strand].data(), boot_count[best_strand]); bitmap_free(b); } auto sintax_thread_run(int64_t const t) -> void { while (true) { xpthread_mutex_lock(&mutex_input); if (fastx_next(query_fastx_h, opt_notrunclabels == 0, chrmap_no_change_vector.data())) { auto const * qhead = fastx_get_header(query_fastx_h); int const query_head_len = fastx_get_header_length(query_fastx_h); auto const * qseq = fastx_get_sequence(query_fastx_h); int const qseqlen = fastx_get_sequence_length(query_fastx_h); int const query_no = fastx_get_seqno(query_fastx_h); int const qsize = fastx_get_abundance(query_fastx_h); for (auto s = 0; s < opt_strand; s++) { struct searchinfo_s * si = (s != 0) ? si_minus + t : si_plus + t; si->query_head_len = query_head_len; si->qseqlen = qseqlen; si->query_no = query_no; si->qsize = qsize; si->strand = s; /* allocate more memory for header and sequence, if necessary */ if (si->query_head_len + 1 > si->query_head_alloc) { si->query_head_alloc = si->query_head_len + 2001; si->query_head = (char *) xrealloc(si->query_head, (size_t)(si->query_head_alloc)); } if (si->qseqlen + 1 > si->seq_alloc) { si->seq_alloc = si->qseqlen + 2001; si->qsequence = (char *) xrealloc(si->qsequence, (size_t)(si->seq_alloc)); } } /* plus strand: copy header and sequence */ std::strcpy(si_plus[t].query_head, qhead); std::strcpy(si_plus[t].qsequence, qseq); /* get progress as amount of input file read */ auto const progress = fastx_get_position(query_fastx_h); /* let other threads read input */ xpthread_mutex_unlock(&mutex_input); /* minus strand: copy header and reverse complementary sequence */ if (opt_strand > 1) { std::strcpy(si_minus[t].query_head, si_plus[t].query_head); reverse_complement(si_minus[t].qsequence, si_plus[t].qsequence, si_plus[t].qseqlen); } sintax_query(t); /* lock mutex for update of global data and output */ xpthread_mutex_lock(&mutex_output); /* show progress */ progress_update(progress); xpthread_mutex_unlock(&mutex_output); } else { xpthread_mutex_unlock(&mutex_input); break; } } } auto sintax_thread_init(struct searchinfo_s * si) -> void { /* thread specific initialiation */ si->uh = unique_init(); si->kmers = (count_t *) xmalloc((seqcount * sizeof(count_t)) + 32); si->m = minheap_init(tophits); si->hits = nullptr; si->qsize = 1; si->query_head_alloc = 0; si->query_head = nullptr; si->seq_alloc = 0; si->qsequence = nullptr; si->nw = nullptr; si->s = nullptr; } auto sintax_thread_exit(struct searchinfo_s * searchinfo) -> void { /* thread specific clean up */ unique_exit(searchinfo->uh); minheap_exit(searchinfo->m); xfree(searchinfo->kmers); if (searchinfo->query_head != nullptr) { xfree(searchinfo->query_head); } if (searchinfo->qsequence != nullptr) { xfree(searchinfo->qsequence); } } auto sintax_thread_worker(void * void_ptr) -> void * { auto t = (int64_t) void_ptr; sintax_thread_run(t); return nullptr; } auto sintax_thread_worker_run() -> void { /* initialize threads, start them, join them and return */ xpthread_attr_init(&attr); xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); /* init and create worker threads, put them into stand-by mode */ for (auto t = 0; t < opt_threads; t++) { sintax_thread_init(si_plus + t); if (si_minus != nullptr) { sintax_thread_init(si_minus + t); } xpthread_create(pthread + t, &attr, sintax_thread_worker, (void *) (int64_t)t); } /* finish and clean up worker threads */ for (auto t = 0; t < opt_threads; t++) { xpthread_join(pthread[t], nullptr); sintax_thread_exit(si_plus + t); if (si_minus != nullptr) { sintax_thread_exit(si_minus + t); } } xpthread_attr_destroy(&attr); } auto sintax(struct Parameters const & parameters) -> void { /* tophits = the maximum number of hits we need to store */ tophits = 1; /* open output files */ if (opt_db == nullptr) { fatal("No database file specified with --db"); } if (opt_tabbedout != nullptr) { fp_tabbedout = fopen_output(opt_tabbedout); if (fp_tabbedout == nullptr) { fatal("Unable to open tabbedout output file for writing"); } } else { fatal("No output file specified with --tabbedout"); } /* check if db may be an UDB file */ auto const is_udb = udb_detect_isudb(opt_db); if (is_udb) { udb_read(opt_db, true, true); } else { db_read(opt_db, 0); } seqcount = db_getsequencecount(); if (! is_udb) { dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); } /* prepare reading of queries */ query_fastx_h = fastx_open(parameters.opt_sintax); /* allocate memory for thread info */ si_plus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); if (opt_strand > 1) { si_minus = (struct searchinfo_s *) xmalloc(opt_threads * sizeof(struct searchinfo_s)); } else { si_minus = nullptr; } pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t)); /* init mutexes for input and output */ xpthread_mutex_init(&mutex_input, nullptr); xpthread_mutex_init(&mutex_output, nullptr); /* run */ progress_init("Classifying sequences", fastx_get_size(query_fastx_h)); sintax_thread_worker_run(); progress_done(); if (! opt_quiet) { std::fprintf(stderr, "Classified %d of %d sequences", classified, queries); if (queries > 0) { std::fprintf(stderr, " (%.2f%%)", 100.0 * classified / queries); } std::fprintf(stderr, "\n"); } if (opt_log != nullptr) { std::fprintf(fp_log, "Classified %d of %d sequences", classified, queries); if (queries > 0) { std::fprintf(fp_log, " (%.2f%%)", 100.0 * classified / queries); } std::fprintf(fp_log, "\n"); } /* clean up */ xpthread_mutex_destroy(&mutex_output); xpthread_mutex_destroy(&mutex_input); xfree(pthread); xfree(si_plus); if (si_minus != nullptr) { xfree(si_minus); } fastx_close(query_fastx_h); std::fclose(fp_tabbedout); dbindex_free(); db_free(); } vsearch-2.30.1/src/sintax.h000066400000000000000000000047501506776541700155260ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sintax(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/sortbylength.cc000066400000000000000000000166421506776541700171050ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/check_output_filehandle.hpp" #include "utils/fatal.hpp" #include "utils/open_file.hpp" #include "utils/progress.hpp" #include // std::sort, std::min #include #include // std::FILE, std::fprintf, std::size_t #include // std::ldiv #include // std::strcmp #include #ifndef NDEBUG #include #endif // anonymous namespace: limit visibility and usage to this translation unit namespace { struct sortinfo_length_s { unsigned int length = 0; unsigned int size = 0; unsigned int seqno = 0; }; auto create_deck(struct Parameters const & parameters) -> std::vector { auto const dbsequencecount = db_getsequencecount(); assert(dbsequencecount < std::numeric_limits::max()); std::vector deck(dbsequencecount); Progress progress("Getting lengths", deck.size(), parameters); auto counter = std::size_t{0}; for (auto & sequence: deck) { sequence.seqno = counter; sequence.length = db_getsequencelen(counter); sequence.size = db_getabundance(counter); progress.update(counter); ++counter; } return deck; } auto sort_deck(std::vector & deck, struct Parameters const & parameters) -> void { auto compare_sequences = [](struct sortinfo_length_s const & lhs, struct sortinfo_length_s const & rhs) -> bool { // longest first... if (lhs.length < rhs.length) { return false; } if (lhs.length > rhs.length) { return true; } // ... then ties are sorted by decreasing abundance values... if (lhs.size < rhs.size) { return false; } if (lhs.size > rhs.size) { return true; } // ...then ties are sorted by sequence labels (alpha-numerical ordering), // preserve input order auto const result = std::strcmp(db_getheader(lhs.seqno), db_getheader(rhs.seqno)); return result < 0; }; static constexpr auto one_hundred_percent = 100ULL; Progress const progress("Sorting", one_hundred_percent, parameters); std::stable_sort(deck.begin(), deck.end(), compare_sequences); } // refactoring C++17 [[nodiscard]] auto find_median_length(std::vector const & deck) -> double { // function returns a round value or a value with a remainder of 0.5 static constexpr double half = 0.5; if (deck.empty()) { return 0.0; } // refactoring C++11: use const& std::vector.size() auto const midarray = std::ldiv(static_cast(deck.size()), 2L); // odd number of valid amplicons if (deck.size() % 2 != 0) { return deck[midarray.quot].length * 1.0; // a round value } // even number of valid amplicons // (average of two ints is either round or has a remainder of .5) // avoid risk of silent overflow for large abundance values: // a >= b ; (a + b) / 2 == b + (a - b) / 2 return deck[midarray.quot].length + ((deck[midarray.quot - 1].length - deck[midarray.quot].length) * half); } auto output_median_length(std::vector const & deck, struct Parameters const & parameters) -> void { // Banker's rounding (round half to even) auto const median = find_median_length(deck); if (not parameters.opt_quiet) { std::fprintf(stderr, "Median length: %.0f\n", median); } if (parameters.opt_log != nullptr) { std::fprintf(fp_log, "Median length: %.0f\n", median); } } // refactoring: extract as a template auto truncate_deck(std::vector & deck, long int const n_first_sequences) -> void { if (deck.size() > static_cast(n_first_sequences)) { deck.resize(n_first_sequences); } } // refactoring: extract as a template auto output_sorted_fasta(std::vector const & deck, std::FILE * output_file, struct Parameters const & parameters) -> void { Progress progress("Writing output", deck.size(), parameters); auto counter = std::size_t{0}; for (auto const & sequence: deck) { fasta_print_db_relabel(output_file, sequence.seqno, counter + 1); progress.update(counter); ++counter; } } } // end of anonymous namespace auto sortbylength(struct Parameters const & parameters) -> void { auto const output_handle = open_output_file(parameters.opt_output); check_mandatory_output_handle(parameters.opt_output, (not output_handle)); db_read(parameters.opt_sortbylength, 0); show_rusage(); auto deck = create_deck(parameters); show_rusage(); sort_deck(deck, parameters); output_median_length(deck, parameters); show_rusage(); truncate_deck(deck, parameters.opt_topn); output_sorted_fasta(deck, output_handle.get(), parameters); show_rusage(); db_free(); } vsearch-2.30.1/src/sortbylength.h000066400000000000000000000047561506776541700167520ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sortbylength(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/sortbysize.cc000066400000000000000000000221101506776541700165610ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/check_output_filehandle.hpp" #include "utils/fatal.hpp" #include "utils/open_file.hpp" #include // std::min, std::sort #include #include // int64_t #include // std::FILE, std::fprintf, std::size_t #include // std::ldiv #include // std::strcmp #include #ifndef NDEBUG #include #endif // anonymous namespace: limit visibility and usage to this translation unit namespace { struct sortinfo_size_s { unsigned int size = 0; unsigned int seqno = 0; }; auto create_deck(struct Parameters const & parameters) -> std::vector { auto const dbsequencecount = db_getsequencecount(); assert(dbsequencecount < std::numeric_limits::max()); std::vector deck(dbsequencecount); progress_init("Getting sizes", deck.size()); auto counter = std::size_t{0}; for (auto seqno = 0U; seqno < dbsequencecount; ++seqno) { auto const size = static_cast(db_getabundance(seqno)); if ((size < parameters.opt_minsize) or (size > parameters.opt_maxsize)) { continue; } deck[counter].seqno = seqno; deck[counter].size = static_cast(size); progress_update(seqno); ++counter; } progress_done(); deck.resize(counter); return deck; } auto sort_deck(std::vector & deck) -> void { auto compare_sequences = [](struct sortinfo_size_s const & lhs, struct sortinfo_size_s const & rhs) -> bool { // highest abundance first... if (lhs.size < rhs.size) { return false; } if (lhs.size > rhs.size) { return true; } // ...then ties are sorted by sequence labels (alpha-numerical ordering), // preserve input order auto const result = std::strcmp(db_getheader(lhs.seqno), db_getheader(rhs.seqno)); return result < 0; }; static constexpr auto one_hundred_percent = 100ULL; progress_init("Sorting", one_hundred_percent); std::stable_sort(deck.begin(), deck.end(), compare_sequences); progress_done(); } // refactoring C++17 [[nodiscard]] auto find_median_abundance(std::vector const & deck) -> double { // function returns a round value or a value with a remainder of 0.5 static constexpr double half = 0.5; if (deck.empty()) { return 0.0; } // refactoring C++11: use const& std::vector.size() auto const midarray = std::ldiv(static_cast(deck.size()), 2L); // odd number of valid amplicons if (deck.size() % 2 != 0) { return deck[midarray.quot].size * 1.0; // a round value } // even number of valid amplicons // (average of two ints is either round or has a remainder of .5) // avoid risk of silent overflow for large abundance values: // a >= b ; (a + b) / 2 == b + (a - b) / 2 return deck[midarray.quot].size + ((deck[midarray.quot - 1].size - deck[midarray.quot].size) * half); } auto output_median_abundance(std::vector const & deck, struct Parameters const & parameters) -> void { // Banker's rounding (round half to even) auto const median = find_median_abundance(deck); if (not parameters.opt_quiet) { static_cast(fprintf(stderr, "Median abundance: %.0f\n", median)); } if (parameters.opt_log != nullptr) { static_cast(fprintf(fp_log, "Median abundance: %.0f\n", median)); } } // auto trim_deck(std::vector & deck) // -> std::vector { // // assume deck is sorted by decreasing abundance // // - opt_minsize = 0 by default // // - opt_maxsize = LONG_MAX by default // // - size is unsigned int // auto begin = std::upper_bound(deck.begin(), deck.end(), opt_maxsize, // [](int64_t maxsize, struct sortinfo_size_s & seq) -> bool { // return seq.size > maxsize; // }); // auto end = std::lower_bound(deck.begin(), deck.end(), opt_minsize, // [](int64_t minsize, struct sortinfo_size_s & seq) -> bool { // return seq.size <= minsize; // }); // return std::vector{begin, end}; // } auto truncate_deck(std::vector & deck, long int const n_first_sequences) -> void { if (deck.size() > static_cast(n_first_sequences)) { deck.resize(n_first_sequences); } } // refactoring: extract as a template auto output_sorted_fasta(std::vector const & deck, std::FILE * output_file) -> void { progress_init("Writing output", deck.size()); auto counter = std::size_t{0}; for (auto const & sequence: deck) { fasta_print_db_relabel(output_file, sequence.seqno, counter + 1); progress_update(counter); ++counter; } progress_done(); } // refactoring: trim misize and maxsize with a free function // https://stackoverflow.com/questions/26719144/how-to-erase-a-value-efficiently-from-a-sorted-vector // auto erase_high_abundances(std::vector & vec, int value) -> void // { // auto lb = std::lower_bound(std::begin(vec), std::end(vec), value); // if (lb != std::end(vec) and *lb == value) { // auto ub = std::upper_bound(lb, std::end(vec), value); // vec.erase(lb, ub); // } // } } // end of anonymous namespace // refactoring: // - create vector (no branch) // - stable_sort vector (by increasing size, then label) // - find lower_bound(comp(opt_minsize)), // - deck.resize() // - find upper_bound(comp(opt_maxsize)), // - std::vector subdeck = {deck.begin() + upper_bound, deck.end()}; // view? // - opt_minsize = 0 by default // - opt_maxsize = LONG_MAX by default // - top_n = LONG_MAX by default // - mediane, etc... // - std::min(subdeck.size(), topn); auto sortbysize(struct Parameters const & parameters) -> void { auto const output_handle = open_output_file(parameters.opt_output); check_mandatory_output_handle(parameters.opt_output, (not output_handle)); db_read(parameters.opt_sortbysize, 0); show_rusage(); auto deck = create_deck(parameters); show_rusage(); sort_deck(deck); output_median_abundance(deck, parameters); show_rusage(); truncate_deck(deck, parameters.opt_topn); output_sorted_fasta(deck, output_handle.get()); show_rusage(); // refactoring: why three calls to show_rusage()? db_free(); } vsearch-2.30.1/src/sortbysize.h000066400000000000000000000047541506776541700164410ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto sortbysize(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/subsample.cc000066400000000000000000000342251506776541700163510ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/fatal.hpp" #include // std::count_if #include #include // macros PRIu64 and PRId64 #include // std::floor #include // int64_t #include // std::FILE, std::fprintf, std::fclose #include // std::minus #include // std::fill #include #ifndef NDEBUG // all contiguous integers from 0 to 2^53 can be represented in the // mantissa of a double constexpr uint64_t contiguous_mantissa = 9007199254740992; // 9 x 10^15 reads #endif struct a_file { char * name = nullptr; std::FILE * handle = nullptr; }; struct file_purposes { a_file kept; a_file lost; }; struct file_types { file_purposes fasta; file_purposes fastq; }; // refactoring: // - accept sample_size = 0 and sample_pct = 0.0? // - fastaout should be empty, all reads should be in fastaout_discarded // refactoring: // - store parameters in a struct, pass struct to reduce list of arguments // - use uint64_t for abundance values? // - create discarded vector only if needed // // discrete_distribution // #include // #include // #include // int main() // { // const auto nreads = 500ULL; // target number of reads // std::random_device r; // std::default_random_engine generator(r()); // std::vector weights = {1000, 100, 10, 1}; // std::discrete_distribution distribution(weights.begin(), weights.end()); // std::vector v2(weights.size()); // for (auto i = 0ULL; i < nreads; ++i) { // auto const number = distribution(generator); // ++v2[number]; // } // for (auto i = 0UL; i < v2.size(); ++i) // std::cout << i << ": " << v1[i] << " " << v2[i] << "\n"; // return 0; // } // refactoring: "RTK: efficient rarefaction analysis of large datasets" // a fast C++11 rarefaction tool https://github.com/hildebra/Rarefaction // maybe they use a better approach than the std::discrete_distribution? auto open_output_files(struct file_types & ouput_files) -> void { if (ouput_files.fasta.kept.name != nullptr) { ouput_files.fasta.kept.handle = fopen_output(ouput_files.fasta.kept.name); } if (ouput_files.fasta.lost.name != nullptr) { ouput_files.fasta.lost.handle = fopen_output(ouput_files.fasta.lost.name); } if (ouput_files.fastq.kept.name != nullptr) { ouput_files.fastq.kept.handle = fopen_output(ouput_files.fastq.kept.name); } if (ouput_files.fastq.lost.name != nullptr) { ouput_files.fastq.lost.handle = fopen_output(ouput_files.fastq.lost.name); } } auto abort_if_fastq_out_of_fasta(struct file_types const & ouput_files) -> void { auto const output_is_fastq = (ouput_files.fastq.kept.handle != nullptr or ouput_files.fastq.lost.handle != nullptr); auto const input_is_fasta = not db_is_fastq(); if (input_is_fasta and output_is_fastq) { fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); } } auto check_output_files(struct file_types const & ouput_files) -> void { if (ouput_files.fasta.kept.name != nullptr) { if (ouput_files.fasta.kept.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (ouput_files.fasta.lost.name != nullptr) { if (ouput_files.fasta.lost.handle == nullptr) { fatal("Unable to open FASTA output file for writing"); } } if (ouput_files.fastq.kept.name != nullptr) { if (ouput_files.fastq.kept.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } if (ouput_files.fastq.lost.name != nullptr) { if (ouput_files.fastq.lost.handle == nullptr) { fatal("Unable to open FASTQ output file for writing"); } } } namespace { // anonymous namespace to avoid linker error (multiple definitions // of function with identical names and parameters) auto create_deck(bool const sizein_requested) -> std::vector { auto const dbsequencecount = db_getsequencecount(); std::vector deck(dbsequencecount, 1); if (sizein_requested) { auto counter = std::size_t{0}; for (auto & abundance : deck) { abundance = db_getabundance(counter); ++counter; } } return deck; } } auto write_original_stats(std::vector const & deck, uint64_t const mass_total, struct Parameters const & parameters) -> void { if (not parameters.opt_quiet) { std::fprintf(stderr, "Got %" PRIu64 " reads from %d amplicons\n", mass_total, static_cast(deck.size())); } if (parameters.opt_log != nullptr) { std::fprintf(parameters.fp_log, "Got %" PRIu64 " reads from %d amplicons\n", mass_total, static_cast(deck.size())); } } auto number_of_reads_to_sample(struct Parameters const & parameters, uint64_t const mass_total) -> uint64_t { assert(mass_total <= contiguous_mantissa); if (parameters.opt_sample_size != 0) { return static_cast(parameters.opt_sample_size); } return static_cast(std::floor(static_cast(mass_total) * parameters.opt_sample_pct / 100.0)); } auto write_subsampling_stats(std::vector const &deck, uint64_t const n_reads, struct Parameters const & parameters) -> void { int const samples = std::count_if(deck.begin(), deck.end(), [](int abundance) -> bool { return abundance != 0; }); if (not parameters.opt_quiet) { std::fprintf(stderr, "Subsampled %" PRIu64 " reads from %d amplicons\n", n_reads, samples); } if (parameters.opt_log != nullptr) { std::fprintf(parameters.fp_log, "Subsampled %" PRIu64 " reads from %d amplicons\n", n_reads, samples); } } auto random_subsampling(std::vector & deck, uint64_t const mass_total, uint64_t const n_reads, bool const sizein_requested) -> void { auto n_reads_left = n_reads; auto amplicon_number = 0; uint64_t n_read_being_checked = 0; uint64_t accumulated_mass = 0; auto amplicon_mass = sizein_requested ? db_getabundance(0) : 1; // refactoring C++17: std::sample() progress_init("Subsampling", mass_total); while (n_reads_left > 0) { auto const random = random_ulong(mass_total - n_read_being_checked); if (random < n_reads_left) { /* selected read r from amplicon a */ ++deck[amplicon_number]; --n_reads_left; } ++n_read_being_checked; ++accumulated_mass; if (accumulated_mass >= amplicon_mass) { /* next amplicon */ ++amplicon_number; amplicon_mass = sizein_requested ? db_getabundance(amplicon_number) : 1; accumulated_mass = 0; } progress_update(n_read_being_checked); } progress_done(); } auto substract_two_decks(std::vector const & original_deck, std::vector const & subsampled_deck) -> std::vector { std::vector difference_deck(original_deck.size()); std::transform(original_deck.cbegin(), original_deck.cend(), subsampled_deck.cbegin(), difference_deck.begin(), std::minus()); return difference_deck; } auto writing_fasta_output(std::vector const & deck, struct a_file const & fasta_file) -> void { if (fasta_file.name == nullptr) { return; } int amplicons_printed = 0; progress_init("Writing fasta output", deck.size()); auto counter = 0U; for (auto const abundance_value : deck) { int64_t const new_abundance = abundance_value; if (new_abundance == 0) { ++counter; continue; } ++amplicons_printed; fasta_print_general(fasta_file.handle, nullptr, db_getsequence(counter), static_cast(db_getsequencelen(counter)), db_getheader(counter), static_cast(db_getheaderlen(counter)), new_abundance, amplicons_printed, -1.0, -1, -1, nullptr, 0.0); progress_update(counter); ++counter; } progress_done(); } auto writing_fastq_output(std::vector const & deck, struct a_file const & fastq_file) -> void { if (fastq_file.name == nullptr) { return; } int amplicons_printed = 0; progress_init("Writing fastq output", deck.size()); auto counter = 0U; for (auto const abundance_value : deck) { int64_t const new_abundance = abundance_value; if (new_abundance == 0) { ++counter; continue; } ++amplicons_printed; fastq_print_general(fastq_file.handle, db_getsequence(counter), static_cast(db_getsequencelen(counter)), db_getheader(counter), static_cast(db_getheaderlen(counter)), db_getquality(counter), static_cast(new_abundance), amplicons_printed, -1.0); progress_update(counter); ++counter; } progress_done(); } auto close_output_files(struct file_types const & ouput_files) -> void { for (auto * fp_outputfile : { ouput_files.fasta.kept.handle, ouput_files.fastq.kept.handle, ouput_files.fasta.lost.handle, ouput_files.fastq.lost.handle}) { if (fp_outputfile != nullptr) { static_cast(std::fclose(fp_outputfile)); } } } auto subsample(struct Parameters const & parameters) -> void { struct file_types ouput_files; // refactoring: direct initialization {}; then struct can be const ouput_files.fasta.kept.name = parameters.opt_fastaout; ouput_files.fasta.lost.name = parameters.opt_fastaout_discarded; ouput_files.fastq.kept.name = parameters.opt_fastqout; ouput_files.fastq.lost.name = parameters.opt_fastqout_discarded; open_output_files(ouput_files); check_output_files(ouput_files); db_read(parameters.opt_fastx_subsample, 0); show_rusage(); abort_if_fastq_out_of_fasta(ouput_files); // subsampling auto const original_abundances = create_deck(parameters.opt_sizein); auto const mass_total = std::accumulate(original_abundances.cbegin(), original_abundances.cend(), uint64_t{0}); auto subsampled_abundances = original_abundances; std::fill(subsampled_abundances.begin(), subsampled_abundances.end(), 0); // temporary fix: reset vector to zero write_original_stats(original_abundances, mass_total, parameters); // refactoring: move up? auto const n_reads = number_of_reads_to_sample(parameters, mass_total); if (n_reads > mass_total) { fatal("Cannot subsample more reads than in the original sample"); } random_subsampling(subsampled_abundances, mass_total, n_reads, parameters.opt_sizein); // refactoring: pass & original, copy, subsample, return new (const) vector // write output files writing_fasta_output(subsampled_abundances, ouput_files.fasta.kept); writing_fastq_output(subsampled_abundances, ouput_files.fastq.kept); auto const discarded_output_is_requested = (ouput_files.fasta.lost.handle != nullptr) or (ouput_files.fastq.lost.handle != nullptr); if (discarded_output_is_requested) { auto const discarded_abundances = substract_two_decks(original_abundances, subsampled_abundances); writing_fasta_output(discarded_abundances, ouput_files.fasta.lost); writing_fastq_output(discarded_abundances, ouput_files.fastq.lost); } write_subsampling_stats(subsampled_abundances, n_reads, parameters); // clean up db_free(); close_output_files(ouput_files); } vsearch-2.30.1/src/subsample.h000066400000000000000000000047531506776541700162160ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto subsample(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/tax.cc000066400000000000000000000134011506776541700151430ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "utils/taxonomic_fields.h" #include // std::find #include // std::tolower #include // std::strlen, std::strstr, std::strchr #include // std::distance // very similar to header_find_attribute() in attributes.cc auto tax_parse(char const * header, int const header_length, int * tax_start, int * tax_end) -> bool { /* Identify the first occurence of the pattern (^|;)tax=([^;]*)(;|$) */ if (header == nullptr) { return false; } auto const * attribute = "tax="; auto const attribute_length = static_cast(std::strlen(attribute)); auto offset = 0; while (offset < header_length - attribute_length) { auto const * first_occurence = std::strstr(header + offset, attribute); /* no match */ if (first_occurence == nullptr) { break; } offset = std::distance(header, first_occurence); /* check for ';' in front */ if ((offset > 0) and (header[offset - 1] != ';')) { offset += attribute_length + 1; continue; } *tax_start = offset; /* find end (semicolon or end of header) */ auto const * terminus = std::strchr(header + offset + attribute_length, ';'); if (terminus == nullptr) { *tax_end = header_length; } else { *tax_end = std::distance(header, terminus); } return true; } return false; } auto tax_split(int seqno, int * level_start, int * level_len) -> void { /* Parse taxonomy string into the following 9 parts d domain k kingdom p phylum c class o order f family g genus s species t strain */ static constexpr auto length_of_attribute_name = 4; // "tax=" -> 4 letters auto tax_start = 0; auto tax_end = 0; auto const * const header = db_getheader(seqno); auto const header_length = static_cast(db_getheaderlen(seqno)); auto const attribute_is_present = tax_parse(header, header_length, & tax_start, & tax_end); if (not attribute_is_present) { return; } auto offset = tax_start + length_of_attribute_name; while (offset < tax_end) { /* Is the next char a recognized tax level letter? */ auto const * next_level = std::find(taxonomic_fields.begin(), taxonomic_fields.end(), std::tolower(header[offset])); if (next_level != taxonomic_fields.end()) { int const level = std::distance(taxonomic_fields.data(), next_level); /* Is there a colon after it? */ if (header[offset + 1] == ':') { level_start[level] = offset + 2; auto const * next_comma = std::strchr(header + offset + 2, ','); if (next_comma != nullptr) { level_len[level] = std::distance(header, next_comma) - offset - 2; } else { level_len[level] = tax_end - offset - 2; } } } /* skip past next comma */ auto const * next_comma_bis = std::strchr(header + offset, ','); if (next_comma_bis != nullptr) { offset = std::distance(header, next_comma_bis) + 1; } else { offset = tax_end; } } } vsearch-2.30.1/src/tax.h000066400000000000000000000047651506776541700150220ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto tax_split(int seqno, int * level_start, int * level_len) -> void; vsearch-2.30.1/src/udb.cc000066400000000000000000000665701506776541700151400ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "attributes.h" #include "bitmap.h" #include "dbindex.h" #include "mask.h" #include "unique.h" #include "utils/fatal.hpp" #include // std::min, std::max #include #include // macros PRIu64 and PRId64 #include #include // uint64_t #include // std::FILE, std::fprintf, std::size_t #include // std::qsort #include // std::memset, std::memmove #include #include constexpr auto blocksize = uint64_t{4096UL * 4096UL}; static unsigned int udb_dbaccel = 0; struct wordfreq { unsigned int kmer; unsigned int count; }; using wordfreq_t = struct wordfreq; auto wc_compare(const void * a, const void * b) -> int { auto * lhs = (wordfreq_t *) a; auto * rhs = (wordfreq_t *) b; if (lhs->count < rhs->count) { return -1; } if (lhs->count > rhs->count) { return +1; } if (lhs->kmer < rhs->kmer) { return +1; } if (lhs->kmer > rhs->kmer) { return -1; } return 0; } auto largeread(int file_descriptor, void * buf, uint64_t nbyte, uint64_t offset) -> uint64_t { /* call pread multiple times and update progress */ auto progress = offset; for (uint64_t i = 0; i < nbyte; i += blocksize) { auto const res = xlseek(file_descriptor, offset + i, SEEK_SET); if (res != offset + i) { fatal("Unable to seek in UDB file or invalid UDB file"); } auto const rem = std::min(blocksize, nbyte - i); uint64_t const bytesread = read(file_descriptor, ((char *) buf) + i, rem); if (bytesread != rem) { fatal("Unable to read from UDB file or invalid UDB file"); } progress += rem; progress_update(progress); } return nbyte; } auto largewrite(int file_descriptor, void * buf, uint64_t nbyte, uint64_t offset) -> uint64_t { /* call write multiple times and update progress */ auto progress = offset; for (uint64_t i = 0; i < nbyte; i += blocksize) { auto const res = xlseek(file_descriptor, offset + i, SEEK_SET); if (res != offset + i) { fatal("Unable to seek in UDB file or invalid UDB file"); } auto const rem = std::min(blocksize, nbyte - i); uint64_t const byteswritten = write(file_descriptor, ((char *) buf) + i, rem); if (byteswritten != rem) { fatal("Unable to write to UDB file"); } progress += rem; progress_update(progress); } return nbyte; } auto udb_detect_isudb(const char * filename) -> bool { /* Detect whether the given filename seems to refer to an UDB file. It must be an uncompressed regular file, not a pipe. */ constexpr static uint32_t udb_file_signature {0x55444246}; // 'FBDU UDBF' constexpr static uint64_t expected_n_bytes {sizeof(uint32_t)}; xstat_t fs; if (xstat(filename, & fs) != 0) { fatal("Unable to get status for input file (%s)", filename); } auto const is_pipe = S_ISFIFO(fs.st_mode); if (is_pipe) { return false; } int file_descriptor = 0; file_descriptor = xopen_read(filename); if (file_descriptor == 0) { fatal("Unable to open input file for reading (%s)", filename); } unsigned int magic = 0; uint64_t const bytesread = read(file_descriptor, & magic, expected_n_bytes); close(file_descriptor); if ((bytesread == expected_n_bytes) and (magic == udb_file_signature)) { return true; } return false; } auto udb_info(struct Parameters const & parameters) -> void { /* Read UDB header and show basic info */ std::array buffer {{}}; int fd_udbinfo = 0; fd_udbinfo = xopen_read(parameters.opt_udbinfo); if (fd_udbinfo == 0) { fatal("Unable to open UDB file for reading"); } uint64_t const bytesread = read(fd_udbinfo, buffer.data(), 4 * 50); if (bytesread != 4 * 50) { fatal("Unable to read from UDB file or invalid UDB file"); } if ((buffer[0] != 0x55444246) or (buffer[2] != 32) or (buffer[4] < 3) or (buffer[4] > 15) or (buffer[13] == 0) or (buffer[17] != 0x0000746e) or (buffer[49] != 0x55444266)) { fatal("Invalid UDB file"); } if (not opt_quiet) { fprintf(stderr, " Seqs %u\n", buffer[13]); fprintf(stderr, " SeqIx bits %u\n", buffer[2]); fprintf(stderr, " Alpha nt (4)\n"); fprintf(stderr, " Word width %u\n", buffer[4]); fprintf(stderr, " Slots %u\n", buffer[11]); fprintf(stderr, " Dict size %u (%.1fk)\n", (1U << (2 * buffer[4])), (1U << (2 * buffer[4])) * 1.0 / 1000.0); fprintf(stderr, " DBstep %u\n", buffer[5]); fprintf(stderr, " DBAccel %u%%\n", buffer[6]); } if (opt_log != nullptr) { fprintf(fp_log, " Seqs %u\n", buffer[13]); fprintf(fp_log, " SeqIx bits %u\n", buffer[2]); fprintf(fp_log, " Alpha nt (4)\n"); fprintf(fp_log, " Word width %u\n", buffer[4]); fprintf(fp_log, " Slots %u\n", buffer[11]); fprintf(fp_log, " Dict size %u (%.1fk)\n", (1U << (2 * buffer[4])), (1U << (2 * buffer[4])) * 1.0 / 1000.0); fprintf(fp_log, " DBstep %u\n", buffer[5]); fprintf(fp_log, " DBAccel %u%%\n", buffer[6]); } close(fd_udbinfo); } auto udb_read(const char * filename, bool create_bitmaps, bool parse_abundances) -> void { /* read UDB as indexed database */ auto seqcount = 0U; auto udb_wordlength = 0U; uint64_t nucleotides = 0; xstat_t fs; if (xstat(filename, & fs) != 0) { fatal("Unable to get status for input file (%s)", filename); } auto const is_pipe = S_ISFIFO(fs.st_mode); if (is_pipe) { fatal("Cannot read UDB file from a pipe"); } /* get file size */ uint64_t const filesize = fs.st_size; /* open UDB file */ auto fd_udb = 0; fd_udb = xopen_read(filename); if (fd_udb == 0) { fatal("Unable to open UDB file for reading"); } char * prompt = nullptr; if (xsprintf(&prompt, "Reading UDB file %s", filename) == -1) { fatal("Out of memory"); } progress_init(prompt, filesize); /* header */ std::array buffer {{}}; uint64_t pos = 0; pos += largeread(fd_udb, buffer.data(), 4 * 50, pos); if ((buffer[0] != 0x55444246) or (buffer[2] != 32) or (buffer[4] < 3) or (buffer[4] > 15) or (buffer[13] == 0) or (buffer[17] != 0x0000746e) or (buffer[49] != 0x55444266)) { fatal("Invalid UDB file"); } udb_wordlength = buffer[4]; seqcount = buffer[13]; udb_dbaccel = buffer[6]; if (udb_wordlength != opt_wordlength) { fprintf(stderr, "\nWARNING: Wordlength adjusted to %u as indicated in UDB file\n", udb_wordlength); opt_wordlength = udb_wordlength; } /* word match counts */ kmerhashsize = 1U << (2 * udb_wordlength); kmercount = (unsigned int *) xmalloc(kmerhashsize * sizeof(unsigned int)); kmerhash = (uint64_t *) xmalloc(kmerhashsize * sizeof(uint64_t)); kmerbitmap = (struct bitmap_s * *) xmalloc(kmerhashsize * sizeof(struct bitmap_s **)); std::memset(kmerbitmap, 0, kmerhashsize * sizeof(struct bitmap_s **)); pos += largeread(fd_udb, kmercount, 4 * kmerhashsize, pos); kmerindexsize = 0; for (uint64_t i = 0; i < kmerhashsize; i++) { kmerhash[i] = kmerindexsize; kmerindexsize += kmercount[i]; } /* signature */ pos += largeread(fd_udb, buffer.data(), 4, pos); if (buffer[0] != 0x55444233) { fatal("Invalid UDB file"); } /* sequence numbers for word matches */ kmerindex = (unsigned int *) xmalloc(kmerindexsize * 4); pos += largeread(fd_udb, kmerindex, 4 * kmerindexsize, pos); /* new header */ pos += largeread(fd_udb, buffer.data(), 4 * 8, pos); if ((buffer[0] != 0x55444234) or (buffer[1] != 0x005e0db3) or (buffer[2] != seqcount) or (buffer[7] != 0x005e0db4)) { fatal("Invalid UDB file"); } nucleotides = (((uint64_t) buffer[4]) << 32U) | buffer[3]; auto const udb_headerchars = (((uint64_t) buffer[6]) << 32U) | buffer[5]; /* header index */ seqindex = (seqinfo_t *) xmalloc(seqcount * sizeof(seqinfo_t)); std::vector header_index(seqcount + 1); pos += largeread(fd_udb, header_index.data(), 4 * seqcount, pos); header_index[seqcount] = udb_headerchars; auto last = 0U; for (auto i = 0U; i < seqcount; i++) { unsigned int const current_index = header_index[i]; if ((current_index < last) or (current_index >= udb_headerchars)) { fatal("Invalid UDB file"); } seqindex[i].header_p = current_index; seqindex[i].headerlen = header_index[i + 1] - current_index - 1; seqindex[i].size = 1; last = current_index; } /* headers */ datap = (char *) xmalloc(udb_headerchars + nucleotides + seqcount); pos += largeread(fd_udb, datap, udb_headerchars, pos); uint64_t longestheader = 0; for (auto i = 0U; i < seqcount; i++) { longestheader = std::max(seqindex[i].headerlen, longestheader); } /* sequence lengths */ std::vector sequence_lengths(seqcount); pos += largeread(fd_udb, sequence_lengths.data(), 4 * seqcount, pos); uint64_t sum = 0; auto shortest = std::numeric_limits::max(); auto longest = 0U; for (auto i = 0U; i < seqcount; i++) { unsigned int const sequence_length = sequence_lengths[i]; seqindex[i].seq_p = udb_headerchars + sum; seqindex[i].seqlen = sequence_length; seqindex[i].qual_p = 0; shortest = std::min(sequence_length, shortest); longest = std::max(sequence_length, longest); sum += sequence_length; if (sum > nucleotides) { fatal("Invalid UDB file"); } } if (sum != nucleotides) { fatal("Invalid UDB file"); } /* sequences */ pos += largeread(fd_udb, datap + udb_headerchars, nucleotides, pos); if (pos != filesize) { fatal("Incorrect UDB file size"); } /* close UDB file */ close(fd_udb); progress_done(); xfree(prompt); /* move sequences and insert zero at end of each sequence */ progress_init("Reorganizing data in memory", seqcount); for (auto i = seqcount - 1; i > 0; i--) { auto const old_p = seqindex[i].seq_p; auto const new_p = seqindex[i].seq_p + i; auto const len = seqindex[i].seqlen; std::memmove(datap + new_p, datap + old_p, len); *(datap + new_p + len) = 0; seqindex[i].seq_p = new_p; progress_update(seqcount - i); } *(datap + seqindex[0].seq_p + seqindex[0].seqlen) = 0; progress_done(); /* Create bitmaps for the most frequent words */ if (create_bitmaps) { progress_init("Creating bitmaps", kmerhashsize); auto const bitmap_mincount = seqcount / 8; for (auto i = 0U; i < kmerhashsize; i++) { if (kmercount[i] >= bitmap_mincount) { kmerbitmap[i] = bitmap_init(seqcount + 127); // pad for xmm bitmap_reset_all(kmerbitmap[i]); for (auto j = 0U; j < kmercount[i]; j++) { bitmap_set(kmerbitmap[i], kmerindex[kmerhash[i]+j]); } } progress_update(i + 1); } progress_done(); } /* get abundances and longest header */ if (parse_abundances) { progress_init("Parsing abundances", seqcount); for (auto i = 0U; i < seqcount; i++) { auto const size = header_get_size(datap + seqindex[i].header_p, seqindex[i].headerlen); if (size > 0) { seqindex[i].size = size; } else { seqindex[i].size = 1; } progress_update(i + 1); } progress_done(); } /* set database info */ dbindex_uh = unique_init(); db_setinfo(false, seqcount, nucleotides, longest, shortest, longestheader); /* make mapping from indexno to seqno */ dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int)); dbindex_count = seqcount; for (auto i = 0U; i < seqcount; i++) { dbindex_map[i] = i; } /* done */ /* some stats */ if (not opt_quiet) { if (seqcount > 0) { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(stderr, "%" PRIu64 " nt in %" PRIu64 " seqs\n", db_getnucleotidecount(), db_getsequencecount()); } } if (opt_log != nullptr) { if (seqcount > 0) { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs, min %" PRIu64 ", max %" PRIu64 ", avg %.0f\n\n", db_getnucleotidecount(), db_getsequencecount(), db_getshortestsequence(), db_getlongestsequence(), db_getnucleotidecount() * 1.0 / db_getsequencecount()); } else { fprintf(fp_log, "%" PRIu64 " nt in %" PRIu64 " seqs\n\n", db_getnucleotidecount(), db_getsequencecount()); } } } auto udb_fasta(struct Parameters const & parameters) -> void { if (opt_output == nullptr) { fatal("FASTA output file must be specified with --output"); } /* open FASTA file for writing */ auto * fp_output = fopen_output(opt_output); if (fp_output == nullptr) { fatal("Unable to open FASTA output file for writing"); } /* read UDB file */ udb_read(parameters.opt_udb2fasta, false, false); /* dump fasta */ auto const seqcount = db_getsequencecount(); progress_init("Writing FASTA file", seqcount); for (std::size_t i = 0; i < seqcount; i++) { fasta_print_db_relabel(fp_output, i, i + 1); progress_update(i + 1); } progress_done(); fclose(fp_output); dbindex_free(); db_free(); } auto udb_stats(struct Parameters const & parameters) -> void { /* show word statistics for an UDB file */ /* read UDB file */ udb_read(parameters.opt_udbstats, false, false); /* analyze word counts */ std::vector freqtable(kmerhashsize); for (auto i = 0U; i < kmerhashsize; i++) { freqtable[i].kmer = i; freqtable[i].count = kmercount[i]; } qsort(freqtable.data(), kmerhashsize, sizeof(wordfreq_t), wc_compare); auto const wcmax = freqtable[kmerhashsize-1].count; auto const wcmedian = ( freqtable[(kmerhashsize / 2) - 1].count + freqtable[kmerhashsize / 2].count ) / 2; unsigned int const seqcount = db_getsequencecount(); auto const nt = db_getnucleotidecount(); /* show stats */ if (opt_log != nullptr) { fprintf(fp_log, " Alphabet nt\n"); fprintf(fp_log, " Word width %" PRIu64 "\n", opt_wordlength); fprintf(fp_log, " Word ones %" PRIu64 "\n", opt_wordlength); fprintf(fp_log, " Spaced No\n"); fprintf(fp_log, " Hashed No\n"); fprintf(fp_log, " Coded No\n"); fprintf(fp_log, " Stepped No\n"); fprintf(fp_log, " Slots %u (%.1fk)\n", kmerhashsize, 1.0 * kmerhashsize / 1000.0); fprintf(fp_log, " DBAccel %u%%\n", udb_dbaccel); fprintf(fp_log, "\n"); fprintf(fp_log, "%10" PRIu64 " DB size (%.1fk)\n", nt, 1.0 * nt / 1000.0); fprintf(fp_log, "%10" PRIu64 " Words\n", kmerindexsize); fprintf(fp_log, "%10u Median size\n", wcmedian); fprintf(fp_log, "%10.1f Mean size\n", 1.0 * kmerindexsize / kmerhashsize); fprintf(fp_log, "\n"); fprintf(fp_log, " iWord sWord Cap Size Row\n"); fprintf(fp_log, "---------- ------------ ---------- ---------- ---\n"); for (auto i = 0U; i < kmerhashsize; i++) { fprintf(fp_log, "%10u ", freqtable[kmerhashsize - 1 - i].kmer); fprintf(fp_log, "%.*s", std::max(12 - (int)(opt_wordlength), 0), " "); fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize - 1 - i].kmer); fprintf(fp_log, " %10u %10u", 0, freqtable[kmerhashsize - 1 - i].count); fprintf(fp_log, " "); for (auto j = 0U; j < freqtable[kmerhashsize - 1 - i].count; j++) { fprintf(fp_log, " %u", kmerindex[kmerhash[freqtable[kmerhashsize - 1 - i].kmer] + j]); if (j == 7) { break; } } if (freqtable[kmerhashsize-1-i].count > 8) { fprintf(fp_log, "..."); } fprintf(fp_log, "\n"); if (i == 10) { break; } } fprintf(fp_log, "\n\n"); fprintf(fp_log, "Word width %" PRIu64 "\n", opt_wordlength); fprintf(fp_log, "Slots %u\n", kmerhashsize); fprintf(fp_log, "Words %" PRIu64 "\n", kmerindexsize); fprintf(fp_log, "Max size %u (", wcmax); fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize - 1].kmer); fprintf(fp_log, ")\n\n"); fprintf(fp_log, " Size lo Size hi Total size Nr. Words Pct TotPct\n"); fprintf(fp_log, "---------- ---------- ---------- ---------- ------ ------\n"); auto size_lo = 0U; auto size_hi = 0U; auto x = 0U; auto totpct = 0.0; while (size_lo < seqcount) { auto count = 0; auto size = 0; while ((x < kmerhashsize) and (freqtable[x].count <= size_hi)) { count++; size += freqtable[x].count; x++; } auto const pct = 100.0 * count / kmerhashsize; totpct += pct; if (size_lo < size_hi) { fprintf(fp_log, "%10u", size_lo); } else { fprintf(fp_log, " "); } fprintf(fp_log, " %10u", size_hi); if (size >= 10000) { fprintf(fp_log, " %9.1fk", size * 0.001); } else { fprintf(fp_log, " %10.1f", size * 1.0); } if (count >= 10000) { fprintf(fp_log, " %9.1fk", count * 0.001); } else { fprintf(fp_log, " %10.1f", count * 1.0); } fprintf(fp_log, " %5.1f%% %5.1f%%", pct, totpct); static constexpr auto divider = 3.0; const auto dots = std::lround(pct / divider); if (dots > 0) { fprintf(fp_log, " "); } for (auto i = 0L; i < dots ; i++) { fprintf(fp_log, "*"); } fprintf(fp_log, "\n"); size_lo = size_hi + 1; if (size_hi > 0) { size_hi *= 2; } else { size_hi = 1; } size_hi = std::min(size_hi, seqcount); } fprintf(fp_log, "---------- ---------- ---------- ----------\n"); fprintf(fp_log, " "); if (kmerindexsize >= 10000) { fprintf(fp_log, " %9.1fk", kmerindexsize * 0.001); } else { fprintf(fp_log, " %10.1f", kmerindexsize * 1.0); } if (kmerhashsize >= 10000) { fprintf(fp_log, " %9.1fk", kmerhashsize * 0.001); } else { fprintf(fp_log, " %10.1f", kmerhashsize * 1.0); } fprintf(fp_log, "\n\n"); fprintf(fp_log, "%10" PRIu64 " Upper\n", nt); fprintf(fp_log, "%10u Lower (%.1f%%)\n", 0, 0.0); fprintf(fp_log, "%10" PRIu64 " Total\n", nt); fprintf(fp_log, "%10" PRIu64 " Indexed words\n", kmerindexsize); } dbindex_free(); db_free(); } auto udb_make(struct Parameters const & parameters) -> void { if (opt_output == nullptr) { fatal("UDB output file must be specified with --output"); } auto fd_output = 0; fd_output = xopen_write(opt_output); if (fd_output == 0) { fatal("Unable to open output file for writing"); } db_read(parameters.opt_makeudb_usearch, 1); if (opt_dbmask == MASK_DUST) { dust_all(); } else if ((opt_dbmask == MASK_SOFT) and (opt_hardmask != 0)) { hardmask_all(); } dbindex_prepare(1, opt_dbmask); dbindex_addallsequences(opt_dbmask); unsigned int const seqcount = db_getsequencecount(); auto const ntcount = db_getnucleotidecount(); uint64_t header_characters = 0; for (auto i = 0U; i < seqcount; i++) { header_characters += db_getheaderlen(i) + 1; } uint64_t const kmerhashsize = 1U << (2 * static_cast(opt_wordlength)); /* count word matches */ uint64_t wordmatches = 0; for (auto i = 0U; i < kmerhashsize; i++) { wordmatches += kmercount[i]; } uint64_t pos = 0; uint64_t const progress_all = (4 * 50) + (4 * kmerhashsize) + (4 * 1) + (4 * wordmatches) + (4 * 8) + (4 * seqcount) + header_characters + (4 * seqcount) + ntcount; progress_init("Writing UDB file", progress_all); uint64_t const buffersize = std::max(50U, seqcount); std::vector buffer(buffersize); /* Header */ buffer[0] = 0x55444246; /* FBDU UDBF */ buffer[2] = 32; /* bits */ buffer[4] = opt_wordlength; /* default 8 */ buffer[5] = 1; /* dbstep */ buffer[6] = 100; /* dbaccelpct % */ buffer[11] = 0; /* slots */ buffer[13] = seqcount; /* number of sequences */ buffer[17] = 0x0000746e; /* alphabet: "nt" */ buffer[49] = 0x55444266; /* fBDU UDBf */ pos += largewrite(fd_output, buffer.data(), 50 * 4, 0); /* write 4^wordlength uint32_t's with word match counts */ pos += largewrite(fd_output, kmercount, 4 * kmerhashsize, pos); /* 3BDU */ buffer[0] = 0x55444233; /* 3BDU UDB3 */ pos += largewrite(fd_output, buffer.data(), 1 * 4, pos); /* lists of sequence no's with matches for all words */ for (auto i = 0U; i < kmerhashsize; i++) { if (kmerbitmap[i] != nullptr) { std::memset(buffer.data(), 0, 4 * kmercount[i]); auto elements = 0U; for (auto j = 0U; j < seqcount; j++) { if (bitmap_get(kmerbitmap[i], j) != 0U) { buffer[elements++] = j; } } pos += largewrite(fd_output, buffer.data(), 4 * elements, pos); } else { if (kmercount[i] > 0) { pos += largewrite(fd_output, kmerindex + kmerhash[i], 4 * kmercount[i], pos); } } } /* New header */ buffer[0] = 0x55444234; /* 4BDU UDB4 */ /* 0x005e0db3 */ buffer[1] = 0x005e0db3; /* number of sequences, uint32_t */ buffer[2] = seqcount; /* total number of nucleotides, uint64_t */ buffer[3] = (unsigned int) (ntcount & 0xffffffff); buffer[4] = (unsigned int) (ntcount >> 32U); /* total number of header characters, incl zero-terminator, uint64_t */ buffer[5] = (unsigned int) (header_characters & 0xffffffff); buffer[6] = (unsigned int) (header_characters >> 32U); /* 0x005e0db4 */ buffer[7] = 0x005e0db4; pos += largewrite(fd_output, buffer.data(), 4 * 8, pos); /* indices to headers (uint32_t) */ auto sum = 0U; for (auto i = 0U; i < seqcount; i++) { buffer[i] = sum; sum += db_getheaderlen(i) + 1; } pos += largewrite(fd_output, buffer.data(), 4 * seqcount, pos); /* headers (ascii, zero terminated, not padded) */ for (auto i = 0U; i < seqcount; i++) { unsigned int const len = db_getheaderlen(i); pos += largewrite(fd_output, db_getheader(i), len + 1, pos); } /* sequence lengths (uint32_t) */ for (auto i = 0U; i < seqcount; i++) { buffer[i] = db_getsequencelen(i); } pos += largewrite(fd_output, buffer.data(), 4 * seqcount, pos); /* sequences (ascii, no term, no pad) */ for (auto i = 0U; i < seqcount; i++) { unsigned int const len = db_getsequencelen(i); pos += largewrite(fd_output, db_getsequence(i), len, pos); } if (close(fd_output) != 0) { fatal("Unable to close UDB file"); } progress_done(); dbindex_free(); db_free(); } vsearch-2.30.1/src/udb.h000066400000000000000000000055171506776541700147740ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto udb_detect_isudb(const char * filename) -> bool; auto udb_read(const char * filename, bool create_bitmaps, bool parse_abundances) -> void; auto udb_fasta(struct Parameters const & parameters) -> void; auto udb_info(struct Parameters const & parameters) -> void; auto udb_make(struct Parameters const & parameters) -> void; auto udb_stats(struct Parameters const & parameters) -> void; vsearch-2.30.1/src/unique.cc000066400000000000000000000264501506776541700156650ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "city.h" #include "mask.h" #include "utils/maps.hpp" #include // std::min #include // uint64_t #include // std::memset #include // std::function /* Find the unique kmers or words in a given sequence. Unique is now defined as all different words occuring at least once. Earlier it was defined as those words occuring exactly once, but that caused a problem when searching for sequences with many repeats. */ // anonymous namespace: limit visibility and usage to this translation unit namespace { // refactoring: // replace with std::unordered_map (default hashing) // if performance are bad, see Victor_Ciura's Cpp Talk "So You Think You Can Hash" // then make a CityHash hasher object and use it with std::unordered_map using Hash = decltype(&CityHash64); Hash hash_function = CityHash64; struct bucket_s { unsigned int kmer; unsigned int count; }; } // end of anonymous namespace struct uhandle_s { struct bucket_s * hash; unsigned int * list; unsigned int hash_mask; int size; int alloc; uint64_t bitmap_size; uint64_t * bitmap; }; // refactoring: 2025-07-22 failed attempt to eliminate malloc/free, // requires major refactoring, see attempt in unique_struct.hpp auto unique_init() -> struct uhandle_s * { static constexpr auto initial_allocation = 2048; static constexpr auto initial_hash_mask = initial_allocation - 1U; auto * unique_handle = static_cast(xmalloc(sizeof(struct uhandle_s))); unique_handle->alloc = initial_allocation; unique_handle->size = 0; unique_handle->hash_mask = initial_hash_mask; unique_handle->hash = static_cast(xmalloc(sizeof(struct bucket_s) * initial_allocation)); unique_handle->list = static_cast(xmalloc(sizeof(unsigned int) * initial_allocation)); unique_handle->bitmap_size = 0; unique_handle->bitmap = nullptr; return unique_handle; } auto unique_exit(struct uhandle_s * unique_handle) -> void { if (unique_handle->bitmap != nullptr) { xfree(unique_handle->bitmap); } if (unique_handle->hash != nullptr) { xfree(unique_handle->hash); } if (unique_handle->list != nullptr) { xfree(unique_handle->list); } xfree(unique_handle); } auto unique_compare(const void * a, const void * b) -> int { auto * lhs = (unsigned int *) a; auto * rhs = (unsigned int *) b; if (lhs < rhs) { return -1; } if (lhs > rhs) { return +1; } return 0; } auto unique_count_bitmap(struct uhandle_s * unique_handle, int const wordlength, int const seqlen, char const * seq, unsigned int * listlen, unsigned int const * * list, int const seqmask) -> void { /* if necessary, reallocate list of unique kmers */ if (unique_handle->alloc < seqlen) { while (unique_handle->alloc < seqlen) { unique_handle->alloc *= 2; } // failed refactoring 2025-08-19: unique_handle is passed as // pointer to another struct, ownership is not clear, // multithreading fails unique_handle->list = (unsigned int *) xrealloc(unique_handle->list, sizeof(unsigned int) * unique_handle->alloc); } uint64_t const size = 1ULL << (wordlength << 1ULL); /* reallocate bitmap arrays if necessary */ if (unique_handle->bitmap_size < size) { unique_handle->bitmap = (uint64_t *) xrealloc(unique_handle->bitmap, size >> 3ULL); unique_handle->bitmap_size = size; } std::memset(unique_handle->bitmap, 0, size >> 3ULL); uint64_t bad = 0; uint64_t kmer = 0; uint64_t const mask = size - 1ULL; auto const * s = seq; auto const * e1 = s + wordlength - 1; auto const * e2 = s + seqlen; e1 = std::min(e2, e1); std::function const maskmap = (seqmask != MASK_NONE) ? map_mask_lower : map_mask_ambig; while (s < e1) { bad <<= 2ULL; bad |= maskmap(*s); kmer <<= 2ULL; kmer |= map_2bit(*s); ++s; } auto unique = 0; while (s < e2) { bad <<= 2ULL; bad |= maskmap(*s); bad &= mask; kmer <<= 2ULL; kmer |= map_2bit(*s); ++s; kmer &= mask; if (bad == 0U) { uint64_t const x = kmer >> 6ULL; uint64_t const y = 1ULL << (kmer & 63ULL); if ((unique_handle->bitmap[x] & y) == 0U) { /* not seen before */ unique_handle->list[unique] = kmer; ++unique; unique_handle->bitmap[x] |= y; } } } *listlen = unique; *list = unique_handle->list; } auto unique_count_hash(struct uhandle_s * unique_handle, int const wordlength, int const seqlen, char const * seq, unsigned int * listlen, unsigned int const * * list, int const seqmask) -> void { /* if necessary, reallocate hash table and list of unique kmers */ if (unique_handle->alloc < 2 * seqlen) { while (unique_handle->alloc < 2 * seqlen) { unique_handle->alloc *= 2; } unique_handle->hash = (struct bucket_s *) xrealloc(unique_handle->hash, sizeof(struct bucket_s) * unique_handle->alloc); unique_handle->list = (unsigned int *) xrealloc(unique_handle->list, sizeof(unsigned int) * unique_handle->alloc); } /* hashtable variant */ unique_handle->size = 1; while (unique_handle->size < 2 * seqlen) { unique_handle->size *= 2; } unique_handle->hash_mask = unique_handle->size - 1; std::memset(unique_handle->hash, 0, sizeof(struct bucket_s) * unique_handle->size); uint64_t bad = 0; uint64_t j = 0; auto kmer = 0U; unsigned int const mask = (1ULL << (2ULL * wordlength)) - 1ULL; auto const * s = seq; auto const * e1 = s + wordlength - 1; auto const * e2 = s + seqlen; e1 = std::min(e2, e1); std::function const maskmap = (seqmask != MASK_NONE) ? map_mask_lower : map_mask_ambig; while (s < e1) { bad <<= 2ULL; bad |= maskmap(*s); kmer <<= 2ULL; kmer |= map_2bit(*s); ++s; } uint64_t unique = 0; while (s < e2) { bad <<= 2ULL; bad |= maskmap(*s); bad &= mask; kmer <<= 2ULL; kmer |= map_2bit(*s); ++s; kmer &= mask; if (bad == 0U) { /* find free appropriate bucket in hash */ j = hash_function((char *) &kmer, (wordlength + 3) / 4) & unique_handle->hash_mask; while ((unique_handle->hash[j].count != 0U) && (unique_handle->hash[j].kmer != kmer)) { j = (j + 1) & unique_handle->hash_mask; } if (unique_handle->hash[j].count == 0U) { /* not seen before */ unique_handle->list[unique] = kmer; ++unique; unique_handle->hash[j].kmer = kmer; unique_handle->hash[j].count = 1; } } } *listlen = unique; *list = unique_handle->list; } auto unique_count(struct uhandle_s * unique_handle, int const wordlength, int const seqlen, char const * seq, unsigned int * listlen, unsigned int const * * list, int const seqmask) -> void { if (wordlength < 10) { unique_count_bitmap(unique_handle, wordlength, seqlen, seq, listlen, list, seqmask); } else { unique_count_hash(unique_handle, wordlength, seqlen, seq, listlen, list, seqmask); } } auto unique_count_shared(struct uhandle_s const & unique_handle, int const wordlength, int const listlen, unsigned int const * list) -> unsigned int { /* counts how many of the kmers in list are present in the (already computed) hash or bitmap */ auto count = 0U; if (wordlength < 10) { for (auto i = 0; i < listlen; i++) { auto const kmer = list[i]; uint64_t const x = kmer >> 6ULL; uint64_t const y = 1ULL << (kmer & 63ULL); if ((unique_handle.bitmap[x] & y) != 0U) { ++count; } } } else { for (auto i = 0; i < listlen; i++) { auto kmer = list[i]; uint64_t j = hash_function((char *) &kmer, (wordlength + 3) / 4) & unique_handle.hash_mask; while ((unique_handle.hash[j].count != 0U) && (unique_handle.hash[j].kmer != kmer)) { j = (j + 1) & unique_handle.hash_mask; } if (unique_handle.hash[j].count != 0U) { ++count; } } } return count; } vsearch-2.30.1/src/unique.h000066400000000000000000000060501506776541700155210ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct bucket_s; struct uhandle_s; auto unique_init() -> struct uhandle_s *; auto unique_exit(struct uhandle_s * unique_handle) -> void; auto unique_count(struct uhandle_s * unique_handle, int wordlength, int seqlen, char const * seq, unsigned int * listlen, unsigned int const * * list, int seqmask) -> void; auto unique_count_shared(struct uhandle_s const & unique_handle, int wordlength, int listlen, unsigned int const * list) -> unsigned int; vsearch-2.30.1/src/userfields.cc000066400000000000000000000120051506776541700165130ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // std::count #include // uint64_t #include // std::strcmp, std::strchr, std::strlen // refactoring: C++11 std::array does not allow conversion from litteral string to char * static const char * userfields_names[] = { "query", // 0 "target", // 1 "evalue", // 2 "id", // 3 "pctpv", "pctgaps", "pairs", "gaps", "qlo", "qhi", "tlo", "thi", "pv", "ql", "tl", "qs", "ts", "alnlen", "opens", "exts", "raw", "bits", "aln", "caln", "qstrand", "tstrand", "qrow", "trow", "qframe", "tframe", "mism", "ids", "qcov", "tcov", // 33 "id0", "id1", "id2", "id3", "id4", // 38 "qilo", // 39 "qihi", "tilo", "tihi", // 42 nullptr }; int * userfields_requested = nullptr; int userfields_requested_count = 0; auto parse_userfields_arg(char const * arg) -> int { // Parses the userfields option argument, e.g. query+target+id+alnlen+mism // and returns 1 if it is ok or 0 if not. static constexpr auto separator = '+'; char const * ptr = arg; char const * end_of_string = ptr + std::strlen(ptr); // pointer to end of string userfields_requested_count = std::count(ptr, end_of_string, separator) + 1; userfields_requested = static_cast(xmalloc(sizeof(int) * (uint64_t) userfields_requested_count)); ptr = arg; // reset to the start of the string char const * next_separator = nullptr; auto nth_field = 0; while (true) { next_separator = std::strchr(ptr, separator); if (next_separator == nullptr) { next_separator = end_of_string; } auto const field_length = static_cast(next_separator - ptr); char ** valid_userfield = (char **) userfields_names; while (*valid_userfield != nullptr) { if ((std::strncmp(ptr, *valid_userfield, field_length) == 0) and (std::strlen(*valid_userfield) == field_length)) { break; } ++valid_userfield; } if (*valid_userfield == nullptr) { // reached end of list -> unrecognized field return 0; // bad argument } auto const nth_valid_userfield = static_cast((((const char **) valid_userfield) - userfields_names)); userfields_requested[nth_field] = nth_valid_userfield; ++nth_field; ptr = next_separator; if (ptr == end_of_string) { // reached end of argument return 1; } ++ptr; } } vsearch-2.30.1/src/userfields.h000066400000000000000000000050541506776541700163630ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ extern int * userfields_requested; extern int userfields_requested_count; auto parse_userfields_arg(char const * arg) -> int; vsearch-2.30.1/src/util.cc000066400000000000000000000304341506776541700153310ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "city.h" #include "md5.h" #include "utils/fatal.hpp" #include "utils/maps.hpp" #include #include // macros PRIu64 and PRId64 #include // va_list #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::fclose, std::size_t, std::vsnprintf, std::fopen #include // std::exit, EXIT_FAILURE, RAND_MAX #include // std::strlen, std::strcmp, std::strcpy, std::strchr #include // timeval, gettimeofday #include // std::next #include #include constexpr auto one_hundred_percent = 100UL; constexpr auto nighty_nine_percent = 99UL; static const char * progress_prompt; static uint64_t progress_next; static uint64_t progress_size; static uint64_t progress_pct; static bool progress_show; // refactoring: make a progress object, with an .update() method and automatic closure? auto progress_init(char const * prompt, uint64_t const size) -> void { progress_show = (isatty(fileno(stderr)) != 0) and (not opt_quiet) and (not opt_no_progress); progress_prompt = prompt; progress_size = size; progress_pct = 0; progress_next = ((progress_pct + 1) * progress_size + nighty_nine_percent) / one_hundred_percent; if (opt_quiet) { return; } std::fprintf(stderr, "%s", prompt); if (not progress_show) { return; } std::fprintf(stderr, " %d%%", 0); } auto progress_update(uint64_t const progress) -> void { if ((progress < progress_next) or not progress_show) { return; } if (progress_size == 0) { std::fprintf(stderr, " \r%s 0%%", progress_prompt); return; } progress_pct = one_hundred_percent * progress / progress_size; std::fprintf(stderr, " \r%s %" PRIu64 "%%", progress_prompt, progress_pct); progress_next = ((progress_pct + 1) * progress_size + nighty_nine_percent) / one_hundred_percent; } auto progress_done() -> void { if (opt_quiet) { return; } if (progress_show) { std::fprintf(stderr, " \r%s", progress_prompt); } std::fprintf(stderr, " %lu%%\n", one_hundred_percent); } auto xstrdup(char const * src) -> char * { auto const len = std::strlen(src); auto * dest = static_cast(xmalloc(len + 1)); return std::strcpy(dest, src); } auto xsprintf(char * * ret, char const * format, ...) -> int { // refactoring: build string with std::string? // refactoring: C variadic function, replace with template variadic function? // Only used with one or two extra arguments, it could be a simple overload std::va_list args; va_start(args, format); auto len = std::vsnprintf(nullptr, 0, format, args); va_end(args); if (len < 0) { fatal("Error with vsnprintf in xsprintf"); } auto * buffer = static_cast(xmalloc(len + 1)); if (buffer == nullptr) { fatal("Out of memory"); } va_start(args, format); len = std::vsnprintf(buffer, len + 1, format, args); va_end(args); *ret = buffer; return len; } auto hash_cityhash64(char const * sequence, uint64_t const length) -> uint64_t { return CityHash64(sequence, length); } auto hash_cityhash128(char const * sequence, uint64_t const length) -> uint128 { return CityHash128(sequence, length); } auto show_rusage() -> void { #ifdef SHOW_RUSAGE static constexpr auto a_megabyte = 1024.0 * 1024.0; double user_time = 0.0; double system_time = 0.0; arch_get_user_system_time(&user_time, &system_time); double const megabytes = arch_get_memused() / a_megabyte; std::fprintf(stderr, "Time: %.3fs (user) %.3fs (sys) Memory: %.0lfMB\n", user_time, system_time, megabytes); if (opt_log) std::fprintf(fp_log, "Time: %.3fs (user) %.3fs (sys) Memory: %.0lfMB\n", user_time, system_time, megabytes); #endif } // refactoring: create reverse_complement.hpp, progressive migration // write overloads for span? // assert(destination.size() > source.size()); // std::reverse_copy(source.begin(), source.end(), destination.begin()); // auto complement = [](char nucleotide) -> char { ... }; // std::transform(destination.begin(), destination.end(), destination.begin(), complement) // destination[length] = '\0'; auto reverse_complement(char * rc_seq, char const * seq, int64_t const len) -> void { /* Write the reverse complementary sequence to rc_seq. The memory for rc_seq must be long enough for the rc_seq of the sequence (identical to the length of seq + 1). */ for (auto i = 0LL; i < len; ++i) { auto const nucleotide = *std::next(seq, len - 1 - i); auto const complement_char = map_complement(nucleotide); *std::next(rc_seq, i) = complement_char; } *std::next(rc_seq, len) = '\0'; } auto random_init() -> void { arch_srandom(); } auto random_int(int64_t const upper_limit) -> int64_t { /* Generate a random integer in the range 0 to n-1, inclusive. n must be > 0 The random() function returns a random number in the range 0 to 2147483647 (=2^31-1=RAND_MAX), inclusive. We should avoid some of the upper generated numbers to avoid modulo bias. */ assert(upper_limit != 0); int64_t const random_max = RAND_MAX; int64_t const limit = random_max - ((random_max + 1) % upper_limit); auto random_value = static_cast(arch_random()); while (random_value > limit) { random_value = static_cast(arch_random()); } return random_value % upper_limit; } auto random_ulong(uint64_t const upper_limit) -> uint64_t { /* Generate a random integer in the range 0 to n-1, inclusive, n must be > 0 */ assert(upper_limit != 0U); static constexpr auto shift_16_bits = 16U; static constexpr auto shift_32_bits = 32U; static constexpr auto shift_48_bits = 48U; auto const random_max = std::numeric_limits::max(); auto const limit = random_max - ((random_max - upper_limit + 1) % upper_limit); auto random_value = ((arch_random() << shift_48_bits) ^ (arch_random() << shift_32_bits) ^ (arch_random() << shift_16_bits) ^ (arch_random())); while (random_value > limit) { random_value = ((arch_random() << shift_48_bits) ^ (arch_random() << shift_32_bits) ^ (arch_random() << shift_16_bits) ^ (arch_random())); } return random_value % upper_limit; } auto string_normalize(char * normalized, char const * raw_seq, unsigned int const len) -> void { /* convert string to upper case and replace U by T */ for (auto i = 0U; i < len; ++i) { auto const unsigned_char = static_cast(*raw_seq); auto const normalized_char = chrmap_normalize_vector[unsigned_char]; *normalized = static_cast(normalized_char); std::advance(normalized, 1); std::advance(raw_seq, 1); } *normalized = '\0'; } auto SHA1(unsigned char const * data, unsigned long const len, unsigned char * digest) -> void { if (digest == nullptr) { fatal("Error in computing SHA1 digest"); } SHA1_CTX a_context; SHA1_Init(&a_context); SHA1_Update(&a_context, data, len); SHA1_Final(&a_context, digest); } auto MD5(void * data, unsigned long const len, unsigned char * digest) -> void { if (digest == nullptr) { fatal("Error in computing MD5 digest"); } MD5_CTX a_context; MD5_Init(&a_context); MD5_Update(&a_context, data, len); MD5_Final(digest, &a_context); } constexpr auto drop_lower_nibble = 4U; constexpr auto mask_upper_nibble = 15U; const std::vector hexdigits = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; auto get_hex_seq_digest_sha1(char * hex, char const * seq, int const seqlen) -> void { /* Save hexadecimal representation of the SHA1 hash of the sequence. The string array digest must be large enough (len_hex_dig_sha1). First normalize string by uppercasing it and replacing U's with T's. */ std::vector normalized(seqlen + 1); string_normalize(normalized.data(), seq, seqlen); std::vector digest(sha1_digest_length); SHA1((const unsigned char *) normalized.data(), static_cast(seqlen), digest.data()); for (auto const & element: digest) { *hex = hexdigits[element >> drop_lower_nibble]; std::advance(hex, 1); *hex = hexdigits[element & mask_upper_nibble]; std::advance(hex, 1); } *hex = '\0'; } auto get_hex_seq_digest_md5(char * hex, char const * seq, int const seqlen) -> void { /* Save hexadecimal representation of the MD5 hash of the sequence. The string array digest must be large enough (len_hex_dig_md5). First normalize string by uppercasing it and replacing U's with T's. */ std::vector normalized(seqlen + 1); string_normalize(normalized.data(), seq, seqlen); std::vector digest(md5_digest_length); MD5(normalized.data(), static_cast(seqlen), digest.data()); for (auto const & element: digest) { *hex = hexdigits[element >> drop_lower_nibble]; std::advance(hex, 1); *hex = hexdigits[element & mask_upper_nibble]; std::advance(hex, 1); } *hex = '\0'; } auto fprint_seq_digest_sha1(std::FILE * output_handle, char const * seq, int const seqlen) -> void { std::vector hex_digest(len_hex_dig_sha1); get_hex_seq_digest_sha1(hex_digest.data(), seq, seqlen); std::fprintf(output_handle, "%s", hex_digest.data()); } auto fprint_seq_digest_md5(std::FILE * output_handle, char const * seq, int const seqlen) -> void { std::vector hex_digest(len_hex_dig_md5); get_hex_seq_digest_md5(hex_digest.data(), seq, seqlen); std::fprintf(output_handle, "%s", hex_digest.data()); } auto fopen_output(char const * filename) -> std::FILE * { /* open the output stream given by filename, but use stdout if name is - */ if (std::strcmp(filename, "-") == 0) { auto const file_descriptor = dup(STDOUT_FILENO); if (file_descriptor < 0) { return nullptr; } return fdopen(file_descriptor, "w"); } return std::fopen(filename, "w"); } vsearch-2.30.1/src/util.h000066400000000000000000000074651506776541700152030ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t #include // std::FILE, std::size_t constexpr auto md5_digest_length = 16; constexpr auto sha1_digest_length = 20; constexpr auto len_hex_dig_md5 = (2 * md5_digest_length) + 1; constexpr auto len_hex_dig_sha1 = (2 * sha1_digest_length) + 1; auto xstrdup(char const * src) -> char *; auto xsprintf(char * * ret, char const * format, ...) -> int; auto hash_cityhash64(char const * sequence, uint64_t length) -> uint64_t; auto hash_cityhash128(char const * sequence, uint64_t length) -> uint128; auto show_rusage() -> void; auto progress_init(char const * prompt, uint64_t size) -> void; auto progress_update(uint64_t progress) -> void; auto progress_done() -> void; auto random_init() -> void; auto random_int(int64_t upper_limit) -> int64_t; auto random_ulong(uint64_t upper_limit) -> uint64_t; auto string_normalize(char * normalized, char const * raw_seq, unsigned int len) -> void; auto reverse_complement(char * rc_seq, char const * seq, int64_t len) -> void; auto get_hex_seq_digest_sha1(char * hex, char const * seq, int seqlen) -> void; auto get_hex_seq_digest_md5(char * hex, char const * seq, int seqlen) -> void; auto fprint_seq_digest_sha1(std::FILE * output_handle, char const * seq, int seqlen) -> void; auto fprint_seq_digest_md5(std::FILE * output_handle, char const * seq, int seqlen) -> void; auto fopen_output(char const * filename) -> std::FILE *; vsearch-2.30.1/src/utils/000077500000000000000000000000001506776541700152015ustar00rootroot00000000000000vsearch-2.30.1/src/utils/check_output_filehandle.cpp000066400000000000000000000065041506776541700225620ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "fatal.hpp" auto check_mandatory_fastq_output_handle(char const * filename, bool const filehandle_is_empty) -> void { if (filename == nullptr) { fatal("output file must be specified with --fastqout"); } if (filehandle_is_empty) { fatal("unable to open output file %s", filename); } } auto check_mandatory_output_handle(char const * filename, bool const filehandle_is_empty) -> void { if (filename == nullptr) { fatal("output file must be specified with --output"); } if (filehandle_is_empty) { fatal("unable to open output file %s", filename); } } auto check_optional_output_handle(char const * filename, bool const filehandle_is_empty) -> void { if ((filename != nullptr) and filehandle_is_empty) { fatal("unable to open output file %s", filename); } } vsearch-2.30.1/src/utils/check_output_filehandle.hpp000066400000000000000000000054721506776541700225720ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ auto check_mandatory_fastq_output_handle(char const * filename, bool filehandle_is_empty) -> void; auto check_mandatory_output_handle(char const * filename, bool filehandle_is_empty) -> void; auto check_optional_output_handle(char const * filename, bool filehandle_is_empty) -> void; vsearch-2.30.1/src/utils/cigar.cpp000066400000000000000000000200351506776541700167720ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cigar_operations.hpp" #include "fatal.hpp" #include "span.hpp" #include // std::max #include #include // std::size_t #include // std::strtoll #include // std::next #include #include // std::pair #ifndef NDEBUG #include #endif // refactoring: as a Cigar class (pivate data: Span, or // View const), constructor from a char const *, or from a // View. That would eliminate the // bugprone-easily-swappable-parameters warning. Merge with // cigar_operations.hpp // CIGAR string example: 3M2I3MD // document the format here, and in vsearch-cigar(5) // also 'X' (mismatch) and 'N' // anonymous namespace: limit visibility and usage to this translation unit namespace { auto convert_to_operation(char const operation) -> Operation { assert(operation == 'M' or operation == 'I' or operation == 'D'); if (operation == 'I') { return Operation::insertion; } if (operation == 'D') { return Operation::deletion; } return Operation::match; } auto convert_from_operation(Operation const operation) -> char { switch (operation) { case Operation::match: return 'M'; break; case Operation::deletion: return 'D'; break; case Operation::insertion: return 'I'; break; } // C++23 refactoring: default: std::unreachable(); __builtin_unreachable(); } } // end of anonymous namespace // duplicate: msa.cc auto find_runlength_of_leftmost_operation(char const * first_character, char ** first_non_digit) -> long long { // std::strtoll: // - start from the 'first_character' pointed to, // - consume as many characters as possible to form a valid integer, // - advance pointer to the first non-digit character, // - return the valid integer // - if there is no valid integer: pointer is not advanced and strtoll() returns zero static constexpr auto decimal_base = 10; auto const runlength = std::strtoll(first_character, first_non_digit, decimal_base); assert(runlength <= std::numeric_limits::max()); // in cigar strings, runlength of 1 are implicit (no digit) return std::max(runlength, 1LL); // is in [1, INT_MAX] } // refactoring C++23: std::pair generator auto parse_cigar_string(Span const cigar_string) -> std::vector> { std::vector> parsed_cigar; auto * position = cigar_string.begin(); auto * cigar_end = cigar_string.end(); while (position < cigar_end) { // Consume digits (if any), return the position of the // first char (M, D, or I), store it, move cursor to the next byte. auto ** next_operation = &position; auto const run = find_runlength_of_leftmost_operation(position, next_operation); // do not dereference if outside of cigar_string! (= missing operation!) if (*next_operation >= cigar_end) { // fail if ill-formed (ex: '12M1'), we could also silently skip over fatal("ill-formed CIGAR string"); } // operations: match (M), insertion (I), or deletion (D) auto const operation = **next_operation; position = std::next(*next_operation); parsed_cigar.emplace_back(convert_to_operation(operation), run); } return parsed_cigar; } auto print_uncompressed_cigar(std::FILE * output_handle, Span const cigar_string) -> void { auto const cigar_pairs = parse_cigar_string(cigar_string); for (auto const & a_pair: cigar_pairs) { auto const operation = convert_from_operation(a_pair.first); auto const runlength = a_pair.second; // refactoring? std::fprintf("%s", std::string(runlength, operation).c_str()); for (auto i = 0LL; i < runlength; ++i) { static_cast(std::fprintf(output_handle, "%c", operation)); } } } // compiler explorer tests: // auto main() -> int { // // basic example // std::vector ex1 = {'2', 'I'}; // auto res1 = parse_cigar_string(ex1); // for (auto const & pair: res1) { // std::cout << pair.first << '\t' << pair.second << '\n'; // } // std::cout << '\n'; // // empty cigar // std::vector ex2 = {}; // auto res2 = parse_cigar_string(ex2); // if (res2.empty()) { // std::cout << "empty vector\n"; // } // std::cout << '\n'; // // ommitted run // std::vector ex3 = {'M'}; // auto res3 = parse_cigar_string(ex3); // for (auto const & pair: res3) { // std::cout << pair.first << '\t' << pair.second << '\n'; // } // std::cout << '\n'; // // long run // std::vector ex4 = {'1', '2', '3', '4', 'M'}; // auto res4 = parse_cigar_string(ex4); // for (auto const & pair: res4) { // std::cout << pair.first << '\t' << pair.second << '\n'; // } // std::cout << '\n'; // // null run // std::vector ex5 = {'0', 'M'}; // auto res5 = parse_cigar_string(ex5); // for (auto const & pair: res5) { // std::cout << pair.first << '\t' << pair.second << '\n'; // } // std::cout << '\n'; // // chained operations // std::vector ex6 = {'M', 'I', 'D'}; // auto res6 = parse_cigar_string(ex6); // for (auto const & pair: res6) { // std::cout << pair.first << '\t' << pair.second << '\n'; // } // std::cout << '\n'; // // no operation? // std::vector ex7 = {'1'}; // auto res7 = parse_cigar_string(ex7); // if (res7.empty()) { // std::cout << "empty vector\n"; // } // std::cout << '\n'; // } vsearch-2.30.1/src/utils/cigar.hpp000066400000000000000000000055561506776541700170120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cigar_operations.hpp" #include "span.hpp" #include #include // std::pair auto find_runlength_of_leftmost_operation(char const * first_character, char ** first_non_digit) -> long long; auto parse_cigar_string(Span cigar_string) -> std::vector>; auto print_uncompressed_cigar(std::FILE * output_handle, Span cigar_string) -> void; vsearch-2.30.1/src/utils/cigar_operations.hpp000066400000000000000000000051631506776541700212470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // CIGAR operations: match (M), insertion (I), or deletion (D) // unused in vsearch: mismatch (X) and gap (N) enum struct Operation : char { match = 'M', insertion = 'I', deletion = 'D', }; vsearch-2.30.1/src/utils/compare_strings_nocase.cpp000066400000000000000000000105171506776541700224400ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "span.hpp" #include // std::search, std::equal #include #include // std::toupper #include // EOF #include // std::strlen #include // anonymous namespace: limit visibility and usage to this translation unit namespace { auto compare_chars = [](char const lhs, char const rhs) -> bool { assert((lhs >= 0) or (lhs == EOF)); auto const lhs_unsigned = static_cast(lhs); auto const rhs_unsigned = static_cast(rhs); return std::toupper(lhs_unsigned) == std::toupper(rhs_unsigned); }; } // end of anonymous namespace auto contains_substring(Span const haystack, Span const needle) -> bool { // case insensitive auto * const hit = std::search(haystack.begin(), haystack.end(), needle.begin(), needle.end(), compare_chars); return (hit != haystack.end()); } auto are_same_string(Span const haystack, std::vector const & needle) -> bool { // case insensitive if (haystack.size() != needle.size()) { return false; } return std::equal(haystack.begin(), haystack.end(), needle.begin(), compare_chars); } auto are_same_string(Span const haystack, Span const needle) -> bool { // case insensitive if (haystack.size() != needle.size()) { return false; } return std::equal(haystack.begin(), haystack.end(), needle.begin(), compare_chars); } auto are_same_string(char const * haystack_str, char const * needle_str) -> bool { // case insensitive auto const haystack = Span{haystack_str, std::strlen(haystack_str)}; auto const needle = Span{needle_str, std::strlen(needle_str)}; if (haystack.size() != needle.size()) { return false; } return std::equal(haystack.begin(), haystack.end(), needle.begin(), compare_chars); } vsearch-2.30.1/src/utils/compare_strings_nocase.hpp000066400000000000000000000054161506776541700224470ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "span.hpp" #include auto contains_substring(Span haystack, Span needle) -> bool; auto are_same_string(Span haystack, std::vector const & needle) -> bool; auto are_same_string(Span haystack, Span needle) -> bool; auto are_same_string(char const * haystack_str, char const * needle_str) -> bool; vsearch-2.30.1/src/utils/fatal.cpp000066400000000000000000000073361506776541700170050ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include // uint64_t #include // std::fprintf #include // std::exit, EXIT_FAILURE __attribute__((noreturn)) auto fatal(char const * message) -> void { std::fprintf(stderr, "\n\n"); std::fprintf(stderr, "Fatal error: %s\n", message); if (fp_log != nullptr) { std::fprintf(fp_log, "\n\n"); std::fprintf(fp_log, "Fatal error: %s\n", message); } std::exit(EXIT_FAILURE); } __attribute__((noreturn)) auto fatal(char const * format, char const * message) -> void { std::fprintf(stderr, "\n\nFatal error: "); std::fprintf(stderr, format, message); std::fprintf(stderr, "\n"); if (fp_log != nullptr) { std::fprintf(fp_log, "\n\nFatal error: "); std::fprintf(fp_log, format, message); std::fprintf(fp_log, "\n"); } std::exit(EXIT_FAILURE); } // used in fastx.cc __attribute__((noreturn)) auto fatal(char const * format, char const symbol, uint64_t const line_number) -> void { std::fprintf(stderr, "\n\nFatal error: "); std::fprintf(stderr, format, symbol, line_number); std::fprintf(stderr, "\n"); if (fp_log != nullptr) { std::fprintf(fp_log, "\n\nFatal error: "); std::fprintf(fp_log, format, symbol, line_number); std::fprintf(fp_log, "\n"); } std::exit(EXIT_FAILURE); } vsearch-2.30.1/src/utils/fatal.hpp000066400000000000000000000054431506776541700170070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t // parameters must be marked as const! // ISO C++ forbids converting a string constant to 'char *' // error: invalid conversion from 'const char *' to 'char *' auto fatal(char const * message) -> void; auto fatal(char const * format, char const * message) -> void; auto fatal(char const * format, char symbol, uint64_t line_number) -> void; vsearch-2.30.1/src/utils/kmer_hash_struct.hpp000066400000000000000000000056161506776541700212670ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include struct kh_bucket_s { unsigned int kmer = 0; unsigned int pos = 0; /* 1-based position, 0 = empty */ }; struct kh_handle_s { static constexpr auto kmer_hash_allocation = 256; static constexpr auto kmer_hash_mask = kmer_hash_allocation - 1U; std::vector hash = std::vector(kmer_hash_allocation); unsigned int hash_mask = kmer_hash_mask; int size = 0; int alloc = kmer_hash_allocation; int maxpos = 0; }; vsearch-2.30.1/src/utils/maps.cpp000066400000000000000000000403751506776541700166560ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "maps.hpp" #include #include // anonymous namespace: limit visibility and usage to this translation unit namespace { auto to_uchar(char const nucleotide) -> unsigned char { assert(nucleotide >= 0); return static_cast(nucleotide); } const std::vector chrmap_4bit = { /* Map from ascii to 4-bit nucleotide code Aa: 1 0001 Bb: 14 1110 (not A) ex: 'B' & 'A' == 0000 while 'B' & anyother != 0000 Cc: 2 0010 Dd: 13 1101 (not C) Gg: 4 0100 Hh: 11 1011 (not G) Kk: 12 1100 (G or T) Mm: 3 0011 (A or C) Nn: 15 1111 ex: 'N' & any != 0000 Rr: 5 0101 (A or G) Ss: 6 0110 (S or G) Tt: 8 1000 Uu: 8 1000 Vv: 7 0111 (not T) Ww: 9 1001 (A or T) Yy: 10 1010 (C or T) ex: 'Y' & 'C' or 'T' == 0000 Others: 0 @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 8, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 0, 0, 0, 5, 6, 8, 8, 7, 9, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const std::vector chrmap_complement = { /* Map from ascii to ascii, complementary nucleotide @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','T','V','G','H','N','N','C','D','N','N','M','N','K','N','N', 'N','N','Y','S','A','A','B','W','N','R','N','N','N','N','N','N', 'N','t','v','g','h','N','N','c','d','N','N','m','N','k','n','N', 'N','N','y','s','a','a','b','w','N','r','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const std::vector chrmap_2bit = { /* Map from ascii to 2-bit nucleotide code Aa: 0 Cc: 1 Gg: 2 TtUu: 3 All others: 0 @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const std::vector chrmap_ambiguous_4bit = { true, false, // Aa false, // Cc true, false, // Gg true, true, true, false, // TtUu true, true, true, true, true, true, true }; const std::vector chrmap_mask_ambig = { /* Should character be masked and not used for search ? Mask everything but A, C, G, T and U. Lower case letters are NOT masked. @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; const std::vector chrmap_mask_lower = { /* Should character be masked and not used for search ? Mask everything but A, C, G, T and U. All lower case letters are masked (soft masking). @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; } // end of anonymous namespace const std::vector chrmap_no_change_vector = { /* Map from ascii to ascii - no change @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'N', 'N', 'N', 'N', 'N', 'N', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; const std::vector chrmap_normalize_vector = { /* Map from ascii to ascii Convert to upper case nucleotide, and replace U by T @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','N','N','G','H','N','N','K','N','M','N','N', 'N','N','R','S','T','T','V','W','N','Y','N','N','N','N','N','N', 'N','A','B','C','D','N','N','G','H','N','N','K','N','M','N','N', 'N','N','R','S','T','T','V','W','N','Y','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; const std::vector chrmap_upcase_vector = { /* Map from ascii to ascii Convert to upper case nucleotide @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N' }; auto map_uppercase(char const nucleotide) -> char { return static_cast(chrmap_upcase_vector[to_uchar(nucleotide)]); } auto map_2bit(char const nucleotide) -> unsigned int { return chrmap_2bit[to_uchar(nucleotide)]; } auto map_4bit(char const nucleotide) -> unsigned char { return chrmap_4bit[to_uchar(nucleotide)]; } auto map_complement(char const nucleotide) -> char { return static_cast(chrmap_complement[to_uchar(nucleotide)]); } auto map_mask_ambig(char const nucleotide) -> unsigned int { return chrmap_mask_ambig[to_uchar(nucleotide)]; } auto map_mask_lower(char const nucleotide) -> unsigned int { return chrmap_mask_lower[to_uchar(nucleotide)]; } auto is_equivalent_4bit(char const lhs, char const rhs) -> bool { auto const lhs_unsigned = map_4bit(lhs); auto const rhs_unsigned = map_4bit(rhs); return ((lhs_unsigned & rhs_unsigned) != 0); } auto is_equivalent_4bit_rhs(char const lhs, char const rhs) -> bool { auto const lhs_unsigned = to_uchar(lhs); auto const rhs_unsigned = map_4bit(rhs); return ((lhs_unsigned & rhs_unsigned) != 0); } auto is_ambiguous_4bit(unsigned char const nucleotide) -> bool { return chrmap_ambiguous_4bit[nucleotide]; } auto is_same_4bit(char const lhs, char const rhs) -> bool { return map_4bit(lhs) == map_4bit(rhs); } vsearch-2.30.1/src/utils/maps.hpp000066400000000000000000000063031506776541700166540ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MAPS_HPP #define MAPS_HPP #include extern const std::vector chrmap_no_change_vector; extern const std::vector chrmap_normalize_vector; extern const std::vector chrmap_upcase_vector; auto map_uppercase(char nucleotide) -> char; auto map_2bit(char nucleotide) -> unsigned int; auto map_4bit(char nucleotide) -> unsigned char; auto map_complement(char nucleotide) -> char; auto map_mask_ambig(char nucleotide) -> unsigned int; auto map_mask_lower(char nucleotide) -> unsigned int; auto is_equivalent_4bit(char lhs, char rhs) -> bool; auto is_equivalent_4bit_rhs(char lhs, char rhs) -> bool; auto is_ambiguous_4bit(unsigned char nucleotide) -> bool; auto is_same_4bit(char lhs, char rhs) -> bool; #endif // MAPS_HPP vsearch-2.30.1/src/utils/open_file.cpp000066400000000000000000000123251506776541700176500ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "fatal.hpp" #include "open_file.hpp" #include // dup, STDIN_FILENO, STDOUT_FILENO #include #include // errno #include // std::fopen, fdopen #include // std::strcmp // anonymous namespace: limit visibility and usage to this translation unit namespace { // C++17 refactoring: // constexpr std::string_view a_dash = "-"; // simpler string comparisons: if (filename == a_dash) { // type-safe string mode wrapper struct ModeString { explicit constexpr ModeString(char const * str) noexcept : mode(str) { } char const * mode; }; // Safely wrapping fopen() auto open_file(char const * filename, ModeString const & mode) -> FileHandle { assert(filename != nullptr); assert(mode.mode != nullptr); return FileHandle{std::fopen(filename, mode.mode)}; } auto open_file_descriptor(int const file_descriptor, ModeString const & mode) -> FileHandle { assert(file_descriptor >= 0); assert(mode.mode != nullptr); return FileHandle{fdopen(file_descriptor, mode.mode)}; } auto check_file_descriptor(int const file_descriptor) -> void { assert(file_descriptor >= -1); if (file_descriptor != -1) { return; } if (errno == EBADF) { fatal("original fd is not an open file descriptor."); } if (errno == EMFILE) { fatal("too many open file descriptors."); } fatal("cannot duplicate input or output stream."); } } // end of anonymous namespace // read_file, file to read, open_input_file, open_istream auto open_input_file(char const * filename) -> FileHandle { if (filename == nullptr) { std::FILE * empty = nullptr; return FileHandle{empty}; } auto const mode = ModeString{"rb"}; // r: reading, b: non-UNIX environments /* open the input stream given by filename, but if name is '-' then use a duplicate of stdin (fd = STDIN_FILENO = 0) */ if (std::strcmp(filename, "-") == 0) { auto const file_descriptor = dup(STDIN_FILENO); check_file_descriptor(file_descriptor); return open_file_descriptor(file_descriptor, mode); } return open_file(filename, mode); } // write_file, file to write, open_output_file, open_ostream auto open_output_file(char const * filename) -> FileHandle { if (filename == nullptr) { std::FILE * empty = nullptr; return FileHandle{empty}; } auto const mode = ModeString{"w"}; // w: writing, no b? /* open the output stream given by filename, but if name is '-' then use a duplicate of stdout (fd = STDOUT_FILENO = 1) */ if (std::strcmp(filename, "-") == 0) { auto const file_descriptor = dup(STDOUT_FILENO); check_file_descriptor(file_descriptor); return open_file_descriptor(file_descriptor, mode); } return open_file(filename, mode); } vsearch-2.30.1/src/utils/open_file.hpp000066400000000000000000000063641506776541700176630ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // std::FILE, std:fclose, std::fopen #include // std::unique_ptr // Make sure std::FILE * cannot leak // for reference, see: // https://stackoverflow.com/questions/26360916/using-custom-deleter-with-unique-ptr // note: taking a reference to a standard library function (such as // &std::fclose) is unspecified behavior (or ill-formed): // // using FileHandle = std::unique_ptr; UB! // prefer a deleter struct with an operator() to call std::fclose struct CloseFileHandle { auto operator()(std::FILE * file_handle) -> void { static_cast(std::fclose(file_handle)); } }; using FileHandle = std::unique_ptr; auto open_input_file(char const * filename) -> FileHandle; auto open_output_file(char const * filename) -> FileHandle; vsearch-2.30.1/src/utils/os_byteswap.cpp000066400000000000000000000077631506776541700202610ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Operating System specific commands to swap bytes // C++23 refactoring: replace with std::byteswap() #if defined(_MSC_VER) || defined(_WIN32) #include // uint16_t, uint32_t, uint64_t #include auto bswap_16(uint16_t bsx) noexcept -> uint16_t { return _byteswap_ushort(bsx); } auto bswap_32(uint32_t bsx) noexcept -> uint32_t { return _byteswap_ulong(bsx); } auto bswap_64(uint64_t bsx) noexcept -> uint64_t { return _byteswap_uint64(bsx); } #elif defined(__APPLE__) // Mac OS X / Darwin features #include // uint16_t, uint32_t, uint64_t #include auto bswap_16(uint16_t bsx) noexcept -> uint16_t { return OSSwapInt16(bsx); } auto bswap_32(uint32_t bsx) noexcept -> uint32_t { return OSSwapInt32(bsx); } auto bswap_64(uint64_t bsx) noexcept -> uint64_t { return OSSwapInt64(bsx); } #elif defined(__FreeBSD__) #include // uint16_t, uint32_t, uint64_t #include auto bswap_16(uint16_t bsx) noexcept -> uint16_t { return bswap16(bsx); } auto bswap_32(uint32_t bsx) noexcept -> uint32_t { return bswap32(bsx); } auto bswap_64(uint64_t bsx) noexcept -> uint64_t { return bswap64(bsx); } #elif defined(__NetBSD__) #include // uint16_t, uint32_t, uint64_t #include #include auto bswap_16(uint16_t bsx) noexcept -> uint16_t { return bswap16(bsx); } auto bswap_32(uint32_t bsx) noexcept -> uint32_t { return bswap32(bsx); } auto bswap_64(uint64_t bsx) noexcept -> uint64_t { return bswap64(bsx); } #else // other operating systems? #endif vsearch-2.30.1/src/utils/os_byteswap.hpp000066400000000000000000000063031506776541700202530ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Operating System specific commands to swap bytes // C++23 refactoring: replace with std::byteswap() #ifndef OS_BYTESWAP_HPP #define OS_BYTESWAP_HPP #if defined(_MSC_VER) || defined(_WIN32) #include // uint16_t, uint32_t, uint64_t auto bswap_16(uint16_t bsx) noexcept -> uint16_t; auto bswap_32(uint32_t bsx) noexcept -> uint32_t; auto bswap_64(uint64_t bsx) noexcept -> uint64_t; #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) #include // uint16_t, uint32_t, uint64_t auto bswap_16(uint16_t bsx) noexcept -> uint16_t; auto bswap_32(uint32_t bsx) noexcept -> uint32_t; auto bswap_64(uint64_t bsx) noexcept -> uint64_t; #else // __linux__ and other operating systems #include #endif #endif // OS_BYTESWAP_HPP vsearch-2.30.1/src/utils/progress.hpp000066400000000000000000000126451506776541700175660ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include // macros PRIu64 and PRId64 #include // int64_t, uint64_t #include // std::fprintf class Progress { public: explicit Progress(char const * prompt, std::uint64_t const max_size, struct Parameters const & parameters) : prompt_{prompt}, max_size_{max_size}, stderr_is_tty_(parameters.opt_stderr_is_tty), is_quiet_(parameters.opt_quiet), no_progress_(parameters.opt_no_progress) { assert(prompt != nullptr); is_visible_ = check_if_visible(); if (is_quiet_) { return; } static_cast(std::fprintf(stderr, "%s", prompt)); if (not is_visible_) { return; } static_cast(std::fprintf(stderr, " %d%%", 0)); if (max_size_ == 0) { static_cast(std::fprintf(stderr, " \r%s 0%%", prompt_)); return; } current_percentage_ = calculate_percentage(); next_threshold_ = calculate_next_threshold(); } Progress(Progress const &) = delete; // copy constructor: no copies auto operator=(Progress const &) -> Progress & = delete; // assignment operator: no self-assignment Progress(Progress&&) = delete; // no move constructor auto operator=(Progress&&) -> Progress & = delete; // no move assignment operator auto update(std::uint64_t const counter) -> void { counter_ = counter; if ((not is_visible_) or (counter_ < next_threshold_)) { return; } current_percentage_ = calculate_percentage(); static_cast(std::fprintf(stderr, " \r%s %" PRIu64 "%%", prompt_, current_percentage_)); next_threshold_ = calculate_next_threshold(); }; auto update() -> void { ++counter_; update(counter_); } ~Progress() { done(); } private: static constexpr auto one_hundred_percent = 100UL; // Construction-time parameters char const * prompt_ {}; std::uint64_t max_size_ {}; bool stderr_is_tty_ {}; bool is_quiet_ {}; bool no_progress_ {}; // Internal parameters std::uint64_t counter_ {}; std::uint64_t current_percentage_ {}; // integer, and not a double std::uint64_t next_threshold_ {}; bool is_visible_ {}; // Helper functions auto check_if_visible() const -> bool { return (stderr_is_tty_) and (not is_quiet_) and (not no_progress_); }; auto calculate_percentage() const -> std::uint64_t { if (max_size_ == 0) { return 0; } // when reading from a pipe return counter_ * one_hundred_percent / max_size_; }; auto calculate_next_threshold() const -> std::uint64_t { static constexpr auto nighty_nine_percent = 99UL; return ((current_percentage_ + 1) * max_size_ + nighty_nine_percent) / one_hundred_percent; }; auto done() const -> void { if (is_quiet_) { return; } if (is_visible_) { static_cast(std::fprintf(stderr, " \r%s", prompt_)); } static_cast(std::fprintf(stderr, " %lu%%\n", one_hundred_percent)); } }; vsearch-2.30.1/src/utils/seqcmp.cpp000066400000000000000000000074511506776541700172040ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "maps.hpp" #include "span.hpp" #include #include // uint64_t // refactoring: with std::mismatch()? having to check if == '\0' breaks the flow // Find first position with a difference, if any. Return 0 for // identical sequences, -1 if lhs is sorted first (lower // alpha-sorting), +1 if rhs is sorted first. auto seqcmp(char const * lhs, char const * rhs, uint64_t const length) -> int { auto const lhs_seq = Span{lhs, length}; auto const rhs_seq = Span{rhs, length}; for (auto lhs_it = lhs_seq.cbegin(), rhs_it = rhs_seq.cbegin(); lhs_it < lhs_seq.cend() and rhs_it < rhs_seq.cend(); ++lhs_it, ++rhs_it) { if ((*lhs_it == '\0') or (*rhs_it == '\0')) { break; } auto const lhs_nuc = map_4bit(*lhs_it); auto const rhs_nuc = map_4bit(*rhs_it); if (lhs_nuc < rhs_nuc) { return -1; } if (lhs_nuc > rhs_nuc) { return +1; } } return 0; } auto seqcmp(char const * lhs, char const * rhs, unsigned int length) -> int { return seqcmp(lhs, rhs, static_cast(length)); } auto seqcmp(char const * lhs, char const * rhs, int length) -> int { assert(length >= 0); return seqcmp(lhs, rhs, static_cast(length)); } auto seqcmp(char const * lhs, char const * rhs, int64_t length) -> int { assert(length >= 0); return seqcmp(lhs, rhs, static_cast(length)); } vsearch-2.30.1/src/utils/seqcmp.hpp000066400000000000000000000053641506776541700172120ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include // uint64_t auto seqcmp(char const * lhs, char const * rhs, uint64_t length) -> int; auto seqcmp(char const * lhs, char const * rhs, unsigned int length) -> int; auto seqcmp(char const * lhs, char const * rhs, int64_t length) -> int; auto seqcmp(char const * lhs, char const * rhs, int length) -> int; vsearch-2.30.1/src/utils/span.hpp000066400000000000000000000174561506776541700166700ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef SPAN_HPP #define SPAN_HPP #include // std::equal, std::lexicographical_compare, std::min #include #include // std::ptrdiff_t #include // std::size_t #include // std::prev, std::next #ifndef NDEBUG #include // C++17 refactoring: [[maybe_unused]] constexpr auto max_ptrdiff = std::numeric_limits::max(); constexpr auto max_size = std::numeric_limits::max(); #endif // TODO: // - turn into a View (not allowed to modify the underlying data), not possible because ot strtoll() // - add friend function for custom hashing, // - goal is to be able to build std::map and std::set of Span // simple version of std::span (C++20) // // only valid for contiguous sequences of elements (vectors or arrays) // of any type Type (except std::vector?) template class Span { public: explicit Span(Type * start, std::size_t const length) : start_ {start}, length_ {length} { assert(start != nullptr); assert(length <= max_ptrdiff); } explicit Span(Type const * start, std::size_t const length) : start_ {const_cast(start)}, length_ {length} { assert(start != nullptr); assert(length <= max_ptrdiff); } // Operators auto operator==(Span const & other) const -> bool { return size() == other.size() and std::equal(cbegin(), cend(), other.cbegin()); } auto operator<(Span const & other) const -> bool { return std::lexicographical_compare(cbegin(), cend(), other.cbegin(), other.cend()); } // Iterators auto begin() const -> Type * { return data(); } auto cbegin() const -> Type const * { return data(); } auto end() const -> Type * { auto const distance = static_cast(size()); return std::next(data(), distance); } auto cend() const -> Type const * { return end(); } auto rbegin() const -> std::reverse_iterator { return std::reverse_iterator(end()); } auto crbegin() const -> std::reverse_iterator { return std::reverse_iterator(cend()); } auto rend() const -> std::reverse_iterator { return std::reverse_iterator(begin()); } auto crend() const -> std::reverse_iterator { return std::reverse_iterator(cbegin()); } // Element access // C++17 refactoring: [[nodiscard]] auto front() const -> Type const & { assert(not empty()); return *data(); } auto back() const -> Type const & { assert(not empty()); return *std::prev(end()); } auto data() const -> Type * { return start_; } auto operator[](std::size_t const index) const -> Type & { assert(index < size()); auto const distance = static_cast(index); return *std::next(data(), distance); } // Observers auto size() const -> std::size_t { return length_; } auto size_bytes() const -> std::size_t { assert(size() <= (max_size / sizeof(Type))); return size() * sizeof(Type); } auto empty() const -> bool { return size() == 0; } // Subviews auto subspan(std::size_t const offset, std::size_t const count) const -> Span { assert(offset <= size()); assert(count <= size() - offset); auto const distance = static_cast(offset); auto * new_start = std::next(data(), distance); return Span{new_start, count}; } auto first(std::size_t const count) const -> Span { return subspan(0, count); } auto last(std::size_t const count) const -> Span { assert(count <= size()); return subspan(size() - count, count); } auto drop(std::size_t const count) const -> Span { // drop n first items, return empty if n is >= size() auto const offset = std::min(count, size()); assert(offset <= size()); return subspan(offset, size() - offset); } private: Type * start_ {}; std::size_t length_ {}; }; // tests: // #include // #include // auto main() -> int { // std::vector v = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; // auto s = Span{v.data(), 5}; // assert(s.size() == 5); // assert(s.size_bytes() == 5); // assert(! s.empty()); // assert(s.front() == 'a'); // assert(s.back() == 'e'); // assert(s[1] == 'b'); // for (auto c: s) { // printf("%c\n", c); // } // printf("\n"); // auto report = [](char const &c) -> void { printf("%c\n", c); }; // std::for_each(s.begin(), s.end(), report); // printf("\n"); // // assert(s[5] == 'f'); // auto s1 = Span{v.data(), 0}; // std::for_each(s1.begin(), s1.end(), report); // printf("\n"); // auto s2 = Span{v.data(), 10}; // auto s3 = s2.first(2); // std::for_each(s3.begin(), s3.end(), report); // printf("\n"); // auto s4 = s2.first(10); // std::for_each(s4.begin(), s4.end(), report); // printf("\n"); // auto s5 = s2.first(0); // std::for_each(s5.begin(), s5.end(), report); // printf("\n"); // auto s6 = s2.last(0); // std::for_each(s6.begin(), s6.end(), report); // printf("\n"); // auto s7 = s2.last(2); // std::for_each(s7.begin(), s7.end(), report); // printf("\n"); // std::for_each(s7.rbegin(), s7.rend(), report); // printf("\n"); // std::for_each(s7.crbegin(), s7.crend(), report); // printf("\n"); // } #endif // SPAN_HPP vsearch-2.30.1/src/utils/taxonomic_fields.h000066400000000000000000000054071506776541700207070ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include /* shorthands for nine taxonomic levels d domain k kingdom p phylum c class o order f family g genus s species t strain */ constexpr auto tax_levels = 9; constexpr std::array taxonomic_fields = {'d', 'k', 'p', 'c', 'o', 'f', 'g', 's', 't'}; // refactoring C++17: inline constexpr vsearch-2.30.1/src/utils/xpthread.cpp000066400000000000000000000116151506776541700175300ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "utils/fatal.hpp" #include auto xpthread_attr_init(pthread_attr_t * attr) -> void { if (pthread_attr_init(attr) != 0) { fatal("Unable to init thread attributes"); } } auto xpthread_attr_destroy(pthread_attr_t * attr) -> void { if (pthread_attr_destroy(attr) != 0) { fatal("Unable to destroy thread attributes"); } } auto xpthread_attr_setdetachstate(pthread_attr_t * attr, int detachstate) -> void { if (pthread_attr_setdetachstate(attr, detachstate) != 0) { fatal("Unable to set thread attributes detach state"); } } auto xpthread_create(pthread_t * thread, pthread_attr_t const * attr, void * (*start_routine)(void *), void * arg) -> void { if (pthread_create(thread, attr, start_routine, arg) != 0) { fatal("Unable to create thread"); } } auto xpthread_join(pthread_t thread, void ** value_ptr) -> void { if (pthread_join(thread, value_ptr) != 0) { fatal("Unable to join thread"); } } auto xpthread_mutex_init(pthread_mutex_t * mutex, pthread_mutexattr_t const * attr) -> void { if (pthread_mutex_init(mutex, attr) != 0) { fatal("Unable to init mutex"); } } auto xpthread_mutex_destroy(pthread_mutex_t * mutex) -> void { if (pthread_mutex_destroy(mutex) != 0) { fatal("Unable to destroy mutex"); } } auto xpthread_mutex_lock(pthread_mutex_t * mutex) -> void { if (pthread_mutex_lock(mutex) != 0) { fatal("Unable to lock mutex"); } } auto xpthread_mutex_unlock(pthread_mutex_t * mutex) -> void { if (pthread_mutex_unlock(mutex) != 0) { fatal("Unable to unlock mutex"); } } auto xpthread_cond_init(pthread_cond_t * cond, pthread_condattr_t const * attr) -> void { if (pthread_cond_init(cond, attr) != 0) { fatal("Unable to init condition variable"); } } auto xpthread_cond_destroy(pthread_cond_t * cond) -> void { if (pthread_cond_destroy(cond) != 0) { fatal("Unable to destroy condition variable"); } } auto xpthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) -> void { if (pthread_cond_wait(cond, mutex) != 0) { fatal("Unable to wait on condition variable"); } } auto xpthread_cond_signal(pthread_cond_t * cond) -> void { if (pthread_cond_signal(cond) != 0) { fatal("Unable to signal condition variable"); } } auto xpthread_cond_broadcast(pthread_cond_t * cond) -> void { if (pthread_cond_broadcast(cond) != 0) { fatal("Unable to broadcast condition variable"); } } vsearch-2.30.1/src/utils/xpthread.hpp000066400000000000000000000070411506776541700175330ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include auto xpthread_attr_init(pthread_attr_t * attr) -> void; auto xpthread_attr_destroy(pthread_attr_t * attr) -> void; auto xpthread_attr_setdetachstate(pthread_attr_t * attr, int detachstate) -> void; auto xpthread_create(pthread_t * thread, pthread_attr_t const * attr, void * (*start_routine)(void *), void * arg) -> void; auto xpthread_join(pthread_t thread, void ** value_ptr) -> void; auto xpthread_mutex_init(pthread_mutex_t * mutex, pthread_mutexattr_t const * attr) -> void; auto xpthread_mutex_destroy(pthread_mutex_t * mutex) -> void; auto xpthread_mutex_lock(pthread_mutex_t * mutex) -> void; auto xpthread_mutex_unlock(pthread_mutex_t * mutex) -> void; auto xpthread_cond_init(pthread_cond_t * cond, pthread_condattr_t const * attr) -> void; auto xpthread_cond_destroy(pthread_cond_t * cond) -> void; auto xpthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) -> void; auto xpthread_cond_signal(pthread_cond_t * cond) -> void; auto xpthread_cond_broadcast(pthread_cond_t * cond) -> void; vsearch-2.30.1/src/vsearch.cc000066400000000000000000005723211506776541700160150ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vsearch.h" #include "allpairs.h" #include "chimera.h" #include "cluster.h" #include "cut.h" #include "derep.h" #include "derep_prefix.h" #include "derep_smallmem.h" #include "dynlibs.h" #include "eestats.h" #include "fasta2fastq.h" #include "fastq_chars.h" #include "fastq_join.h" #include "fastq_mergepairs.h" #include "fastq_stats.h" #include "fastqops.h" #include "filter.h" #include "getseq.h" #include "mask.h" #include "orient.h" #include "rereplicate.h" #include "search.h" #include "search_exact.h" #include "sff_convert.h" #include "shuffle.h" #include "sintax.h" #include "sortbylength.h" #include "sortbysize.h" #include "subsample.h" #include "udb.h" #include "userfields.h" #include "utils/compare_strings_nocase.hpp" #include "utils/fatal.hpp" #include // std::count, std::any_of #include #include // macros PRIu64 and PRId64 #include // std::floor #include // std::strftime, std::localtime, std::time, std::time_t, std::tm, std::difftime #include // int64_t, uint64_t #include // std::FILE, std::fprintf, std::size_t, std::sscanf, std::fclose, std::snprintf, std::printf, std::strcat #include // std::exit, EXIT_FAILURE #include // std::strlen, std::memset #include // getopt_long_only, optarg, optind, opterr, struct // option (no_argument, required_argument) #include #include #include constexpr int64_t n_threads_max = 1024; constexpr auto max_line_length = std::size_t{80}; /* options */ bool opt_bzip2_decompress = false; bool opt_clusterout_id = false; bool opt_clusterout_sort = false; bool opt_eeout; bool opt_fasta_score; bool opt_fastq_allowmergestagger; bool opt_fastq_eeout; bool opt_fastq_nostagger; bool opt_gzip_decompress; bool opt_label_substr_match; bool opt_lengthout; bool opt_n_mismatch; bool opt_no_progress; bool opt_quiet; bool opt_relabel_keep; bool opt_relabel_md5; bool opt_relabel_self; bool opt_relabel_sha1; bool opt_samheader; bool opt_sintax_random; bool opt_sizein; bool opt_sizeorder; bool opt_sizeout; bool opt_xee; bool opt_xlength; bool opt_xsize; char * opt_alnout; char * opt_biomout; char * opt_blast6out; char * opt_borderline; char * opt_centroids; char * opt_chimeras; char * opt_chimeras_alnout; char * opt_chimeras_denovo; char * opt_cluster_fast; char * opt_cluster_size; char * opt_cluster_smallmem; char * opt_cluster_unoise; char * opt_clusters; char * opt_consout; char * opt_db; char * opt_dbmatched; char * opt_dbnotmatched; char * opt_eetabbedout; char * opt_fastaout; char * opt_fastaout_discarded; char * opt_fastaout_discarded_rev; char * opt_fastaout_notmerged_fwd; char * opt_fastaout_notmerged_rev; char * opt_fastaout_rev; char * opt_fastapairs; char * opt_fastq_convert; char * opt_fastqout; char * opt_fastqout_discarded; char * opt_fastqout_discarded_rev; char * opt_fastqout_notmerged_fwd; char * opt_fastqout_notmerged_rev; char * opt_fastqout_rev; char * opt_label; char * opt_label_field; char * opt_label_suffix; char * opt_label_word; char * opt_label_words; char * opt_labels; char * opt_lcaout; char * opt_log; char * opt_matched; char * opt_mothur_shared_out; char * opt_msaout; char * opt_nonchimeras; char * opt_notmatched; char * opt_notmatchedfq; char * opt_otutabout; char * opt_output; char * opt_pattern; char * opt_profile; char * opt_qsegout; char * opt_relabel; char * opt_reverse; char * opt_samout; char * opt_sample; char * opt_tabbedout; char * opt_tsegout; char * opt_uc; char * opt_uchime2_denovo; char * opt_uchime3_denovo; char * opt_uchime_denovo; char * opt_uchime_ref; char * opt_uchimealns; char * opt_uchimeout; char * opt_userout; double * opt_ee_cutoffs_values; double opt_abskew; double opt_chimeras_diff_pct; double opt_dn; double opt_fastq_maxdiffpct; double opt_fastq_maxee; double opt_fastq_maxee_rate; double opt_fastq_truncee; double opt_fastq_truncee_rate; double opt_id; double opt_lca_cutoff; double opt_maxid; double opt_maxqt; double opt_maxsizeratio; double opt_maxsl; double opt_mid; double opt_mindiv; double opt_minh; double opt_minqt; double opt_minsizeratio; double opt_minsl; double opt_query_cov; double opt_sample_pct; double opt_sintax_cutoff; double opt_target_cov; double opt_unoise_alpha; double opt_weak_id; double opt_xn; int opt_acceptall; int opt_alignwidth; int opt_chimeras_length_min; int opt_chimeras_parents_max; int opt_chimeras_parts; int opt_cons_truncate; int opt_ee_cutoffs_count; int opt_gap_extension_query_interior; int opt_gap_extension_query_left; int opt_gap_extension_query_right; int opt_gap_extension_target_interior; int opt_gap_extension_target_left; int opt_gap_extension_target_right; int opt_gap_open_query_interior; int opt_gap_open_query_left; int opt_gap_open_query_right; int opt_gap_open_target_interior; int opt_gap_open_target_left; int opt_gap_open_target_right; int opt_length_cutoffs_increment; int opt_length_cutoffs_longest; int opt_length_cutoffs_shortest; int opt_mindiffs; int opt_slots; int opt_uchimeout5; int opt_usersort; int64_t opt_dbmask; int64_t opt_fasta_width; int64_t opt_fastq_ascii; int64_t opt_fastq_asciiout; int64_t opt_fastq_maxdiffs; int64_t opt_fastq_maxlen; int64_t opt_fastq_maxmergelen; int64_t opt_fastq_maxns; int64_t opt_fastq_minlen; int64_t opt_fastq_minmergelen; int64_t opt_fastq_minovlen; int64_t opt_fastq_minqual; int64_t opt_fastq_qmax; int64_t opt_fastq_qmaxout; int64_t opt_fastq_qmin; int64_t opt_fastq_qminout; int64_t opt_fastq_stripleft; int64_t opt_fastq_stripright; int64_t opt_fastq_trunclen; int64_t opt_fastq_trunclen_keep; int64_t opt_fastq_truncqual; int64_t opt_fulldp; int64_t opt_hardmask; int64_t opt_iddef; int64_t opt_idprefix; int64_t opt_idsuffix; int64_t opt_leftjust; int64_t opt_match; int64_t opt_maxaccepts; int64_t opt_maxdiffs; int64_t opt_maxgaps; int64_t opt_maxhits; int64_t opt_maxqsize; int64_t opt_maxrejects; int64_t opt_maxseqlength; int64_t opt_maxsize; int64_t opt_maxsubs; int64_t opt_maxuniquesize; int64_t opt_mincols; int64_t opt_minseqlength; int64_t opt_minsize; int64_t opt_mintsize; int64_t opt_minuniquesize; int64_t opt_minwordmatches; int64_t opt_mismatch; int64_t opt_notrunclabels; int64_t opt_output_no_hits; int64_t opt_qmask; int64_t opt_randseed; int64_t opt_rightjust; int64_t opt_rowlen; int64_t opt_sample_size; int64_t opt_self; int64_t opt_selfid; int64_t opt_strand; int64_t opt_subseq_end; int64_t opt_subseq_start; int64_t opt_threads; int64_t opt_top_hits_only; int64_t opt_topn; int64_t opt_uc_allhits; int64_t opt_wordlength; /* Other variables */ /* cpu features available */ int64_t altivec_present = 0; int64_t neon_present = 0; int64_t mmx_present = 0; int64_t sse_present = 0; int64_t sse2_present = 0; int64_t sse3_present = 0; int64_t ssse3_present = 0; int64_t sse41_present = 0; int64_t sse42_present = 0; int64_t popcnt_present = 0; int64_t avx_present = 0; int64_t avx2_present = 0; static std::array prog_header {{}}; static char * cmdline; static time_t time_start; static time_t time_finish; std::FILE * fp_log = nullptr; // anonymous namespace: limit visibility and usage to this translation unit namespace { // performance: lambda compiles to a single x86-64 instruction (shr al, 7), // equivalent to (c & ~0x7F) == 0 auto is_not_ASCII(std::string const & user_string) -> bool { static constexpr auto ascii_max = static_cast(std::numeric_limits::max()); auto is_not_in_range = [](char const user_char) -> bool { auto const unsigned_user_char = static_cast(user_char); return (unsigned_user_char > ascii_max); }; return std::any_of(user_string.begin(), user_string.end(), is_not_in_range); } } // end of anonymous namespace #ifdef __x86_64__ #define cpuid(f1, f2, a, b, c, d) \ __asm__ __volatile__ ("cpuid" \ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ : "a" (f1), "c" (f2)); #endif auto cpu_features_detect() -> void { #ifdef __aarch64__ #ifdef __ARM_NEON /* may check /proc/cpuinfo for asimd or neon */ neon_present = 1; #else #error ARM Neon not present #endif #elif __PPC__ altivec_present = 1; #elif __x86_64__ unsigned int a = 0; unsigned int b = 0; unsigned int c = 0; unsigned int d = 0; cpuid(0, 0, a, b, c, d); unsigned int const maxlevel = a & 0xff; if (maxlevel >= 1) { cpuid(1, 0, a, b, c, d); mmx_present = (d >> 23U) & 1U; sse_present = (d >> 25U) & 1U; sse2_present = (d >> 26U) & 1U; sse3_present = (c >> 0U) & 1U; ssse3_present = (c >> 9U) & 1U; sse41_present = (c >> 19U) & 1U; sse42_present = (c >> 20U) & 1U; popcnt_present = (c >> 23U) & 1U; avx_present = (c >> 28U) & 1U; if (maxlevel >= 7) { cpuid(7, 0, a, b, c, d); avx2_present = (b >> 5U) & 1U; } } #else // simde #endif } auto cpu_features_show() -> void { fprintf(stderr, "CPU features:"); if (neon_present != 0) { fprintf(stderr, " neon"); } if (altivec_present != 0) { fprintf(stderr, " altivec"); } if (mmx_present != 0) { fprintf(stderr, " mmx"); } if (sse_present != 0) { fprintf(stderr, " sse"); } if (sse2_present != 0) { fprintf(stderr, " sse2"); } if (sse3_present != 0) { fprintf(stderr, " sse3"); } if (ssse3_present != 0) { fprintf(stderr, " ssse3"); } if (sse41_present != 0) { fprintf(stderr, " sse4.1"); } if (sse42_present != 0) { fprintf(stderr, " sse4.2"); } if (popcnt_present != 0) { fprintf(stderr, " popcnt"); } if (avx_present != 0) { fprintf(stderr, " avx"); } if (avx2_present != 0) { fprintf(stderr, " avx2"); } fprintf(stderr, "\n"); } auto args_get_ee_cutoffs(char * arg) -> void { /* get comma-separated list of floating point numbers */ /* save in ee_cutoffs_count and ee_cutoffs_values */ auto const commas = std::count(arg, arg + std::strlen(arg), ','); opt_ee_cutoffs_count = 0; opt_ee_cutoffs_values = (double *) xrealloc(opt_ee_cutoffs_values, (commas + 1) * sizeof(double)); char const * cursor = arg; while (true) { double val = 0; int skip = 0; if ((std::sscanf(cursor, "%lf%n", &val, &skip) != 1) or (val <= 0.0)) { fatal("Invalid arguments to ee_cutoffs"); } opt_ee_cutoffs_values[opt_ee_cutoffs_count] = val; ++opt_ee_cutoffs_count; cursor += skip; if (*cursor == ',') { ++cursor; } else if (*cursor == '\0') { break; } else { fatal("Invalid arguments to ee_cutoffs"); } } } auto args_get_length_cutoffs(char * arg) -> void { /* get comma-separated list of 3 integers: */ /* smallest, largest and increment. */ /* second value may be * indicating no limit */ /* save in length_cutoffs_{smallest,largest,increment} */ // refactoring: std::stoi(), faster than sscanf() static constexpr auto n_of_expected_assignments= 3; int skip = 0; // receives the number of characters read so far ('%n') if (std::sscanf(arg, "%d,%d,%d%n", &opt_length_cutoffs_shortest, &opt_length_cutoffs_longest, &opt_length_cutoffs_increment, & skip) == n_of_expected_assignments) { if ((size_t) skip < std::strlen(arg)) { fatal("Invalid arguments to length_cutoffs"); } } else if (std::sscanf(arg, "%d,*,%d%n", &opt_length_cutoffs_shortest, &opt_length_cutoffs_increment, &skip) == 2) { if ((size_t) skip < std::strlen(arg)) { fatal("Invalid arguments to length_cutoffs"); } opt_length_cutoffs_longest = std::numeric_limits::max(); } else { fatal("Invalid arguments to length_cutoffs"); } if ((opt_length_cutoffs_shortest < 1) or (opt_length_cutoffs_shortest > opt_length_cutoffs_longest) or (opt_length_cutoffs_increment < 1)) { fatal("Invalid arguments to length_cutoffs"); } } auto args_get_gap_penalty_string(char * arg, bool const is_open) -> void { /* See http://www.drive5.com/usearch/manual/aln_params.html --gapopen *E/10I/1E/2L/3RQ/4RT/1IQ --gapext *E/10I/1E/2L/3RQ/4RT/1IQ integer or * followed by I, E, L, R, Q or T characters separated by / * means infinitely high (disallow) E=end I=interior L=left R=right Q=query T=target E cannot be combined with L or R We do not support floating point values. Therefore, all default score and penalties are multiplied by 2. */ static constexpr auto max_penality = std::numeric_limits::max(); char * cursor = arg; while (*cursor != '\0') { int skip = 0; int pen = 0; if (std::sscanf(cursor, "%d%n", &pen, &skip) == 1) { cursor += skip; } else if (*cursor == '*') { pen = max_penality; ++cursor; } else { fatal("Invalid gap penalty argument (%s)", cursor); } char const * q = cursor; int set_E = 0; int set_I = 0; int set_L = 0; int set_R = 0; int set_Q = 0; int set_T = 0; while ((*cursor != '\0') and (*cursor != '/')) { switch (*cursor) { case 'E': set_E = 1; break; case 'I': set_I = 1; break; case 'L': set_L = 1; break; case 'R': set_R = 1; break; case 'Q': set_Q = 1; break; case 'T': set_T = 1; break; default: fatal("Invalid char '%.1s' in gap penalty string", cursor); break; } ++cursor; } if (*cursor == '/') { ++cursor; } if ((set_E != 0) and ((set_L != 0) or (set_R != 0))) { fatal("Invalid gap penalty string (E and L or R) '%s'", q); } if (set_E != 0) { set_L = 1; set_R = 1; } /* if neither L, I, R nor E is specified, it applies to all */ if ((set_L == 0) and (set_I == 0) and (set_R == 0)) { set_L = 1; set_I = 1; set_R = 1; } /* if neither Q nor T is specified, it applies to both */ if ((set_Q == 0) and (set_T == 0)) { set_Q = 1; set_T = 1; } if (is_open) { if (set_Q != 0) { if (set_L != 0) { opt_gap_open_query_left = pen; } if (set_I != 0) { opt_gap_open_query_interior = pen; } if (set_R != 0) { opt_gap_open_query_right = pen; } } if (set_T != 0) { if (set_L != 0) { opt_gap_open_target_left = pen; } if (set_I != 0) { opt_gap_open_target_interior = pen; } if (set_R != 0) { opt_gap_open_target_right = pen; } } } else { if (set_Q != 0) { if (set_L != 0) { opt_gap_extension_query_left = pen; } if (set_I != 0) { opt_gap_extension_query_interior = pen; } if (set_R != 0) { opt_gap_extension_query_right = pen; } } if (set_T != 0) { if (set_L != 0) { opt_gap_extension_target_left = pen; } if (set_I != 0) { opt_gap_extension_target_interior = pen; } if (set_R != 0) { opt_gap_extension_target_right = pen; } } } } } auto args_getlong(char * arg) -> int64_t { int len = 0; int64_t temp = 0; auto const ret = std::sscanf(arg, "%" PRId64 "%n", &temp, &len); if ((ret == 0) or (((unsigned int) (len)) < std::strlen(arg))) { fatal("Illegal option argument"); } return temp; } auto args_getdouble(char * arg) -> double { int len = 0; double temp = 0; auto const ret = std::sscanf(arg, "%lf%n", &temp, &len); if ((ret == 0) or (((unsigned int) (len)) < std::strlen(arg))) { fatal("Illegal option argument"); } return temp; } auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void { /* Set defaults */ static constexpr auto int_max = std::numeric_limits::max(); static constexpr auto long_min = std::numeric_limits::min(); static constexpr auto number_of_commands = std::size_t{50}; static constexpr auto number_of_options = std::size_t{247}; static constexpr auto max_number_of_options_per_command = std::size_t{99}; parameters.progname = argv[0]; opt_abskew = 0.0; opt_acceptall = 0; opt_alignwidth = 80; opt_alnout = nullptr; opt_biomout = nullptr; opt_blast6out = nullptr; opt_borderline = nullptr; opt_centroids = nullptr; opt_chimeras = nullptr; opt_chimeras_denovo = nullptr; opt_chimeras_diff_pct = 0.0; opt_chimeras_length_min = 10; opt_chimeras_parents_max = 3; opt_chimeras_parts = 0; opt_cluster_fast = nullptr; opt_cluster_size = nullptr; opt_cluster_smallmem = nullptr; opt_cluster_unoise = nullptr; opt_clusters = nullptr; opt_cons_truncate = 0; opt_consout = nullptr; opt_db = nullptr; opt_dbmask = MASK_DUST; opt_dbmatched = nullptr; opt_dbnotmatched = nullptr; opt_dn = 1.4; opt_ee_cutoffs_count = 3; opt_ee_cutoffs_values = (double *) xmalloc(opt_ee_cutoffs_count * sizeof(double)); opt_ee_cutoffs_values[0] = 0.5; opt_ee_cutoffs_values[1] = 1.0; opt_ee_cutoffs_values[2] = 2.0; opt_eeout = false; opt_eetabbedout = nullptr; opt_fasta_score = false; opt_fasta_width = 80; opt_fastaout = nullptr; opt_fastaout_discarded = nullptr; opt_fastaout_discarded_rev = nullptr; opt_fastaout_notmerged_fwd = nullptr; opt_fastaout_notmerged_rev = nullptr; opt_fastaout_rev = nullptr; opt_fastapairs = nullptr; opt_fastq_allowmergestagger = false; opt_fastq_ascii = 33; opt_fastq_asciiout = 33; opt_fastq_convert = nullptr; opt_fastq_eeout = false; opt_fastq_maxdiffpct = 100.0; opt_fastq_maxdiffs = 10; opt_fastq_maxee = dbl_max; opt_fastq_maxee_rate = dbl_max; opt_fastq_maxlen = int64_max; opt_fastq_maxmergelen = 1000000; opt_fastq_maxns = int64_max; opt_fastq_minlen = 1; opt_fastq_minmergelen = 0; opt_fastq_minovlen = 10; opt_fastq_minqual = 0; opt_fastq_nostagger = true; opt_fastq_qmax = 41; opt_fastq_qmaxout = 41; opt_fastq_qmin = 0; opt_fastq_qminout = 0; opt_fastq_stripleft = 0; opt_fastq_stripright = 0; opt_fastq_truncee = dbl_max; opt_fastq_truncee_rate = dbl_max; opt_fastq_trunclen = -1; opt_fastq_trunclen_keep = -1; opt_fastq_truncqual = long_min; opt_fastqout = nullptr; opt_fastqout_discarded = nullptr; opt_fastqout_discarded_rev = nullptr; opt_fastqout_notmerged_fwd = nullptr; opt_fastqout_notmerged_rev = nullptr; opt_fastqout_rev = nullptr; opt_fulldp = 0; opt_gap_extension_query_interior = 2; opt_gap_extension_query_left = 1; opt_gap_extension_query_right = 1; opt_gap_extension_target_interior = 2; opt_gap_extension_target_left = 1; opt_gap_extension_target_right = 1; opt_gap_open_query_interior = 20; opt_gap_open_query_left = 2; opt_gap_open_query_right = 2; opt_gap_open_target_interior = 20; opt_gap_open_target_left = 2; opt_gap_open_target_right = 2; opt_gzip_decompress = false; opt_hardmask = 0; opt_id = -1.0; opt_iddef = 2; opt_idprefix = 0; opt_idsuffix = 0; opt_label = nullptr; opt_label_field = nullptr; opt_label_substr_match = false; opt_label_suffix = nullptr; opt_label_word = nullptr; opt_label_words = nullptr; opt_labels = nullptr; opt_lca_cutoff = 1.0; opt_lcaout = nullptr; opt_leftjust = 0; opt_length_cutoffs_increment = 50; opt_length_cutoffs_longest = int_max; opt_length_cutoffs_shortest = 50; opt_lengthout = false; opt_log = nullptr; opt_match = 2; opt_matched = nullptr; opt_maxaccepts = 1; opt_maxdiffs = int_max; opt_maxgaps = int_max; opt_maxhits = 0; opt_maxid = 1.0; opt_maxqsize = int_max; opt_maxqt = dbl_max; opt_maxrejects = -1; opt_maxseqlength = default_maxseqlength; opt_maxsize = int64_max; opt_maxsizeratio = dbl_max; opt_maxsl = dbl_max; opt_maxsubs = int_max; opt_maxuniquesize = int64_max; opt_mid = 0.0; opt_mincols = 0; opt_mindiffs = 3; opt_mindiv = 0.8; opt_minh = 0.28; opt_minqt = 0.0; opt_minseqlength = -1; opt_minsize = 0; opt_minsizeratio = 0.0; opt_minsl = 0.0; opt_mintsize = 0; opt_minuniquesize = 1; opt_minwordmatches = -1; opt_mismatch = -4; opt_mothur_shared_out = nullptr; opt_msaout = nullptr; opt_n_mismatch = false; opt_no_progress = false; opt_nonchimeras = nullptr; opt_notmatched = nullptr; opt_notmatched = nullptr; opt_notrunclabels = 0; opt_otutabout = nullptr; opt_output = nullptr; opt_output_no_hits = 0; opt_pattern = nullptr; opt_profile = nullptr; opt_qmask = MASK_DUST; opt_qsegout = nullptr; opt_query_cov = 0.0; opt_quiet = false; opt_randseed = 0; opt_relabel = nullptr; opt_relabel_keep = false; opt_relabel_md5 = false; opt_relabel_self = false; opt_relabel_sha1 = false; opt_reverse = nullptr; opt_rightjust = 0; opt_rowlen = 64; opt_samheader = false; opt_samout = nullptr; opt_sample = nullptr; opt_sample_pct = 0; opt_sample_size = 0; opt_self = 0; opt_selfid = 0; opt_sintax_cutoff = 0.0; opt_sintax_random = false; opt_sizein = false; opt_sizeorder = false; opt_sizeout = false; opt_slots = 0; opt_strand = 1; opt_subseq_end = int64_max; opt_subseq_start = 1; opt_tabbedout = nullptr; opt_target_cov = 0.0; opt_threads = 0; opt_top_hits_only = 0; opt_topn = int64_max; opt_tsegout = nullptr; opt_uc = nullptr; opt_uc_allhits = 0; opt_uchime2_denovo = nullptr; opt_uchime3_denovo = nullptr; opt_uchime_denovo = nullptr; opt_uchime_ref = nullptr; opt_uchimealns = nullptr; opt_uchimeout = nullptr; opt_uchimeout5 = 0; opt_unoise_alpha = 2.0; opt_userout = nullptr; opt_usersort = 0; opt_weak_id = 10.0; opt_wordlength = 0; opt_xee = false; opt_xlength = false; opt_xn = 8.0; opt_xsize = false; opterr = 1; enum { option_abskew, option_acceptall, option_alignwidth, option_allpairs_global, option_alnout, option_band, option_biomout, option_blast6out, option_borderline, option_bzip2_decompress, option_centroids, option_chimeras, option_chimeras_denovo, option_chimeras_diff_pct, option_chimeras_length_min, option_chimeras_parents_max, option_chimeras_parts, option_cluster_fast, option_cluster_size, option_cluster_smallmem, option_cluster_unoise, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_cut, option_cut_pattern, option_db, option_dbmask, option_dbmatched, option_dbnotmatched, option_derep_fulllength, option_derep_id, option_derep_prefix, option_derep_smallmem, option_dn, option_ee_cutoffs, option_eeout, option_eetabbedout, option_fasta2fastq, option_fasta_score, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_notmerged_fwd, option_fastaout_notmerged_rev, option_fastaout_rev, option_fastapairs, option_fastq_allowmergestagger, option_fastq_ascii, option_fastq_asciiout, option_fastq_chars, option_fastq_convert, option_fastq_eeout, option_fastq_eestats, option_fastq_eestats2, option_fastq_filter, option_fastq_join, option_fastq_maxdiffpct, option_fastq_maxdiffs, option_fastq_maxee, option_fastq_maxee_rate, option_fastq_maxlen, option_fastq_maxmergelen, option_fastq_maxns, option_fastq_mergepairs, option_fastq_minlen, option_fastq_minmergelen, option_fastq_minovlen, option_fastq_minqual, option_fastq_nostagger, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastq_qout_max, option_fastq_stats, option_fastq_stripleft, option_fastq_stripright, option_fastq_tail, option_fastq_truncee, option_fastq_truncee_rate, option_fastq_trunclen, option_fastq_trunclen_keep, option_fastq_truncqual, option_fastqout, option_fastqout_discarded, option_fastqout_discarded_rev, option_fastqout_notmerged_fwd, option_fastqout_notmerged_rev, option_fastqout_rev, option_fastx_filter, option_fastx_getseq, option_fastx_getseqs, option_fastx_getsubseq, option_fastx_mask, option_fastx_revcomp, option_fastx_subsample, option_fastx_uniques, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_h, option_hardmask, option_help, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_join_padgap, option_join_padgapq, option_label, option_label_field, option_label_substr_match, option_label_suffix, option_label_word, option_label_words, option_labels, option_lca_cutoff, option_lcaout, option_leftjust, option_length_cutoffs, option_lengthout, option_log, option_makeudb_usearch, option_maskfasta, option_match, option_matched, option_max_unmasked_pct, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsize, option_maxsizeratio, option_maxsl, option_maxsubs, option_maxuniquesize, option_mid, option_min_unmasked_pct, option_mincols, option_mindiffs, option_mindiv, option_minh, option_minhsp, option_minqt, option_minseqlength, option_minsize, option_minsizeratio, option_minsl, option_mintsize, option_minuniquesize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_nonchimeras, option_notmatched, option_notmatchedfq, option_notrunclabels, option_orient, option_otutabout, option_output, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_randseed, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rereplicate, option_reverse, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_sample_pct, option_sample_size, option_search_exact, option_self, option_selfid, option_sff_clip, option_sff_convert, option_shuffle, option_sintax, option_sintax_cutoff, option_sintax_random, option_sizein, option_sizeorder, option_sizeout, option_slots, option_sortbylength, option_sortbysize, option_strand, option_subseq_end, option_subseq_start, option_tabbedout, option_target_cov, option_threads, option_top_hits_only, option_topn, option_tsegout, option_uc, option_uc_allhits, option_uchime2_denovo, option_uchime3_denovo, option_uchime_denovo, option_uchime_ref, option_uchimealns, option_uchimeout, option_uchimeout5, option_udb2fasta, option_udbinfo, option_udbstats, option_unoise_alpha, option_usearch_global, option_userfields, option_userout, option_usersort, option_v, option_version, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xn, option_xsize }; static constexpr std::array long_options = {{ {"abskew", required_argument, nullptr, 0 }, {"acceptall", no_argument, nullptr, 0 }, {"alignwidth", required_argument, nullptr, 0 }, {"allpairs_global", required_argument, nullptr, 0 }, {"alnout", required_argument, nullptr, 0 }, {"band", required_argument, nullptr, 0 }, {"biomout", required_argument, nullptr, 0 }, {"blast6out", required_argument, nullptr, 0 }, {"borderline", required_argument, nullptr, 0 }, {"bzip2_decompress", no_argument, nullptr, 0 }, {"centroids", required_argument, nullptr, 0 }, {"chimeras", required_argument, nullptr, 0 }, {"chimeras_denovo", required_argument, nullptr, 0 }, {"chimeras_diff_pct", required_argument, nullptr, 0 }, {"chimeras_length_min", required_argument, nullptr, 0 }, {"chimeras_parents_max", required_argument, nullptr, 0 }, {"chimeras_parts", required_argument, nullptr, 0 }, {"cluster_fast", required_argument, nullptr, 0 }, {"cluster_size", required_argument, nullptr, 0 }, {"cluster_smallmem", required_argument, nullptr, 0 }, {"cluster_unoise", required_argument, nullptr, 0 }, {"clusterout_id", no_argument, nullptr, 0 }, {"clusterout_sort", no_argument, nullptr, 0 }, {"clusters", required_argument, nullptr, 0 }, {"cons_truncate", no_argument, nullptr, 0 }, {"consout", required_argument, nullptr, 0 }, {"cut", required_argument, nullptr, 0 }, {"cut_pattern", required_argument, nullptr, 0 }, {"db", required_argument, nullptr, 0 }, {"dbmask", required_argument, nullptr, 0 }, {"dbmatched", required_argument, nullptr, 0 }, {"dbnotmatched", required_argument, nullptr, 0 }, {"derep_fulllength", required_argument, nullptr, 0 }, {"derep_id", required_argument, nullptr, 0 }, {"derep_prefix", required_argument, nullptr, 0 }, {"derep_smallmem", required_argument, nullptr, 0 }, {"dn", required_argument, nullptr, 0 }, {"ee_cutoffs", required_argument, nullptr, 0 }, {"eeout", no_argument, nullptr, 0 }, {"eetabbedout", required_argument, nullptr, 0 }, {"fasta2fastq", required_argument, nullptr, 0 }, {"fasta_score", no_argument, nullptr, 0 }, {"fasta_width", required_argument, nullptr, 0 }, {"fastaout", required_argument, nullptr, 0 }, {"fastaout_discarded", required_argument, nullptr, 0 }, {"fastaout_discarded_rev",required_argument, nullptr, 0 }, {"fastaout_notmerged_fwd",required_argument, nullptr, 0 }, {"fastaout_notmerged_rev",required_argument, nullptr, 0 }, {"fastaout_rev", required_argument, nullptr, 0 }, {"fastapairs", required_argument, nullptr, 0 }, {"fastq_allowmergestagger", no_argument, nullptr, 0 }, {"fastq_ascii", required_argument, nullptr, 0 }, {"fastq_asciiout", required_argument, nullptr, 0 }, {"fastq_chars", required_argument, nullptr, 0 }, {"fastq_convert", required_argument, nullptr, 0 }, {"fastq_eeout", no_argument, nullptr, 0 }, {"fastq_eestats", required_argument, nullptr, 0 }, {"fastq_eestats2", required_argument, nullptr, 0 }, {"fastq_filter", required_argument, nullptr, 0 }, {"fastq_join", required_argument, nullptr, 0 }, {"fastq_maxdiffpct", required_argument, nullptr, 0 }, {"fastq_maxdiffs", required_argument, nullptr, 0 }, {"fastq_maxee", required_argument, nullptr, 0 }, {"fastq_maxee_rate", required_argument, nullptr, 0 }, {"fastq_maxlen", required_argument, nullptr, 0 }, {"fastq_maxmergelen", required_argument, nullptr, 0 }, {"fastq_maxns", required_argument, nullptr, 0 }, {"fastq_mergepairs", required_argument, nullptr, 0 }, {"fastq_minlen", required_argument, nullptr, 0 }, {"fastq_minmergelen", required_argument, nullptr, 0 }, {"fastq_minovlen", required_argument, nullptr, 0 }, {"fastq_minqual", required_argument, nullptr, 0 }, {"fastq_nostagger", no_argument, nullptr, 0 }, {"fastq_qmax", required_argument, nullptr, 0 }, {"fastq_qmaxout", required_argument, nullptr, 0 }, {"fastq_qmin", required_argument, nullptr, 0 }, {"fastq_qminout", required_argument, nullptr, 0 }, {"fastq_qout_max", no_argument, nullptr, 0 }, {"fastq_stats", required_argument, nullptr, 0 }, {"fastq_stripleft", required_argument, nullptr, 0 }, {"fastq_stripright", required_argument, nullptr, 0 }, {"fastq_tail", required_argument, nullptr, 0 }, {"fastq_truncee", required_argument, nullptr, 0 }, {"fastq_truncee_rate", required_argument, nullptr, 0 }, {"fastq_trunclen", required_argument, nullptr, 0 }, {"fastq_trunclen_keep", required_argument, nullptr, 0 }, {"fastq_truncqual", required_argument, nullptr, 0 }, {"fastqout", required_argument, nullptr, 0 }, {"fastqout_discarded", required_argument, nullptr, 0 }, {"fastqout_discarded_rev",required_argument, nullptr, 0 }, {"fastqout_notmerged_fwd",required_argument, nullptr, 0 }, {"fastqout_notmerged_rev",required_argument, nullptr, 0 }, {"fastqout_rev", required_argument, nullptr, 0 }, {"fastx_filter", required_argument, nullptr, 0 }, {"fastx_getseq", required_argument, nullptr, 0 }, {"fastx_getseqs", required_argument, nullptr, 0 }, {"fastx_getsubseq", required_argument, nullptr, 0 }, {"fastx_mask", required_argument, nullptr, 0 }, {"fastx_revcomp", required_argument, nullptr, 0 }, {"fastx_subsample", required_argument, nullptr, 0 }, {"fastx_uniques", required_argument, nullptr, 0 }, {"fulldp", no_argument, nullptr, 0 }, {"gapext", required_argument, nullptr, 0 }, {"gapopen", required_argument, nullptr, 0 }, {"gzip_decompress", no_argument, nullptr, 0 }, {"h", no_argument, nullptr, 0 }, {"hardmask", no_argument, nullptr, 0 }, {"help", no_argument, nullptr, 0 }, {"hspw", required_argument, nullptr, 0 }, {"id", required_argument, nullptr, 0 }, {"iddef", required_argument, nullptr, 0 }, {"idprefix", required_argument, nullptr, 0 }, {"idsuffix", required_argument, nullptr, 0 }, {"join_padgap", required_argument, nullptr, 0 }, {"join_padgapq", required_argument, nullptr, 0 }, {"label", required_argument, nullptr, 0 }, {"label_field", required_argument, nullptr, 0 }, {"label_substr_match", no_argument, nullptr, 0 }, {"label_suffix", required_argument, nullptr, 0 }, {"label_word", required_argument, nullptr, 0 }, {"label_words", required_argument, nullptr, 0 }, {"labels", required_argument, nullptr, 0 }, {"lca_cutoff", required_argument, nullptr, 0 }, {"lcaout", required_argument, nullptr, 0 }, {"leftjust", no_argument, nullptr, 0 }, {"length_cutoffs", required_argument, nullptr, 0 }, {"lengthout", no_argument, nullptr, 0 }, {"log", required_argument, nullptr, 0 }, {"makeudb_usearch", required_argument, nullptr, 0 }, {"maskfasta", required_argument, nullptr, 0 }, {"match", required_argument, nullptr, 0 }, {"matched", required_argument, nullptr, 0 }, {"max_unmasked_pct", required_argument, nullptr, 0 }, {"maxaccepts", required_argument, nullptr, 0 }, {"maxdiffs", required_argument, nullptr, 0 }, {"maxgaps", required_argument, nullptr, 0 }, {"maxhits", required_argument, nullptr, 0 }, {"maxid", required_argument, nullptr, 0 }, {"maxqsize", required_argument, nullptr, 0 }, {"maxqt", required_argument, nullptr, 0 }, {"maxrejects", required_argument, nullptr, 0 }, {"maxseqlength", required_argument, nullptr, 0 }, {"maxsize", required_argument, nullptr, 0 }, {"maxsizeratio", required_argument, nullptr, 0 }, {"maxsl", required_argument, nullptr, 0 }, {"maxsubs", required_argument, nullptr, 0 }, {"maxuniquesize", required_argument, nullptr, 0 }, {"mid", required_argument, nullptr, 0 }, {"min_unmasked_pct", required_argument, nullptr, 0 }, {"mincols", required_argument, nullptr, 0 }, {"mindiffs", required_argument, nullptr, 0 }, {"mindiv", required_argument, nullptr, 0 }, {"minh", required_argument, nullptr, 0 }, {"minhsp", required_argument, nullptr, 0 }, {"minqt", required_argument, nullptr, 0 }, {"minseqlength", required_argument, nullptr, 0 }, {"minsize", required_argument, nullptr, 0 }, {"minsizeratio", required_argument, nullptr, 0 }, {"minsl", required_argument, nullptr, 0 }, {"mintsize", required_argument, nullptr, 0 }, {"minuniquesize", required_argument, nullptr, 0 }, {"minwordmatches", required_argument, nullptr, 0 }, {"mismatch", required_argument, nullptr, 0 }, {"mothur_shared_out", required_argument, nullptr, 0 }, {"msaout", required_argument, nullptr, 0 }, {"n_mismatch", no_argument, nullptr, 0 }, {"no_progress", no_argument, nullptr, 0 }, {"nonchimeras", required_argument, nullptr, 0 }, {"notmatched", required_argument, nullptr, 0 }, {"notmatchedfq", required_argument, nullptr, 0 }, {"notrunclabels", no_argument, nullptr, 0 }, {"orient", required_argument, nullptr, 0 }, {"otutabout", required_argument, nullptr, 0 }, {"output", required_argument, nullptr, 0 }, {"output_no_hits", no_argument, nullptr, 0 }, {"pattern", required_argument, nullptr, 0 }, {"profile", required_argument, nullptr, 0 }, {"qmask", required_argument, nullptr, 0 }, {"qsegout", required_argument, nullptr, 0 }, {"query_cov", required_argument, nullptr, 0 }, {"quiet", no_argument, nullptr, 0 }, {"randseed", required_argument, nullptr, 0 }, {"relabel", required_argument, nullptr, 0 }, {"relabel_keep", no_argument, nullptr, 0 }, {"relabel_md5", no_argument, nullptr, 0 }, {"relabel_self", no_argument, nullptr, 0 }, {"relabel_sha1", no_argument, nullptr, 0 }, {"rereplicate", required_argument, nullptr, 0 }, {"reverse", required_argument, nullptr, 0 }, {"rightjust", no_argument, nullptr, 0 }, {"rowlen", required_argument, nullptr, 0 }, {"samheader", no_argument, nullptr, 0 }, {"samout", required_argument, nullptr, 0 }, {"sample", required_argument, nullptr, 0 }, {"sample_pct", required_argument, nullptr, 0 }, {"sample_size", required_argument, nullptr, 0 }, {"search_exact", required_argument, nullptr, 0 }, {"self", no_argument, nullptr, 0 }, {"selfid", no_argument, nullptr, 0 }, {"sff_clip", no_argument, nullptr, 0 }, {"sff_convert", required_argument, nullptr, 0 }, {"shuffle", required_argument, nullptr, 0 }, {"sintax", required_argument, nullptr, 0 }, {"sintax_cutoff", required_argument, nullptr, 0 }, {"sintax_random", no_argument, nullptr, 0 }, {"sizein", no_argument, nullptr, 0 }, {"sizeorder", no_argument, nullptr, 0 }, {"sizeout", no_argument, nullptr, 0 }, {"slots", required_argument, nullptr, 0 }, {"sortbylength", required_argument, nullptr, 0 }, {"sortbysize", required_argument, nullptr, 0 }, {"strand", required_argument, nullptr, 0 }, {"subseq_end", required_argument, nullptr, 0 }, {"subseq_start", required_argument, nullptr, 0 }, {"tabbedout", required_argument, nullptr, 0 }, {"target_cov", required_argument, nullptr, 0 }, {"threads", required_argument, nullptr, 0 }, {"top_hits_only", no_argument, nullptr, 0 }, {"topn", required_argument, nullptr, 0 }, {"tsegout", required_argument, nullptr, 0 }, {"uc", required_argument, nullptr, 0 }, {"uc_allhits", no_argument, nullptr, 0 }, {"uchime2_denovo", required_argument, nullptr, 0 }, {"uchime3_denovo", required_argument, nullptr, 0 }, {"uchime_denovo", required_argument, nullptr, 0 }, {"uchime_ref", required_argument, nullptr, 0 }, {"uchimealns", required_argument, nullptr, 0 }, {"uchimeout", required_argument, nullptr, 0 }, {"uchimeout5", no_argument, nullptr, 0 }, {"udb2fasta", required_argument, nullptr, 0 }, {"udbinfo", required_argument, nullptr, 0 }, {"udbstats", required_argument, nullptr, 0 }, {"unoise_alpha", required_argument, nullptr, 0 }, {"usearch_global", required_argument, nullptr, 0 }, {"userfields", required_argument, nullptr, 0 }, {"userout", required_argument, nullptr, 0 }, {"usersort", no_argument, nullptr, 0 }, {"v", no_argument, nullptr, 0 }, {"version", no_argument, nullptr, 0 }, {"weak_id", required_argument, nullptr, 0 }, {"wordlength", required_argument, nullptr, 0 }, {"xdrop_nw", required_argument, nullptr, 0 }, {"xee", no_argument, nullptr, 0 }, {"xlength", no_argument, nullptr, 0 }, {"xn", required_argument, nullptr, 0 }, {"xsize", no_argument, nullptr, 0 }, { nullptr, 0, nullptr, 0 } }}; constexpr int options_count = (sizeof(long_options) / sizeof(struct option)) - 1; std::vector options_selected(options_count); int options_index = 0; int val = 0; // recognized long option: return 'val' if 'flag' is nullptr while ((val = getopt_long_only(argc, argv, "", long_options.data(), &options_index)) == 0) { if (options_index < options_count) { options_selected[options_index] = true; } switch (options_index) { case option_help: parameters.opt_help = true; break; case option_version: parameters.opt_version = true; break; case option_alnout: opt_alnout = optarg; break; case option_usearch_global: parameters.opt_usearch_global = optarg; break; case option_db: opt_db = optarg; parameters.opt_db = optarg; break; case option_id: opt_id = args_getdouble(optarg); break; case option_maxaccepts: opt_maxaccepts = args_getlong(optarg); break; case option_maxrejects: opt_maxrejects = args_getlong(optarg); break; case option_wordlength: opt_wordlength = args_getlong(optarg); break; case option_match: opt_match = args_getlong(optarg); break; case option_mismatch: opt_mismatch = args_getlong(optarg); break; case option_fulldp: opt_fulldp = 1; fprintf(stderr, "WARNING: Option --fulldp is ignored\n"); break; case option_strand: if (are_same_string(optarg, "plus")) { opt_strand = 1; parameters.opt_strand = false; } else if (are_same_string(optarg, "both")) { opt_strand = 2; parameters.opt_strand = true; } else { fatal("The argument to --strand must be plus or both"); } break; case option_threads: opt_threads = static_cast(args_getdouble(optarg)); parameters.opt_threads = static_cast(args_getdouble(optarg)); break; case option_gapopen: args_get_gap_penalty_string(optarg, true); break; case option_gapext: args_get_gap_penalty_string(optarg, false); break; case option_rowlen: opt_rowlen = args_getlong(optarg); break; case option_userfields: if (parse_userfields_arg(optarg) == 0) { fatal("Unrecognized userfield argument"); } break; case option_userout: opt_userout = optarg; break; case option_self: opt_self = 1; break; case option_blast6out: opt_blast6out = optarg; break; case option_uc: opt_uc = optarg; parameters.opt_uc = optarg; break; case option_weak_id: opt_weak_id = args_getdouble(optarg); break; case option_uc_allhits: opt_uc_allhits = 1; parameters.opt_uc_allhits = true; break; case option_notrunclabels: opt_notrunclabels = 1; parameters.opt_notrunclabels = true; break; case option_sortbysize: parameters.opt_sortbysize = optarg; break; case option_output: opt_output = optarg; parameters.opt_output = optarg; break; case option_minsize: opt_minsize = args_getlong(optarg); parameters.opt_minsize = args_getlong(optarg); if (parameters.opt_minsize <= 0) { fatal("The argument to --minsize must be at least 1"); } break; case option_maxsize: opt_maxsize = args_getlong(optarg); parameters.opt_maxsize = args_getlong(optarg); break; case option_relabel: opt_relabel = optarg; parameters.opt_relabel = optarg; break; case option_sizeout: opt_sizeout = true; parameters.opt_sizeout = true; break; case option_derep_fulllength: parameters.opt_derep_fulllength = optarg; break; case option_minseqlength: opt_minseqlength = args_getlong(optarg); parameters.opt_minseqlength = args_getlong(optarg); if (parameters.opt_minseqlength < 0) { fatal("The argument to --minseqlength must not be negative"); } break; case option_minuniquesize: opt_minuniquesize = args_getlong(optarg); parameters.opt_minuniquesize = args_getlong(optarg); break; case option_topn: opt_topn = args_getlong(optarg); parameters.opt_topn = args_getlong(optarg); if (parameters.opt_topn == 0) { fatal("The argument to --topn must be greater than zero"); } break; case option_maxseqlength: opt_maxseqlength = args_getlong(optarg); parameters.opt_maxseqlength = args_getlong(optarg); break; case option_sizein: opt_sizein = true; parameters.opt_sizein = true; break; case option_sortbylength: parameters.opt_sortbylength = optarg; break; case option_matched: opt_matched = optarg; break; case option_notmatched: opt_notmatched = optarg; break; case option_dbmatched: opt_dbmatched = optarg; parameters.opt_dbmatched = optarg; break; case option_dbnotmatched: opt_dbnotmatched = optarg; parameters.opt_dbnotmatched = optarg; break; case option_fastapairs: opt_fastapairs = optarg; break; case option_output_no_hits: opt_output_no_hits = 1; break; case option_maxhits: opt_maxhits = args_getlong(optarg); break; case option_top_hits_only: opt_top_hits_only = 1; break; case option_fasta_width: opt_fasta_width = args_getlong(optarg); parameters.opt_fasta_width = args_getlong(optarg); break; case option_query_cov: opt_query_cov = args_getdouble(optarg); break; case option_target_cov: opt_target_cov = args_getdouble(optarg); break; case option_idprefix: opt_idprefix = args_getlong(optarg); break; case option_idsuffix: opt_idsuffix = args_getlong(optarg); break; case option_minqt: opt_minqt = args_getdouble(optarg); break; case option_maxqt: opt_maxqt = args_getdouble(optarg); break; case option_minsl: opt_minsl = args_getdouble(optarg); break; case option_maxsl: opt_maxsl = args_getdouble(optarg); break; case option_leftjust: opt_leftjust = 1; break; case option_rightjust: opt_rightjust = 1; break; case option_selfid: opt_selfid = 1; break; case option_maxid: opt_maxid = args_getdouble(optarg); break; case option_minsizeratio: opt_minsizeratio = args_getdouble(optarg); break; case option_maxsizeratio: opt_maxsizeratio = args_getdouble(optarg); break; case option_maxdiffs: opt_maxdiffs = args_getlong(optarg); break; case option_maxsubs: opt_maxsubs = args_getlong(optarg); break; case option_maxgaps: opt_maxgaps = args_getlong(optarg); break; case option_mincols: opt_mincols = args_getlong(optarg); break; case option_maxqsize: opt_maxqsize = args_getlong(optarg); break; case option_mintsize: opt_mintsize = args_getlong(optarg); break; case option_mid: opt_mid = args_getdouble(optarg); break; case option_shuffle: parameters.opt_shuffle = optarg; break; case option_randseed: opt_randseed = args_getlong(optarg); parameters.opt_randseed = args_getlong(optarg); break; case option_maskfasta: parameters.opt_maskfasta = optarg; break; case option_hardmask: opt_hardmask = 1; parameters.opt_hardmask = true; break; case option_qmask: if (are_same_string(optarg, "none")) { opt_qmask = MASK_NONE; parameters.opt_qmask = MASK_NONE; } else if (are_same_string(optarg, "dust")) { opt_qmask = MASK_DUST; parameters.opt_qmask = MASK_DUST; } else if (are_same_string(optarg, "soft")) { opt_qmask = MASK_SOFT; parameters.opt_qmask = MASK_SOFT; } else { opt_qmask = MASK_ERROR; parameters.opt_qmask = MASK_ERROR; } break; case option_dbmask: if (are_same_string(optarg, "none")) { opt_dbmask = MASK_NONE; } else if (are_same_string(optarg, "dust")) { opt_dbmask = MASK_DUST; } else if (are_same_string(optarg, "soft")) { opt_dbmask = MASK_SOFT; } else { opt_dbmask = MASK_ERROR; } break; case option_cluster_smallmem: opt_cluster_smallmem = optarg; parameters.opt_cluster_smallmem = optarg; break; case option_cluster_fast: opt_cluster_fast = optarg; parameters.opt_cluster_fast = optarg; break; case option_centroids: opt_centroids = optarg; break; case option_clusters: opt_clusters = optarg; break; case option_consout: opt_consout = optarg; break; case option_cons_truncate: fprintf(stderr, "WARNING: Option --cons_truncate is ignored\n"); opt_cons_truncate = 1; break; case option_msaout: opt_msaout = optarg; break; case option_usersort: opt_usersort = 1; break; case option_xn: opt_xn = args_getdouble(optarg); break; case option_iddef: opt_iddef = args_getlong(optarg); break; case option_slots: fprintf(stderr, "WARNING: Option --slots is ignored\n"); opt_slots = args_getlong(optarg); break; case option_pattern: fprintf(stderr, "WARNING: Option --pattern is ignored\n"); opt_pattern = optarg; break; case option_maxuniquesize: opt_maxuniquesize = args_getlong(optarg); parameters.opt_maxuniquesize = args_getlong(optarg); break; case option_abskew: opt_abskew = args_getdouble(optarg); break; case option_chimeras: opt_chimeras = optarg; break; case option_dn: opt_dn = args_getdouble(optarg); break; case option_mindiffs: opt_mindiffs = args_getlong(optarg); break; case option_mindiv: opt_mindiv = args_getdouble(optarg); break; case option_minh: opt_minh = args_getdouble(optarg); break; case option_nonchimeras: opt_nonchimeras = optarg; break; case option_uchime_denovo: opt_uchime_denovo = optarg; parameters.opt_uchime_denovo = optarg; break; case option_uchime_ref: opt_uchime_ref = optarg; parameters.opt_uchime_ref = optarg; break; case option_uchimealns: opt_uchimealns = optarg; break; case option_uchimeout: opt_uchimeout = optarg; break; case option_uchimeout5: opt_uchimeout5 = 1; break; case option_alignwidth: opt_alignwidth = args_getlong(optarg); break; case option_allpairs_global: parameters.opt_allpairs_global = optarg; break; case option_acceptall: opt_acceptall = 1; break; case option_cluster_size: opt_cluster_size = optarg; parameters.opt_cluster_size = optarg; break; case option_samout: opt_samout = optarg; break; case option_log: opt_log = optarg; parameters.opt_log = optarg; break; case option_quiet: opt_quiet = true; parameters.opt_quiet = true; break; case option_fastx_subsample: parameters.opt_fastx_subsample = optarg; break; case option_sample_pct: opt_sample_pct = args_getdouble(optarg); parameters.opt_sample_pct = args_getdouble(optarg); break; case option_fastq_chars: parameters.opt_fastq_chars = optarg; break; case option_profile: opt_profile = optarg; break; case option_sample_size: opt_sample_size = args_getlong(optarg); parameters.opt_sample_size = args_getlong(optarg); break; case option_fastaout: opt_fastaout = optarg; parameters.opt_fastaout = optarg; break; case option_xsize: opt_xsize = true; parameters.opt_xsize = true; break; case option_clusterout_id: opt_clusterout_id = true; parameters.opt_clusterout_id = true; break; case option_clusterout_sort: opt_clusterout_sort = true; parameters.opt_clusterout_sort = true; break; case option_borderline: opt_borderline = optarg; break; case option_relabel_sha1: opt_relabel_sha1 = true; parameters.opt_relabel_sha1 = true; break; case option_relabel_md5: opt_relabel_md5 = true; parameters.opt_relabel_md5 = true; break; case option_derep_prefix: parameters.opt_derep_prefix = optarg; break; case option_fastq_filter: parameters.opt_fastq_filter = optarg; break; case option_fastqout: opt_fastqout = optarg; parameters.opt_fastqout = optarg; break; case option_fastaout_discarded: opt_fastaout_discarded = optarg; parameters.opt_fastaout_discarded = optarg; break; case option_fastqout_discarded: opt_fastqout_discarded = optarg; parameters.opt_fastqout_discarded = optarg; break; case option_fastq_truncqual: opt_fastq_truncqual = args_getlong(optarg); break; case option_fastq_maxee: opt_fastq_maxee = args_getdouble(optarg); break; case option_fastq_trunclen: opt_fastq_trunclen = args_getlong(optarg); break; case option_fastq_minlen: opt_fastq_minlen = args_getlong(optarg); break; case option_fastq_stripleft: opt_fastq_stripleft = args_getlong(optarg); break; case option_fastq_maxee_rate: opt_fastq_maxee_rate = args_getdouble(optarg); break; case option_fastq_maxns: opt_fastq_maxns = args_getlong(optarg); break; case option_eeout: opt_eeout = true; parameters.opt_eeout = true; break; case option_fastq_ascii: opt_fastq_ascii = args_getlong(optarg); parameters.opt_fastq_ascii = args_getlong(optarg); break; case option_fastq_qmin: opt_fastq_qmin = args_getlong(optarg); parameters.opt_fastq_qmin = args_getlong(optarg); break; case option_fastq_qmax: opt_fastq_qmax = args_getlong(optarg); parameters.opt_fastq_qmax = args_getlong(optarg); break; case option_fastq_qmaxout: opt_fastq_qmaxout = args_getlong(optarg); parameters.opt_fastq_qmaxout = args_getlong(optarg); break; case option_fastq_stats: parameters.opt_fastq_stats = optarg; break; case option_fastq_tail: parameters.opt_fastq_tail = args_getlong(optarg); break; case option_fastx_revcomp: parameters.opt_fastx_revcomp = optarg; break; case option_label_suffix: opt_label_suffix = optarg; parameters.opt_label_suffix = optarg; break; case option_h: parameters.opt_help = true; break; case option_samheader: opt_samheader = true; parameters.opt_samheader = true; break; case option_sizeorder: opt_sizeorder = true; parameters.opt_sizeorder = true; break; case option_minwordmatches: opt_minwordmatches = args_getlong(optarg); if (opt_minwordmatches < 0) { fatal("The argument to --minwordmatches must not be negative"); } break; case option_v: parameters.opt_version = true; break; case option_relabel_keep: opt_relabel_keep = true; parameters.opt_relabel_keep = true; break; case option_search_exact: parameters.opt_search_exact = optarg; break; case option_fastx_mask: parameters.opt_fastx_mask = optarg; break; case option_min_unmasked_pct: parameters.opt_min_unmasked_pct = args_getdouble(optarg); break; case option_max_unmasked_pct: parameters.opt_max_unmasked_pct = args_getdouble(optarg); break; case option_fastq_convert: opt_fastq_convert = optarg; parameters.opt_fastq_convert = optarg; break; case option_fastq_asciiout: opt_fastq_asciiout = args_getlong(optarg); parameters.opt_fastq_asciiout = args_getlong(optarg); break; case option_fastq_qminout: opt_fastq_qminout = args_getlong(optarg); parameters.opt_fastq_qminout = args_getlong(optarg); break; case option_fastq_mergepairs: parameters.opt_fastq_mergepairs = optarg; break; case option_fastq_eeout: opt_fastq_eeout = true; parameters.opt_fastq_eeout = true; break; case option_fastqout_notmerged_fwd: opt_fastqout_notmerged_fwd = optarg; break; case option_fastqout_notmerged_rev: opt_fastqout_notmerged_rev = optarg; break; case option_fastq_minovlen: opt_fastq_minovlen = args_getlong(optarg); break; case option_fastq_minmergelen: opt_fastq_minmergelen = args_getlong(optarg); break; case option_fastq_maxmergelen: opt_fastq_maxmergelen = args_getlong(optarg); break; case option_fastq_nostagger: opt_fastq_nostagger = true; parameters.opt_fastq_nostagger = true; break; case option_fastq_allowmergestagger: opt_fastq_allowmergestagger = true; parameters.opt_fastq_allowmergestagger = true; break; case option_fastq_maxdiffs: opt_fastq_maxdiffs = args_getlong(optarg); break; case option_fastaout_notmerged_fwd: opt_fastaout_notmerged_fwd = optarg; break; case option_fastaout_notmerged_rev: opt_fastaout_notmerged_rev = optarg; break; case option_reverse: opt_reverse = optarg; parameters.opt_reverse = optarg; break; case option_eetabbedout: opt_eetabbedout = optarg; break; case option_fasta_score: opt_fasta_score = true; parameters.opt_fasta_score = true; break; case option_fastq_eestats: parameters.opt_fastq_eestats = optarg; break; case option_rereplicate: parameters.opt_rereplicate = optarg; break; case option_xdrop_nw: /* xdrop_nw ignored */ fprintf(stderr, "WARNING: Option --xdrop_nw is ignored\n"); break; case option_minhsp: /* minhsp ignored */ fprintf(stderr, "WARNING: Option --minhsp is ignored\n"); break; case option_band: /* band ignored */ fprintf(stderr, "WARNING: Option --band is ignored\n"); break; case option_hspw: /* hspw ignored */ fprintf(stderr, "WARNING: Option --hspw is ignored\n"); break; case option_gzip_decompress: opt_gzip_decompress = true; parameters.opt_gzip_decompress = true; break; case option_bzip2_decompress: opt_bzip2_decompress = true; parameters.opt_bzip2_decompress = true; break; case option_fastq_maxlen: opt_fastq_maxlen = args_getlong(optarg); break; case option_fastq_truncee: opt_fastq_truncee = args_getdouble(optarg); break; case option_fastx_filter: parameters.opt_fastx_filter = optarg; break; case option_otutabout: opt_otutabout = optarg; break; case option_mothur_shared_out: opt_mothur_shared_out = optarg; break; case option_biomout: opt_biomout = optarg; break; case option_fastq_trunclen_keep: opt_fastq_trunclen_keep = args_getlong(optarg); break; case option_fastq_stripright: opt_fastq_stripright = args_getlong(optarg); break; case option_no_progress: opt_no_progress = true; parameters.opt_no_progress = true; break; case option_fastq_eestats2: parameters.opt_fastq_eestats2 = optarg; break; case option_ee_cutoffs: args_get_ee_cutoffs(optarg); break; case option_length_cutoffs: args_get_length_cutoffs(optarg); break; case option_makeudb_usearch: parameters.opt_makeudb_usearch = optarg; break; case option_udb2fasta: parameters.opt_udb2fasta = optarg; break; case option_udbinfo: parameters.opt_udbinfo = optarg; break; case option_udbstats: parameters.opt_udbstats = optarg; break; case option_cluster_unoise: opt_cluster_unoise = optarg; parameters.opt_cluster_unoise = optarg; break; case option_unoise_alpha: opt_unoise_alpha = args_getdouble(optarg); break; case option_uchime2_denovo: opt_uchime2_denovo = optarg; parameters.opt_uchime2_denovo = optarg; break; case option_uchime3_denovo: opt_uchime3_denovo = optarg; parameters.opt_uchime3_denovo = optarg; break; case option_sintax: parameters.opt_sintax = optarg; break; case option_sintax_cutoff: opt_sintax_cutoff = args_getdouble(optarg); break; case option_tabbedout: opt_tabbedout = optarg; parameters.opt_tabbedout = optarg; break; case option_fastq_maxdiffpct: opt_fastq_maxdiffpct = args_getdouble(optarg); break; case option_fastq_join: parameters.opt_fastq_join = optarg; break; case option_join_padgap: parameters.opt_join_padgap = optarg; break; case option_join_padgapq: parameters.opt_join_padgapq = optarg; parameters.opt_join_padgapq_set_by_user = true; break; case option_sff_convert: parameters.opt_sff_convert = optarg; break; case option_sff_clip: parameters.opt_sff_clip = true; break; case option_fastaout_rev: opt_fastaout_rev = optarg; parameters.opt_fastaout_rev = optarg; break; case option_fastaout_discarded_rev: opt_fastaout_discarded_rev = optarg; parameters.opt_fastaout_discarded_rev = optarg; break; case option_fastqout_rev: opt_fastqout_rev = optarg; parameters.opt_fastqout_rev = optarg; break; case option_fastqout_discarded_rev: opt_fastqout_discarded_rev = optarg; parameters.opt_fastqout_discarded_rev = optarg; break; case option_xee: opt_xee = true; parameters.opt_xee = true; break; case option_fastx_getseq: parameters.opt_fastx_getseq = optarg; break; case option_fastx_getseqs: parameters.opt_fastx_getseqs = optarg; break; case option_fastx_getsubseq: parameters.opt_fastx_getsubseq = optarg; break; case option_label_substr_match: opt_label_substr_match = true; parameters.opt_label_substr_match = true; break; case option_label: opt_label = optarg; break; case option_subseq_start: opt_subseq_start = args_getlong(optarg); break; case option_subseq_end: opt_subseq_end = args_getlong(optarg); break; case option_notmatchedfq: opt_notmatchedfq = optarg; break; case option_label_field: opt_label_field = optarg; break; case option_label_word: opt_label_word = optarg; break; case option_label_words: opt_label_words = optarg; break; case option_labels: opt_labels = optarg; break; case option_cut: parameters.opt_cut = optarg; break; case option_cut_pattern: parameters.opt_cut_pattern = optarg; break; case option_relabel_self: opt_relabel_self = true; parameters.opt_relabel_self = true; break; case option_derep_id: parameters.opt_derep_id = optarg; break; case option_orient: parameters.opt_orient = optarg; break; case option_fasta2fastq: parameters.opt_fasta2fastq = optarg; break; case option_lcaout: opt_lcaout = optarg; break; case option_lca_cutoff: opt_lca_cutoff = args_getdouble(optarg); break; case option_fastx_uniques: parameters.opt_fastx_uniques = optarg; break; case option_fastq_qout_max: parameters.opt_fastq_qout_max = true; break; case option_sample: opt_sample = optarg; parameters.opt_sample = optarg; break; case option_qsegout: opt_qsegout = optarg; break; case option_tsegout: opt_tsegout = optarg; break; case option_derep_smallmem: parameters.opt_derep_smallmem = optarg; break; case option_lengthout: opt_lengthout = true; parameters.opt_lengthout = true; break; case option_xlength: opt_xlength = true; parameters.opt_xlength = true; break; case option_chimeras_denovo: opt_chimeras_denovo = optarg; parameters.opt_chimeras_denovo = optarg; break; case option_chimeras_length_min: opt_chimeras_length_min = args_getlong(optarg); break; case option_chimeras_parts: opt_chimeras_parts = args_getlong(optarg); break; case option_chimeras_parents_max: opt_chimeras_parents_max = args_getlong(optarg); break; case option_chimeras_diff_pct: opt_chimeras_diff_pct = args_getdouble(optarg); break; case option_sintax_random: opt_sintax_random = true; parameters.opt_sintax_random = true; break; case option_n_mismatch: opt_n_mismatch = true; break; case option_fastq_minqual: opt_fastq_minqual = args_getlong(optarg); parameters.opt_fastq_minqual = args_getlong(optarg); break; case option_fastq_truncee_rate: opt_fastq_truncee_rate = args_getdouble(optarg); parameters.opt_fastq_truncee_rate = args_getdouble(optarg); break; default: fatal("Internal error in option parsing"); } } /* Terminate if ambiguous or illegal options have been detected */ if (val != -1) { exit(EXIT_FAILURE); } /* Terminate after reporting any extra non-option arguments */ if (optind < argc) { fatal("Unrecognized string on command line (%s)", argv[optind]); } /* Below is a list of all command names, in alphabetical order. */ static constexpr std::array command_options = { option_allpairs_global, option_chimeras_denovo, option_cluster_fast, option_cluster_size, option_cluster_smallmem, option_cluster_unoise, option_cut, option_derep_fulllength, option_derep_id, option_derep_prefix, option_derep_smallmem, option_fasta2fastq, option_fastq_chars, option_fastq_convert, option_fastq_eestats, option_fastq_eestats2, option_fastq_filter, option_fastq_join, option_fastq_mergepairs, option_fastq_stats, option_fastx_filter, option_fastx_getseq, option_fastx_getseqs, option_fastx_getsubseq, option_fastx_mask, option_fastx_revcomp, option_fastx_subsample, option_fastx_uniques, option_h, option_help, option_makeudb_usearch, option_maskfasta, option_orient, option_rereplicate, option_search_exact, option_sff_convert, option_shuffle, option_sintax, option_sortbylength, option_sortbysize, option_uchime2_denovo, option_uchime3_denovo, option_uchime_denovo, option_uchime_ref, option_udb2fasta, option_udbinfo, option_udbstats, option_usearch_global, option_v, option_version }; const int commands_count = sizeof(command_options) / sizeof(int); /* Below is a list of all the options that are valid for each command. The first line is the command and the lines below are the valid options. */ static constexpr std::array, number_of_commands> valid_options = {{ { option_allpairs_global, option_acceptall, option_alnout, option_band, option_blast6out, option_bzip2_decompress, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_output_no_hits, option_pattern, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeout, option_slots, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_chimeras_denovo, option_abskew, option_alignwidth, option_alnout, option_chimeras, option_chimeras_diff_pct, option_chimeras_length_min, option_chimeras_parents_max, option_chimeras_parts, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_tabbedout, option_threads, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_cluster_fast, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cluster_size, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cluster_smallmem, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_profile, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_userfields, option_userout, option_usersort, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cluster_unoise, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_centroids, option_clusterout_id, option_clusterout_sort, option_clusters, option_cons_truncate, option_consout, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsize, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_msaout, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_qsegout, option_pattern, option_profile, option_qmask, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeorder, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_unoise_alpha, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_cut, option_bzip2_decompress, option_cut_pattern, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_xee, option_xlength, option_xsize, -1 }, { option_derep_fulllength, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_derep_id, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_derep_prefix, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_derep_smallmem, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fasta2fastq, option_bzip2_decompress, option_fastq_asciiout, option_fastq_qmaxout, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_chars, option_bzip2_decompress, option_fastq_tail, option_gzip_decompress, option_log, option_no_progress, option_quiet, option_threads, -1 }, { option_fastq_convert, option_bzip2_decompress, option_fastq_ascii, option_fastq_asciiout, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_eestats, option_bzip2_decompress, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_log, option_no_progress, option_output, option_quiet, option_threads, -1 }, { option_fastq_eestats2, option_bzip2_decompress, option_ee_cutoffs, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_length_cutoffs, option_log, option_no_progress, option_output, option_quiet, option_threads, -1 }, { option_fastq_filter, option_bzip2_decompress, option_eeout, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_rev, option_fastq_ascii, option_fastq_eeout, option_fastq_maxee, option_fastq_maxee_rate, option_fastq_maxlen, option_fastq_maxns, option_fastq_minlen, option_fastq_minqual, option_fastq_qmax, option_fastq_qmin, option_fastq_stripleft, option_fastq_stripright, option_fastq_truncee, option_fastq_truncee_rate, option_fastq_trunclen, option_fastq_trunclen_keep, option_fastq_truncqual, option_fastqout, option_fastqout_discarded, option_fastqout_discarded_rev, option_fastqout_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxsize, option_minsize, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_join, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_join_padgap, option_join_padgapq, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_mergepairs, option_bzip2_decompress, option_eeout, option_eetabbedout, option_fasta_width, option_fastaout, option_fastaout_notmerged_fwd, option_fastaout_notmerged_rev, option_fastq_allowmergestagger, option_fastq_ascii, option_fastq_eeout, option_fastq_maxdiffpct, option_fastq_maxdiffs, option_fastq_maxee, option_fastq_maxlen, option_fastq_maxmergelen, option_fastq_maxns, option_fastq_minlen, option_fastq_minmergelen, option_fastq_minovlen, option_fastq_nostagger, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastq_truncqual, option_fastqout, option_fastqout_notmerged_fwd, option_fastqout_notmerged_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastq_stats, option_bzip2_decompress, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_log, option_no_progress, option_quiet, option_threads, -1 }, { option_fastx_filter, option_bzip2_decompress, option_eeout, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastaout_discarded_rev, option_fastaout_rev, option_fastq_ascii, option_fastq_eeout, option_fastq_maxee, option_fastq_maxee_rate, option_fastq_maxlen, option_fastq_maxns, option_fastq_minlen, option_fastq_minqual, option_fastq_qmax, option_fastq_qmin, option_fastq_stripleft, option_fastq_stripright, option_fastq_truncee, option_fastq_truncee_rate, option_fastq_trunclen, option_fastq_trunclen_keep, option_fastq_truncqual, option_fastqout, option_fastqout_discarded, option_fastqout_discarded_rev, option_fastqout_rev, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxsize, option_minsize, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_reverse, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_getseq, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label, option_label_substr_match, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notmatched, option_notmatchedfq, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_getseqs, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label, option_label_field, option_label_substr_match, option_label_suffix, option_label_word, option_label_words, option_labels, option_lengthout, option_log, option_no_progress, option_notmatched, option_notmatchedfq, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_getsubseq, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label, option_label_substr_match, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notmatched, option_notmatchedfq, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_subseq_end, option_subseq_start, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_mask, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_hardmask, option_label_suffix, option_lengthout, option_log, option_max_unmasked_pct, option_min_unmasked_pct, option_no_progress, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_revcomp, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_subsample, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastaout_discarded, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_fastqout, option_fastqout_discarded, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_quiet, option_randseed, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sample_pct, option_sample_size, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_fastx_uniques, option_bzip2_decompress, option_fasta_width, option_fastaout, option_fastq_ascii, option_fastq_asciiout, option_fastq_qmax, option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, option_fastq_qout_max, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxuniquesize, option_minseqlength, option_minuniquesize, option_no_progress, option_notrunclabels, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_strand, option_tabbedout, option_threads, option_topn, option_uc, option_xee, option_xlength, option_xsize, -1 }, { option_h, option_log, option_quiet, option_threads, -1 }, { option_help, option_log, option_quiet, option_threads, -1 }, { option_makeudb_usearch, option_bzip2_decompress, option_dbmask, option_gzip_decompress, option_hardmask, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_quiet, option_threads, option_wordlength, -1 }, { option_maskfasta, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_hardmask, option_label_suffix, option_lengthout, option_log, option_max_unmasked_pct, option_maxseqlength, option_min_unmasked_pct, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_orient, option_bzip2_decompress, option_db, option_dbmask, option_fasta_width, option_fastaout, option_fastqout, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notmatched, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_tabbedout, option_threads, option_wordlength, option_xee, option_xlength, option_xsize, -1 }, { option_rereplicate, option_bzip2_decompress, option_fasta_width, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_search_exact, option_alnout, option_biomout, option_blast6out, option_bzip2_decompress, option_db, option_dbmask, option_dbmatched, option_dbnotmatched, option_fasta_width, option_fastapairs, option_gzip_decompress, option_hardmask, option_label_suffix, option_lca_cutoff, option_lcaout, option_lengthout, option_log, option_match, option_matched, option_maxhits, option_maxqsize, option_maxqt, option_maxseqlength, option_maxsizeratio, option_maxsl, option_mincols, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_mismatch, option_mothur_shared_out, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_qmask, option_qsegout, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_sizein, option_sizeout, option_strand, option_threads, option_top_hits_only, option_tsegout, option_uc, option_uc_allhits, option_userfields, option_userout, option_xee, option_xlength, option_xsize, -1 }, { option_sff_convert, option_fastq_asciiout, option_fastq_qmaxout, option_fastq_qminout, option_fastqout, option_label_suffix, option_lengthout, option_log, option_no_progress, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sff_clip, option_sizeout, option_threads, -1 }, { option_shuffle, option_bzip2_decompress, option_fasta_width, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_quiet, option_randseed, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_topn, option_xee, option_xlength, option_xsize, -1 }, { option_sintax, option_bzip2_decompress, option_db, option_dbmask, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_quiet, option_randseed, option_sintax_cutoff, option_sintax_random, option_strand, option_tabbedout, option_threads, option_wordlength, -1 }, { option_sortbylength, option_bzip2_decompress, option_fasta_width, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_minseqlength, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_topn, option_xee, option_xlength, option_xsize, -1 }, { option_sortbysize, option_bzip2_decompress, option_fasta_width, option_fastq_ascii, option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, option_label_suffix, option_lengthout, option_log, option_maxseqlength, option_maxsize, option_minseqlength, option_minsize, option_no_progress, option_notrunclabels, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_topn, option_xee, option_xlength, option_xsize, -1 }, { option_uchime2_denovo, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_uchime3_denovo, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_uchime_denovo, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_uchime_ref, option_abskew, option_alignwidth, option_borderline, option_chimeras, option_db, option_dbmask, option_dn, option_fasta_score, option_fasta_width, option_gapext, option_gapopen, option_hardmask, option_label_suffix, option_lengthout, option_log, option_match, option_maxseqlength, option_mindiffs, option_mindiv, option_minh, option_minseqlength, option_mismatch, option_no_progress, option_nonchimeras, option_notrunclabels, option_qmask, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_self, option_selfid, option_sizein, option_sizeout, option_strand, option_threads, option_uchimealns, option_uchimeout, option_uchimeout5, option_xee, option_xlength, option_xn, option_xsize, -1 }, { option_udb2fasta, option_fasta_width, option_label_suffix, option_lengthout, option_log, option_no_progress, option_output, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_sample, option_sizein, option_sizeout, option_threads, option_xee, option_xlength, option_xsize, -1 }, { option_udbinfo, option_log, option_quiet, option_threads, -1 }, { option_udbstats, option_log, option_no_progress, option_quiet, option_threads, -1 }, { option_usearch_global, option_alnout, option_band, option_biomout, option_blast6out, option_bzip2_decompress, option_db, option_dbmask, option_dbmatched, option_dbnotmatched, option_fasta_width, option_fastapairs, option_fulldp, option_gapext, option_gapopen, option_gzip_decompress, option_hardmask, option_hspw, option_id, option_iddef, option_idprefix, option_idsuffix, option_label_suffix, option_lca_cutoff, option_lcaout, option_leftjust, option_lengthout, option_log, option_match, option_matched, option_maxaccepts, option_maxdiffs, option_maxgaps, option_maxhits, option_maxid, option_maxqsize, option_maxqt, option_maxrejects, option_maxseqlength, option_maxsizeratio, option_maxsl, option_maxsubs, option_mid, option_mincols, option_minhsp, option_minqt, option_minseqlength, option_minsizeratio, option_minsl, option_mintsize, option_minwordmatches, option_mismatch, option_mothur_shared_out, option_n_mismatch, option_no_progress, option_notmatched, option_notrunclabels, option_otutabout, option_output_no_hits, option_pattern, option_qmask, option_qsegout, option_query_cov, option_quiet, option_relabel, option_relabel_keep, option_relabel_md5, option_relabel_self, option_relabel_sha1, option_rightjust, option_rowlen, option_samheader, option_samout, option_sample, option_self, option_selfid, option_sizein, option_sizeout, option_slots, option_strand, option_target_cov, option_threads, option_top_hits_only, option_tsegout, option_uc, option_uc_allhits, option_userfields, option_userout, option_weak_id, option_wordlength, option_xdrop_nw, option_xee, option_xlength, option_xsize, -1 }, { option_v, option_log, option_quiet, option_threads, -1 }, { option_version, option_log, option_quiet, option_threads, -1 } }}; /* check that only one commmand is specified */ int commands = 0; int k = -1; for (int i = 0; i < commands_count; i++) { if (options_selected[command_options[i]]) { ++commands; k = i; } } if (commands > 1) { fatal("More than one command specified"); } /* check that only valid options are specified */ int invalid_options = 0; if (commands == 0) { /* check if any options are specified */ bool any_options = false; for (bool const i: options_selected) { if (i) { any_options = true; } } if (any_options) { fprintf(stderr, "WARNING: Options given, but no valid command specified.\n"); } } else { for (int i = 0; i < options_count; i++) { if (options_selected[i]) { int j = 0; bool option_is_valid = false; while (valid_options[k][j] >= 0) { if (valid_options[k][j] == i) { option_is_valid = true; break; } ++j; } if (not option_is_valid) { ++invalid_options; if (invalid_options == 1) { fprintf(stderr, "Fatal error: Invalid options to command %s\n", long_options[command_options[k]].name); fprintf(stderr, "Invalid option(s):"); } fprintf(stderr, " --%s", long_options[i].name); } } } if (invalid_options > 0) { fprintf(stderr, "\nThe valid options for the %s command are:", long_options[command_options[k]].name); int count = 0; for (int j = 1; valid_options[k][j] >= 0; j++) { fprintf(stderr, " --%s", long_options[valid_options[k][j]].name); ++count; } if (count == 0) { fprintf(stderr, " (none)"); } fprintf(stderr, "\n"); exit(EXIT_FAILURE); } } /* multi-threaded commands */ if ((opt_threads < 0) or (opt_threads > n_threads_max)) { fatal("The argument to --threads must be in the range 0 (default) to 1024"); } if ((parameters.opt_allpairs_global != nullptr) or (parameters.opt_cluster_fast != nullptr) or (parameters.opt_cluster_size != nullptr) or (parameters.opt_cluster_smallmem != nullptr) or (parameters.opt_cluster_unoise != nullptr) or (parameters.opt_fastq_mergepairs != nullptr) or (parameters.opt_fastx_mask != nullptr) or (parameters.opt_maskfasta != nullptr) or (parameters.opt_search_exact != nullptr) or (parameters.opt_sintax != nullptr) or (parameters.opt_uchime_ref != nullptr) or (parameters.opt_usearch_global != nullptr)) { if (parameters.opt_threads == 0) { opt_threads = arch_get_cores(); parameters.opt_threads = arch_get_cores(); } } else { if (parameters.opt_threads > 1) { fprintf(stderr, "WARNING: The %s command does not support multithreading.\nOnly 1 thread used.\n", long_options[command_options[k]].name); } opt_threads = 1; parameters.opt_threads = 1; } if ((parameters.opt_sintax != nullptr) and (parameters.opt_randseed != 0) and (parameters.opt_threads > 1)) { fprintf(stderr, "WARNING: Using the --sintax command with the --randseed option may not work as intended with multiple threads. Use a single thread (--threads 1) to ensure reproducible results.\n"); } if (parameters.opt_cluster_unoise != nullptr) { opt_weak_id = 0.90; } else if (opt_weak_id > opt_id) { opt_weak_id = opt_id; } if (opt_maxrejects == -1) { if (parameters.opt_cluster_fast != nullptr) { opt_maxrejects = 8; } else { opt_maxrejects = 32; } } if (opt_maxaccepts < 0) { fatal("The argument to --maxaccepts must not be negative"); } if (opt_maxrejects < 0) { fatal("The argument to --maxrejects must not be negative"); } if (opt_wordlength == 0) { /* set default word length */ if (parameters.opt_orient != nullptr) { opt_wordlength = 12; } else { opt_wordlength = 8; } } if ((opt_wordlength < 3) or (opt_wordlength > 15)) { fatal("The argument to --wordlength must be in the range 3 to 15"); } if ((opt_iddef < 0) or (opt_iddef > 4)) { fatal("The argument to --iddef must in the range 0 to 4"); } #if 0 if (opt_match <= 0) fatal("The argument to --match must be positive"); if (opt_mismatch >= 0) fatal("The argument to --mismatch must be negative"); #endif if (opt_alignwidth < 0) { fatal("The argument to --alignwidth must not be negative"); } if (opt_rowlen < 0) { fatal("The argument to --rowlen must not be negative"); } if (parameters.opt_qmask == MASK_ERROR) { fatal("The argument to --qmask must be none, dust or soft"); } if (opt_dbmask == MASK_ERROR) { fatal("The argument to --dbmask must be none, dust or soft"); } if ((opt_sample_pct < 0.0) or (opt_sample_pct > 100.0)) { fatal("The argument to --sample_pct must be in the range 0.0 to 100.0"); } if (opt_sample_size < 0) { fatal("The argument to --sample_size must not be negative"); } if ((((parameters.opt_relabel != nullptr) ? 1 : 0) + static_cast(parameters.opt_relabel_md5) + static_cast(parameters.opt_relabel_self) + static_cast(parameters.opt_relabel_sha1)) > 1) { fatal("Specify only one of --relabel, --relabel_self, --relabel_sha1, or --relabel_md5"); } if (parameters.opt_fastq_tail < 1) { fatal("The argument to --fastq_tail must be greater than zero"); } if ((parameters.opt_min_unmasked_pct < 0.0) and (parameters.opt_min_unmasked_pct > 100.0)) { fatal("The argument to --min_unmasked_pct must be between 0.0 and 100.0"); } if ((parameters.opt_max_unmasked_pct < 0.0) and (parameters.opt_max_unmasked_pct > 100.0)) { fatal("The argument to --max_unmasked_pct must be between 0.0 and 100.0"); } if (parameters.opt_min_unmasked_pct > parameters.opt_max_unmasked_pct) { fatal("The argument to --min_unmasked_pct cannot be larger than --max_unmasked_pct"); } if ((parameters.opt_fastq_ascii != 33) and (parameters.opt_fastq_ascii != 64)) { fatal("The argument to --fastq_ascii must be 33 or 64"); } if (opt_fastq_qmin > opt_fastq_qmax) { fatal("The argument to --fastq_qmin cannot be greater than --fastq_qmax"); } if (parameters.opt_fastq_ascii + opt_fastq_qmin < 33) { fatal("Sum of arguments to --fastq_ascii and --fastq_qmin must be no less than 33"); } if (parameters.opt_fastq_ascii + opt_fastq_qmax > 126) { fatal("Sum of arguments to --fastq_ascii and --fastq_qmax must be no more than 126"); } if (parameters.opt_fastq_qminout > parameters.opt_fastq_qmaxout) { fatal("The argument to --fastq_qminout cannot be larger than --fastq_qmaxout"); } if ((parameters.opt_fastq_asciiout != 33) and (parameters.opt_fastq_asciiout != 64)) { fatal("The argument to --fastq_asciiout must be 33 or 64"); } if (parameters.opt_fastq_asciiout + parameters.opt_fastq_qminout < 33) { fatal("Sum of arguments to --fastq_asciiout and --fastq_qminout must be no less than 33"); } if (parameters.opt_fastq_asciiout + parameters.opt_fastq_qmaxout > 126) { fatal("Sum of arguments to --fastq_asciiout and --fastq_qmaxout must be no more than 126"); } if (parameters.opt_gzip_decompress and parameters.opt_bzip2_decompress) { fatal("Specify either --gzip_decompress or --bzip2_decompress, not both"); } if ((opt_sintax_cutoff < 0.0) or (opt_sintax_cutoff > 1.0)) { fatal("The argument to sintax_cutoff must be in the range 0.0 to 1.0"); } if ((opt_lca_cutoff <= 0.5) or (opt_lca_cutoff > 1.0)) { fatal("The argument to lca_cutoff must be larger than 0.5, but not larger than 1.0"); } if (parameters.opt_minuniquesize < 1) { fatal("The argument to minuniquesize must be at least 1"); } if (parameters.opt_maxuniquesize < 1) { fatal("The argument to maxuniquesize must be at least 1"); } if (parameters.opt_maxsize < 1) { fatal("The argument to maxsize must be at least 1"); } if (opt_maxhits < 0) { fatal("The argument to maxhits cannot be negative"); } if (opt_chimeras_length_min < 1) { fatal("The argument to chimeras_length_min must be at least 1"); } if ((opt_chimeras_parents_max < 2) or (opt_chimeras_parents_max > maxparents)) { std::array maxparents_string {{}}; snprintf(maxparents_string.data(), maxparents_string.size(), "%d", maxparents); fatal("The argument to chimeras_parents_max must be in the range 2 to %s.\n", maxparents_string.data()); } if ((opt_chimeras_diff_pct < 0.0) or (opt_chimeras_diff_pct > 50.0)) { fatal("The argument to chimeras_diff_pct must be in the range 0.0 to 50.0"); } if (options_selected[option_chimeras_parts] and ((opt_chimeras_parts < 2) or (opt_chimeras_parts > 100))) { fatal("The argument to chimeras_parts must be in the range 2 to 100"); } if (parameters.opt_chimeras_denovo != nullptr) { if (not options_selected[option_alignwidth]) { opt_alignwidth = 60; } } /* TODO: check valid range of gap penalties */ /* adapt/adjust parameters */ #if 1 /* Adjust gap open penalty according to convention. The specified gap open penalties include the penalty for a single nucleotide gap: gap penalty = gap open penalty + (gap length - 1) * gap extension penalty The rest of the code assumes the first nucleotide gap penalty is not included in the gap opening penalty. */ opt_gap_open_query_left -= opt_gap_extension_query_left; opt_gap_open_target_left -= opt_gap_extension_target_left; opt_gap_open_query_interior -= opt_gap_extension_query_interior; opt_gap_open_target_interior -= opt_gap_extension_target_interior; opt_gap_open_query_right -= opt_gap_extension_query_right; opt_gap_open_target_right -= opt_gap_extension_target_right; #endif /* set defaults parameters, if not specified */ if (opt_maxhits == 0) { opt_maxhits = int64_max; } if (opt_minwordmatches < 0) { opt_minwordmatches = minwordmatches_defaults[opt_wordlength]; } /* set default opt_minsize depending on command */ if (parameters.opt_minsize == 0) { if (parameters.opt_cluster_unoise != nullptr) { opt_minsize = 8; parameters.opt_minsize = 8; } else { opt_minsize = 1; parameters.opt_minsize = 1; } } /* set default opt_abskew depending on command */ if (not options_selected[option_abskew]) { if (parameters.opt_chimeras_denovo != nullptr) { opt_abskew = 1.0; } else if (opt_uchime3_denovo != nullptr) { opt_abskew = 16.0; } else { opt_abskew = 2.0; } } /* set default opt_minseqlength depending on command */ if (parameters.opt_minseqlength < 0) { if ((parameters.opt_cluster_fast != nullptr) or (parameters.opt_cluster_size != nullptr) or (parameters.opt_cluster_smallmem != nullptr) or (parameters.opt_cluster_unoise != nullptr) or (parameters.opt_derep_fulllength != nullptr) or (parameters.opt_derep_id != nullptr) or (parameters.opt_derep_prefix != nullptr) or (parameters.opt_makeudb_usearch != nullptr) or (parameters.opt_sintax != nullptr) or (parameters.opt_usearch_global != nullptr)) { opt_minseqlength = 32; parameters.opt_minseqlength = 32; } else { opt_minseqlength = 1; parameters.opt_minseqlength = 1; } } if (parameters.opt_sintax != nullptr) { opt_notrunclabels = 1; parameters.opt_notrunclabels = true; } // refactoring: C++17 std::filesystem::is_regular_file // check if stderr is referring to a terminal // - fileno() returns a file descriptor (fd) // - isatty() returns 1 if a file descriptor refers to a terminal parameters.opt_stderr_is_tty = (isatty(fileno(stderr)) == 1); } auto show_publication() -> void { fprintf(stdout, "Rognes T, Flouri T, Nichols B, Quince C, Mahe F (2016)\n" "VSEARCH: a versatile open source tool for metagenomics\n" "PeerJ 4:e2584 doi: 10.7717/peerj.2584 https://doi.org/10.7717/peerj.2584\n" "\n"); } auto cmd_version(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } show_publication(); #ifdef HAVE_ZLIB_H printf("Compiled with support for gzip-compressed files,"); if (gz_lib != nullptr) { printf(" and the library is loaded.\n"); char * (*zlibVersion_p)(); zlibVersion_p = (char * (*)()) arch_dlsym(gz_lib, "zlibVersion"); char const * gz_version = (*zlibVersion_p)(); long unsigned int (*zlibCompileFlags_p)(); zlibCompileFlags_p = (long unsigned int (*)()) arch_dlsym(gz_lib, "zlibCompileFlags"); long unsigned int const flags = (*zlibCompileFlags_p)(); printf("zlib version %s, compile flags %lx", gz_version, flags); static constexpr auto check_10th_bit = 1024U; // 0x0400 if ((flags & check_10th_bit) != 0U) { printf(" (ZLIB_WINAPI)"); } printf("\n"); } else { printf(" but the library was not found.\n"); } #else printf("Compiled without support for gzip-compressed files.\n"); #endif #ifdef HAVE_BZLIB_H printf("Compiled with support for bzip2-compressed files,"); if (bz2_lib != nullptr) { printf(" and the library is loaded.\n"); } else { printf(" but the library was not found.\n"); } #else printf("Compiled without support for bzip2-compressed files.\n"); #endif } auto cmd_help(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } show_publication(); /* 0 1 2 3 4 5 6 7 */ /* 01234567890123456789012345678901234567890123456789012345678901234567890123456789 */ fprintf(stdout, "Usage: %s [OPTIONS]\n", parameters.progname); fprintf(stdout, "\n" "For further details, please consult the manual by entering: man vsearch\n" "\n" "General options\n" " --bzip2_decompress decompress input with bzip2 (required if pipe)\n" " --fasta_width INT width of FASTA seq lines, 0 for no wrap (80)\n" " --gzip_decompress decompress input with gzip (required if pipe)\n" " --help | -h display help information\n" " --log FILENAME write messages, timing and memory info to file\n" " --maxseqlength INT maximum sequence length (50000)\n" " --minseqlength INT min seq length (clust/derep/search: 32, other:1)\n" " --no_progress do not show progress indicator\n" " --notrunclabels do not truncate labels at first space\n" " --quiet output just warnings and fatal errors to stderr\n" " --threads INT number of threads to use, zero for all cores (0)\n" " --version | -v display version information\n" "\n" "Chimera detection with new algorithm\n" " --chimeras_denovo FILENAME detect chimeras de novo in long exact sequences\n" " Parameters\n" " --abskew REAL minimum abundance ratio (1.0)\n" " --chimeras_diff_pct REAL mismatch %% allowed in each chimeric region (0.0)\n" " --chimeras_length_min INT minimum length of each chimeric region (10)\n" " --chimeras_parents_max INT maximum number of parent sequences (3)\n" " --chimeras_parts INT number of parts to divide sequences (length/100)\n" " --sizein propagate abundance annotation from input\n" " Output\n" " --alignwidth INT width of alignments in alignment output file (60)\n" " --alnout FILENAME output chimera alignments to file\n" " --chimeras FILENAME output chimeric sequences to file\n" " --nonchimeras FILENAME output non-chimeric sequences to file\n" " --relabel STRING relabel nonchimeras with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout include abundance information when relabelling\n" " --tabbedout FILENAME output chimera info to tab-separated file\n" " --xsize strip abundance information in output\n" "\n" "Chimera detection with UCHIME algorithms\n" " --uchime_denovo FILENAME detect chimeras de novo\n" " --uchime2_denovo FILENAME detect chimeras de novo in denoised amplicons\n" " --uchime3_denovo FILENAME detect chimeras de novo in denoised amplicons\n" " --uchime_ref FILENAME detect chimeras using a reference database\n" " Data\n" " --db FILENAME reference database for --uchime_ref\n" " Parameters\n" " --abskew REAL minimum abundance ratio (2.0, 16.0 for uchime3)\n" " --dn REAL 'no' vote pseudo-count (1.4)\n" " --mindiffs INT minimum number of differences in segment (3) *\n" " --mindiv REAL minimum divergence from closest parent (0.8) *\n" " --minh REAL minimum score (0.28) * ignored in uchime2/3\n" " --sizein propagate abundance annotation from input\n" " --self exclude identical labels for --uchime_ref\n" " --selfid exclude identical sequences for --uchime_ref\n" " --xn REAL 'no' vote weight (8.0)\n" " Output\n" " --alignwidth INT width of alignment in uchimealn output (80)\n" " --borderline FILENAME output borderline chimeric sequences to file\n" " --chimeras FILENAME output chimeric sequences to file\n" " --fasta_score include chimera score in FASTA output\n" " --nonchimeras FILENAME output non-chimeric sequences to file\n" " --relabel STRING relabel nonchimeras with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout include abundance information when relabelling\n" " --uchimealns FILENAME output chimera alignments to file\n" " --uchimeout FILENAME output to chimera info to tab-separated file\n" " --uchimeout5 make output compatible with uchime version 5\n" " --xsize strip abundance information in output\n" "\n" "Clustering\n" " --cluster_fast FILENAME cluster sequences after sorting by length\n" " --cluster_size FILENAME cluster sequences after sorting by abundance\n" " --cluster_smallmem FILENAME cluster already sorted sequences (see -usersort)\n" " --cluster_unoise FILENAME denoise Illumina amplicon reads\n" " Parameters (most searching options also apply)\n" " --cons_truncate do not ignore terminal gaps in MSA for consensus\n" " --id REAL reject if identity lower, accepted values: 0-1.0\n" " --iddef INT id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n" " --qmask none|dust|soft mask seqs with dust, soft or no method (dust)\n" " --sizein propagate abundance annotation from input\n" " --strand plus|both cluster using plus or both strands (plus)\n" " --usersort indicate sequences not pre-sorted by length\n" " --minsize INT minimum abundance (unoise only) (8)\n" " --unoise_alpha REAL alpha parameter (unoise only) (2.0)\n" " Output\n" " --biomout FILENAME filename for OTU table output in biom 1.0 format\n" " --centroids FILENAME output centroid sequences to FASTA file\n" " --clusterout_id add cluster id info to consout and profile files\n" " --clusterout_sort order msaout, consout, profile by decr abundance\n" " --clusters STRING output each cluster to a separate FASTA file\n" " --consout FILENAME output cluster consensus sequences to FASTA file\n" " --mothur_shared_out FN filename for OTU table output in mothur format\n" " --msaout FILENAME output multiple seq. alignments to FASTA file\n" " --otutabout FILENAME filename for OTU table output in classic format\n" " --profile FILENAME output sequence profile of each cluster to file\n" " --relabel STRING relabel centroids with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeorder sort accepted centroids by abundance, AGC\n" " --sizeout write cluster abundances to centroid file\n" " --uc FILENAME specify filename for UCLUST-like output\n" " --xsize strip abundance information in output\n" "\n" "Convert SFF to FASTQ\n" " --sff_convert FILENAME convert given SFF file to FASTQ format\n" " Parameters\n" " --sff_clip clip ends of sequences as indicated in file (no)\n" " --fastq_asciiout INT FASTQ output quality score ASCII base char (33)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " Output\n" " --fastqout FILENAME output converted sequences to given FASTQ file\n" "\n" "Dereplication and rereplication\n" " --derep_fulllength FILENAME dereplicate sequences in the given FASTA file\n" " --derep_id FILENAME dereplicate using both identifiers and sequences\n" " --derep_prefix FILENAME dereplicate sequences in file based on prefixes\n" " --derep_smallmem FILENAME dereplicate sequences in file using less memory\n" " --fastx_uniques FILENAME dereplicate sequences in the FASTA/FASTQ file\n" " --rereplicate FILENAME rereplicate sequences in the given FASTA file\n" " Parameters\n" " --maxuniquesize INT maximum abundance for output from dereplication\n" " --minuniquesize INT minimum abundance for output from dereplication\n" " --sizein propagate abundance annotation from input\n" " --strand plus|both dereplicate plus or both strands (plus)\n" " Output\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " --fastaout FILENAME output FASTA file (for fastx_uniques)\n" " --fastqout FILENAME output FASTQ file (for fastx_uniques)\n" " --output FILENAME output FASTA file (not for fastx_uniques)\n" " --relabel STRING relabel with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout write abundance annotation to output\n" " --tabbedout FILENAME write cluster info to tsv file for fastx_uniques\n" " --topn INT output only n most abundant sequences after derep\n" " --uc FILENAME filename for UCLUST-like dereplication output\n" " --xsize strip abundance information in derep output\n" "\n" "FASTA to FASTQ conversion\n" " --fasta2fastq FILENAME convert from FASTA to FASTQ, fake quality scores\n" " Parameters\n" " --fastq_asciiout INT FASTQ output quality score ASCII base char (33)\n" " --fastq_qmaxout INT fake quality score for FASTQ output (41)\n" " Output\n" " --fastqout FILENAME FASTQ output filename for converted sequences\n" "\n" "FASTQ format conversion\n" " --fastq_convert FILENAME convert between FASTQ file formats\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_asciiout INT FASTQ output quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " Output\n" " --fastqout FILENAME FASTQ output filename for converted sequences\n" "\n" "FASTQ format detection and quality analysis\n" " --fastq_chars FILENAME analyse FASTQ file for version and quality range\n" " Parameters\n" " --fastq_tail INT min length of tails to count for fastq_chars (4)\n" "\n" "FASTQ quality statistics\n" " --fastq_stats FILENAME report statistics on FASTQ file\n" " --fastq_eestats FILENAME quality score and expected error statistics\n" " --fastq_eestats2 FILENAME expected error and length cutoff statistics\n" " Parameters\n" " --ee_cutoffs REAL,... fastq_eestats2 expected error cutoffs (0.5,1,2)\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --length_cutoffs INT,INT,INT fastq_eestats2 length (min,max,incr) (50,*,50)\n" " Output\n" " --log FILENAME output file for fastq_stats statistics\n" " --output FILENAME output file for fastq_eestats(2) statistics\n" "\n" "Masking (new)\n" " --fastx_mask FILENAME mask sequences in the given FASTA or FASTQ file\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --hardmask mask by replacing with N instead of lower case\n" " --max_unmasked_pct max unmasked %% of sequences to keep (100.0)\n" " --min_unmasked_pct min unmasked %% of sequences to keep (0.0)\n" " --qmask none|dust|soft mask seqs with dust, soft or no method (dust)\n" " Output\n" " --fastaout FILENAME output to specified FASTA file\n" " --fastqout FILENAME output to specified FASTQ file\n" "\n" "Masking (old)\n" " --maskfasta FILENAME mask sequences in the given FASTA file\n" " Parameters\n" " --hardmask mask by replacing with N instead of lower case\n" " --qmask none|dust|soft mask seqs with dust, soft or no method (dust)\n" " Output\n" " --output FILENAME output to specified FASTA file\n" "\n" "Orient sequences in forward or reverse direction\n" " --orient FILENAME orient sequences in given FASTA/FASTQ file\n" " Data\n" " --db FILENAME database of sequences in correct orientation\n" " --dbmask none|dust|soft mask db seqs with dust, soft or no method (dust)\n" " --qmask none|dust|soft mask query with dust, soft or no method (dust)\n" " --wordlength INT length of words used for matching 3-15 (12)\n" " Output\n" " --fastaout FILENAME FASTA output filename for oriented sequences\n" " --fastqout FILENAME FASTQ output filenamr for oriented sequences\n" " --notmatched FILENAME output filename for undetermined sequences\n" " --tabbedout FILENAME output filename for result information\n" "\n" "Paired-end reads joining\n" " --fastq_join FILENAME join paired-end reads into one sequence with gap\n" " Data\n" " --reverse FILENAME specify FASTQ file with reverse reads\n" " --join_padgap STRING sequence string used for padding (NNNNNNNN)\n" " --join_padgapq STRING quality string used for padding (IIIIIIII)\n" " Output\n" " --fastaout FILENAME FASTA output filename for joined sequences\n" " --fastqout FILENAME FASTQ output filename for joined sequences\n" "\n" "Paired-end reads merging\n" " --fastq_mergepairs FILENAME merge paired-end reads into one sequence\n" " Data\n" " --reverse FILENAME specify FASTQ file with reverse reads\n" " Parameters\n" " --fastq_allowmergestagger allow merging of staggered reads\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_maxdiffpct REAL maximum percentage diff. bases in overlap (100.0)\n" " --fastq_maxdiffs INT maximum number of different bases in overlap (10)\n" " --fastq_maxee REAL maximum expected error value for merged sequence\n" " --fastq_maxmergelen maximum length of entire merged sequence\n" " --fastq_maxns INT maximum number of N's\n" " --fastq_minlen INT minimum input read length after truncation (1)\n" " --fastq_minmergelen minimum length of entire merged sequence\n" " --fastq_minovlen minimum length of overlap between reads (10)\n" " --fastq_nostagger disallow merging of staggered reads (default)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" " --fastq_truncqual INT base quality value for truncation\n" " Output\n" " --eetabbedout FILENAME output error statistics to specified file\n" " --fastaout FILENAME FASTA output filename for merged sequences\n" " --fastaout_notmerged_fwd FN FASTA filename for non-merged forward sequences\n" " --fastaout_notmerged_rev FN FASTA filename for non-merged reverse sequences\n" " --fastq_eeout include expected errors (ee) in FASTQ output\n" " --fastqout FILENAME FASTQ output filename for merged sequences\n" " --fastqout_notmerged_fwd FN FASTQ filename for non-merged forward sequences\n" " --fastqout_notmerged_rev FN FASTQ filename for non-merged reverse sequences\n" " --label_suffix STRING suffix to append to label of merged sequences\n" " --xee remove expected errors (ee) info from output\n" "\n" "Pairwise alignment\n" " --allpairs_global FILENAME perform global alignment of all sequence pairs\n" " Output (most searching options also apply)\n" " --alnout FILENAME filename for human-readable alignment output\n" " --acceptall output all pairwise alignments\n" "\n" "Restriction site cutting\n" " --cut FILENAME filename of FASTA formatted input sequences\n" " Parameters\n" " --cut_pattern STRING pattern to match with ^ and _ at cut sites\n" " Output\n" " --fastaout FILENAME FASTA filename for fragments on forward strand\n" " --fastaout_rev FILENAME FASTA filename for fragments on reverse strand\n" " --fastaout_discarded FN FASTA filename for non-matching sequences\n" " --fastaout_discarded_rev FN FASTA filename for non-matching, reverse compl.\n" "\n" "Reverse complementation\n" " --fastx_revcomp FILENAME reverse-complement seqs in FASTA or FASTQ file\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " Output\n" " --fastaout FILENAME FASTA output filename\n" " --fastqout FILENAME FASTQ output filename\n" " --label_suffix STRING label to append to identifier in the output\n" "\n" "Searching\n" " --search_exact FILENAME filename of queries for exact match search\n" " --usearch_global FILENAME filename of queries for global alignment search\n" " Data\n" " --db FILENAME FASTA or UDB database (only FASTA for search_exact)\n" " Parameters\n" " --dbmask none|dust|soft mask db with dust, soft or no method (dust)\n" " --fulldp full dynamic programming alignment (always on)\n" " --gapext STRING penalties for gap extension (2I/1E)\n" " --gapopen STRING penalties for gap opening (20I/2E)\n" " --hardmask mask by replacing with N instead of lower case\n" " --id REAL reject if identity lower\n" " --iddef INT id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n" " --idprefix INT reject if first n nucleotides do not match\n" " --idsuffix INT reject if last n nucleotides do not match\n" " --lca_cutoff REAL fraction of matching hits required for LCA (1.0)\n" " --leftjust reject if terminal gaps at alignment left end\n" " --match INT score for match (2)\n" " --maxaccepts INT number of hits to accept and show per strand (1)\n" " --maxdiffs INT reject if more substitutions or indels\n" " --maxgaps INT reject if more indels\n" " --maxhits INT maximum number of hits to show (unlimited)\n" " --maxid REAL reject if identity higher\n" " --maxqsize INT reject if query abundance larger\n" " --maxqt REAL reject if query/target length ratio higher\n" " --maxrejects INT number of non-matching hits to consider (32)\n" " --maxsizeratio REAL reject if query/target abundance ratio higher\n" " --maxsl REAL reject if shorter/longer length ratio higher\n" " --maxsubs INT reject if more substitutions\n" " --mid REAL reject if percent identity lower, ignoring gaps\n" " --mincols INT reject if alignment length shorter\n" " --minqt REAL reject if query/target length ratio lower\n" " --minsizeratio REAL reject if query/target abundance ratio lower\n" " --minsl REAL reject if shorter/longer length ratio lower\n" " --mintsize INT reject if target abundance lower\n" " --minwordmatches INT minimum number of word matches required (12)\n" " --mismatch INT score for mismatch (-4)\n" " --n_mismatch consider aligning with N's as mismatches\n" " --pattern STRING option is ignored\n" " --qmask none|dust|soft mask query with dust, soft or no method (dust)\n" " --query_cov REAL reject if fraction of query seq. aligned lower\n" " --rightjust reject if terminal gaps at alignment right end\n" " --sizein propagate abundance annotation from input\n" " --self reject if labels identical\n" " --selfid reject if sequences identical\n" " --slots INT option is ignored\n" " --strand plus|both search plus or both strands (plus)\n" " --target_cov REAL reject if fraction of target seq. aligned lower\n" " --weak_id REAL include aligned hits with >= id; continue search\n" " --wordlength INT length of words for database index 3-15 (8)\n" " Output\n" " --alnout FILENAME filename for human-readable alignment output\n" " --biomout FILENAME filename for OTU table output in biom 1.0 format\n" " --blast6out FILENAME filename for blast-like tab-separated output\n" " --dbmatched FILENAME FASTA file for matching database sequences\n" " --dbnotmatched FILENAME FASTA file for non-matching database sequences\n" " --fastapairs FILENAME FASTA file with pairs of query and target\n" " --lcaout FILENAME output LCA of matching sequences to file\n" " --matched FILENAME FASTA file for matching query sequences\n" " --mothur_shared_out FN filename for OTU table output in mothur format\n" " --notmatched FILENAME FASTA file for non-matching query sequences\n" " --otutabout FILENAME filename for OTU table output in classic format\n" " --output_no_hits output non-matching queries to output files\n" " --rowlen INT width of alignment lines in alnout output (64)\n" " --samheader include a header in the SAM output file\n" " --samout FILENAME filename for SAM format output\n" " --sizeout write abundance annotation to dbmatched file\n" " --top_hits_only output only hits with identity equal to the best\n" " --uc FILENAME filename for UCLUST-like output\n" " --uc_allhits show all, not just top hit with uc output\n" " --userfields STRING fields to output in userout file\n" " --userout FILENAME filename for user-defined tab-separated output\n" "\n" "Shuffling and sorting\n" " --shuffle FILENAME shuffle order of sequences in FASTA file randomly\n" " --sortbylength FILENAME sort sequences by length in given FASTA file\n" " --sortbysize FILENAME abundance sort sequences in given FASTA file\n" " Parameters\n" " --maxsize INT maximum abundance for sortbysize\n" " --minsize INT minimum abundance for sortbysize\n" " --randseed INT seed for PRNG, zero to use random data source (0)\n" " --sizein propagate abundance annotation from input\n" " Output\n" " --output FILENAME output to specified FASTA file\n" " --relabel STRING relabel sequences with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout include abundance information when relabelling\n" " --topn INT output just first n sequences\n" " --xsize strip abundance information in output\n" "\n" "Subsampling\n" " --fastx_subsample FILENAME subsample sequences from given FASTA/FASTQ file\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --randseed INT seed for PRNG, zero to use random data source (0)\n" " --sample_pct REAL sampling percentage between 0.0 and 100.0\n" " --sample_size INT sampling size\n" " --sizein consider abundance info from input, do not ignore\n" " Output\n" " --fastaout FILENAME output subsampled sequences to FASTA file\n" " --fastaout_discarded FILE output non-subsampled sequences to FASTA file\n" " --fastqout FILENAME output subsampled sequences to FASTQ file\n" " --fastqout_discarded output non-subsampled sequences to FASTQ file\n" " --relabel STRING relabel sequences with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout update abundance information in output\n" " --xsize strip abundance information in output\n" "\n" "Taxonomic classification\n" " --sintax FILENAME classify sequences in given FASTA/FASTQ file\n" " Parameters\n" " --db FILENAME taxonomic reference db in given FASTA or UDB file\n" " --randseed INT seed for PRNG, zero to use random data source (0)\n" " --sintax_cutoff REAL confidence value cutoff level (0.0)\n" " --sintax_random use random sequence, not shortest, if equal match\n" " Output\n" " --tabbedout FILENAME write results to given tab-delimited file\n" "\n" "Trimming and filtering\n" " --fastx_filter FILENAME trim and filter sequences in FASTA/FASTQ file\n" " --fastq_filter FILENAME trim and filter sequences in FASTQ file\n" " --reverse FILENAME FASTQ file with other end of paired-end reads\n" " Parameters\n" " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" " --fastq_maxee REAL discard if expected error value is higher\n" " --fastq_maxee_rate REAL discard if expected error rate is higher\n" " --fastq_maxlen INT discard if length of sequence is longer\n" " --fastq_maxns INT discard if number of N's is higher\n" " --fastq_minlen INT discard if length of sequence is shorter\n" " --fastq_minqual INT discard if any base quality value lower (0)\n" " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" " --fastq_stripleft INT delete given number of bases from the 5' end\n" " --fastq_stripright INT delete given number of bases from the 3' end\n" " --fastq_truncee REAL truncate to given maximum expected error\n" " --fastq_truncee_rate REAL truncate to given maximum expected error rate\n" " --fastq_trunclen INT truncate to given length (discard if shorter)\n" " --fastq_trunclen_keep INT truncate to given length (keep if shorter)\n" " --fastq_truncqual INT truncate to given minimum base quality\n" " --maxsize INT discard if abundance of sequence is above\n" " --minsize INT discard if abundance of sequence is below\n" " Output\n" " --eeout include expected errors in output\n" " --fastaout FN FASTA filename for passed sequences\n" " --fastaout_discarded FN FASTA filename for discarded sequences\n" " --fastaout_discarded_rev FN FASTA filename for discarded reverse sequences\n" " --fastaout_rev FN FASTA filename for passed reverse sequences\n" " --fastqout FN FASTQ filename for passed sequences\n" " --fastqout_discarded FN FASTQ filename for discarded sequences\n" " --fastqout_discarded_rev FN FASTQ filename for discarded reverse sequences\n" " --fastqout_rev FN FASTQ filename for passed reverse sequences\n" " --relabel STRING relabel filtered sequences with given prefix\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel filtered sequences with md5 digest\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel filtered sequences with sha1 digest\n" " --sizeout include abundance information when relabelling\n" " --xee remove expected errors (ee) info from output\n" " --xsize strip abundance information in output\n" "\n" "UDB files\n" " --makeudb_usearch FILENAME make UDB file from given FASTA file\n" " --udb2fasta FILENAME output FASTA file from given UDB file\n" " --udbinfo FILENAME show information about UDB file\n" " --udbstats FILENAME report statistics about indexed words in UDB file\n" " Parameters\n" " --dbmask none|dust|soft mask db with dust, soft or no method (dust)\n" " --hardmask mask by replacing with N instead of lower case\n" " --wordlength INT length of words for database index 3-15 (8)\n" " Output\n" " --output FILENAME UDB or FASTA output file\n" ); } auto cmd_allpairs_global(struct Parameters const & parameters) -> void { /* check options */ if ((opt_alnout == nullptr) and (opt_userout == nullptr) and (parameters.opt_uc == nullptr) and (opt_blast6out == nullptr) and (opt_matched == nullptr) and (opt_notmatched == nullptr) and (opt_samout == nullptr) and (opt_fastapairs == nullptr)) { fatal("No output files specified"); } if (not ((opt_acceptall != 0) or ((opt_id >= 0.0) and (opt_id <= 1.0)))) { fatal("Specify either --acceptall or --id with an identity from 0.0 to 1.0"); } allpairs_global(parameters, cmdline, prog_header.data()); } auto cmd_usearch_global(struct Parameters const & parameters) -> void { /* check options */ if ((opt_alnout == nullptr) and (opt_userout == nullptr) and (parameters.opt_uc == nullptr) and (opt_blast6out == nullptr) and (opt_matched == nullptr) and (opt_notmatched == nullptr) and (parameters.opt_dbmatched == nullptr) and (parameters.opt_dbnotmatched == nullptr) and (opt_samout == nullptr) and (opt_otutabout == nullptr) and (opt_biomout == nullptr) and (opt_mothur_shared_out == nullptr) and (opt_fastapairs == nullptr) and (opt_lcaout == nullptr)) { fatal("No output files specified"); } if (parameters.opt_db == nullptr) { fatal("Database filename not specified with --db"); } if ((opt_id < 0.0) or (opt_id > 1.0)) { fatal("Identity between 0.0 and 1.0 must be specified with --id"); } usearch_global(parameters, cmdline, prog_header.data()); } auto cmd_search_exact(struct Parameters const & parameters) -> void { /* check options */ if ((opt_alnout == nullptr) and (opt_userout == nullptr) and (parameters.opt_uc == nullptr) and (opt_blast6out == nullptr) and (opt_matched == nullptr) and (opt_notmatched == nullptr) and (parameters.opt_dbmatched == nullptr) and (parameters.opt_dbnotmatched == nullptr) and (opt_samout == nullptr) and (opt_otutabout == nullptr) and (opt_biomout == nullptr) and (opt_mothur_shared_out == nullptr) and (opt_fastapairs == nullptr) and (opt_lcaout == nullptr)) { fatal("No output files specified"); } if (parameters.opt_db == nullptr) { fatal("Database filename not specified with --db"); } search_exact(parameters, cmdline, prog_header.data()); } auto cmd_subsample(struct Parameters const & parameters) -> void { if ((opt_fastaout == nullptr) and (opt_fastqout == nullptr)) { fatal("Specify output files for subsampling with --fastaout and/or --fastqout"); } if ((opt_sample_pct > 0) == (opt_sample_size > 0)) { fatal("Specify either --sample_pct or --sample_size"); } subsample(parameters); } auto cmd_none(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } fprintf(stderr, "For more help, please enter: %s --help\n" "For further details, please consult the manual by entering: man vsearch\n" "\n" "Selected command examples:\n" "\n" "vsearch --allpairs_global FILENAME --id 0.5 --alnout FILENAME\n" "vsearch --cluster_size FILENAME --id 0.97 --centroids FILENAME\n" "vsearch --cut FILENAME --cut_pattern G^AATT_C --fastaout FILENAME\n" "vsearch --fastq_chars FILENAME\n" "vsearch --fastq_convert FILENAME --fastqout FILENAME --fastq_ascii 64\n" "vsearch --fastq_eestats FILENAME --output FILENAME\n" "vsearch --fastq_eestats2 FILENAME --output FILENAME\n" "vsearch --fastq_mergepairs FILENAME --reverse FILENAME --fastqout FILENAME\n" "vsearch --fastq_stats FILENAME --log FILENAME\n" "vsearch --fastx_filter FILENAME --fastaout FILENAME --fastq_trunclen 100\n" "vsearch --fastx_getseq FILENAME --label LABEL --fastaout FILENAME\n" "vsearch --fastx_mask FILENAME --fastaout FILENAME\n" "vsearch --fastx_revcomp FILENAME --fastqout FILENAME\n" "vsearch --fastx_subsample FILENAME --fastaout FILENAME --sample_pct 1\n" "vsearch --fastx_uniques FILENAME --fastaout FILENAME\n" "vsearch --makeudb_usearch FILENAME --output FILENAME\n" "vsearch --search_exact FILENAME --db FILENAME --alnout FILENAME\n" "vsearch --sff_convert FILENAME --output FILENAME --sff_clip\n" "vsearch --shuffle FILENAME --output FILENAME\n" "vsearch --sintax FILENAME --db FILENAME --tabbedout FILENAME\n" "vsearch --sortbylength FILENAME --output FILENAME\n" "vsearch --sortbysize FILENAME --output FILENAME\n" "vsearch --uchime_denovo FILENAME --nonchimeras FILENAME\n" "vsearch --uchime_ref FILENAME --db FILENAME --nonchimeras FILENAME\n" "vsearch --usearch_global FILENAME --db FILENAME --id 0.97 --alnout FILENAME\n" "\n" "Other commands: cluster_fast, cluster_smallmem, cluster_unoise, cut,\n" " derep_id, derep_fulllength, derep_prefix, derep_smallmem,\n" " fasta2fastq, fastq_filter, fastq_join, fastx_getseqs,\n" " fastx_getsubseq, maskfasta, orient, rereplicate, uchime2_denovo,\n" " uchime3_denovo, udb2fasta, udbinfo, udbstats, version\n" "\n", parameters.progname); } auto cmd_cluster(struct Parameters const & parameters) -> void { if ((opt_alnout == nullptr) and (opt_userout == nullptr) and (parameters.opt_uc == nullptr) and (opt_blast6out == nullptr) and (opt_matched == nullptr) and (opt_notmatched == nullptr) and (opt_centroids == nullptr) and (opt_clusters == nullptr) and (opt_consout == nullptr) and (opt_msaout == nullptr) and (opt_samout == nullptr) and (opt_profile == nullptr) and (opt_otutabout == nullptr) and (opt_biomout == nullptr) and (opt_mothur_shared_out == nullptr)) { fatal("No output files specified"); } if (parameters.opt_cluster_unoise == nullptr) { if ((opt_id < 0.0) or (opt_id > 1.0)) { fatal("Identity between 0.0 and 1.0 must be specified with --id"); } } if (parameters.opt_cluster_fast != nullptr) { cluster_fast(cmdline, prog_header.data()); } else if (parameters.opt_cluster_smallmem != nullptr) { cluster_smallmem(cmdline, prog_header.data()); } else if (parameters.opt_cluster_size != nullptr) { cluster_size(cmdline, prog_header.data()); } else if (parameters.opt_cluster_unoise != nullptr) { cluster_unoise(cmdline, prog_header.data()); } } auto cmd_chimera(struct Parameters const & parameters) -> void { if ((opt_chimeras == nullptr) and (opt_nonchimeras == nullptr) and (opt_uchimeout == nullptr) and (opt_uchimealns == nullptr) and (opt_tabbedout == nullptr) and (opt_alnout == nullptr)) { fatal("No output files specified"); } if ((parameters.opt_uchime_ref != nullptr) and (parameters.opt_db == nullptr)) { fatal("Database filename not specified with --db"); } if (opt_abskew < 1.0) { fatal("Argument to --abskew must be >= 1.0"); } if (opt_xn <= 1.0) { fatal("Argument to --xn must be > 1"); } if (opt_dn <= 0.0) { fatal("Argument to --dn must be > 0"); } if ((parameters.opt_uchime2_denovo == nullptr) and (parameters.opt_uchime3_denovo == nullptr)) { if (opt_mindiffs <= 0) { fatal("Argument to --mindiffs must be > 0"); } if (opt_mindiv <= 0.0) { fatal("Argument to --mindiv must be > 0"); } if (opt_minh <= 0.0) { fatal("Argument to --minh must be > 0"); } } chimera(parameters); } auto cmd_fastq_join(struct Parameters & parameters) -> void { if (is_not_ASCII(parameters.opt_join_padgap)) { fatal("Option --join_padgap contains non-ASCII characters"); } if (is_not_ASCII(parameters.opt_join_padgapq)) { fatal("Option --join_padgapq contains non-ASCII characters"); } if ((not parameters.opt_join_padgapq_set_by_user) and (parameters.opt_fastq_ascii != default_ascii_offset)) { parameters.opt_join_padgapq = alternative_quality_padding; } fastq_join(parameters); } auto cmd_fastq_mergepairs(struct Parameters const & parameters) -> void { if (parameters.opt_reverse == nullptr) { fatal("No reverse reads file specified with --reverse"); } if ((parameters.opt_fastqout == nullptr) and (parameters.opt_fastaout == nullptr) and (opt_fastqout_notmerged_fwd == nullptr) and (opt_fastqout_notmerged_rev == nullptr) and (opt_fastaout_notmerged_fwd == nullptr) and (opt_fastaout_notmerged_rev == nullptr) and (opt_eetabbedout == nullptr)) { fatal("No output files specified"); } if (opt_fastq_maxdiffs < 0) { fatal("Argument to --fastq_maxdiffs must be positive"); } fastq_mergepairs(parameters); } auto fill_prog_header() -> void { static constexpr auto one_gigabyte = double{1024 * 1024 * 1024}; auto const * const format = "%s v%s_%s, %.1fGB RAM, %ld cores"; static_cast(snprintf( prog_header.data(), max_line_length, format, PROG_NAME, PROG_VERSION, PROG_ARCH, static_cast(arch_get_memtotal()) / one_gigabyte, arch_get_cores())); } auto getentirecommandline(int argc, char** argv) -> void { int len = 0; for (int i = 0; i < argc; i++) { len += std::strlen(argv[i]); } cmdline = (char *) xmalloc(len + argc); cmdline[0] = 0; for (int i = 0; i < argc; i++) { if (i > 0) { std::strcat(cmdline, " "); } std::strcat(cmdline, argv[i]); } } auto show_header(struct Parameters const & parameters) -> void { if (parameters.opt_quiet) { return ; } fprintf(stderr, "%s\n", prog_header.data()); fprintf(stderr, "https://github.com/torognes/vsearch\n"); fprintf(stderr, "\n"); } auto main(int argc, char** argv) -> int { struct Parameters parameters; fill_prog_header(); getentirecommandline(argc, argv); parameters.command_line = std::string{cmdline, std::strlen(cmdline)}; cpu_features_detect(); args_init(argc, argv, parameters); if (parameters.opt_log != nullptr) { fp_log = fopen_output(opt_log); parameters.fp_log = fp_log; if (fp_log == nullptr) { fatal("Unable to open log file for writing"); } fprintf(fp_log, "%s\n", prog_header.data()); fprintf(fp_log, "%s\n", cmdline); std::array time_string {{}}; time_start = time(nullptr); struct tm const * tm_start = localtime(& time_start); strftime(time_string.data(), time_string.size(), "%Y-%m-%dT%H:%M:%S", tm_start); fprintf(fp_log, "Started %s\n", time_string.data()); } random_init(); show_header(parameters); dynlibs_open(); #ifdef __x86_64__ if (sse2_present == 0) { fatal("Sorry, this program requires a cpu with SSE2."); } #endif if (parameters.opt_help) { cmd_help(parameters); } else if (parameters.opt_allpairs_global != nullptr) { opt_strand = 1; parameters.opt_strand = false; opt_uc_allhits = 1; parameters.opt_uc_allhits = true; cmd_allpairs_global(parameters); } else if (parameters.opt_usearch_global != nullptr) { cmd_usearch_global(parameters); } else if (parameters.opt_sortbysize != nullptr) { sortbysize(parameters); } else if (parameters.opt_sortbylength != nullptr) { sortbylength(parameters); } else if (parameters.opt_derep_fulllength != nullptr) { derep(parameters, parameters.opt_derep_fulllength, false); } else if (parameters.opt_derep_prefix != nullptr) { derep_prefix(parameters); } else if (parameters.opt_derep_smallmem != nullptr) { derep_smallmem(parameters); } else if (parameters.opt_derep_id != nullptr) { derep(parameters, parameters.opt_derep_id, true); } else if (parameters.opt_shuffle != nullptr) { shuffle(parameters); } else if (parameters.opt_fastx_subsample != nullptr) { cmd_subsample(parameters); } else if (parameters.opt_maskfasta != nullptr) { maskfasta(parameters); } else if ((parameters.opt_cluster_smallmem != nullptr) or (parameters.opt_cluster_fast != nullptr) or (parameters.opt_cluster_size != nullptr) or (parameters.opt_cluster_unoise != nullptr)) { cmd_cluster(parameters); } else if ((parameters.opt_uchime_denovo != nullptr) or (parameters.opt_uchime_ref != nullptr) or (parameters.opt_uchime2_denovo != nullptr) or (parameters.opt_uchime3_denovo != nullptr) or (parameters.opt_chimeras_denovo != nullptr)) { cmd_chimera(parameters); } else if (parameters.opt_fastq_chars != nullptr) { fastq_chars(parameters); } else if (parameters.opt_fastq_stats != nullptr) { fastq_stats(parameters); } else if (parameters.opt_fastq_filter != nullptr) { fastq_filter(parameters); } else if (parameters.opt_fastx_filter != nullptr) { fastx_filter(parameters); } else if (parameters.opt_fastx_revcomp != nullptr) { fastx_revcomp(parameters); } else if (parameters.opt_search_exact != nullptr) { opt_id = 1.0; cmd_search_exact(parameters); } else if (parameters.opt_fastx_mask != nullptr) { fastx_mask(parameters); } else if (parameters.opt_fastq_convert != nullptr) { fastq_convert(parameters); } else if (parameters.opt_fastq_mergepairs != nullptr) { cmd_fastq_mergepairs(parameters); } else if (parameters.opt_fastq_eestats != nullptr) { fastq_eestats(parameters); } else if (parameters.opt_fastq_eestats2 != nullptr) { fastq_eestats2(parameters); } else if (parameters.opt_fastq_join != nullptr) { cmd_fastq_join(parameters); } else if (parameters.opt_rereplicate != nullptr) { opt_xsize = true; parameters.opt_xsize = true; rereplicate(parameters); } else if (parameters.opt_version) { cmd_version(parameters); } else if (parameters.opt_makeudb_usearch != nullptr) { udb_make(parameters); } else if (parameters.opt_udb2fasta != nullptr) { udb_fasta(parameters); } else if (parameters.opt_udbinfo != nullptr) { udb_info(parameters); } else if (parameters.opt_udbstats != nullptr) { udb_stats(parameters); } else if (parameters.opt_sintax != nullptr) { sintax(parameters); } else if (parameters.opt_sff_convert != nullptr) { sff_convert(parameters); } else if (parameters.opt_fastx_getseq != nullptr) { fastx_getseq(parameters); } else if (parameters.opt_fastx_getseqs != nullptr) { fastx_getseqs(parameters); } else if (parameters.opt_fastx_getsubseq != nullptr) { fastx_getsubseq(parameters); } else if (parameters.opt_cut != nullptr) { cut(parameters); } else if (parameters.opt_orient != nullptr) { orient(parameters); } else if (parameters.opt_fasta2fastq != nullptr) { fasta2fastq(parameters); } else if (parameters.opt_fastx_uniques != nullptr) { derep(parameters, parameters.opt_fastx_uniques, false); } else { cmd_none(parameters); } if (parameters.opt_log != nullptr) { time_finish = time(nullptr); struct tm const * tm_finish = localtime(& time_finish); std::array time_string {{}}; strftime(time_string.data(), time_string.size(), "%Y-%m-%dT%H:%M:%S", tm_finish); fprintf(fp_log, "\n"); fprintf(fp_log, "Finished %s", time_string.data()); double const time_diff = difftime(time_finish, time_start); fprintf(fp_log, "\n"); fprintf(fp_log, "Elapsed time %02.0lf:%02.0lf\n", floor(time_diff / 60.0), floor(time_diff - (60.0 * floor(time_diff / 60.0)))); double const maxmem = arch_get_memused() / 1048576.0; if (maxmem < 1024.0) { fprintf(fp_log, "Max memory %.1lfMB\n", maxmem); } else { fprintf(fp_log, "Max memory %.1lfGB\n", maxmem / 1024.0); } fclose(fp_log); } if (opt_ee_cutoffs_values != nullptr) { xfree(opt_ee_cutoffs_values); } opt_ee_cutoffs_values = nullptr; xfree(cmdline); dynlibs_close(); } vsearch-2.30.1/src/vsearch.h000066400000000000000000000407701506776541700156550ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define _GNU_SOURCE 1 #define __STDC_CONSTANT_MACROS 1 #define __STDC_FORMAT_MACROS 1 #define __STDC_LIMIT_MACROS 1 #define __restrict #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include // std::size_t #include #include #include // replace with std::chrono #include #include #include #include #include #include #include #include // refactoring: redundant with ? #include #include #include #include #define PROG_NAME PACKAGE #define PROG_VERSION PACKAGE_VERSION #ifdef __x86_64__ #define PROG_CPU "x86_64" #include #elif __PPC__ #ifdef __LITTLE_ENDIAN__ #define PROG_CPU "ppc64le" #include #undef bool #else #error Big endian ppc64 CPUs not supported #endif #elif __aarch64__ #define PROG_CPU "aarch64" #include #else #define PROG_CPU "simde" #define SIMDE_ENABLE_NATIVE_ALIASES #include #endif #ifdef _WIN32 #define PROG_OS "win" #include #include #include #elif __APPLE__ #define PROG_OS "macos" #include #include #elif __linux__ #define PROG_OS "linux" #include #include #elif __FreeBSD__ #define PROG_OS "freebsd" #include #include #elif __NetBSD__ #define PROG_OS "netbsd" #include #include /* Alters behavior, but NetBSD 7 does not have getopt_long_only() */ #define getopt_long_only getopt_long #else #define PROG_OS "unknown" #include #include #endif #define PROG_ARCH PROG_OS "_" PROG_CPU #ifdef HAVE_DLFCN_H #include #endif #ifdef HAVE_ZLIB_H #include #endif #ifdef HAVE_BZLIB_H #include #endif #include "city.h" #include "sha1.h" #include "arch.h" // TODO: rename to operating_systems.h #include "util.h" #include "db.h" #include "searchcore.h" #include "results.h" #include "cpu.h" #include "fastx.h" #include "fasta.h" #include "fastq.h" #include "dbhash.h" /* options */ extern bool opt_bzip2_decompress; extern bool opt_clusterout_id; extern bool opt_clusterout_sort; extern bool opt_eeout; extern bool opt_fasta_score; extern bool opt_fastq_allowmergestagger; extern bool opt_fastq_eeout; extern bool opt_fastq_nostagger; extern bool opt_gzip_decompress; extern bool opt_label_substr_match; extern bool opt_lengthout; extern bool opt_n_mismatch; extern bool opt_no_progress; extern bool opt_quiet; extern bool opt_relabel_keep; extern bool opt_relabel_md5; extern bool opt_relabel_self; extern bool opt_relabel_sha1; extern bool opt_samheader; extern bool opt_sintax_random; extern bool opt_sizein; extern bool opt_sizeorder; extern bool opt_sizeout; extern bool opt_xee; extern bool opt_xlength; extern bool opt_xsize; extern char * opt_alnout; extern char * opt_biomout; extern char * opt_blast6out; extern char * opt_borderline; extern char * opt_centroids; extern char * opt_chimeras; extern char * opt_chimeras_denovo; extern char * opt_cluster_fast; extern char * opt_cluster_size; extern char * opt_cluster_smallmem; extern char * opt_cluster_unoise; extern char * opt_clusters; extern char * opt_consout; extern char * opt_db; extern char * opt_dbmatched; extern char * opt_dbnotmatched; extern char * opt_eetabbedout; extern char * opt_fastaout; extern char * opt_fastaout_discarded; extern char * opt_fastaout_discarded_rev; extern char * opt_fastaout_notmerged_fwd; extern char * opt_fastaout_notmerged_rev; extern char * opt_fastaout_rev; extern char * opt_fastapairs; extern char * opt_fastq_convert; extern char * opt_fastqout; extern char * opt_fastqout_discarded; extern char * opt_fastqout_discarded_rev; extern char * opt_fastqout_rev; extern char * opt_fastqout_notmerged_fwd; extern char * opt_fastqout_notmerged_rev; extern char * opt_label; extern char * opt_label_suffix; extern char * opt_labels; extern char * opt_label_word; extern char * opt_label_words; extern char * opt_label_field; extern char * opt_lcaout; extern char * opt_log; extern char * opt_matched; extern char * opt_mothur_shared_out; extern char * opt_msaout; extern char * opt_nonchimeras; extern char * opt_notmatched; extern char * opt_notmatchedfq; extern char * opt_otutabout; extern char * opt_output; extern char * opt_pattern; extern char * opt_profile; extern char * opt_qsegout; extern char * opt_relabel; extern char * opt_reverse; extern char * opt_samout; extern char * opt_sample; extern char * opt_tabbedout; extern char * opt_tsegout; extern char * opt_uc; extern char * opt_uchime2_denovo; extern char * opt_uchime3_denovo; extern char * opt_uchime_denovo; extern char * opt_uchime_ref; extern char * opt_uchimealns; extern char * opt_uchimeout; extern char * opt_userout; extern double * opt_ee_cutoffs_values; extern double opt_abskew; extern double opt_chimeras_diff_pct; extern double opt_dn; extern double opt_fastq_maxdiffpct; extern double opt_fastq_maxee; extern double opt_fastq_maxee_rate; extern double opt_fastq_truncee; extern double opt_fastq_truncee_rate; extern double opt_id; extern double opt_lca_cutoff; extern double opt_maxid; extern double opt_maxqt; extern double opt_maxsizeratio; extern double opt_maxsl; extern double opt_mid; extern double opt_mindiv; extern double opt_minh; extern double opt_minqt; extern double opt_minsizeratio; extern double opt_minsl; extern double opt_query_cov; extern double opt_sample_pct; extern double opt_sintax_cutoff; extern double opt_target_cov; extern double opt_unoise_alpha; extern double opt_weak_id; extern double opt_xn; extern int opt_acceptall; extern int opt_alignwidth; extern int opt_chimeras_length_min; extern int opt_chimeras_parents_max; extern int opt_chimeras_parts; extern int opt_cons_truncate; extern int opt_ee_cutoffs_count; extern int opt_gap_extension_query_interior; extern int opt_gap_extension_query_left; extern int opt_gap_extension_query_right; extern int opt_gap_extension_target_interior; extern int opt_gap_extension_target_left; extern int opt_gap_extension_target_right; extern int opt_gap_open_query_interior; extern int opt_gap_open_query_left; extern int opt_gap_open_query_right; extern int opt_gap_open_target_interior; extern int opt_gap_open_target_left; extern int opt_gap_open_target_right; extern int opt_length_cutoffs_increment; extern int opt_length_cutoffs_longest; extern int opt_length_cutoffs_shortest; extern int opt_mindiffs; extern int opt_slots; extern int opt_uchimeout5; extern int opt_usersort; extern int64_t opt_dbmask; extern int64_t opt_fasta_width; extern int64_t opt_fastq_ascii; extern int64_t opt_fastq_asciiout; extern int64_t opt_fastq_maxdiffs; extern int64_t opt_fastq_maxlen; extern int64_t opt_fastq_maxmergelen; extern int64_t opt_fastq_maxns; extern int64_t opt_fastq_minlen; extern int64_t opt_fastq_minmergelen; extern int64_t opt_fastq_minovlen; extern int64_t opt_fastq_minqual; extern int64_t opt_fastq_qmax; extern int64_t opt_fastq_qmaxout; extern int64_t opt_fastq_qmin; extern int64_t opt_fastq_qminout; extern int64_t opt_fastq_stripleft; extern int64_t opt_fastq_stripright; extern int64_t opt_fastq_trunclen; extern int64_t opt_fastq_trunclen_keep; extern int64_t opt_fastq_truncqual; extern int64_t opt_fulldp; extern int64_t opt_hardmask; extern int64_t opt_iddef; extern int64_t opt_idprefix; extern int64_t opt_idsuffix; extern int64_t opt_leftjust; extern int64_t opt_match; extern int64_t opt_maxaccepts; extern int64_t opt_maxdiffs; extern int64_t opt_maxgaps; extern int64_t opt_maxhits; extern int64_t opt_maxqsize; extern int64_t opt_maxrejects; extern int64_t opt_maxseqlength; extern int64_t opt_maxsize; extern int64_t opt_maxsubs; extern int64_t opt_maxuniquesize; extern int64_t opt_mincols; extern int64_t opt_minseqlength; extern int64_t opt_minsize; extern int64_t opt_mintsize; extern int64_t opt_minuniquesize; extern int64_t opt_minwordmatches; extern int64_t opt_mismatch; extern int64_t opt_notrunclabels; extern int64_t opt_output_no_hits; extern int64_t opt_qmask; extern int64_t opt_randseed; extern int64_t opt_rightjust; extern int64_t opt_rowlen; extern int64_t opt_sample_size; extern int64_t opt_self; extern int64_t opt_selfid; extern int64_t opt_strand; // used in pthread functions extern int64_t opt_subseq_start; extern int64_t opt_subseq_end; extern int64_t opt_threads; extern int64_t opt_top_hits_only; extern int64_t opt_topn; extern int64_t opt_uc_allhits; extern int64_t opt_wordlength; extern int64_t altivec_present; extern int64_t mmx_present; extern int64_t sse_present; extern int64_t sse2_present; extern int64_t sse3_present; extern int64_t ssse3_present; extern int64_t sse41_present; extern int64_t sse42_present; extern int64_t popcnt_present; extern int64_t avx_present; extern int64_t avx2_present; extern std::FILE * fp_log; constexpr int64_t default_fasta_width = 80; constexpr int64_t default_fastq_tail = 4; constexpr int64_t default_maxseqlength = 50000; constexpr int64_t default_ascii_offset = 33; constexpr char alternative_ascii_offset = 64; constexpr auto dbl_max = std::numeric_limits::max(); constexpr int64_t default_max_quality = 41; constexpr auto int64_max = std::numeric_limits::max(); std::string const default_quality_padding = "IIIIIIII"; // Q40 with an offset of 33 std::string const alternative_quality_padding = "hhhhhhhh"; // Q40 with an offset of 64 std::string const default_sequence_padding = "NNNNNNNN"; struct Parameters { std::string prog_header; std::string command_line; char * opt_allpairs_global = nullptr; char * opt_chimeras_denovo = nullptr; char * opt_cluster_fast = nullptr; char * opt_cluster_size = nullptr; char * opt_cluster_smallmem = nullptr; char * opt_cluster_unoise = nullptr; char * opt_cut = nullptr; std::string opt_cut_pattern; char * opt_db = nullptr; char * opt_dbmatched = nullptr; char * opt_dbnotmatched = nullptr; char * opt_derep_fulllength = nullptr; char * opt_derep_id = nullptr; char * opt_derep_prefix = nullptr; char * opt_derep_smallmem = nullptr; char * opt_fasta2fastq = nullptr; char * opt_fastaout = nullptr; char * opt_fastaout_rev = nullptr; char * opt_fastaout_discarded = nullptr; char * opt_fastaout_discarded_rev = nullptr; char * opt_fastq_chars = nullptr; char * opt_fastq_convert = nullptr; char * opt_fastq_eestats2 = nullptr; char * opt_fastq_eestats = nullptr; char * opt_fastq_filter = nullptr; char * opt_fastq_join = nullptr; char * opt_fastq_mergepairs = nullptr; char * opt_fastq_stats = nullptr; char * opt_fastqout = nullptr; char * opt_fastqout_rev = nullptr; char * opt_fastqout_discarded = nullptr; char * opt_fastqout_discarded_rev = nullptr; char * opt_fastx_filter = nullptr; char * opt_fastx_getseq = nullptr; char * opt_fastx_getseqs = nullptr; char * opt_fastx_getsubseq = nullptr; char * opt_fastx_mask = nullptr; char * opt_fastx_revcomp = nullptr; char * opt_fastx_subsample = nullptr; char * opt_fastx_uniques = nullptr; std::string opt_join_padgap = default_sequence_padding; std::string opt_join_padgapq = default_quality_padding; char * opt_label_suffix = nullptr; char * opt_log = nullptr; char * opt_makeudb_usearch = nullptr; char * opt_maskfasta = nullptr; char * opt_orient = nullptr; char * opt_output = nullptr; char * opt_relabel = nullptr; char * opt_rereplicate = nullptr; char * opt_reverse = nullptr; char * opt_sample = nullptr; char * opt_search_exact = nullptr; char * opt_sff_convert = nullptr; char * opt_shuffle = nullptr; char * opt_sintax = nullptr; char * opt_sortbylength = nullptr; char * opt_sortbysize = nullptr; char * opt_tabbedout = nullptr; char * opt_uc = nullptr; char * opt_uchime2_denovo = nullptr; char * opt_uchime3_denovo = nullptr; char * opt_uchime_denovo = nullptr; char * opt_uchime_ref = nullptr; char * opt_udb2fasta = nullptr; char * opt_udbinfo = nullptr; char * opt_udbstats = nullptr; char * opt_usearch_global = nullptr; char * progname = nullptr; // refactoring: unused? std::FILE * fp_log = nullptr; double opt_fastq_truncee_rate = dbl_max; double opt_max_unmasked_pct = 100.0; double opt_min_unmasked_pct = 0; double opt_sample_pct = 0; int64_t opt_fasta_width = default_fasta_width; int64_t opt_fastq_ascii = default_ascii_offset; int64_t opt_fastq_asciiout = default_ascii_offset; int64_t opt_fastq_qmax = default_max_quality; int64_t opt_fastq_qmaxout = default_max_quality; int64_t opt_fastq_qmin = 0; int64_t opt_fastq_qminout = 0; int64_t opt_fastq_minqual = 0; int64_t opt_fastq_tail = default_fastq_tail; int64_t opt_maxseqlength = default_maxseqlength; int64_t opt_maxsize = int64_max; int64_t opt_maxuniquesize = int64_max; int64_t opt_minseqlength = -1; int64_t opt_minsize = 0; int64_t opt_minuniquesize = 1; int64_t opt_qmask = 1; // MASK_DUST int64_t opt_randseed = 0; int64_t opt_sample_size = 0; int64_t opt_threads = 0; int64_t opt_topn = int64_max; bool opt_bzip2_decompress = false; bool opt_clusterout_id = false; bool opt_clusterout_sort = false; bool opt_eeout = false; bool opt_fasta_score = false; bool opt_fastq_allowmergestagger = false; bool opt_fastq_eeout = false; bool opt_fastq_nostagger = true; bool opt_fastq_qout_max = false; bool opt_gzip_decompress = false; bool opt_hardmask = false; bool opt_label_substr_match = false; bool opt_lengthout = false; bool opt_no_progress = false; bool opt_help = false; bool opt_join_padgapq_set_by_user = false; bool opt_notrunclabels = false; bool opt_quiet = false; bool opt_relabel_keep = false; bool opt_relabel_md5 = false; bool opt_relabel_self = false; bool opt_relabel_sha1 = false; bool opt_samheader = false; bool opt_sff_clip = false; bool opt_sintax_random = false; bool opt_sizein = false; bool opt_sizeorder = false; bool opt_sizeout = false; bool opt_stderr_is_tty = false; bool opt_strand = false; bool opt_uc_allhits = false; bool opt_version = false; bool opt_xee = false; bool opt_xlength = false; bool opt_xsize = false; }; vsearch-2.30.1/src/xstring.h000066400000000000000000000056411506776541700157160ustar00rootroot00000000000000/* VSEARCH: a versatile open source tool for metagenomics Copyright (C) 2014-2025, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , Department of Informatics, University of Oslo, PO Box 1080 Blindern, NO-0316 Oslo, Norway This software is dual-licensed and available under a choice of one of two licenses, either under the terms of the GNU General Public License version 3 or the BSD 2-Clause License. GNU General Public License version 3 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include class xstring { private: std::string string_; public: xstring() = default; auto add_d(int const runlength) -> void { // add digits string_ += std::to_string(runlength); } auto add_c(char const operation) -> void { // add a CIGAR or MD character string_ += operation; } auto clear() -> void { string_.clear(); } auto c_str() -> char const * { // return pointer to a null-terminated character array return string_.c_str(); } };