pax_global_header00006660000000000000000000000064130256603040014511gustar00rootroot0000000000000052 comment=b779a4ef1b499f88fdbffa5a4133c2e4488c6123 bioperl-run-release-1-7-1/000077500000000000000000000000001302566030400153715ustar00rootroot00000000000000bioperl-run-release-1-7-1/.gitignore000066400000000000000000000002431302566030400173600ustar00rootroot00000000000000*~ .tmp *# .#* .*.swp *(Autosaved)blib* Build Build.bat _build* pm_to_blib* *.tar.gz .lwpcookies cover_db pod2htm*.tmp .emacs.* blib* *.bak MYMETA.yml MYMETA.json bioperl-run-release-1-7-1/.travis.yml000066400000000000000000000021201302566030400174750ustar00rootroot00000000000000language: perl perl: #- "5.24" - "5.20" #- "5.18" - "5.16" #- "5.14" sudo: false env: PERL_CPANM_OPT="--notest --force --skip-satisfied" addons: apt: packages: - clustalw - bedtools - bwa - ncbi-blast+ - ncbi-blast+-legacy - muscle - probcons - hmmer - mafft - emboss - samtools - wise install: #These are recommended or required Perl libraries - "cpanm CJFIELDS/BioPerl-1.007000_005.tar.gz 2>&1 | tail -n 1" - "cpanm Bio::FeatureIO 2>&1 | tail -n 1" - "cpanm IPC::Run XML::Twig 2>&1 | tail -n 1" - "cpanm Config::Any 2>&1 | tail -n 1" #- "SAMTOOLS=( dirname `which samtools` ) cpanm Bio::DB::Sam 2>&1 | tail -n 1" script: - "perl ./Build.PL --accept" - "./Build test" #after_success: # - ./travis_scripts/trigger-dockerhub.sh #TODO - send emails to bioperl-guts-l notifications: email: recipients: #- bioperl-guts-l@lists.open-bio.org - cjfields1@gmail.com on_success: change on_failure: change # whitelist branches branches: only: - master bioperl-run-release-1-7-1/AUTHORS000066400000000000000000000022221302566030400164370ustar00rootroot00000000000000=head1 CONTRIBUTORS TO BIOPERL-RUN =over =item * Sendu Bala =item * Jer-Ming Chia =item * Rob Edwards =item * Mauricio Herrera Cuadra =item * Shawn Hoon =item * Donald Jackson =item * Keith James =item * Ratnapu Kiran Kumar =item * Balamurugan Kumarasamy =item * Catherine Letondal =item * Heikki Lehvaslaiho =item * Stephen Montgomery =item * Brian Osborne =item * Tania Oh =item * Peter Schattner =item * Martin Senger =item * Marc Sohrmann =item * Jason Stajich =item * Elia Stupka =item * David Vilanova =item * Albert Vilella =item * Tiequan Zhang =item * Juguang Xiao =back bioperl-run-release-1-7-1/Build.PL000077500000000000000000000113321302566030400166700ustar00rootroot00000000000000#!/usr/bin/perl -w # This is a Module::Build script for BioPerl-Run installation. # See http://search.cpan.org/~kwilliams/Module-Build/lib/Module/Build.pm use strict; use warnings; use Module::Build; my $build = Module::Build->subclass( code => q( # add dist version to META files sub get_metadata { my ($self, %args) = @_; my $metadata = $self->SUPER::get_metadata(%args); if (exists $metadata->{provides}) { my $ver = $self->dist_version; my $pkgs = $metadata->{provides}; for my $p (keys %{$pkgs}) { if (!exists($pkgs->{$p}->{'version'})) { $pkgs->{$p}->{'version'} = $ver; } else { $self->log_warn("Note: Module $p has a set version: ".$pkgs->{$p}->{'version'}."\n") if $pkgs->{$p}->{'version'} ne $ver; } } } return $metadata; } ) )->new( dist_name => 'BioPerl-Run', dist_version => '1.007001', module_name => 'Bio::Run', dist_author => 'BioPerl Team ', dist_abstract => 'BioPerl-Run - wrapper toolkit', license => 'perl', config_requires => { 'Module::Build' => 0, }, build_requires => { 'Bio::Root::Version' => '1.007000', 'Bio::Root::Test' => 0, }, requires => { 'perl' => '5.6.1', 'Bio::Root::Version' => '1.007000', 'Bio::Root::Root' => 0, }, recommends => { 'Algorithm::Diff' => 0, # generating consensus protein family descriptions: Bio::Tools::Run::TribeMCL 'IPC::Run' => 0, # Glimmer and Genemark application wrappers: Bio::Tools::Run::Glimmer Bio::Tools::Run::Genemark 'IO::String' => 0, # generating Bio::Tree::Tree from strings: Bio::Tools::Run::Phylo::Phylip::Consense 'XML::Twig' => 0, # processing XML data: Bio::Tools::Run::EMBOSSacd 'File::Sort' => 0, # BEDTools 'Config::Any' => 0, # MCS, Match #'SOAP::Lite' => 0.716, # A bug that affects SoapEU-unit.t tests was fixed in this version (many levels deep object throws error) }, get_options => { accept => { }, network => { }, install_scripts => { } }, auto_features => { Network => { description => "Enable tests that need an internet connection", requires => { 'LWP::UserAgent' => 0 } } }, dynamic_config => 1, #create_makefile_pl => 'passthrough' ); my $accept = $build->args->{accept}; # Optionally have script files installed. if ($build->args('install_scripts') or $accept ? 0 : $build->y_n("Install scripts? y/n", 'n')) { my $files = $build->_find_file_by_type('PLS', 'scripts'); my $script_build = File::Spec->catdir($build->blib, 'script'); my @tobp; while (my ($file, $dest) = each %$files) { $dest = 'bp_'.File::Basename::basename($dest); $dest =~ s/PLS$/pl/; $dest = File::Spec->catfile($script_build, $dest); $build->copy_if_modified(from => $file, to => $dest); push @tobp, $dest; } $build->script_files(\@tobp); } # Do network tests? my $do_network_tests = 0; if ($build->args('network')) { $do_network_tests = $build->feature('Network'); } elsif ($build->feature('Network')) { $do_network_tests = $accept ? 0 : $build->y_n("Do you want to run tests that require connection to servers across the internet\n(likely to cause some failures)? y/n", 'n'); } if ($do_network_tests) { $build->notes(network => 1); $build->log_info(" - will run internet-requiring tests\n"); my $use_email = $build->y_n("Do you want to run tests requiring a valid email address? y/n",'n'); if ($use_email) { my $address = $build->prompt("Enter email address:"); $build->notes(email => $address); } } else { $build->notes(network => 0); $build->log_info(" - will not run internet-requiring tests\n"); } # Create the build script and exit $build->create_build_script; bioperl-run-release-1-7-1/Changes000066400000000000000000000133171302566030400166710ustar00rootroot00000000000000Revision history for bioperl-run modules 1.7.000 * Bio::Tools::Run::WrapperBase moved from bioperl core to bioperl-run * Updaed Samtools wrapper, minimal support for samtools > v.1 added [cjfields] * Minor updates to sync with BioPerl v. 1.7.x release series 1.6.901 * added run support for MSAProbs [Jessen Bredeson] 1.6.900 * Bowtie and BWA wrappers for NGS [maj, Ben Bimber, Dan Kortschak] * ClustalW v2 support [cjfields] * tRNAscanSE support [Mark Johnson, cjfields] * Glimmer v2 updates [Mark Johnson, cjfields] * PAML codeml wrapper updated to work with PAML 4.4d [DaveMessina] * Phyml updates [hyphaltip] * Repeatmasker updates [cjfields] * Initial BLAST+ modules (Bio::Tools::Run::BlastPlus/StandAloneBlastPlus) [maj] * Improved Bio::Tools::Run::AssemblerBase module and update of the wrappers that use it [fangly, maj] * Support for running new de novo and comparative assemblers: 454 Newbler [fangly], Minimo [fangly], Maq [maj], Samtools [maj], Bowtie [maj] * [bug 2728] add support to Bio::Tools::Run::Alignment::ClustalW for ClustalW2 [cjfields] * [RT 50363] make a bit more Windows friendly with file paths * [bug 2713] - Bio::Tools::Run::Infernal now works with Infernal 1.0 (older versions deprecated) [cjfields] * Bio::Tools::Run::Alignment::Gmap added [hartzell] * [bug 2798] - patch to fix clustalw premature file unlinking error [Wei Zhou] 1.6.0 Release * All Pise and Pise-related modules and scripts have been moved to the new bioperl-pise repository. The Pise service is no longer available and has been replaced by Mobyle. They have been retained as one can still install a Pise server, and as these modules can possibly be used to create a new BioPerl API for Mobyle. 1.5.2 Release in sync with bioperl core * Several wrappers updated for newer versions of the programs. 1.5.1 Release in sync with bioperl core o First major release in a while, so lots of things in this release o PHYLIP wrappers are updated for PHYLIP 3.6, some programs will no longer work (DrawTree and DrawGram specifically) for 3.5 at ths point. It will depend on whether or not anyone really wants this if we'll add in the necessary stuf to support 3.5. It isn't hard, just requires some stuff in th PhylipConf.pm modules. o Bio::Tools::Run::Alignment::Muscle added o PAML wrapper for Yn00 and Codeml are more forgiving about the argument validation. o Several wrappers updated for newer versions of the programs. TribeMCL, Genewise, RepeatMasker 1.2.2 Release update in sync with bioperl core o Soaplab - API changes - binary input added o Pise - changes affecting most Bio::Tools::Run:PiseApplication modules - Numerous documentation fixes in almost all modules - Added code in the SYNOPSIS, as well as the FEEDBACK, COPYRIGHT and SEE ALSO parts. - the DESCRIPTION section now contains *only* the parameters that can be set by the client. - remote parameter to -location to conform to Bio::Tools::Run::AnalysisFactory interface - new programs sirna, tranalign, twofeat (from EMBOSS 2.6). o Bio::Tools::Run::Eponine - More standardized way of running o Bio::Tools::Run::FootPrinter - Write the files properly - Mark Wagner's enhancements bug #1399 o Bio::Tools::Run::Genewise - more options o Bio::Tools::Run::Genscan - doc fix o Bio::Tools::Run::Hmmpfam - Updated to set params properly and return a SearchIO object o Bio::Tools::Run::Mdust - new location - Modified to inherit Bio::Tools::Run::WrapperBase - use Bio::Root::IO to build up paths - Modified documentation to conform to bioperl format o Bio::Tools::Run::Signalp - uniform sequence truncation lenght o Bio::Tools::Run::Vista - new module - Support more options - More documentation - fix reverse sequence bug o Bio::Tools::Run::Phylo::Phylip::SeqBoot - Allow more than one alignment o Bio::Tools::Run::Phylo::Phylip::Neighbor - Check for multiple data sets and set parameter accordingly o Bio::Tools::Run::Alignment::Blat - moved from Bio::Tools::Run name space - some code cleanup to avoid warnings and insure filehandles are properly closed, etc o Bio::Tools::Run::Alignment::Lagan - program name included - small fixes and addition of options - added the right credits. - Bio::Tools::Run::Alignment::DBA and Bio::Tools::Run::Alignment::Sim4 - Quiet declaration warnings 1.2 Developer release o Analysis Factory framework- currently providing SOAP access to EMBOSS applications o Support for FootPrinter, Genewise, Hmmpfam, Primate, Prints, Profile, Promoterwise, Pseudowise, Seg, Signalp, Tmhmm,TribeMCL, Blat,DBA,Lagan,Sim4,Fasta,ProtML,Vista o Added support for PHYLIP apps: Consense, DrawGram, DrawTree, SeqBoot o Added INSTALL.PROGRAMS providing references to download the program binaries. o Bug Fixes that hopefully solves the 'too many open files' problem 0.01 Initial release o Package is broken off from bioperl-live to support just runnable wrapper modules. o Support for PAML codeml tested, aaml still waiting o Support for Molphy protml, nucml to come o Support for EMBOSS pkg - still need to move component from bioperl-live Bio::Factory::EMBOSS to this package and rename it Bio::Tools::Run::EMBOSSFactory or something equivalent. o Support for Clustalw, TCoffee, Local NCBI BLAST. o RepeatMasker, Genscan, Pseudowise, TribeMCL, Primate, Eponine. o Support for remote analysis through Pise and NCBI Web Blast queue. o Select PHYLIP apps: Neighbor, ProtDist, and ProtPars. bioperl-run-release-1-7-1/DEPENDENCIES000066400000000000000000000137271302566030400171540ustar00rootroot00000000000000BioPerl-run Dependencies NOTE : This file was auto-generated by the core helper script maintenance/dependencies.pl. Do not edit directly! The following packages are used by BioPerl. While not all are required for BioPerl to operate properly, some functionality will be missing without them. You can easily choose to install all of these during the normal installation process. Note that the PPM version of the BioPerl packages always tries to install all dependencies. NB: This list of packages is not authoritative. See the 'requires', 'build_requires' and 'recommends' sections of Build.PL instead. ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | IO-String | * IO::String - IO::File interface | None | | | for in-core strings | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::Phylo::Phylip::Consense - IO::String | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | IPC-Run | * IPC::Run - Child procs w/ piping, | None | | | redir and psuedo-ttys | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::Genemark - IPC::Run | | * Bio::Tools::Run::Glimmer - IPC::Run | | * Bio::Tools::Run::TigrAssembler - IPC::Run | | * Bio::Tools::Run::tRNAscanSE - IPC::Run | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | XML-Twig | * XML::Twig - A module for easy | None | | | processing of XML | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::EMBOSSacd - XML::Twig | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | bioperl | * Bio::Seq - NA | None | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Tools::Run::Cap3 - Bio::Seq | | * Bio::Tools::Run::Genscan - Bio::Seq | | * Bio::Tools::Run::TribeMCL - Bio::Seq | | * Bio::Tools::Run::Vista - Bio::Seq | | * Bio::Tools::Run::Alignment::Amap - Bio::Seq | | * Bio::Tools::Run::Alignment::Clustalw - Bio::Seq | | * Bio::Tools::Run::Alignment::Kalign - Bio::Seq | | * Bio::Tools::Run::Alignment::Lagan - Bio::Seq | | * Bio::Tools::Run::Alignment::MAFFT - Bio::Seq | | * Bio::Tools::Run::Alignment::Muscle - Bio::Seq | | * Bio::Tools::Run::Alignment::Probalign - Bio::Seq | | * Bio::Tools::Run::Alignment::Probcons - Bio::Seq | | * Bio::Tools::Run::Alignment::Proda - Bio::Seq | | * Bio::Tools::Run::Alignment::StandAloneFasta - Bio::Seq | | * Bio::Tools::Run::Alignment::TCoffee - Bio::Seq | ============================================================================== ============================================================================== | Distribution | Module used - Description | Min. ver. | |---------------------------+--------------------------------------+-----------| | libwww-perl | * HTTP::Request::Common - Functions | None | | | that generate HTTP::Requests | | | | * LWP - Libwww-perl | | |==============================================================================| | Used by: | |------------------------------------------------------------------------------| | * Bio::Installer::Generic - HTTP::Request::Common | | * Bio::Installer::Generic - LWP | ============================================================================== bioperl-run-release-1-7-1/INSTALL000077500000000000000000000072361302566030400164350ustar00rootroot00000000000000bioperl-run INSTALLATION INSTALL THE RIGHT BIOPERL You need at least the corresponding version of Bioperl. Since this is BioPerl-run 1.6.9, you should use BioPerl 1.6.9. INSTALLATION Installation instructions at the following address apply here: http://www.bioperl.org/wiki/Installing_Bioperl_for_Unix The next 2 sections summarize the essential points from there. CPAN INSTALLATION To install using CPAN you will need a recent version (v1.8802 has been tested) of it and your prefer_installer conf set to 'MB': >cpan cpan>o conf prefer_installer MB cpan>o conf commit cpan>q Find the name of the bioperl-run version you want: >cpan cpan>d /bioperl-run/ Database was generated on Mon, 20 Nov 2006 05:24:36 GMT Distribution C/CJ/CJFIELDS/BioPerl-run-1.6.9.tar.gz Now install: cpan>install C/CJ/CJFIELDS/BioPerl-run-1.6.9.tar.gz If you've installed everything perfectly then you may pass all the tests run in the './Build test' phase. It's also possible that you may fail some tests. Possible explanations: problems with local Perl installation, previously undetected bug in Bioperl, flawed test script and so on. A few failed tests may not affect your usage of bioperl-run. If you decide that the failed tests will not affect how you intend to use bioperl-run and you'd like to install anyway do: cpan>force install C/CJ/CJFIELDS/BioPerl-run-1.6.9.tar.gz This is what most experienced Bioperl users would do. However, if you're concerned about a failed test and need assistance or advice then contact bioperl-l@bioperl.org. MANUAL INSTALLATION Download the bioperl-run archive, then extract its contents. Example: >gunzip bioperl-run-.tar.gz >tar xvf bioperl-run-.tar >cd bioperl-run where is the current release. Issue the following command from within bioperl-run/: >perl Build.PL You can run regression tests and install bioperl-run using the following commands: >./Build test >./Build install NOTE: many tests will be skipped without the necessary environment variables set to tell Bioperl where your programs are installed. INSTALLING bioperl-run ON WINDOWS The following page on the BioPerl website has up-to-date instructions on how to install bioperl-run on Windows: http://www.bioperl.org/wiki/Installing_Bioperl_on_Windows (the instructions are aimed at bioperl-core, but apply equally to bioperl-run) ENVIRONMENT VARIABLES Some important environment variables you need to be aware of. Variable Values Comment -------------------------------------------------------------------- PHYLIPVERSION 3.5, 3.6 If you want to run Phylip3.6 you need to set this env variable to 3.6 BLASTDIR DIR PATH Point to the directory where BLAST is installed GENSCAN_DIR DIR PATH Point to the directory where HumanIso.smat file is installed EPONINEDIR DIR PATH Point to the directory where eponine_scan.jar is installed PAMLDIR DIR PATH Point to directory where PAML is installed Generally the all-caps program name concatenated to 'DIR' patten for environment variable names is followed for most programs. However there are some exceptions (some require an underscore between the program name and 'DIR'), so check the documentation for the module you're interested in using. Note that for some programs, having the executables in your path is not enough - the correct environment variable still needs to be set, and sometimes it shouldn't point to the executable location, but somewhere else - again, check the documentation. bioperl-run-release-1-7-1/INSTALL.PROGRAMS000066400000000000000000000106601302566030400176560ustar00rootroot00000000000000INSTALL.PROGRAMS: The Bioperl-run package has (Bio)perl wrappers written for the following applications: 1. Coils - Prediction of Coiled Coil Regions in Proteins Bio::Tools::Run::Coil http://www.ch.embnet.org/software/COILS_form.html 2. EMBOSS Applications - European Molecular Biology Open Software Suite Bio::Tools::Run::EMBOSSApplication http://www.hgmp.mrc.ac.uk/Software/EMBOSS/ 3. Eponine - Transcription Start Site finder Bio::Tools::Run::Eponine http://www.sanger.ac.uk/Software/analysis/eponine/ 4. FootPrinter - Program that performs phylogenetic footprinting. Bio::Tools::Run::FootPrinter http://abstract.cs.washington.edu/~blanchem/FootPrinterWeb/FootPrinterInput.pl 5. Genewise - Gene prediction program Bio::Tools::Run::Genewise http://www.sanger.ac.uk/software/wise2 6. Genscan - Identification of complete gene structures in genomic DNA Bio::Tools::Run::Genscan http://genes.mit.edu/GENSCAN.html 7. Hmmpfam - search a single sequence against an HMM database Bio::Tools::Run::Hmmpfam http://hmmer.wustl.edu/ 8. PISE - Web interfaces for Biological Programs Bio::Tools::Run::PiseApplication http://www-alt.pasteur.fr/~letondal/Pise/ 9. Primate - Near exact match finder for short sequence tags. Bio::Tools::Run:::Primate http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/ensembl-nci/?cvsroot=Ensembl 10. FingerPRINTScan - identify the closest matching PRINTS sequence motif fingerprints in a protein sequence Bio::Tools::Run::Prints http://www.bioinf.man.ac.uk/fingerPRINTScan/ 11. pfscan - scan a protein or DNA sequence with a profile library Bio::Tools::Run::Profile http://www.isrec.isb-sib.ch/software/software.html 12. Pseudowise - a pseudogene precdiction program, part of the wise2 package Bio::Tools::Run::Pseudowise http://www.sanger.ac.uk/software/wise2 13. RepeatMasker - screens DNA sequences in fasta format against a library of repetitive elements Bio::Tools::Run::RepeatMasker http://repeatmasker.genome.washington.edu 14. Seg - Identify low-complexity regions in protein sequences Bio::Tools::Run::Seg ftp://ftp.ncbi.nih.gov/pub/seg/ 15. Signalp - predicts the presence and location of signal peptide cleavage sites in amino acid sequences Bio::Tools::Run::Signalp http://www.cbs.dtu.dk/services/SignalP/ 16. Tmhmm - Prediction of transmembrane helices in proteins Bio::Tools::Run::Tmhmm http://www.cbs.dtu.dk/services/TMHMM/ 17. TribeMCL - Method for clustering proteins into related groups. Bio::Tools::Run::TribeMCL http://www.ebi.ac.uk/research/cgg/tribe/ 18. PAML - Phylogenetic Analysis by Maximum Likelihood package Bio::Tools::Run::PAML http://abacus.gene.ucl.ac.uk/software/paml.html 19. Molphy - MOLecular PHYlogenetics Package Bio::Tools::Run::Molphy http://www.ism.ac.jp/software/ismlib/softother.e.html 20. Phylip - Suite of Phylogenetics programs (Version 3.6) Bio::Tools::Run::Phylip http://evolution.genetics.washington.edu/phylip.html 21. Clustalw - general purpose multiple sequence alignment program for DNA or proteins Bio::Tools::Run::Alignment::Clustalw http://www.ebi.ac.uk/clustalw/ 22. DBA - DNA Block Aligner Bio::Tools::Run::Alignment::DBA http://www.sanger.ac.uk/software/wise2 23. Sim4 - Align CDNA to genomic sequences Bio::Tools::Run::Alignment::Sim4 http://globin.cse.psu.edu/ 24. Tcoffee - Multiple Sequence Alignment Package Bio::Tools::Run::Alignment::Tcoffee http://igs-server.cnrs-mrs.fr/~cnotred/Projects_home_page/t_coffee_home_page.html 25. BLAST - Basic Local Alignment Search Tool Bio::Tools::Run::StandAloneBlast (in bioperl-live CVS repository) ftp://ftp.ncbi.nih.gov/blast/executables 26. FASTA,SSEARCH - Pairwise sequence alignment Bio::Tools::Run::StandAloneFasta ftp://ftp.virginia.edu/pub/fasta 27. Promoterwise - Sequence alignment designed for promoter sequences http://www.sanger.ac.uk/software/wise2 28. Lagan - Lagan suite of tools including MLAGAN http://lagan.stanford.edu/ 29. Vista - Visualizing global DNA sequence alignments of arbitrary length http://www-gsd.lbl.gov/vista/VISTAdownload2.html 30. Exonerate - A package of alignment tools for protein and EST to genome/DNA alignments http://www.ebi.ac.uk/~guy/exonerate/ 31. AMAP- Protein multiple alignment based sequence annealing http://bio.math.berkeley.edu/amap/ 32. Blat - An alignment tool like BLAST, but structured differently http://genome.ucsc.edu/cgi-bin/hgBlat bioperl-run-release-1-7-1/INSTALL.SKIP000066400000000000000000000000211302566030400171600ustar00rootroot00000000000000ConfigData\.\S+$ bioperl-run-release-1-7-1/LICENSE000066400000000000000000001212701302566030400164010ustar00rootroot00000000000000BioPerl is licensed under the same terms as Perl itself, which means it is dually-licensed under either the Artistic or GPL licenses. Below are details of the Artistic License and, following it, the GPL. The "Artistic License" Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder as specified below. "Copyright Holder" is whoever is named in the copyright or copyrights for the package. "You" is you, if you're thinking about copying or distributing this Package. "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as uunet.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) give non-standard executables non-standard names, and clearly document the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. You may embed this Package's interpreter within an executable of yours (by linking); this shall be construed as a mere form of aggregation, provided that the complete Standard Version of the interpreter is so embedded. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whoever generated them, and may be sold commercially, and may be aggregated with this Package. If such scripts or library files are aggregated with this Package via the so-called "undump" or "unexec" methods of producing a binary executable image, then distribution of such an image shall neither be construed as a distribution of this Package nor shall it fall under the restrictions of Paragraphs 3 and 4, provided that you do not represent such an executable image as a Standard Version of this Package. 7. C subroutines (or comparably compiled subroutines in other languages) supplied by you and linked into this Package in order to emulate subroutines and variables of the language defined by this Package shall not be considered part of this Package, but are the equivalent of input as in Paragraph 6, provided these subroutines do not change the language in any way that would cause it to fail the regression tests for the language. 8. Aggregation of this Package with a commercial distribution is always permitted provided that the use of this Package is embedded; that is, when no overt attempt is made to make this Package's interfaces visible to the end user of the commercial distribution. Such use shall not be construed as a distribution of this Package. 9. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . bioperl-run-release-1-7-1/README.md000066400000000000000000000010621302566030400166470ustar00rootroot00000000000000# Description This is the home for *bioperl-run*, which contain modules that provides a Perl interface to various bioinformatics applications. This allows various applications to be used with common BioPerl objects. See the *Changes* file for more information about what is contained in here. # Installation See the accompanying *INSTALL* file for details on installing bioperl-run. # Feedback Write down any problems or praise and send them to bioperl-l@bioperl.org. # Bugs Bug reports can be made using the GitHub Issues tracker for this distribution. bioperl-run-release-1-7-1/lib/000077500000000000000000000000001302566030400161375ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/000077500000000000000000000000001302566030400166505ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/DB/000077500000000000000000000000001302566030400171355ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/DB/ESoap.pm000077500000000000000000000244111302566030400205070ustar00rootroot00000000000000# # BioPerl module for Bio::DB::ESoap # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::ESoap - Client for the NCBI Entrez EUtilities SOAP server =head1 SYNOPSIS $fac = Bio::DB::ESoap->new( -util => 'esearch' ); $som = $fac->run( -db => 'prot', -term => 'HIV and gp120' ); $fac->set_parameters( -term => 'HIV2 and gp160' ); # accessors corresponding to valid parameters are also created: $fac->db('nuccore'); $som = $fac->run; # more later. =head1 DESCRIPTION C provides a basic SOAP interface to the NCBI Entrez Utilities Web Service (L). L handles the SOAP calls. Higher level access, pipelines, BioPerl object I/O and such are provided by L. C complies with L. It depends explicitly on NCBI web service description language files to inform the C method. WSDLs are parsed by a relative lightweight, Entrez-specific module L. The C method returns L (SOAP Message) objects. No fault checking or other parsing is performed in this module. =head1 SEE ALSO L, L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::ESoap; use strict; use warnings; use Bio::Root::Root; use Bio::DB::ESoap::WSDL; use SOAP::Lite; use base qw(Bio::Root::Root Bio::ParameterBaseI); =head2 new Title : new Usage : my $obj = new Bio::DB::ESoap(); Function: Builds a new Bio::DB::ESoap factory Returns : an instance of Bio::DB::ESoap Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($util, $fetch_db, $wsdl) = $self->_rearrange( [qw( UTIL FETCH_DB WSDL_FILE )], @args ); $self->throw("Argument -util must be specified") unless $util; my @wsdl_pms; if ($wsdl) { @wsdl_pms = ( '-wsdl' => $wsdl ); } else { $fetch_db ||= 'seq'; my $url = ($util =~ /fetch/ ? 'f_'.$fetch_db : 'eutils'); $url = $NCBI_BASEURL.$WSDL{$url}; @wsdl_pms = ( '-url' => $url ); } $self->_wsdl(Bio::DB::ESoap::WSDL->new(@wsdl_pms)); $self->_operation($util); $self->_init_parameters; $self->_client( SOAP::Lite->new( proxy => $self->_wsdl->service ) ); return $self; } =head2 _wsdl() Title : _wsdl Usage : $obj->_wsdl($newval) Function: Bio::DB::ESoap::WSDL object associated with this factory Example : Returns : value of _wsdl (object) Args : on set, new value (object or undef, optional) =cut sub _wsdl { my $self = shift; return $self->{'_wsdl'} = shift if @_; return $self->{'_wsdl'}; } =head2 _client() Title : _client Usage : $obj->_client($newval) Function: holds a SOAP::Lite object Example : Returns : value of _client (a SOAP::Lite object) Args : on set, new value (a SOAP::Lite object or undef, optional) =cut sub _client { my $self = shift; return $self->{'_client'} = shift if @_; return $self->{'_client'}; } =head2 _operation() Title : _operation Alias : util Usage : Function: check and convert the requested operation based on the wsdl Returns : Args : operation (scalar string) =cut sub _operation { my $self = shift; my $util = shift; return $self->{'_operation'} unless $util; $self->throw("WSDL not yet initialized") unless $self->_wsdl; my $opn = $self->_wsdl->operations; if ( grep /^$util$/, keys %$opn ) { return $self->{'_operation'} = $util; } elsif ( grep /^$util$/, values %$opn ) { my @a = grep { $$opn{$_} eq $util } keys %$opn; return $self->{'_operation'} = $a[0]; } else { $self->throw("Utility '$util' is not recognized"); } } sub util { shift->_operation(@_) } =head2 action() Title : action Usage : Function: return the soapAction associated with the factory's utility Returns : scalar string Args : none =cut sub action { my $self = shift; return $self->{_action} if $self->{_action}; return $self->{_action} = ${$self->_wsdl->operations}{$self->util}; } =head2 wsdl_file() Title : wsdl_file Usage : Function: get filename of the local WSDL XML copy Returns : filename (scalar string) Args : none =cut sub wsdl_file { my $self = shift; if (ref ($self->_wsdl->wsdl) eq 'File::Temp') { return $self->_wsdl->wsdl->filename; } return $self->_wsdl->wsdl; } =head2 run() Title : _run Usage : $som = $self->_run(@optional_setting_args) Function: Call the SOAP service with the factory-associated utility and parameters Returns : SOAP::SOM (SOAP Message) object Args : named parameters appropriate for the utility Note : no fault checking here =cut sub run { my $self = shift; my @args = @_; $self->throw("SOAP::Lite client not initialized") unless $self->_client; $self->throw("run requires named args") if @args % 2; $self->set_parameters(@args) if scalar @args; my %args = $self->get_parameters; my @soap_data; for my $k (keys %args) { ## kludges for NCBI inconsistencies: my $k_ncbi; for ($k) { /QueryKey/ && do { $k_ncbi = 'query_key'; last; }; /RetMax/ && do { $k_ncbi = 'retmax'; last; }; $k_ncbi = $k; } my $data = $args{$k}; next unless defined $data; for (ref $data) { /^$/ && do { push @soap_data, SOAP::Data->name($k_ncbi)->value($data); last; }; /ARRAY/ && do { push @soap_data, SOAP::Data->name($k_ncbi)->value(join(',',@$data)); last; }; /HASH/ && do { # for adding multiple data items with the same message # key (id lists for elink, e.g.) # see ...::SoapEUtilities, c. line 151 push @soap_data, map { SOAP::Data->name($k_ncbi)->value($_) } keys %$data; }; } } $self->_client->on_action( sub { $self->action } ); my $som = $self->_client->call( $self->util, @soap_data ); return $som; } sub _result_elt_name { my $s=shift; (keys %{$s->_wsdl->response_parameters($s->util)})[0] }; sub _response_elt_name { shift->_result_elt_name } sub _request_elt_name { my $s=shift; (keys %{$s->_wsdl->request_parameters($s->util)})[0] }; =head2 Bio::ParameterBaseI compliance =cut sub available_parameters { my $self = shift; my @args = @_; return @{$self->_init_parameters}; } sub set_parameters { my $self = shift; my @args = @_; $self->throw("set_parameters requires named args") if @args % 2; ($_%2 ? 1 : $args[$_] =~ s/^-//) for (0..$#args); my %args = @args; # special translations : if ( defined $args{'usehistory'} ) { $args{'usehistory'} = ($args{'usehistory'} ? 'y' : undef); } $self->_set_from_args(\%args, -methods=>$self->_init_parameters); return $self->parameters_changed(1); } sub get_parameters { my $self = shift; my @ret; foreach (@{$self->_init_parameters}) { next unless defined $self->$_(); push @ret, ($_, $self->$_()); } return @ret; } sub reset_parameters { my $self = shift; my @args = @_; $self->throw("reset_parameters requires named args") if @args % 2; ($_%2 ? 1 : $args[$_] =~ s/^-//) for (0..$#args); my %args = @args; my %reset; @reset{@{$self->_init_parameters}} = (undef) x @{$self->_init_parameters}; $reset{$_} = $args{$_} for keys %args; $self->_set_from_args( \%reset, -methods => $self->_init_parameters ); $self->parameters_changed(1); return 1; } =head2 parameters_changed() Title : parameters_changed Usage : $obj->parameters_changed($newval) Function: flag to indicate, well, you know Example : Returns : value of parameters_changed (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub parameters_changed { my $self = shift; return $self->{'parameters_changed'} = shift if @_; return $self->{'parameters_changed'}; } =head2 _init_parameters() Title : _init_parameters Usage : $fac->_init_parameters Function: identify the available input parameters using the wsdl object Returns : arrayref of parameter names (scalar strings) Args : none =cut sub _init_parameters { my $self = shift; return $self->{_params} if $self->{_params}; $self->throw("WSDL not yet initialized") unless $self->_wsdl; my $phash = {}; my $val = (values %{$self->_wsdl->request_parameters($self->util)})[0]; $$phash{$_} = undef for map { keys %$_ } @{$val}; my $params =$self->{_params} = [sort keys %$phash]; # create parm accessors $self->_set_from_args( $phash, -methods => $params, -create => 1, -code => 'my $self = shift; if (@_) { $self->parameters_changed(1); return $self->{\'_\'.$method} = shift; } $self->parameters_changed(0); return $self->{\'_\'.$method};' ); $self->parameters_changed(1); return $self->{_params}; } 1; bioperl-run-release-1-7-1/lib/Bio/DB/ESoap/000077500000000000000000000000001302566030400201445ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/DB/ESoap/WSDL.pm000077500000000000000000000471361302566030400212710ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::ESoap::WSDL # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::ESoap::WSDL - WSDL parsing for Entrez SOAP EUtilities =head1 SYNOPSIS Used by L # url $wsdl = Bio::DB::ESoap::WSDL->new( -url => "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl" ); # local copy $wsdl = Bio::DB::ESoap::WSDL->new( -wsdl => "local/eutils.wsdl" ); %opns = %{ $wsdl->operations }; =head1 DESCRIPTION This module is a lightweight parser and container for WSDL XML files associated with the NCBI EUtilities SOAP server. XML facilities are provided by L. The following accessors provide names and structures useful for creating SOAP messages using L (e.g.): service() : the URL of the SOAP service operations() : hashref of the form {.., $operation_name => $soapAction, ...} request_parameters($operation) : request field names and namelists as an array of hashes result_parameters($operation) : result field names and namelists as an array of hashes The following accessors provide L objects pointing at key locations in the WSDL: root : the root of the WSDL docment _types_elt : the element _portType_elt : the element _binding_elt : the element _service_elt : the element _message_elts : an array of all top-level elements _operation_elts : an array of all elements contained in Parsing occurs lazily (on first read, not on construction); all information is cached. To clear the cache and force re-parsing, run $wsdl->clear_cache; The globals C<$NCBI_BASEURL>, C<$NCBI_ADAPTOR>, and C<%WSDL> are exported. $NCBI_ADAPTOR : the soap service cgi To construct a URL for a WSDL: $wsdl_eutils = $NCBI_BASEURL.$WSDL{'eutils'} $wsdl_efetch_omim = $NCBI_BASEURL.$WSDL{'f_omim'} # etc. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::DB::ESoap::WSDL; use strict; use Bio::Root::Root; use XML::Twig; use Bio::WebAgent; use File::Temp; use base qw(Bio::Root::Root Exporter); our @EXPORT = qw( $NCBI_BASEURL $NCBI_ADAPTOR %WSDL ); our $NCBI_BASEURL = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/"; our $NCBI_ADAPTOR = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/soap_adapter_2_0.cgi"; our %WSDL = ( 'eutils' => 'eutils.wsdl', 'f_pubmed' => 'efetch_pubmed.wsdl', 'f_pmc' => 'efetch_pmc.wsdl', 'f_nlmc' => 'efetch_nlmc.wsdl', 'f_journals' => 'efetch_journals.wsdl', 'f_omim' => 'efetch_omim.wsdl', 'f_taxon' => 'efetch_taxon.wsdl', 'f_snp' => 'efetch_snp.wsdl', 'f_gene' => 'efetch_gene.wsdl', 'f_seq' => 'efetch_seq.wsdl' ); =head2 new Title : new Usage : my $obj = new Bio::DB::ESoap::WSDL(); Function: Builds a new Bio::DB::ESoap::WSDL object Returns : an instance of Bio::DB::ESoap::WSDL Args : named args: -URL => $url_of_desired_wsdl -OR- -WSDL => $filename_of_local_wsdl_copy ( -WSDL will take precedence if both specified ) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($url, $wsdl) = $self->_rearrange( [qw( URL WSDL )], @args ); my (%sections, %cache); my $doc = 'wsdl:definitions'; $sections{'_message_elts'} = []; $sections{'_operation_elts'} = []; $self->_sections(\%sections); $self->_cache(\%cache); $self->_twig( XML::Twig->new( twig_handlers => { $doc => sub { $self->root($_) }, "$doc/binding" => sub { $self->_sections->{'_binding_elt'} = $_ }, "$doc/binding/operation" => sub { push @{$self->_sections->{'_operation_elts'}},$_ }, "$doc/message" => sub { push @{$self->_sections->{'_message_elts'}}, $_ }, "$doc/portType" => sub { $self->_sections->{'_portType_elt'} = $_ }, "$doc/service" => sub { $self->_sections->{'_service_elt'} = $_ }, "$doc/types" => sub { $self->_sections->{'_types_elt'} = $_ }, } ) ); if ($url || $wsdl ) { $self->url($url); $self->wsdl($wsdl); $self->_parse; } return $self; } =head1 Getters =head2 request_parameters() Title : request_parameters Usage : @params = $wsdl->request_parameters($operation_name) Function: get array of request (input) fields required by specified operation, according to the WSDL Returns : hash of arrays of hashes... Args : scalar string (operation or action name) =cut sub request_parameters { my $self = shift; my ($operation) = @_; my $is_action; $self->throw("Operation name must be specified") unless defined $operation; my $opn_hash = $self->operations; unless ( grep /^$operation$/, keys %$opn_hash ) { $is_action = grep /^$operation$/, values %$opn_hash; $self->throw("Operation name '$operation' is not recognized") unless ($is_action); } #check the cache here.... return $self->_cache("request_params_$operation") if $self->_cache("request_params_$operation"); # find the input message type in the portType elt if ($is_action) { my @a = grep {$$opn_hash{$_} eq $operation} keys %$opn_hash; # note this takes the first match $operation = $a[0]; $self->throw("Whaaa??") unless defined $operation; } #check the cache once more after translation.... return $self->_cache("request_params_$operation") if $self->_cache("request_params_$operation"); my $bookmarks = $self->_operation_bookmarks($operation); my $imsg_elt = $bookmarks->{'i_msg_elt'}; my $opn_schema = $bookmarks->{'schema'}; my $ret = { $imsg_elt->att('name') => [] }; # do a quick recursion: _get_types((values %$ret)[0], $imsg_elt, $opn_schema); return $self->_cache("request_params_$operation", $ret); 1; } =head2 result_parameters() Title : result_parameters Usage : $result_hash = $wsdl->result_parameters Function: retrieve a hash structure describing the result of running the specified operation according to the WSDL Returns : hash of arrays of hashes... Args : operation (scalar string) =cut sub result_parameters { my $self = shift; my ($operation) = @_; my $is_action; $self->throw("Operation name must be specified") unless defined $operation; my $opn_hash = $self->operations; unless ( grep /^$operation$/, keys %$opn_hash ) { $is_action = grep /^$operation$/, values %$opn_hash; $self->throw("Operation name '$operation' is not recognized") unless ($is_action); } #check the cache here.... return $self->_cache("result_params_$operation") if $self->_cache("result_params_$operation"); # find the input message type in the portType elt if ($is_action) { my @a = grep {$$opn_hash{$_} eq $operation} keys %$opn_hash; # note this takes the first match $operation = $a[0]; $self->throw("Whaaa??") unless defined $operation; } #check the cache once more after translation.... return $self->_cache("result_params_$operation") if $self->_cache("result_params_$operation"); # do work my $bookmarks = $self->_operation_bookmarks($operation); # eutilities results seem to be a mixture of xs:string element # and complex types which are just xs:seqs of xs:string elements # # cast these as a hash of hashes... my $omsg_elt = $bookmarks->{'o_msg_elt'}; my $opn_schema = $bookmarks->{'schema'}; my $ret = { $omsg_elt->att('name') => [] }; # do a quick recursion: _get_types((values %$ret)[0], $omsg_elt, $opn_schema); return $self->_cache("result_params_$operation", $ret); } sub response_parameters { shift->result_parameters( @_ ) } =head2 operations() Title : operations Usage : @opns = $wsdl->operations; Function: get a hashref with elts ( $operation_name => $soapAction ) for all operations defined by this WSDL Returns : array of scalar strings Args : none =cut sub operations { my $self = shift; return $self->_cache('operations') if $self->_cache('operations'); my %opns; foreach (@{$self->_parse->_operation_elts}) { $opns{$_->att('name')} = ($_->descendants('soap:operation'))[0]->att('soapAction'); } return $self->_cache('operations', \%opns); } =head2 service() Title : service Usage : $wsdl->service Function: gets the SOAP service url associated with this WSDL Returns : scalar string Args : none =cut sub service { my $self = shift; return $self->_cache('service') || $self->_cache('service', ($self->_parse->_service_elt->descendants('soap:address'))[0]->att('location')); } =head2 db() Title : db Usage : Function: If this is an efetch WSDL, returns the db name associated with it Returns : scalar string or undef Args : none =cut sub db { my $self = shift; $self->root->namespace('nsef') =~ /efetch_(.*?)$/; return $1; } =head1 Internals =head2 _operation_bookmarks() Title : _operation_bookmarks Usage : Function: find useful WSDL elements associated with the specified operation; return a hashref of the form { $key => $XML_Twig_Elt_obj, } Returns : hashref with keys: portType namespace schema i_msg_type i_msg_elt o_msg_type o_msg_elt Args : operation name (scalar string) Note : will import schema if necessary =cut sub _operation_bookmarks { my $self = shift; my $operation = shift; # check cache return $self->_cache("bookmarks_$operation") if $self->_cache("bookmarks_$operation"); # do work my %bookmarks; my $pT_opn = $self->_portType_elt->first_child( qq/ operation[\@name="$operation"] / ); my $imsg_type = $pT_opn->first_child('input')->att('message'); my $omsg_type = $pT_opn->first_child('output')->att('message'); # now lookup the schema element name from among the message elts my ($imsg_elt, $omsg_elt); foreach ( @{$self->_message_elts} ) { my $msg_name = $_->att('name'); if ( $imsg_type =~ qr/$msg_name/ ) { $imsg_elt = $_->first_child('part[@element=~/[Rr]equest/]')->att('element'); } if ( $omsg_type =~ qr/$msg_name/) { $omsg_elt = $_->first_child('part[@element=~/[Rr]esult/]')->att('element'); } last if ($imsg_elt && $omsg_elt); } $self->throw("Can't find request schema element corresponding to '$operation'") unless $imsg_elt; $self->throw("Can't find result schema element corresponding to '$operation'") unless $omsg_elt; # $imsg_elt has a namespace prefix, to lead us to the correct schema # as defined in the wsdl element. Get that schema $imsg_elt =~ /(.*?):/; my $opn_ns = $self->root->namespace($1); my $opn_schema = $self->_types_elt->first_child("xs:schema[\@targetNamespace='$opn_ns']"); $opn_schema ||= $self->_types_elt->first_child("xs:schema"); # only one $self->throw("Can't find types schema corresponding to '$operation'") unless defined $opn_schema; # need to import the schema? do it here. if ( my $import_elt = $opn_schema->first_child("xs:import") ) { my $import_url = $NCBI_BASEURL.$import_elt->att('schemaLocation'); my $imported = XML::Twig->new(); # better error checking here? eval { $imported->parse(Bio::WebAgent->new()->get($import_url)->content); }; $self->throw("Schema import failed (tried url '$import_url') : $@") if $@; my $imported_schema = $imported->root; # get included schemata my @included = $imported_schema->children("xs:include"); foreach (@included) { my $url = $NCBI_BASEURL.$_->att('schemaLocation'); my $incl = XML::Twig->new(); eval { $incl->parse( Bio::WebAgent->new()->get($url)->content ); }; $self->throw("Schema include failed (tried url '$url') : $@") if $@; # cut-n-paste my @incl = $incl->root->children; $_->cut; foreach my $child (@incl) { $child->cut; $child->paste( last_child => $_->former_parent ); } } # cut-n-paste $opn_schema->cut; $imported_schema->cut; $imported_schema->paste( first_child => $opn_schema->former_parent ); $opn_schema = $imported_schema; } # find the definition of $imsg_elt in $opn_schema $imsg_elt =~ s/.*?://; $imsg_elt = $opn_schema->first_child("xs:element[\@name='$imsg_elt']"); $self->throw("Can't find request element definition in schema corresponding to '$operation'") unless defined $imsg_elt; $omsg_elt =~ s/.*?://; $omsg_elt = $opn_schema->first_child("xs:element[\@name='$omsg_elt']"); $self->throw("Can't find result element definition in schema corresponding to '$operation'") unless defined $omsg_elt; @bookmarks{qw(portType i_msg_type o_msg_type namespace schema i_msg_elt o_msg_elt ) } = ($pT_opn, $imsg_type, $omsg_type, $opn_ns, $opn_schema, $imsg_elt, $omsg_elt); return $self->_cache("bookmarks_$operation", \%bookmarks); } =head2 _parse() Title : _parse Usage : $wsdl->_parse Function: parse the wsdl at url and create accessors for section twig elts Returns : self Args : =cut sub _parse { my $self = shift; my @args = @_; return $self if $self->_parsed; # already done $self->throw("Neither URL nor WSDL set in object") unless $self->url || $self->wsdl; eval { if ($self->wsdl) { $self->_twig->parsefile($self->wsdl); } else { eval { my $tfh = File::Temp->new(-UNLINK=>1); Bio::WebAgent->new()->get($self->url, ':content_file' => $tfh->filename); $tfh->close; $self->_twig->parsefile($tfh->filename); $self->wsdl($tfh->filename); }; $self->throw("URL parse failed : $@") if $@; } }; # $self->throw("Parser issue : $@") if $@; die $@ if $@; $self->_set_from_args( $self->_sections, -methods => [qw(_types_elt _message_elts _portType_elt _binding_elt _operation_elts _service_elt)], -create => 1 ); $self->_parsed(1); return $self; } =head2 root() Title : root Usage : $obj->root($newval) Function: holds the root Twig elt of the parsed WSDL Example : Returns : value of root (an XML::Twig::Elt) Args : on set, new value (an XML::Twig::Elt or undef, optional) =cut sub root { my $self = shift; return $self->{'root'} = shift if @_; return $self->{'root'}; } =head2 url() Title : url Usage : $obj->url($newval) Function: get/set the WSDL url Example : Returns : value of url (a scalar string) Args : on set, new value (a scalar or undef, optional) =cut sub url { my $self = shift; return $self->{'url'} = shift if @_; return $self->{'url'}; } =head2 wsdl() Title : wsdl Usage : $obj->wsdl($newval) Function: get/set wsdl XML filename Example : Returns : value of wsdl (a scalar string) Args : on set, new value (a scalar string or undef, optional) =cut sub wsdl { my $self = shift; my $file = shift; if (defined $file) { $self->throw("File not found") unless (-e $file) || (ref $file eq 'File::Temp'); return $self->{'wsdl'} = $file; } return $self->{'wsdl'}; } =head2 _twig() Title : _twig Usage : $obj->_twig($newval) Function: XML::Twig object for handling the wsdl Example : Returns : value of _twig (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _twig { my $self = shift; return $self->{'_twig'} = shift if @_; return $self->{'_twig'}; } =head2 _sections() Title : _sections Usage : $obj->_sections($newval) Function: holds hashref of twigs corresponding to main wsdl elements; filled by _parse() Example : Returns : value of _sections (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _sections { my $self = shift; return $self->{'_sections'} = shift if @_; return $self->{'_sections'}; } =head2 _cache() Title : _cache Usage : $wsdl->_cache($newval) Function: holds the wsdl info cache Example : Returns : value of _cache (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _cache { my $self = shift; my ($name, $value) = @_; unless (@_) { return $self->{'_cache'} = {}; } if (defined $value) { return $self->{'_cache'}->{$name} = $value; } return $self->{'_cache'}->{$name}; } sub clear_cache { shift->_cache() } =head2 _parsed() Title : _parsed Usage : $obj->_parsed($newval) Function: flag to indicate wsdl already parsed Example : Returns : value of _parsed (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub _parsed { my $self = shift; return $self->{'_parsed'} = shift if @_; return $self->{'_parsed'}; } # =head2 _get_types() # Title : _get_types # Usage : very internal # Function: recursively parse through custom types # Returns : # Args : arrayref, XML::Twig::Elt, XML::Twig::Elt # (return array, type element, schema root) # =cut sub _get_types { my ($res, $elt, $sch, $visited) = @_; my $is_choice; $visited ||= []; # assuming max 1 xs:sequence or xs:choice per element my $seq = ($elt->descendants('xs:sequence'))[0]; $is_choice = ($seq ? '' : '|'); $seq ||= ($elt->descendants('xs:choice'))[0]; return 1 unless $seq; foreach ( $seq->descendants('xs:element') ) { for my $type ($_->att('type') || $_->att('ref')) { !defined($type) && do { Bio::Root::Root->throw("neither type nor ref attributes defined; cannot proceed"); last; }; $type eq 'xs:string' && do { push @$res, { $_->att('name').$is_choice => 1}; last; }; do { # custom type # find the type def in schema $type =~ s/.*?://; # strip tns if (grep /^$type$/, @$visited) { # check for circularity push @$res, { $_->att('name').$is_choice => "$type(reused)"}if $_->att('name'); last; } push @$visited, $type; my $new_elt = $sch->first_child("xs:complexType[\@name='$type']"); if (defined $new_elt) { my $new_res = []; push @$res, { $_->att('name').$is_choice => $new_res }; _get_types($new_res, $new_elt, $sch, $visited); } else { # a 'ref', make sure it's defined $new_elt = $sch->first_child("xs:element[\@name='$type']"); $DB::single=1 unless $new_elt; Bio::Root::Root->throw("type not defined in schema; cannot proceed") unless defined $new_elt; push @$res, { $new_elt->att('name').$is_choice => 1 }; } last; } } } return 1; } sub DESTROY { my $self = shift; if (ref($self->wsdl) eq 'File::Temp') { unlink $self->wsdl->filename; } } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities.pm000077500000000000000000000600771302566030400224130ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities - Interface to the NCBI Entrez web service *BETA* =head1 SYNOPSIS use Bio::DB::SoapEUtilities; # factory construction my $fac = Bio::DB::SoapEUtilities->new() # executing a utility call #get an iteratable adaptor my $links = $fac->elink( -dbfrom => 'protein', -db => 'taxonomy', -id => \@protein_ids )->run(-auto_adapt => 1); # get a Bio::DB::SoapEUtilities::Result object my $result = $fac->esearch( -db => 'gene', -term => 'sonic and human')->run; # get the raw XML message my $xml = $fac->efetch( -db => 'gene', -id => \@gids )->run( -raw_xml => 1 ); # change parameters my $new_result = $fac->efetch( -db => 'gene', -id => \@more_gids)->run; # reset parameters $fac->efetch->reset_parameters( -db => 'nucleotide', -id => $nucid ); $result = $fac->efetch->run; # parsing and iterating the results $count = $result->count; @ids = $result->ids; while ( my $linkset = $links->next_link ) { $submitted = $linkset->submitted_id; } ($taxid) = $links->id_map($submitted_prot_id); $species_io = $fac->efetch( -db => 'taxonomy', -id => $taxid )->run( -auto_adapt => 1); $species = $species_io->next_species; $linnaeus = $species->binomial; =head1 DESCRIPTION This module allows the user to query the NCBI Entrez database via its SOAP (Simple Object Access Protocol) web service (described at L). The basic tools (C) are available as methods off a C factory object. Parameters for each tool can be queried, set and reset for each method through the L standard calls (C). Returned data can be retrieved, accessed and parsed in several ways, according to user preference. Adaptors and object iterators are available for C, C, C, and C results. =head1 USAGE The C system has been designed to be as easy (few includes, available parameter facilities, reasonable defaults, intuitive aliases, built-in pipelines) or as complex (accessors for underlying low-level objects, all parameters accessible, custom hooks for builder objects, facilities for providing local copies of WSDLs) as the user requires or desires. (To the extent that it does not succeed in either direction, it is up to the user to report to the mailing list (L)!) =head2 Factory To begin, make a factory: my $fac = Bio::DB::SoapEUtilities->new(); From the factory, utilities are called, parameters are set, and results or adaptors are retrieved. If you have your own copy of the wsdl, use my $fac = Bio::Db::SoapEUtilities->new( -wsdl_file => $my_wsdl ); otherwise, the correct one will be obtained over the network (by L and friends). =head2 Utilities and parameters To run any of the standard NCBI EUtilities (C), call the desired utility from the factory. To use a utility, you must set its parameters and run it to get a result. TMTOWTDI: # verbose my $fetch = $fac->efetch(); $fetch->set_parameters( -db => 'gene', -id => [828392, 790]); my $result = $fetch->run; # compact my $result = $fac->efetch(-db =>'gene',-id => [828392,790])->run; # change ids $fac->efetch->set_parameters( -id => 470338 ); $result = $fac->run; # another util $result = $fac->esearch(-db => 'protein', -term => 'BRCA and human')->run; # the utilities are kept separate %search_params = $fac->esearch->get_parameters; %fetch_params = $fac->efetch->get_parameters; $search_param{db}; # is 'protein' $fetch_params{db}; # is 'gene' The factory is L compliant: that means you can find out what you can set with @available_search = $fac->esearch->available_parameters; @available_egquery = $fac->egquery->available_parameters; For more information on parameters, see L. =head2 Results The "intermediate" object for C query results is the L. This is a BioPerly parsing of the SOAP message sent by NCBI when a query is C. This can be very useful on it's own, but most users will likely want to proceed directly to L, which take a C and turn it into more intuitive/familiar BioPerl objects. Go there if the following details are too gory. Results can be highly- or lowly-parsed, depending on the parameters passed to the factory C method. To get the raw XML message with no parsing, do my $xml = $fac->$util->run(-raw_xml => 1); # $xml is a scalar string To retrieve a L object with limited parsing, but with accessors to the L message (provided by L), do my $result = $fac->$util->run(-no_parse => 1); my $som = $result->som; my $method_hash = $som->method; # etc... To retrieve a C object with message elements parsed into accessors, including C and C, run without arguments: my $result = $fac->esearch->run() my $count = $result->count; my @Count = $result->Count; # counts for each member of # the translation stack my @ids = $result->IdList_Id; # from automatic message parsing @ids = $result->ids; # a convenient alias See L for more, even gorier details. =head2 Adaptors Adaptors convert EUtility Cs into convenient objects, via a handle that usually provides an iterator, in the spirit of L. These are probably more useful than the C to the typical user, and so you can retrieve them automatically by setting the C parameter C<-auto_adapt => 1>. In general, retrieve an adaptor like so: $adp = $fac->$util->run( -auto_adapt => 1 ); # iterate... while ( my $obj = $adp->next_obj ) { # do stuff with $obj } The adaptor itself occasionally possesses useful methods besides the iterator. The method C always works, but a natural alias is also always available: $seqio = $fac->esearch->run( -auto_adapt => 1 ); while ( my $seq = $seqio->next_seq ) { # do stuff with $seq } In the above example, C<-auto_adapt => 1> also instructs the factory to perform an C based on the ids returned by the C (if any), so that the adaptor returned iterates over L objects. Here is a rundown of the different adaptor flavors: =over =item * C, Fetch Adaptors, and BioPerl object iterators The C creates bona fide BioPerl objects. Currently, there are FetchAdaptor subclasses for sequence data (both Genbank and FASTA rettypes) and taxonomy data. The choice of FetchAdaptor is based on information in the result message, and should be transparent to the user. $seqio = $fac->efetch( -db =>'nucleotide', -id => \@ids, -rettype => 'gb' )->run( -auto_adapt => 1 ); while (my $seq = $seqio->next_seq) { my $taxio = $fac->efetch( -db => 'taxonomy', -id => $seq->species->ncbi_taxid )->run(-auto_adapt => 1); my $tax = $taxio->next_species; unless ( $tax->TaxId == $seq->species->ncbi_taxid ) { print "more work for MAJ" } } See the pod for the FetchAdaptor subclasses (e.g., L) for more detail. =item * C, the Link adaptor, and the C iterator The C manages LinkSets. In C, an C call B preserves the correspondence between submitted and retrieved ids. The mapping between these can be accessed from the adaptor object directly as C my $links = $fac->elink( -db => 'protein', -dbfrom => 'nucleotide', -id => \@nucids )->run( -auto_adapt => 1 ); # maybe more than one associated id... my @prot_0 = $links->id_map( $nucids[0] ); Or iterate over the linksets: while ( my $ls = $links->next_linkset ) { @ids = $ls->ids; @submitted_ids = $ls->submitted_ids; # etc. } =item * C, the DocSum adaptor, and the C iterator The C manages docsums, the C return type. The objects returned by iterating with a C have accessors that let you obtain field information directly. Docsums contain lots of easy-to-forget fields; use C to remind yourself. my $docs = $fac->esummary( -db => 'taxonomy', -id => 527031 )->run(-auto_adapt=>1); # iterate over docsums while (my $d = $docs->next_docsum) { @available_items = $docsum->item_names; # any available item can be called as an accessor # from the docsum object...watch your case... $sci_name = $d->ScientificName; $taxid = $d->TaxId; } =item * C, the GQuery adaptor, and the C iterator The C manages global query items returned by calls to C, which identifies all NCBI databases containing hits for your query term. The databases actually containing hits can be retrieved directly from the adaptor with C: my $queries = $fac->egquery( -term => 'BRCA and human' )->run(-auto_adapt=>1); my @dbs = $queries->found_in_dbs; Retrieve the global query info returned for B database with C: my $prot_q = $queries->query_by_db('protein'); if ($prot_q->count) { #do something } Or iterate as usual: while ( my $q = $queries->next_query ) { if ($q->status eq 'Ok') { # do sth } } =back =head2 Web environments and query keys To make large or complex requests for data, or to share queries, it may be helpful to use the NCBI WebEnv system to manage your queries. Each EUtility accepts the following parameters: -usehistory -WebEnv -QueryKey for this purpose. These store the details of your queries serverside. C attempts to make using these relatively straightforward. Use C objects to obtain the correct parameters, and don't forget C<-usehistory>: my $result1 = $fac->esearch( -term => 'BRCA and human', -db => 'nucleotide', -usehistory => 1 )->run( -no_parse=>1 ); my $result = $fac->esearch( -term => 'AND early onset', -QueryKey => $result1->query_key, -WebEnv => $result1->webenv )->run( -no_parse => 1 ); my $result = $fac->esearch( -db => 'protein', -term => 'sonic', -usehistory => 1 )->run( -no_parse => 1 ); # later (but not more than 8 hours later) that day... $result = $fac->esearch( -WebEnv => $result->webenv, -QueryKey => $result->query_key, -RetMax => 800 # get 'em all )->run; # note we're parsing the result... @all_ids = $result->ids; =head2 Error checking Two kinds of errors can ensue on an Entrez SOAP run. One is a SOAP fault, and the other is an error sent in non-faulted SOAP message from the server. The distinction is probably systematic, and I would welcome an explanation of it. To check for result errors, try something like: unless ( $result = $fac->$util->run ) { die $fac->errstr; # this will catch a SOAP fault } # a valid result object was returned, but it may carry an error if ($result->count == 0) { warn "No hits returned"; if ($result->ERROR) { warn "Entrez error : ".$result->ERROR; } } Error handling will be improved in the package eventually. =head1 SEE ALSO L, L, L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::DB::SoapEUtilities; use strict; use Bio::Root::Root; use Bio::DB::ESoap; use Bio::DB::SoapEUtilities::DocSumAdaptor; use Bio::DB::SoapEUtilities::FetchAdaptor; use Bio::DB::SoapEUtilities::GQueryAdaptor; use Bio::DB::SoapEUtilities::LinkAdaptor; use Bio::DB::SoapEUtilities::Result; use base qw(Bio::Root::Root Bio::ParameterBaseI ); our $AUTOLOAD; =head2 new Title : new Usage : my $eutil = new Bio::DB::SoapEUtilities(); Function: Builds a new Bio::DB::SoapEUtilities object Returns : an instance of Bio::DB::SoapEUtilities Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($db, $wsdl) = $self->_rearrange( [qw( DB WSDL_FILE )], @args ); $self->{db} = $db; $self->{'_wsdl_file'} = $wsdl; return $self; } =head2 run() Title : run Usage : $fac->$eutility->run(@args) Function: Execute the EUtility Returns : true on success, false on fault or error (reason in errstr(), for more detail check the SOAP message in last_result() ) Args : named params appropriate to utility -auto_adapt => boolean ( return an iterator over results as appropriate to util if true) -raw_xml => boolean ( return raw xml result; no processing ) Bio::DB::SoapEUtilities::Result constructor parms =cut sub run { my $self = shift; my @args = @_; $self->throw("run method requires named arguments") if @args % 2; $self->throw("call run method like '\$fac->\$eutility->run(\@args)") unless $self->_caller_util; my ($autofetch, $raw_xml) = $self->_rearrange( [qw( AUTO_ADAPT RAW_XML)], @args ); my ($adaptor); my %args = @args; # add tool argument for NCBI records $args{tool} = "BioPerl"; my %params = $self->get_parameters; $self->warn("No -email parameter set : be advised that NCBI requires a valid email to accompany all requests") unless $params{email}; my $util = $self->_caller_util; # pass util args to run only to a downstream utility (i.e., efetch # on autofetch.. # $self->set_parameters(%args) if %args; # kludge for elink : make sure to-ids and from-ids are associated if ( $util eq 'elink' ) { my $es = $self->_soap_facs($util); my $ids = $es->id; if (ref $ids eq 'ARRAY') { my %ids; @ids{@$ids} = (1) x scalar @$ids; $es->id(\%ids); } } $self->_soap_facs($util)->_client->outputxml($raw_xml); my $som = $self->{'_response_message'} = $self->_soap_facs($util)->run; # raw xml only... if ($raw_xml) { return $som; } # SOAP::SOM parsing... # check response status if ($som->fault) { $self->{'errstr'} = $som->faultstring; return 0; } # elsif non-fault error if (my $err = $som->valueof("//ErrorList")) { while ( my ($key, $val) = each %$err ) { $self->{'errstr'} .= join( " : ", $key, $val )."\n"; }; $self->{'errstr'} =~ s/\n$//; return 0; } # attach some key properties to the factory $self->{'_WebEnv'} = $som->valueof("//WebEnv"); # create convenient aliases off result for different utils my @alias_hash; for ($util) { /einfo/ && do { my %args = $self->get_parameters; if ($args{db}) { push @alias_hash, ( '-alias_hash' => { 'record_count' => 'DbInfo_Count', 'last_update' => 'DbInfo_LastUpdate', 'db' => 'DbInfo_DbName', 'description' => 'DbInfo_Description' } ); } else { push @alias_hash, ('-alias_hash' => {'dbs' => 'DbList_DbName'} ); } last; }; # put others here as nec } my $result = Bio::DB::SoapEUtilities::Result->new($self, @args, @alias_hash); # success, parse it out if ($autofetch) { for ($self->_caller_util) { $_ eq 'esearch' && do { # do an efetch with the same db and a returned list of ids... # reentering here! my $ids = $result->ids; if (!$result->count) { $self->warn("Can't fetch; no records returned"); return $result; } if (!$result->ids) { $self->warn("Can't fetch; no id list returned"); return $result; } if ( !$self->db ) { my %h = $self->get_parameters; $self->{db} = $h{db} || $h{DB}; } # pass run() args to the downstream utility here # (so can specify -rettype, basically) # note @args will contain -auto_adapt => 1 here. # keep the email arg my %parms = $self->get_parameters; $adaptor = $self->efetch( -db => $self->db, -id => $ids, -email => $parms{email}, -tool => $parms{tool}, @args )->run(-no_parse => 1, @args); last; }; $_ eq 'elink' && do { $adaptor = Bio::DB::SoapEUtilities::LinkAdaptor->new( -result => $result ); last; }; $_ eq 'esummary' && do { $adaptor = Bio::DB::SoapEUtilities::DocSumAdaptor->new( -result => $result ); last; }; $_ eq 'egquery' && do { $adaptor = Bio::DB::SoapEUtilities::GQueryAdaptor->new( -result => $result ); last; }; $_ eq 'efetch' && do { $adaptor = Bio::DB::SoapEUtilities::FetchAdaptor->new( -result => $result ); last; }; # else, ignore } return ($adaptor || $result); } else { return $result; 1; } } =head2 Useful Accessors =head2 response_message() Title : response_message Aliases : last_response, last_result Usage : $som = $fac->response_message Function: get the last response message Returns : a SOAP::SOM object Args : none =cut sub response_message { shift->{'_response_message'} } sub last_response { shift->{'_response_message'} } sub last_result { shift->{'_response_message'} } =head2 webenv() Title : webenv Usage : Function: contains WebEnv key referencing the session (set after run() ) Returns : scalar Args : none =cut sub webenv { shift->{'_WebEnv'} } =head2 errstr() Title : errstr Usage : $fac->errstr Function: get the last error, if any Example : Returns : value of errstr (a scalar) Args : none =cut sub errstr { shift->{'errstr'} } sub _wsdl_file { shift->{'_wsdl_file'} } =head2 Bio::ParameterBaseI compliance =head2 available_parameters() Title : available_parameters Usage : Function: get available request parameters for calling utility Returns : Args : -util => $desired_utility [optional, default is caller utility] =cut sub available_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); delete $args{'-util'}; delete $args{'-UTIL'}; $self->_soap_facs($util)->available_parameters(%args); } =head2 set_parameters() Title : set_parameters Usage : Function: Returns : none Args : -util => $desired_utility [optional, default is caller utility], named utility arguments =cut sub set_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); delete $args{'-util'}; delete $args{'-UTIL'}; $self->_soap_facs($util)->set_parameters(%args); } =head2 get_parameters() Title : get_parameters Usage : Function: Returns : array of named parameters Args : utility (scalar string) [optional] (default is caller utility) =cut sub get_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); return $self->_soap_facs($util)->get_parameters; } =head2 reset_parameters() Title : reset_parameters Usage : Function: Returns : none Args : -util => $desired_utility [optional, default is caller utility], named utility arguments =cut sub reset_parameters { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); delete $args{'-util'}; delete $args{'-UTIL'}; $self->_soap_facs($util)->reset_parameters(%args); } =head2 parameters_changed() Title : parameters_changed Usage : Function: Returns : boolean Args : utility (scalar string) [optional] (default is caller utility) =cut sub parameters_changed { my $self = shift; my @args = @_; my %args = @args; my $util = $args{'-util'} || $args{'-UTIL'} || $self->_caller_util; return unless $self->_soap_facs($util); return $self->_soap_facs($util)->parameters_changed; } # idea behind using autoload: attempt to buffer the module # against additions of new eutilities, and (of course) to # reduce work (laziness, not Laziness) sub AUTOLOAD { my $self = shift; my $util = $AUTOLOAD; my @args = @_; $util =~ s/.*:://; if ( $util =~ /^e/ ) { # this will bite me someday # create an ESoap factory for this utility my $fac = $self->_soap_facs($util); # check cache my @pms = ( -util => $util ); if ($self->_wsdl_file) { push @pms, ( -wsdl_file => $self->_wsdl_file ); } eval { $fac ||= Bio::DB::ESoap->new( @pms ); }; for ($@) { /^$/ && do { $self->_soap_facs($util,$fac); # put in cache last; }; /Utility .* not recognized/ && do { my $err = (ref $@ ? $@->text : $@); $self->throw($err); }; do { #else my $err = (ref $@ ? $@->text : $@); die $err; $self->throw("Problem creating ESoap client : $err"); }; } # arg setting $self->throw("Named arguments required") if @args % 2; $fac->set_parameters(@args) if @args; $self->_caller_util($util); return $self; # now, can do $obj->esearch()->run, etc, with methods in # this package, with an appropriate low-level factory # set up in the background. } elsif ($self->_caller_util) { # delegate to the appropriate soap factory my $method = $util; $util = $self->_caller_util; my $soapfac = $self->_soap_facs($util); if ( $soapfac && $soapfac->can($method) ) { return $soapfac->$method(@args); } } else { $self->throw("Can't locate method '$util' in module ". __PACKAGE__); } 1; } =head2 _soap_facs() Title : _soap_facs Usage : $self->_soap_facs($util, $fac) Function: caches Bio::DB::ESoap factories for the eutils in use by this instance Example : Returns : Bio::DB::ESoap object Args : $eutility, [optional on set] $esoap_factory_object =cut sub _soap_facs { my $self = shift; my ($util, $fac) = @_; $self->throw("Utility must be specified") unless $util; $self->{'_soap_facs'} ||= {}; if ($fac) { return $self->{'_soap_facs'}->{$util} = $fac; } return $self->{'_soap_facs'}->{$util}; } =head2 _caller_util() Title : _caller_util Usage : $self->_caller_util($newval) Function: the utility requested off the main SoapEUtilities object Example : Returns : value of _caller_util (a scalar string, a valid eutility) Args : on set, new value (a scalar string [optional]) =cut sub _caller_util { my $self = shift; return $self->{'_caller_util'} = shift if @_; return $self->{'_caller_util'}; } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/000077500000000000000000000000001302566030400220405ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/DocSumAdaptor.pm000077500000000000000000000132411302566030400251070ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::DocSumAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::DocSumAdaptor - Handle for Entrez SOAP DocSums =head1 SYNOPSIS my $fac = Bio::DB::SoapEUtilities->new(); # run a query, returning a DocSumAdaptor my $docs = $fac->esummary( -db => 'taxonomy', -id => 527031 )->run(-auto_adapt=>1); # iterate over docsums while (my $d = $docs->next_docsum) { @available_items = $docsum->item_names; # any available item can be called as an accessor # from the docsum object...watch your case... $sci_name = $d->ScientificName; $taxid = $d->TaxId; } =head1 DESCRIPTION This adaptor provides an iterator (C) and other convenience functions for parsing NCBI Entrez EUtility C SOAP results. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::DocSumAdaptor; use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::DocSumAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::DocSumAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::DocSumAdaptor Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($result) = $self->_rearrange([qw(RESULT)], @args); $self->throw("DocSumAdaptor requires a SoapEUtilities::Result argument") unless $result; $self->throw("DocSumAdaptor only works with elink results") unless $result->util eq 'esummary'; $self->{'_result'} = $result; $self->{'_idx'} = 1; return $self; } sub result { shift->{'_result'} } =head2 next_docsum() Title : next_docsum Usage : Function: return the next DocSum from the attached Result Returns : Args : =cut sub next_docsum { my $self = shift; my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; my $som = $self->result->som; return unless $som->valueof($stem); my ($ret, %params); my $get = sub { $som->valueof("$stem/".shift) }; $params{'-id'} = $get->('Id'); my $names = []; for (my $i = 1; my $data = $som->dataof("$stem/[$i]"); $i++) { if ( $data->value and $data->value !~ /^\s*$/) { my $name = $data->attr->{'Name'}; next unless $name; my $content = $som->valueof("$stem/[$i]/ItemContent"); unless (defined $content) { next unless $som->dataof("$stem/[$i]/Item"); my $h = {}; _traverse_items("$stem/[$i]", $som, $h); $content = $h; } push @$names, $name; $params{$name} = $content; } } $params{'_item_names'} = $names; my $class = ref($self)."::docsum"; $ret = $class->new(%params); ($self->{'_idx'})++; return $ret; } sub next_obj { shift->next_docsum(@_) } sub rewind { shift->{'_idx'} = 1; }; sub _traverse_items { my ($stem, $som, $h) = @_; for (my $i = 1; my $data = $som->dataof($stem."/[$i]"); $i++) { my $name = $data->attr->{'Name'}; next unless $name; if ($name =~ /Type$/) { # clip out this node _traverse_items("$stem/[$i]", $som, $h); } else { my $content = $som->valueof("$stem/[$i]/ItemContent"); if ($content) { $$h{$name} = $content; } else { $$h{$name} = {}; _traverse_items("$stem/[$i]", $som, $$h{$name}); } } } return; } 1; #### package Bio::DB::SoapEUtilities::DocSumAdaptor::docsum; use strict; use warnings; use base qw(Bio::Root::Root); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my %args = @args; $self->_set_from_args( \%args, -methods => [map { /^-?(.*)/ } keys %args], -create => 1, -code => 'my $self = shift; my $d = shift; my $k = \'_\'.$method; $self->{$k} = $d if $d; return (ref($self->{$k}) eq \'ARRAY\' ? @{$self->{$k}} : $self->{$k});' ); return $self; } =head2 item_names() Title : item_names Usage : @accs = $docsum->item_names Function: Return a list of items accessible from the object Returns : array of scalar strings Args : none =cut sub item_names { my $a = shift->{'__item_names'} ; return @$a if $a } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/FetchAdaptor.pm000077500000000000000000000146241302566030400247540ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::FetchAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::FetchAdaptor - Conversion of Entrez SOAP messages to BioPerl objects =head1 SYNOPSIS $fac = Bio::DB::SoapEUtilities->new(); $soap_result = $fac->efetch( -db => 'protein', -id => 2597988 ); $adp = Bio::DB::SoapEUtilities::FetchAdaptor( -result => $soap_result, -type => 'seq' ); while ( $gb_seq = $adp->next_obj ) { # do stuff } =head1 DESCRIPTION C is the base class of a system, modeled after L, to parse SOAP responses from the NCBI Entrez C utility into germane BioPerl objects. The user will rarely need to instantiate a C with L object as in the L. It usually suffices to use the C<-auto_adapt> parameter in the factory C method: my $fac = Bio::DB::SoapEUtilities->new(); my $taxio = $fac->efetch(-db => 'taxonomy', -id => 1394)->run(-auto_adapt=>1); my $sp = $taxio->next_species; # Bio::Species objects my $seqio = $fac->efetch(-db => 'protein', -id => 730439)->run(-auto_adapt=>1); my $seq = $seqio->next_seq; # Bio::Seq::RichSeq objects =head1 SEE ALSO L, C subclasses =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::DB::SoapEUtilities::FetchAdaptor; use strict; use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::FetchAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::FetchAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::FetchAdaptor Args : named arguments -som => $soap_som_object (soap message) -type => $type ( optional, forces loading of $type adaptor ) =cut sub new { my ($class,@args) = @_; $class = ref($class) || $class; if ($class =~ /.*?::FetchAdaptor::(\S+)/) { my $self = $class->SUPER::new(@args); $self->_initialize(@args); return $self; } else { my %args = @args; my $result = $args{'-result'} || $args{'-RESULT'}; $class->throw("Bio::DB::SoapEUtilities::Result argument required") unless $result; $class->throw("RESULT argument must be a Bio::DB::SoapEUtilities::Result object") unless ref($result) eq 'Bio::DB::SoapEUtilities::Result'; # identify the correct adaptor module to load using Result info my $type ||= $result->fetch_type; $class->throw("Can't determine fetch type for this result") unless $type; # $type ultimately contains a FetchAdaptor subclass return unless( $class->_load_adaptor($type, $result) ); return "Bio::DB::SoapEUtilities::FetchAdaptor::$type"->new(@args); } } =head2 _initialize() Title : _initialize Usage : Function: Returns : Args : =cut sub _initialize { my $self = shift; my @args = @_; my ($result, $type) = $self->_rearrange([qw( RESULT TYPE )], @args); $self->throw("Bio::DB::SoapEUtilities::Result argument required") unless $result; $self->throw("RESULT argument must be a Bio::DB::SoapEUtilities::Result object") unless ref($result) eq 'Bio::DB::SoapEUtilities::Result'; $self->{'_type'} = $type || $result->fetch_type; $self->{'_result'} = $result; 1; } =head2 _load_adaptor() Title : _load_adaptor Usage : Function: loads a FetchAdaptor subclass Returns : Args : adaptor type (subclass name) =cut sub _load_adaptor { my ($class, $type, $result) = @_; return unless $type; # specials for ($result->fetch_type) { $_ eq 'seq' && do { $_[1] = $type = 'species' if $result->fetch_db and $result->fetch_db eq 'taxonomy'; last; }; # else, leave $type alone } my $module = "Bio::DB::SoapEUtilities::FetchAdaptor::".$type; my $ok; eval { $ok = $class->_load_module($module); }; for ($@) { /^$/ && do { return $ok; }; /Can't locate/ && do { $class->throw("Fetch adaptor for '$type' not found"); }; do { # else $class->throw("Error in fetch adaptor for '$type' : $@"); }; } } =head2 obj_class() Title : obj_class Usage : $adaptor->obj_class Function: Returns the fully qualified BioPerl classname of the objects returned by next_obj() Returns : scalar string (class name) Args : none =cut sub obj_class { shift->throw_not_implemented } =head2 next_obj() Title : next_obj Usage : $obj = $adaptor->next_obj Function: Returns the next parsed BioPerl object from the adaptor Returns : object of class obj_class() Args : none =cut sub next_obj { shift->throw_not_implemented } =head2 rewind() Title : rewind Usage : Function: Rewind the adaptor's iterator Returns : Args : none =cut sub rewind { shift->throw_not_implemented } =head2 result() Title : result Usage : Function: contains the SoapEUtilities::Result object Returns : Bio::DB::SoapEUtilities::Result object Args : none =cut sub result { shift->{'_result'} } =head2 type() Title : type Usage : Function: contains the fetch type of this adaptor Returns : Args : =cut sub type { shift->{'_type'} } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/FetchAdaptor/000077500000000000000000000000001302566030400244045ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/FetchAdaptor/seq.pm000077500000000000000000000563601302566030400255470ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::FetchAdaptor::seq # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::FetchAdaptor::seq - Fetch adaptor for 'seq' efetch SOAP messages =head1 SYNOPSIS Imported by L as required. =head1 DESCRIPTION Returns an iterator over L or L objects, depending on the the return type of the C. A standard C to a sequence database will return a GenBank SOAP result; this will be parsed into rich sequence objects: my $fac = Bio::DB::SoapEUtilities->new; my $seqio = $fac->efetch(-db => 'protein', -id => 730439)->run(-auto_adapt=>1); my $seq = $seqio->next_seq; $seq->species->binomial; # returns 'Bacillus caldolyticus' An C with C<-rettype => 'fasta'> will be parsed into L objects (VERY much faster): $seqio = $fac->efetch( -rettype => 'fasta' )->run(-auto_adapt=>1); $seq = $seqio->next_seq; $seq->species; # undef $seq->desc; # kitchen sink To find out the object type returned: $class = $seqio->obj_class; as for all L objects. =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 CONTRIBUTORS Much inspiration from L and family. =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::FetchAdaptor::seq; use strict; use Bio::Root::Root; use Bio::Annotation::Collection; use Bio::Annotation::Comment; use Bio::Annotation::DBLink; use Bio::Annotation::Reference; use Bio::Annotation::SimpleValue; use Bio::Factory::FTLocationFactory; use Bio::SeqFeature::Generic; use Bio::Seq::SeqBuilder; use Bio::Seq::SeqFactory; use Bio::Species; use base qw(Bio::DB::SoapEUtilities::FetchAdaptor Bio::Root::Root); our %VALID_ALPHABET = ( 'AA' => 'protein', 'DNA' => 'dna', 'RNA' => 'rna' ); our %TYPE_XLT = ( 'Bio::Seq' => ['TSeqSet','TSeq'], 'Bio::Seq::RichSeq' => ['GBSet', 'GBSeq'] ); sub _initialize { my ($self, @args) = @_; $self->SUPER::_initialize(@args); my ($builder, $seqfac ) = $self->_rearrange( [qw(SEQBUILDER SEQFACTORY)], @args ); # choose rich or simple seq based on result my ($t) = keys %{$self->result->som->method}; for ($t) { /^GB/ && do { $t = 'GB'; # genbank info $self->{'_obj_class'} = ($seqfac ? $seqfac->type : 'Bio::Seq::RichSeq'); last; }; /^T/ && do { $t = 'T'; # fasta info $self->{'_obj_class'} = ($seqfac ? $seqfac->type : 'Bio::Seq'); last; }; $self->throw("FetchAdaptor::seq : unrecognized result elt type '$t', can't parse"); } $self->{'_builder'} = $builder || Bio::Seq::SeqBuilder->new(); $self->{'_builder'}->sequence_factory( $seqfac || Bio::Seq::SeqFactory->new( -type => $self->{'_obj_class'} ) ); $self->{'_locfac'} = Bio::Factory::FTLocationFactory->new(); $self->{'_idx'} = 1; 1; } sub rewind { shift->{'_idx'} = 1 } sub obj_class { shift->{'_obj_class'} } sub builder { shift->{'_builder'} }; sub locfac { shift->{'_locfac'} }; sub next_obj { my $self = shift; my $t = $TYPE_XLT{$$self{_obj_class}}; my $stem = "//$$t[0]/[".$self->{'_idx'}."]"; my $som = $self->result->som; my $seqid; return unless defined $som->valueof("$stem"); my $get = sub { $som->valueof("$stem/$$t[1]_".shift) }; # speed up (?) by caching top-level data hash my $toplev = $som->valueof("$stem"); my $get_tl = sub { $toplev->{"$$t[1]_".shift} }; my %params = (-verbose => $self->verbose); if ($t->[0] =~ /^T/) { $params{'-display_id'} = $get_tl->('accver'); $params{'-primary_id'} = $get_tl->('gi'); $params{'-length'} = $get_tl->('length'); $params{'-desc'} = $get_tl->('defline'); $params{'-seq'} = $get_tl->('sequence'); $params{'-alphabet'} = $get_tl->('seqtype') || undef; $self->builder->add_slot_value(%params); ($self->{_idx})++; if ( !$self->builder->want_object ) { # skip $self->builder->make_object; goto &next_obj; } else { return $self->builder->make_object; } } elsif ($t->[0] =~ /^GB/) { # source, id, alphabet $params{'-display_id'} = $get_tl->('locus'); $params{'-length'} = $get_tl->('length'); $get_tl->('moltype') =~ /(AA|[DR]NA)/; $params{'-alphabet'} = $VALID_ALPHABET{$1} || ''; # molecule, division, dates $params{'-molecule'} = $get_tl->('moltype'); $params{'-is_circular'} = ($get_tl->('topology') eq 'circular'); $params{'-division'} = $get->('division'); $params{'-dates'} = [$get_tl->('create-date'), $get_tl->('update-date')]; $self->builder->add_slot_value(%params); %params = (); if ( !$self->builder->want_object ) { # skip this $self->builder->make_object; ($self->{_idx})++; goto &next_obj; } # accessions, version, pid, description $get_tl->('accession-version') =~ /.*\.([0-9]+)$/; $params{'-version'} = $params{'-seq_version'} = $1; my @secondary_ids; my @ids = $get->('other-seqids/GBSeqid'); foreach (@ids) { /^gi\|([0-9]+)/ && do { $seqid = $params{'-primary_id'} = $1; $params{'-accession_number'} = $_; # correct? next; }; do { # else push @secondary_ids, $_; next; }; } $params{'-secondary_accessions'} = \@secondary_ids; $params{'-desc'} = $get->('definition'); # sequence if ( $self->builder->want_slot('seq')) { $params{'-seq'} = $get->('sequence'); } # keywords if ($get->('keywords')) { my @kw; foreach my $kw ($som->valueof("$stem/GBSeq_keywords/*")) { push @kw, $kw; } $params{'-keywords'} = join(' ',@kw); } $self->builder->add_slot_value(%params); %params = (); my $ann; # annotations if ($self->builder->want_slot('annotation')) { $ann = Bio::Annotation::Collection->new(); # references if ($get->('references')) { $ann->add_Annotation('reference', $_) for _read_references($stem,$som); } # comment if ($get_tl->('comment')) { $ann->add_Annotation('comment', Bio::Annotation::Comment->new( -tagname => 'comment', -text => $get_tl->('comment') ) ); } # project if ( $get_tl->('project') ) { $ann->add_Annotation('project', Bio::Annotation::SimpleValue->new( -value => $get_tl->('project') ) ); } # contig if ($get_tl->('contig')) { $ann->add_Annotation('contig', Bio::Annotation::SimpleValue->new( -value => $get_tl->('contig') ) ); } # dblink if ($get_tl->('source-db')) { _read_db_source($ann, $get); } $self->builder->add_slot_value(-annotation => $ann); } # features my $feats; if ($self->builder->want_slot('features')) { $feats = _read_features($stem,$som,$self->locfac,$get); $self->builder->add_slot_value( -features => $feats ); } # organism data if ( $self->builder->want_slot('species') && $get_tl->('source') ) { my $sp = _read_species($get); if ($sp && !$sp->ncbi_taxid) { my ($src) = grep { $_->primary_tag eq 'source' } @$feats; if ($src) { foreach my $val ($src->get_tag_values('db_xref')) { $sp->ncbi_taxid(substr($val,6)) if index($val,"taxon:") == 0; } } } $self->builder->add_slot_value( -species => $sp ); } } else { $self->throw("FetchAdaptor::seq : unrecognized result elt type '$t', can't parse"); } ($self->{_idx})++; return $self->builder->make_object; } # mostly ripped from Bio::SeqIO::genbank... sub _read_species { my ($get) = @_; my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown', 'Unspecified', 'Unknown', 'None', 'unclassified', 'unidentified organism', 'not supplied'); # dictionary of synonyms for taxid 32644 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified'); # all above can be part of valid species name my( $sub_species, $species, $genus, $sci_name, $common, $abbr_name, $organelle); $sci_name = $get->('organism') || return; # parse out organelle, common name, abbreviated name if present; # this should catch everything, but falls back to # entire GBSeq_taxonomy element just in case if ($get->('source') =~ m{^ (mitochondrion|chloroplast|plastid)? \s*(.*?) \s*(?: \( (.*?) \) )?\.? $}xms ) { ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional } else { $abbr_name = $get->('source'); # nothing caught; this is a backup! } # Convert data in classification lines into classification array. my @class = split(/; /, $get->('taxonomy')); # do we have a genus? my $possible_genus = quotemeta($class[-1]) . ($class[-2] ? "|" . quotemeta($class[-2]) : ''); if ($sci_name =~ /^($possible_genus)/) { $genus = $1; ($species) = $sci_name =~ /^$genus\s+(.+)/; } else { $species = $sci_name; } # is this organism of rank species or is it lower? # (we don't catch everything lower than species, but it doesn't matter - # this is just so we abide by previous behaviour whilst not calling a # species a subspecies) if ($species && $species =~ /subsp\.|var\./) { ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/; } # Don't make a species object if it's empty or "Unknown" or "None" # return unless $genus and $genus !~ /^(Unknown|None)$/oi; # Don't make a species object if it belongs to taxid 32644 my $src = $get->('source'); return unless ($species || $genus) and !grep { $_ eq $src } @unkn_names; # Bio::Species array needs array in Species -> Kingdom direction push(@class, $sci_name); @class = reverse @class; my $make = Bio::Species->new(); $make->scientific_name($sci_name); $make->classification(@class) if @class > 0; $make->common_name( $common ) if $common; $make->name('abbreviated', $abbr_name) if $abbr_name; $make->organelle($organelle) if $organelle; return $make; } sub next_seq { shift->next_obj } sub _read_references { my ($stem, $som) = @_; my @ret; for ( my $i = 1; $som->valueof($stem."/GBSeq_references/[$i]"); $i++ ) { my $get = sub { $som->valueof($stem."/GBSeq_references/[$i]/GBReference_".shift ) }; my %params; $params{'-title'} = $get->('title'); $params{'-pubmed'} = $get->('pubmed'); $params{'-medline'} = $get->('pubmed'); $params{'-journal'} = $get->('journal'); $params{'-comment'} = $get->('remark'); $params{'-consortium'} = $get->('consortium'); my $pos = $get->('position'); $pos and $pos =~ /^([0-9]+)[.]+([0-9]+)$/; $params{'-start'} = $1; $params{'-end'} = $2; $params{'-gb_reference'} = $get->('reference'); $params{'-authors'} = ''; foreach my $author ( $get->('authors/*') ) { $params{'-authors'} .= " $author"; } push @ret, Bio::Annotation::Reference->new( -tagname => 'reference', %params); } return @ret; } sub _read_features { my ($stem, $som, $locfac, $get_pri) = @_; my @ret; my $seqid = $get_pri->('primary-accession'); for ( my $i = 1; $get_pri->("feature-table/[$i]"); $i++ ) { my $get = sub { $som->valueof($stem."/GBSeq_feature-table/[$i]/GBFeature_".shift ) }; my $loc; my $sf = Bio::SeqFeature::Generic->direct_new(); if ($get->('location')) { # may have to parse GBIntervals instead here... $loc = $locfac->from_string( $get->('location') ); if ($seqid && !($loc->is_remote)) { $loc->seq_id($seqid); } } $sf->location($loc); $sf->seq_id($seqid); $sf->primary_tag($get->('key')); $sf->source_tag('EMBL/GenBank/SwissProt'); # fill other fields using $sf->add_tag_value... # qualifiers are name => value pairs. add as tags # to this feature if ($get->('quals')) { foreach ($get->('quals/*')) { $sf->add_tag_value( $_->{'GBQualifier_name'}, $_->{'GBQualifier_value'} ); } } if ($get->('partial5')) { $sf->add_tag_value( 'is_partial5', $get->('partial5') eq 'true' ? 1 : 0) } if ($get->('partial3')) { $sf->add_tag_value( 'is_partial3', $get->('partial3') eq 'true' ? 1 : 0) } push @ret, $sf; } return \@ret; } sub _read_db_source { my ($ann, $get) = @_; my $dbsource = $get->('source-db'); # ripped mainly from Bio::SeqIO::genbank... # deal with UniProKB dbsources if( $dbsource =~ s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,[^.]+\.\s*// ) { $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $2, -database => $1, -tagname => 'dblink')); if( $dbsource =~ s/created:\s+([^.]+)\.\s*// ) { $ann->add_Annotation ('swissprot_dates', Bio::Annotation::SimpleValue->new (-tagname => 'date_created', -value => $1)); } while( $dbsource =~ s/\s+(sequence|annotation)\s+ updated:\s+([^.]+)\.\s*//xg ) { $ann->add_Annotation ('swissprot_dates', Bio::Annotation::SimpleValue->new (-tagname => 'date_updated', -value => $2)); } $dbsource =~ s/\n/ /g; if ( $dbsource =~ s/xrefs:\s+ ((?:\S+,\s+)+\S+)\s+xrefs/xrefs/x ) { # will use $i to determine even or odd # for swissprot the accessions are paired my $i = 0; for my $dbsrc ( split(/,\s+/,$1) ) { if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) { my ($id,$version) = ($1,$2); $version ='' unless defined $version; my $db; if( $id =~ /^\d\S{3}/) { $db = 'PDB'; } else { $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; } $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $id, -version => $version, -database => $db, -tagname => 'dblink')); } } } elsif ( $dbsource =~ s/xrefs:\s+(.+)\s+xrefs/xrefs/i ) { # download screwed up and ncbi didn't put # acc in for gi numbers my $i = 0; for my $id ( split(/\,\s+/,$1) ) { my ($acc,$db); if( $id =~ /gi:\s+(\d+)/ ) { $acc= $1; $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) { $acc= $1; $db = 'PDB'; } else { $acc= $id; $db = ''; } $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $acc, -database => $db, -tagname => 'dblink')); } } else { warn "Cannot match $dbsource"; } if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+ ((?:\S+,\s+)+\S+)//x ) { for my $id ( split(/\,\s+/,$1) ) { my $db; # quote from Bio::SeqIO::genbank: # this is because GenBank dropped the spaces!!! # I'm sure we're not going to get this right $db = substr($id,0,index($id,':')); $id = substr($id,index($id,':')+1); $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $id, -database => $db, -tagname => 'dblink')); } } } else { if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) { my ($db,$id,$version) = ($1,$2,$3); $ann->add_Annotation ('dblink', Bio::Annotation::DBLink->new (-primary_id => $id, -version => $version, -database => $db || 'GenBank', -tagname => 'dblink')); } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) { my ($id, $db, $version); if ($2 eq ':') { ($db, $id) = ($1, $3); } else { ($db, $id, $version) = ('GenBank', $1, $3); } $ann->add_Annotation('dblink', Bio::Annotation::DBLink->new( -primary_id => $id, -version => $version, -database => $db, -tagname => 'dblink') ); } else { warn "Unrecognized DBSOURCE data: $dbsource"; } } return 1; } 1; __END__ here\'s an example: PROTEIN 0 HASH(0x439b8a8) 'GBSet' => HASH(0x439c010) 'GBSeq' => HASH(0x43a79c8) 'GBSeq_accession-version' => 'CAA53922.1' 'GBSeq_comment' => 'On Nov 8, 1997 this sequence version replaced gi:443947.' 'GBSeq_create-date' => '18-JAN-1994' 'GBSeq_definition' => 'sonic hedgehog [Mus musculus]' 'GBSeq_division' => 'ROD' 'GBSeq_feature-table' => HASH(0x43abf4c) 'GBFeature' => HASH(0x43b23b4) 'GBFeature_intervals' => HASH(0x43b800c) 'GBInterval' => HASH(0x43b83fc) 'GBInterval_accession' => 'CAA53922.1' 'GBInterval_from' => 1 'GBInterval_to' => 437 'GBFeature_key' => 'CDS' 'GBFeature_location' => '1..437' 'GBFeature_quals' => HASH(0x43b8378) 'GBQualifier' => HASH(0x43baeb0) 'GBQualifier_name' => 'db_xref' 'GBQualifier_value' => 'UniProtKB/Swiss-Prot:Q62226' 'GBSeq_length' => 437 'GBSeq_locus' => 'CAA53922' 'GBSeq_moltype' => 'AA' 'GBSeq_organism' => 'Mus musculus' 'GBSeq_other-seqids' => HASH(0x43ab028) 'GBSeqid' => 'gi|2597988' 'GBSeq_primary-accession' => 'CAA53922' 'GBSeq_references' => HASH(0x43abe80) 'GBReference' => HASH(0x43af1f8) 'GBReference_authors' => HASH(0x43af3e4) 'GBAuthor' => 'McMahon,A.P.' 'GBReference_journal' => 'Submitted (03-NOV-1997) A.P. McMahon, Harvard University, 16 Divinity Ave., Cambridge, MA 02138, USA' 'GBReference_position' => '1..437' 'GBReference_reference' => 3 'GBReference_title' => 'Direct Submission' 'GBSeq_sequence' => 'mllllarcflvilassllvcpglacgpgrgfgkrrhpkkltplaykqfipnvaektlgasgryegkitrnserfkeltpnynpdiifkdeentgadrlmtqrckdklnalaisvmnqwpgvklrvtegwdedghhseeslhyegravdittsdrdrskygmlarlaveagfdwvyyeskahihcsvkaensvaaksggcfpgsatvhleqggtklvkdlrpgdrvlaaddqgrllysdfltfldrdegakkvfyvietleprerllltaahllfvaphndsgptpgpsalfasrvrpgqrvyvvaerggdrrllpaavhsvtlreeeagayapltahgtilinrvlascyavieehswahrafapfrlahallaalapartdgggggsipaaqsateargaeptagihwysqllyhigtwlldsetmhplgmavkss' 'GBSeq_source' => 'Mus musculus (house mouse)' 'GBSeq_source-db' => 'embl accession X76290.1' 'GBSeq_taxonomy' => 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muroidea; Muridae; Murinae; Mus' 'GBSeq_topology' => 'linear' 'GBSeq_update-date' => '04-NOV-1997' NUCLEOTIDE 0 HASH(0x42c1a44) 'GBSet' => HASH(0x42dd728) 'GBSeq' => HASH(0x44bc2c8) 'GBSeq_accession-version' => 'NR_029721.1' 'GBSeq_comment' => 'PROVISIONAL REFSEQ: This record is based on preliminary annotation provided by NCBI staff in collaboration with miRBase. The reference sequence was derived from AL645478.15.; ~Summary: microRNAs (miRNAs) are short (20-24 nt) non-coding RNAs that are involved in post-transcriptional regulation of gene expression in multicellular organisms by affecting both the stability and translation of mRNAs. miRNAs are transcribed by RNA polymerase II as part of capped and polyadenylated primary transcripts (pri-miRNAs) that can be either protein-coding or non-coding. The primary transcript is cleaved by the Drosha ribonuclease III enzyme to produce an approximately 70-nt stem-loop precursor miRNA (pre-miRNA), which is further cleaved by the cytoplasmic Dicer ribonuclease to generate the mature miRNA and antisense miRNA star (miRNA*) products. The mature miRNA is incorporated into a RNA-induced silencing complex (RISC), which recognizes target mRNAs through imperfect base pairing with the miRNA and most commonly results in translational inhibition or destabilization of the target mRNA. The RefSeq represents the predicted microRNA stem-loop. [provided by RefSeq]; ~Sequence Note: This record represents a predicted microRNA stem-loop as defined by miRBase. Some sequence at the 5\' and 3\' ends may not be included in the intermediate precursor miRNA produced by Drosha cleavage.' 'GBSeq_create-date' => '29-OCT-2009' 'GBSeq_definition' => 'Mus musculus microRNA 196a-1 (Mir196a-1), microRNA' 'GBSeq_division' => 'ROD' 'GBSeq_feature-table' => HASH(0x4579f0c) 'GBFeature' => HASH(0x457ab6c) 'GBFeature_intervals' => HASH(0x457fa20) 'GBInterval' => HASH(0x45813d0) 'GBInterval_accession' => 'NR_029721.1' 'GBInterval_from' => 24 'GBInterval_to' => 45 'GBFeature_key' => 'ncRNA' 'GBFeature_location' => '24..45' 'GBFeature_quals' => HASH(0x45813e8) 'GBQualifier' => HASH(0x4581a90) 'GBQualifier_name' => 'db_xref' 'GBQualifier_value' => 'MGI:2676860' 'GBSeq_length' => 102 'GBSeq_locus' => 'NR_029721' 'GBSeq_moltype' => 'ncRNA' 'GBSeq_organism' => 'Mus musculus' 'GBSeq_other-seqids' => HASH(0x456bea8) 'GBSeqid' => 'gi|262205520' 'GBSeq_primary' => 'REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP~1-102 AL645478.15 79764-79865 ' 'GBSeq_primary-accession' => 'NR_029721' 'GBSeq_references' => HASH(0x45744ac) 'GBReference' => HASH(0x457ac20) 'GBReference_authors' => HASH(0x457f36c) 'GBAuthor' => 'Tuschl,T.' 'GBReference_journal' => 'RNA 9 (2), 175-179 (2003)' 'GBReference_position' => '1..102' 'GBReference_pubmed' => 12554859 'GBReference_reference' => 9 'GBReference_title' => 'New microRNAs from mouse and human' 'GBSeq_sequence' => 'tgagccgggactgttgagtgaagtaggtagtttcatgttgttgggcctggctttctgaacacaacgacatcaaaccacctgattcatggcagttactgcttc' 'GBSeq_source' => 'Mus musculus (house mouse)' 'GBSeq_strandedness' => 'single' 'GBSeq_taxonomy' => 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muroidea; Muridae; Murinae; Mus' 'GBSeq_topology' => 'linear' 'GBSeq_update-date' => '06-JAN-2010' bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/FetchAdaptor/species.pm000077500000000000000000000141501302566030400264010ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::FetchAdaptor::species # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::FetchAdaptor::species - Fetch adaptor for 'taxonomy' efetch SOAP messages =head1 SYNOPSIS Imported by L as required. =head1 DESCRIPTION Returns an iterator over L objects: my $fac = Bio::DB::SoapEUtilities->new; my $taxio = $fac->efetch(-db => 'taxonomy', -id => 1394)->run(-auto_adapt=>1); my $sp = $taxio->next_species; $sp->binomial; # returns 'Bacillus caldolyticus' To find out the object type returned: $class = $seqio->obj_class; # $class is 'Bio::Species' as for all L objects. =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::FetchAdaptor::species; use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Species; use base qw(Bio::DB::SoapEUtilities::FetchAdaptor Bio::Root::Root ); sub _initialize { my ($self, @args) = @_; $self->SUPER::_initialize(@args); # my ($builder, $seqfac ) = $self->_rearrange( [qw(SEQBUILDER # SEQFACTORY)], @args ); $self->{'_obj_class'} = 'Bio::Species' ; $self->{'_idx'} = 1; 1; } sub rewind { shift->{'_idx'} = 1 } sub obj_class { shift->{'_obj_class'} } sub next_species { shift->next_obj } sub next_obj { my $self = shift; my $stem = "//TaxaSet/[".$self->{'_idx'}."]"; # my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; my $som = $self->result->som; return unless defined $som->valueof($stem); my $get = sub { $som->valueof("$stem/".shift) }; my $toplev = $som->valueof("$stem"); my $get_tl = sub { $toplev->{ shift @_ } }; my $sp = _read_species($get_tl); $self->warn("FetchAdaptor::species - parse error, no Bio::Species returned") unless $sp; ($self->{_idx})++; return $sp; } 1; # mostly ripped from Bio::SeqIO::genbank... sub _read_species { my ($get) = @_; my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown', 'Unspecified', 'Unknown', 'None', 'unclassified', 'unidentified organism', 'not supplied'); # dictionary of synonyms for taxid 32644 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified'); # all above can be part of valid species name my( $sub_species, $species, $genus, $sci_name, $common, $abbr_name, $organelle); $sci_name = $get->('ScientificName') || return; # no "source" elt like gb format./maj # parse out organelle, common name, abbreviated name if present; # this should catch everything, but falls back to # entire GBSeq_taxonomy element just in case # if ($get->('source') =~ m{^ # (mitochondrion|chloroplast|plastid)? # \s*(.*?) # \s*(?: \( (.*?) \) )?\.? # $}xms ) { # ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional # } else { # $abbr_name = $get->('source'); # nothing caught; this is a backup! # } # # Convert data in classification lines into classification array. my @class = split(/; /, $get->('Lineage')); # do we have a genus? my $possible_genus = quotemeta($class[-1]) . ($class[-2] ? "|" . quotemeta($class[-2]) : ''); if ($sci_name =~ /^($possible_genus)/) { $genus = $1; ($species) = $sci_name =~ /^$genus\s+(.+)/; } else { $species = $sci_name; } # is this organism of rank species or is it lower? # (we don't catch everything lower than species, but it doesn't matter - # this is just so we abide by previous behaviour whilst not calling a # species a subspecies) if ($species && $species =~ /subsp\.|var\./) { ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/; } # Don't make a species object if it's empty or "Unknown" or "None" # return unless $genus and $genus !~ /^(Unknown|None)$/oi; # Don't make a species object if it belongs to taxid 32644 my $src = $get->('ScientificName'); return unless ($species || $genus) and !grep { $_ eq $src } @unkn_names; # Bio::Species array needs array in Species -> Kingdom direction push(@class, $sci_name); @class = reverse @class; my $make = Bio::Species->new(); $make->scientific_name($sci_name); $make->classification(@class) if @class > 0; $make->common_name( $get->('CommonName')); $make->name('abbreviated', $abbr_name) if $abbr_name; $make->organelle($organelle) if $organelle; $make->ncbi_taxid( $get->('TaxId') ); $make->division( $get->('Division') ); return $make; } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/GQueryAdaptor.pm000077500000000000000000000137551302566030400251430ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::GQueryAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::GQueryAdaptor - Handle for Entrez SOAP GlobalQuery items =head1 SYNOPSIS my $fac = Bio::DB::SoapEUtilities->new(); # run a query, returning a GQueryAdaptor my $queries = $fac->egquery( -term => 'BRCA and human' )->run(-auto_adapt=>1); # all databases with hits my @dbs = $queries->found_in_dbs; # queries by database my $prot_count = $queries->query_by_db('prot')->count; # iterate over gquery while ( my $q = $queries->next_query ) { my $db = $q->db; my $count = $q->count; my $status = $q->status; } =head1 DESCRIPTION This adaptor provides an iterator (C) and other convenience functions for parsing NCBI Entrez EUtility C SOAP results. =head1 SEE ALSO L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::GQueryAdaptor; use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::GQueryAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::GQueryAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::GQueryAdaptor Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($result) = $self->_rearrange([qw(RESULT)], @args); $self->throw("GQueryAdaptor requires a SoapEUtilities::Result argument") unless $result; $self->throw("GQueryAdaptor only works with egquery results") unless $result->util eq 'egquery'; $self->{'_result'} = $result; $self->{'_query_by_db'} = {}; $self->{'_idx'} = 1; return $self; } sub result { shift->{'_result'} } =head2 next_query() Title : next_query Usage : Function: return the next global query from the attached Result Returns : Args : =cut sub next_query { my $self = shift; # my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; # not consistent, kludge as follows: my $stem = "//eGQueryResult/[".$self->{'_idx'}."]"; my $som = $self->result->som; return unless $som->valueof($stem); my ($ret, %params); my $get = sub { $som->valueof("$stem/".shift) }; my $toplev = $get->(''); my $get_tl = sub { $toplev->{ shift @_ } }; $params{'-term'} = $som->valueof("//Term"); my $names = []; $params{'-count'} = $get_tl->('Count'); $params{'-db'} = $get_tl->('DbName'); $params{'-status'} = $get_tl->('Status'); my $class = ref($self)."::gquery"; $ret = $class->new(%params); $self->{_query_by_db}->{$params{'-db'}} = $ret; ($self->{'_idx'})++; return $ret; } sub next_obj { shift->next_query(@_) } sub rewind { shift->{'_idx'} = 1; }; =head2 found_in_dbs() Title : found_in_dbs Usage : Function: Return list of db names containing hits for the query term Returns : array of scalar strings Args : none =cut sub found_in_dbs { my $self = shift; return @{$self->{'_found_in_dbs'}} if $self->{'_found_in_dbs'}; my $som = $self->result->som; $self->{'_found_in_dbs'} = []; foreach ($som->valueof("//eGQueryResult/*")) { push @{$self->{'_found_in_dbs'}}, $_->{'DbName'} if $_->{'Count'}; } return @{$self->{'_found_in_dbs'}}; } =head2 query_by_db() Title : query_by_db Usage : Function: get gquery object by db name Returns : Args : db name (scalar string) =cut sub query_by_db { my $self = shift; my $db = shift; $self->throw("db must be specified") unless $db; return $self->{_query_by_db}->{$db} if $self->{_query_by_db}->{$db}; my $som = $self->result->som; my $i; for ($i = 1; my $val = $som->valueof("//eGQueryResult/[$i]/DbName"); $i++) { last if $val eq $db; } my $curidx = $self->{_idx}; my $query; { local $self->{_idx} = $i; $query = $self->next_query; } return $query; } 1; #### package Bio::DB::SoapEUtilities::GQueryAdaptor::gquery; use strict; use warnings; use base qw(Bio::Root::Root); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my %args = @args; $self->_set_from_args( \%args, -methods => [map { /^-?(.*)/ } keys %args], -create => 1, -code => 'my $self = shift; my $d = shift; my $k = \'_\'.$method; $self->{$k} = $d if $d; return (ref $self->{$k} eq \'ARRAY\') ? @{$self->{$k}} : $self->{$k};' ); return $self; } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/LinkAdaptor.pm000077500000000000000000000134711302566030400246170ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::LinkAdaptor # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::LinkAdaptor - Handle for Entrez SOAP LinkSets =head1 SYNOPSIS my $fac = Bio::DB::SoapEUtilities->new(); # run a query, returning a LinkAdaptor $fac->elink( -db => 'nucleotide', -dbfrom => 'protein', -id => [qw(828392 790 470338)]); my $links = $fac->elink->run( -auto_adapt => 1); # get the linked ids corresponding to the submitted ids # (may be arrays if multiple crossrefs, or undef if none) my @nucids = $links->id_map(828392); # iterate over linksets while ( my $ls = $links->next_linkset ) { my @from_ids = $ls->submitted_ids; my @to_ids = $ls->ids; my $from_db = $ls->db_from; my $to_db = $ls->db_to; } =head1 DESCRIPTION This adaptor provides an iterator (C) and other convenience functions for parsing NCBI Entrez EUtility C SOAP results. =head1 SEE ALSO L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::LinkAdaptor; use strict; use warnings; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use base qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Bio::DB::SoapEUtilities::LinkAdaptor(); Function: Builds a new Bio::DB::SoapEUtilities::LinkAdaptor object Returns : an instance of Bio::DB::SoapEUtilities::LinkAdaptor Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($result) = $self->_rearrange([qw(RESULT)], @args); $self->throw("LinkAdaptor requires a SoapEUtilities::Result argument") unless $result; $self->throw("LinkAdaptor only works with elink results") unless $result->util eq 'elink'; $self->{'_result'} = $result; $self->{'_idx'} = 1; return $self; } sub result { shift->{'_result'} } =head2 next_linkset() Title : next_linkset Usage : Function: return the next LinkSet from the attached Result Returns : Args : =cut sub next_linkset { my $self = shift; my $stem = "//Body/".$self->result->result_type."/[".$self->{'_idx'}."]"; return unless $self->result->som and $self->result->som->valueof($stem); my $som = $self->result->som; my ($ret, %params); my $get = sub { $som->valueof("$stem/".shift) }; $params{'-db_from'} = $get->('DbFrom'); $params{'-db_to'} = $get->('LinkSetDb/DbTo'); $params{'-link_name'} = $get->('LinkSetDb/LinkName'); $params{'-submitted_ids'} = [$get->('IdList/*')]; $params{'-ids'} = [$get->('LinkSetDb/Link/*')]; $params{'-webenv'} = $get->('WebEnv'); my $class = ref($self)."::linkset"; $ret = $class->new(%params); ($self->{'_idx'})++; return $ret; } sub next_obj { shift->next_linkset(@_) } sub rewind { shift->{'_idx'} = 1; }; =head2 id_map() Title : id_map Usage : $to_id = $adaptor->id_map($from_id) Function: Return 'to-database' ids corresponding to given specified 'from-database' or submitted ids Returns : array of scalars (to-database ids or arrayrefs of ids) Args : array of scalars (from-database ids) =cut sub id_map { my $self = shift; my @from_ids = @_; my $som = $self->result->som; my $stem = "//Body/".$self->result->result_type."/"; if (!defined $self->{'_id_map'}) { my $h = {}; for (my $i=1; $som->valueof($stem."[$i]"); $i++) { # note this assumes that in the elink query, # ids were provided individually (not as a comma-sep # list). This is the standard behavior for elink # in SoapEUtilities. my @to_ids = $som->valueof($stem."[$i]/LinkSetDb/Link/*"); $$h{$som->valueof($stem."[$i]/IdList/[1]")} = (@to_ids == 1 ? $to_ids[0] : \@to_ids); } $self->{'_id_map'} = $h; } return @{$self->{'_id_map'}}{@from_ids}; } package Bio::DB::SoapEUtilities::LinkAdaptor::linkset; use strict; use warnings; use base qw(Bio::Root::Root); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my %args = @args; $self->_set_from_args( \%args, -methods => [map { /^-?(.*)/ } keys %args], -create => 1, -code => 'my $self = shift; my $d = shift; my $k = \'_\'.$method; $self->{$k} = $d if $d; return (ref $self->{$k} eq \'ARRAY\') ? @{$self->{$k}} : $self->{$k};' ); return $self; } 1; bioperl-run-release-1-7-1/lib/Bio/DB/SoapEUtilities/Result.pm000077500000000000000000000302661302566030400236660ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::DB::SoapEUtilities::Result # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::DB::SoapEUtilities::Result - Accessor object for SoapEUtilities results =head1 SYNOPSIS $fac = Bio::DB::SoapEUtilities->new(); $result = $fac->esearch( -db => 'gene', -term => 'hedgehog')->run; $count = $result->count; # case important; $result->Count could be arrayref @ids = $result->ids; =head1 DESCRIPTION This module attempts to make Entrez Utilities SOAP responses as user-friendly and intuitive as possible. These responses can be complex structures with much useful data; but users will generally desire the values of some key fields. The L object provides access to all response values via systematically named accessor methods, and commonly used values as convenience methods. The 'raw' SOAP message (a L object as returned by L) is also provided. =over =item Convenience accessors If a list of record ids is returned by the call, C will return these as an array reference: @seq_ids = $result->ids; The total count of returned records is provided by C: $num_recs = $result->count; If C was specified in the SOAP call, the NCBI-assigned web environment (that can be used in future calls) is available in C, and the query key assigned to the result in C: $next_result = $fac->efetch( -WebEnv => $result->webenv, -QueryKey => $result->query_key ); =item Walking the response This module uses C to provide accessor methods for all response data. Here is an example of a SOAP response as returned by a C call off the L object: DB<5> x $result->som->method 0 HASH(0x2eac9a4) 'Count' => 148 'IdList' => HASH(0x4139578) 'Id' => 100136227 'QueryKey' => 1 'QueryTranslation' => 'sonic[All Fields] AND hedgehog[All Fields]' 'RetMax' => 20 'RetStart' => 0 'TranslationSet' => '' 'TranslationStack' => HASH(0x4237b4c) 'OP' => 'GROUP' 'TermSet' => HASH(0x42c43bc) 'Count' => 2157 'Explode' => 'Y' 'Field' => 'All Fields' 'Term' => 'hedgehog[All Fields]' 'WebEnv' => 'NCID_1_150423569_130.14.22.101_9001_1262703782' Some of the data values here (at the tips of the data structure) are actually arrays of values ( e.g., the tip C Id> ), other tips are simple scalars. With this in mind, C accessor methods work as follows: Data values (at the tips of the response structure) are acquired by calling a method with the structure keys separated by underscores (if necessary): $query_key = $result->QueryKey; # $query_key == 1 $ids = $result->IdList_Id; # @$ids is an array of record ids Data I below a particular node in the response structure can also be obtained with similarly constructed method names. These 'internal node accessors' return a hashref, containing all data leaves below the node, keyed by the accessor names: $data_hash = $result->TranslationStack DB<3> x $data_hash 0 HASH(0x43569d4) 'TranslationStack_OP' => ARRAY(0x42d9988) 0 'AND' 1 'GROUP' 'TranslationStack_TermSet_Count' => ARRAY(0x4369c64) 0 148 1 148 2 2157 'TranslationStack_TermSet_Explode' => ARRAY(0x4368998) 0 'Y' 1 'Y' 'TranslationStack_TermSet_Field' => ARRAY(0x4368260) 0 'All Fields' 1 'All Fields' 'TranslationStack_TermSet_Term' => ARRAY(0x436c97c) 0 'sonic[All Fields]' 1 'hedgehog[All Fields]' Similarly, the call C< $result->TranslationStack_TermSet > would return a similar hash containing the last 4 elements of the example hash above. Creating accessors is somewhat costly, especially for fetch responses which can be deep and complex (not unlike BioPerl developers). Portions of the response tree can be ignored by setting C<-prune_at_node> to a arrayref of nodes to skip. Nodes should be specified in L format, e.g. ...::Result->new( -prune_at_nodes => ['//GBSeq_references'] ); Accessor creation can be skipped altogether by passing C<-no_parse => 1> to the C constructor. This is recommended if a result is being passed to a L. The original SOAP message with all data is always available in C<$result->som>. =back =over Other methods =item accessors() An array of available data accessor names. This contains only the data "tips". The internal node accessors are autoloaded. =item ok() True if no SOAP fault. =item errstr() Returns the SOAP fault error string. =item som() The original C message. =item util() The EUtility associated with the result. =back =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::DB::SoapEUtilities::Result; use strict; use warnings; use Bio::Root::Root; use base qw(Bio::Root::Root ); our $AUTOLOAD; our %ID_LIST_ELT = ( esearch => 'IdList_Id', esummary => 'DocSum_Id', elink => 'LinkSet_IdList_Id' ); # an object of accessors sub new { my $class = shift; my @args = @_; my $self = $class->SUPER::new(@args); my $eutil_obj = shift @args; my ($alias_hash, $prune_at_nodes, $no_parse, $make_index) = $self->_rearrange( [qw( ALIAS_HASH PRUNE_AT_NODES NO_PARSE INDEX_ACCESSORS ) ], @args); $self->throw("Result constructor requires Bio::DB::SoapEUtilities ". "argument") unless ($eutil_obj and ref($eutil_obj) eq 'Bio::DB::SoapEUtilities'); $alias_hash ||= {}; $$alias_hash{ 'ids' } = ($ID_LIST_ELT{$eutil_obj->_caller_util} || 'IdList_Id'); if ($prune_at_nodes) { $prune_at_nodes = [$prune_at_nodes] unless ref $prune_at_nodes; } $self->{'_util'} = $eutil_obj->_caller_util; my $som = $self->{'_som'} = $eutil_obj->last_result; return unless ( $som and ref($som) eq 'SOAP::SOM' ); return $self unless $self->ok; # SOAP fault $self->{'_result_type'} = $eutil_obj->_soap_facs($self->util)->_result_elt_name; $self->{'_accessors'} = []; $self->{'_WebEnv'} = $som->valueof("//WebEnv"); $self->{'_QueryKey'} = $som->valueof("//QueryKey"); $self->{'_fetch_type'} = $eutil_obj->_soap_facs($self->util)->_wsdl->db; $self->{'_fetch_db'} = ($self->util eq 'efetch' ? $eutil_obj->_soap_facs($self->util)->db : undef); return ($no_parse ? $self : $self->parse_methods($alias_hash, $prune_at_nodes)); } =head2 parse_methods() Title : parse_methods Usage : Function: parse out the accessor methods Returns : self (Result object) Args : $alias_hash (hashref), $prune_at_nodes (scalar or arrayref) =cut sub parse_methods { my $self = shift; # parse message into accessors my ($alias_hash, $prune_at_nodes) = @_; my @methods = keys %{$self->som->method}; my %methods; foreach my $m (@methods) { _traverse_methods($m, '/', '', $self->som, \%methods, $self->{'_accessors'}, $prune_at_nodes); } # convenience aliases... if ($alias_hash && ref($alias_hash) eq 'HASH') { for (keys %$alias_hash) { if ($methods{ $$alias_hash{$_} }) { # avoid undef'd accessors $methods{$_} = $methods{ $$alias_hash{$_} }; push @{$self->{_accessors}}, $_; } } } # specials... if ($methods{Count}) { push @{$self->{'_accessors'}}, 'count'; for (ref $methods{Count}) { /^$/ && do { $methods{count} = $methods{Count}; last; }; /ARRAY/ && do { $methods{count} = $methods{Count}->[0]; last; }; } } else { #work harder my @toplev = keys %{$self->som->method}; my ($set) = grep /^.*?S(et|um)$/, @toplev; if ($set) { $methods{count} = 0; # kludge out NCBI inconsistencies my $stem = ($set =~ /(?:DocSum|LinkSet)/ ? "//Body/".$self->result_type."/*" : "//$set/*"); foreach ($self->som->valueof($stem)) { $methods{count}++; } } push @{$self->{'_accessors'}}, 'count'; } $self->_set_from_args( \%methods, -methods => $self->{'_accessors'}, -case_sensitive => 1, -create => 1 ); return $self; } =head2 util() Title : util Usage : Function: Name of the utility producing this result object. Returns : scalar string Args : =cut sub util { shift->{'_util'} } =head2 som() Title : som Usage : Function: get the original SOAP::SOM object Returns : a SOAP::SOM object Args : none =cut sub som { shift->{'_som'} } =head2 ok() Title : ok Usage : Function: Returns : true if no SOAP fault Args : =cut sub ok { !(shift->som->fault) } =head2 errstr() Title : errstr Usage : Function: Returns : fault string of SOAP object Args : none =cut sub errstr { shift->som->faultstring } =head2 accessors() Title : accessors Usage : Function: get the list of created accessors for this result Returns : array of scalar strings Args : none Note : does not include valid AUTOLOADed accessors; see DESCRIPTION =cut sub accessors { my $a = shift->{'_accessors'} ; @$a if $a } =head2 webenv() Title : webenv Usage : Function: contains WebEnv key referencing this result's session Returns : scalar Args : none =cut sub webenv { shift->{'_WebEnv'} } =head2 query_key()() Title : query_key() Usage : Function: contains the web query key assigned to this result Returns : scalar Args : =cut sub query_key { shift->{'_QueryKey'} } =head2 fetch_type() Title : fetch_type Usage : Function: Get the efetch database name according to WSDL Returns : scalar string (db name) or undef if N/A Args : none =cut sub fetch_type { shift->{'_fetch_type'} } sub fetch_db { shift->{'_fetch_db'} } sub result_type { shift->{'_result_type'} } sub _traverse_methods { my ($m, $skey, $key, $som, $hash, $acc, $prune) = @_; if ($prune) { foreach (@$prune) { return if "$skey\/$m" =~ /^$_/; } } my $val = $som->valueof("$skey\/$m"); for (ref $val) { /^$/ && do { my @a = $som->valueof("$skey\/$m"); my $M = $m; # camelcase it $M =~ s/([-_])([a-zA-Z0-9])/\u$2/g; my $k = ($key ? "$key\_" : "").$M; push @{$acc}, $k; if (@a == 1) { $$hash{$k} = $a[0]; } else { $$hash{$k} = \@a; } return; }; /HASH/ && do { foreach my $k (keys %$val) { my $M = $m; # camelcase it $M =~ s/([-_])([a-zA-Z0-9])/\u$2/g; _traverse_methods( $k, "$skey\/$m", ($key ? "$key\_" : "").$M, $som, $hash, $acc, $prune ); } return; }; do { #else, huh? Bio::Root::Root->throw("SOAP::SOM parse error : please contact the mailing list"); }; } } sub AUTOLOAD { my $self = shift; my $accessor = $AUTOLOAD; $accessor =~ s/.*:://; my @list = grep /^${accessor}_/, @{$self->{'_accessors'}}; unless (@list) { $self->debug("Accessor '$accessor' not present in this result"); return; } my %ret; foreach (@list) { $ret{$_} = $self->$_; } return \%ret; } 1; bioperl-run-release-1-7-1/lib/Bio/Factory/000077500000000000000000000000001302566030400202575ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Factory/EMBOSS.pm000066400000000000000000000165011302566030400216100ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Factory::EMBOSS # # Please direct questions and support issues to # # Cared for by Heikki Lehvaslaiho # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Factory::EMBOSS - EMBOSS application factory class =head1 SYNOPSIS # get an EMBOSS factory use Bio::Factory::EMBOSS; $f = Bio::Factory::EMBOSS -> new(); # get an EMBOSS application object from the factory $water = $f->program('water') || die "Program not found!\n"; # here is an example of running the application - # water can compare 1 sequence against 1 or more sequences # in a database using Smith-Waterman my $seq_to_test; # this would have a seq here my @seqs_to_check; # this would be a list of seqs to compare # (could be just 1) my $wateroutfile = 'out.water'; $water->run({-sequences => $seq_to_test, -seqall => \@seqs_to_check, -gapopen => '10.0', -gapextend => '0.5', -outfile => $wateroutfile}); # now you might want to get the alignment use Bio::AlignIO; my $alnin = Bio::AlignIO->new(-format => 'emboss', -file => $wateroutfile); while ( my $aln = $alnin->next_aln ) { # process the alignment -- these will be Bio::SimpleAlign objects } =head1 DESCRIPTION The EMBOSS factory class encapsulates access to EMBOSS programs. A factory object allows creation of only known applications. If you want to check command line options before sending them to the program set $prog-Everbose to positive integer. The value is passed on to programs objects and the ADC description of the available command line options is parsed and compared to input. See also L and L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing lists Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Heikki Lehvaslaiho Email heikki-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Factory::EMBOSS; use vars qw(@ISA $EMBOSSVERSION); use strict; use Bio::Root::Root; use Bio::Tools::Run::EMBOSSApplication; use Bio::Factory::ApplicationFactoryI; @ISA = qw(Bio::Root::Root Bio::Factory::ApplicationFactoryI ); $EMBOSSVERSION = "2.0.0"; sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); # set up defaults my($location) = $self->_rearrange([qw(LOCATION )], @args); $self->{ '_programs' } = {}; $self->{ '_programgroup' } = {}; $self->{ '_groups' } = {}; $self->location($location) if $location; $self->_program_list; # retrieve info about available programs return $self; } =head2 location Title : location Usage : $embossfactory->location Function: get/set the location of EMBOSS programs. Valid values are 'local' and 'novella'. Returns : string, defaults to 'local' Args : string =cut sub location { my ($self, $value) = @_; my %location = ('local' => '1', 'novella' => '1' ); if (defined $value) { $value = lc $value; if ($location{$value}) { $self->{'_location'} = $value; } else { $self->warn("Value [$value] not a valid value for ". "location(). Defaulting to [local]"); $self->{'_location'} = 'local'; } } $self->{'_location'} ||= 'local'; return $self->{'_location'}; } =head2 program Title : program Usage : $embossfactory->program('program_name') Function: Creates a representation of a single EMBOSS program and issues a warning if the program was not found. Returns : Bio::Tools::Run::EMBOSSApplication object or undef Args : string, program name =cut sub program { my ($self, $value) = @_; unless( $self->{'_programs'}->{$value} ) { $self->warn("Application [$value] is not available!"); return undef; } my $attr = {}; $attr->{name} = $value; $attr->{verbose} = $self->verbose; my $appl = Bio::Tools::Run::EMBOSSApplication->new($attr); return $appl; } =head2 version Title : $self->version Usage : $embossfactory->version() Function: gets the version of EMBOSS programs Throws : if EMBOSS suite is not accessible Returns : version value Args : None =cut sub version { my ($self) = @_; my $version = `embossversion -auto`; $self->throw("EMBOSS suite of programs is not available") if $?; chop $version; # compare versions $self->throw("EMBOSS has to be at least version $EMBOSSVERSION got $version\n") if $version lt $EMBOSSVERSION; return $version; } =head2 Programs These methods allow the programmer to query the EMBOSS suite and find out which program names can be used and what arguments can be used. =head2 program_info Title : program_info Usage : $embossfactory->program_info('emma') Function: Finds out if the program is available. Returns : definition string of the program, undef if program name not known Args : string, prgramname =cut sub program_info { my ($self, $value) = @_; return $self->{'_programs'}->{$value}; } =head2 Internal methods Do not call these methods directly =head2 _program_list Title : _program_list Usage : $embossfactory->_program_list() Function: Finds out what programs are available. Writes the names into an internal hash. Returns : true if successful Args : None =cut sub _program_list { my ($self) = @_; if( $^O =~ /Mac/i ) { return; } { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; local * SAVERR; open SAVERR, ">&STDERR"; open STDERR, ">$null"; open(WOSSOUT, "wossname -auto |") || return; open STDERR, ">&SAVERR"; } local $/ = "\n\n"; while( ) { my ($groupname) = (/^([A-Z][A-Z0-9 ]+)$/m); #print $groupname, "\n" if $groupname; $self->{'_groups'}->{$groupname} = [] if $groupname; while ( /^([a-z]\w+) +(.+)$/mg ) { #print "$1\t$2 \n" if $1; $self->{'_programs'}->{$1} = $2 if $1; $self->{'_programgroup'}->{$1} = $groupname if $1; push @{$self->{'_groups'}->{$groupname}}, $1 if $1; } } close(WOSSOUT); } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/000077500000000000000000000000001302566030400206055ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Installer/Clustalw.pm000066400000000000000000000070461302566030400227500ustar00rootroot00000000000000 # BioPerl module for Bio::Installer::Clustalw # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::Clustalw - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::Clustalw; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'ftp://ftp.ebi.ac.uk/pub/software/unix/clustalw', 'BIN_FOLDER' => '', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "$ENV{'HOME'}", 'PACKAGE_NAME' => 'clustalw1.83.UNIX.tar.gz', 'DIRECTORY_NAME' => 'clustalw1.83', 'ENV_NAME' => 'CLUSTALDIR', ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; my $dir; $self->_decompress; $self->_execute_make; $dir = $self->destination_install_dir; $self->_remember_env; } =head2 _execute_make Title : _execute_make Usage : Function: Example : Returns : Args : =cut sub _execute_make{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir . "/" . $self->directory_name; print "\n\nCompiling with make -- (this might take a while)\n\n";sleep 1; if (($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nCalling make (this might take a while)\n\n";sleep 1; $call = "make"; system("$call") == 0 or $self->throw("Error when trying to run make"); } else { $self->throw("_execute_make not yet implemented in this platform"); } } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/EMBOSS.pm000066400000000000000000000113541302566030400221370ustar00rootroot00000000000000 # BioPerl module for Bio::Installer::EMBOSS # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::EMBOSS - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::EMBOSS; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'ftp://ftp.uk.embnet.org/pub/EMBOSS/', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "/usr/local", 'PACKAGE_NAME' => 'EMBOSS-latest.tar.gz', 'DIRECTORY_NAME' => 'EMBOSS-2.9.0' ); } #FIXME: regexp directory_name so that is not hardcoded =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; $self->_decompress; $self->_execute_typical_nix_install; $self->_remember_env_is_in_path; } =head2 _execute_typical_nix_install Title : _execute_typical_nix_install Usage : Function: Example : Returns : Args : =cut sub _execute_typical_nix_install{ my ($self,@args) = @_; my $call; #FIXME: regexp directory_name so that is not hardcoded, after EMBOSS-latest is downloaded my $destination = $self->destination_install_dir . "/" . $self->directory_name; print "\n\nTypical linux install -- configure -- make -- make install (this might take a while)\n\n";sleep 1; if (($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nCalling configure (this might take a while)\n\n";sleep 1; $call = "./configure --prefix=$destination"; system("$call") == 0 or $self->throw("Error when trying to run configure"); print "\n\nCalling make (this might take a while)\n\n";sleep 1; $call = "make"; system("$call") == 0 or $self->throw("Error when trying to run make"); print "\n\nCalling make install (this might take a while)\n\n";sleep 1; if (-w "$destination") { $call = "make install"; } else { print "\n\n You may need administrative password to install\n" ."this program so that bioperl can see it in the binary PATH\n"; $call = "su -c \"make install\""; } system("$call") == 0 or $self->throw("Error when trying to run make install"); } else { $self->throw("_execute_typical_nix not yet implemented in this platform"); } } =head2 _remember_env_is_in_path Title : _remember_env_is_in_path Usage : Function: Example : Returns : Args : =cut sub _remember_env_is_in_path{ my ($self,@args) = @_; my $dir; $dir = $self->destination_install_dir; unless ($dir =~ m|/usr/local|) { print STDERR < # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::Generic - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::Generic; use vars qw(@ISA); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; # Download file use LWP; use HTTP::Request::Common; @ISA = qw(Bio::Root::Root ); =head2 new Title : new Usage : my $obj = new Generic(); Function: Builds a new Generic object Returns : an instance of Generic Args : -origin_download_dir => from where is going to be downloaded -destination_download_dir => where is going to be saved -destination_install_dir => where is going to be installed -package_name => name of the package to be downloaded -directory_name => name of the directory once has been decompressed =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($origin_download_dir, $destination_download_dir, $destination_install_dir, $package_name, $directory_name) = $self->_rearrange( [qw(ORIGIN_DOWNLOAD_DIR DESTINATION_DOWNLOAD_DIR DESTINATION_INSTALL_DIR PACKAGE_NAME DIRECTORY_NAME)], @args); defined $origin_download_dir && $self->origin_download_dir($origin_download_dir); defined $destination_download_dir && $self->destination_download_dir($destination_download_dir); defined $destination_install_dir && $self->destination_install_dir($destination_install_dir); defined $package_name && $self->package_name($package_name); defined $directory_name && $self->directory_name($directory_name); return $self; } =head2 origin_download_dir Title : origin_download_dir Usage : $obj->origin_download_dir($newval) Function: Example : Returns : value of origin_download_dir (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub origin_download_dir{ my $self = shift; return $self->{'origin_download_dir'} = shift if @_; return $self->{'origin_download_dir'} || $self->get_default('ORIGIN_DOWNLOAD_DIR'); } =head2 destination_download_dir Title : destination_download_dir Usage : $obj->destination_download_dir($newval) Function: Example : Returns : value of destination_download_dir (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub destination_download_dir{ my $self = shift; return $self->{'destination_download_dir'} = shift if @_; return $self->{'destination_download_dir'} || $self->get_default('DESTINATION_DOWNLOAD_DIR'); } =head2 destination_install_dir Title : destination_install_dir Usage : $obj->destination_install_dir($newval) Function: Example : Returns : value of destination_install_dir (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub destination_install_dir{ my $self = shift; return $self->{'destination_install_dir'} = shift if @_; return $self->{'destination_install_dir'} || $self->get_default('DESTINATION_INSTALL_DIR'); } =head2 package_name Title : package_name Usage : $obj->package_name($newval) Function: Example : Returns : value of package_name (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub package_name{ my $self = shift; return $self->{'package_name'} = shift if @_; return $self->{'package_name'} || $self->get_default('PACKAGE_NAME'); } =head2 directory_name Title : directory_name Usage : $obj->directory_name($newval) Function: Example : Returns : value of directory_name (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub directory_name{ my $self = shift; return $self->{'directory_name'} = shift if @_; return $self->{'directory_name'} || $self->get_default('DIRECTORY_NAME'); } =head2 env_name Title : env_name Usage : Function: Example : Returns : Args : =cut sub env_name{ my $self = shift; return $self->{'env_name'} = shift if @_; return $self->{'env_name'} || $self->get_default('ENV_NAME'); } =head2 _remember_env Title : _remember_env Usage : Function: Example : Returns : Args : =cut sub _remember_env{ my ($self,@args) = @_; my $dir; $dir = $self->destination_install_dir; $dir =~ s|/$||; $dir .= "/" . $self->directory_name . "/" . $self->get_default('BIN_FOLDER'); my $env_name = $self->env_name; print STDERR <destination_download_dir . "/" . $self->package_name;; my $destination_install_dir = $self->destination_install_dir; if (($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { $call = "tar xzvf $destination --directory=$destination_install_dir"; system("$call") == 0 or $self->throw("Error when trying to decompress package"); $call = "rm -f $destination"; system("$call") == 0 or $self->throw("Error when trying to delete compressed package"); } else { $self->throw("_decompress not yet implemented in this platform"); } } =head2 download Title : download Usage : $installer->download(); Function: Example : Returns : Args : =cut sub download{ my ($self,@args) = @_; print "\n\nDownloading package...(this might take a while)\n\n";sleep 1; my $file = $self->origin_download_dir; $file =~ s|/$||; $file .= "/" . $self->package_name; my $destination = $self->destination_download_dir; $destination =~ s|/$||; $destination .= "/" . $self->package_name;; my $ua = LWP::UserAgent->new; my $response = $ua->request( GET($file), $destination ); die "Error at $file\n ", $response->status_line, "\n Aborting" unless $response->is_success; print "Package successfully downloaded at $destination.\n"; return $self; } =head2 uninstall Title : uninstall Usage : $installer->uninstall(); Function: Example : Returns : Args : =cut sub uninstall{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir; $destination =~ s|/$||; $destination .= "/" . $self->directory_name; print "\n\nUninstalling now: this will delete the installed program\n\n"; if (($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { $call = "rm -rf $destination"; system("$call") == 0 or die "Error when trying to delete installed program $?\n"; } else { $self->throw("uninstall not yet implemented in this platform"); } return $self; } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/Hyphy.pm000066400000000000000000000065751302566030400222610ustar00rootroot00000000000000 # BioPerl module for Bio::Installer::Hyphy # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::Hyphy - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.shtml - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::Hyphy; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'http://www.hyphy.org/current', 'BIN_FOLDER' => '', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "$ENV{'HOME'}", 'PACKAGE_NAME' => 'HYPHY_Source.tgz', 'DIRECTORY_NAME' => 'HYPHY_Source', 'ENV_NAME' => 'HYPHYDIR', ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; my $dir; $self->_decompress; $self->_execute_Hyphy_install_script; $dir = $self->destination_install_dir; $self->_remember_env; } =head2 _execute_Hyphy_install_script Title : _execute_Hyphy_install_script Usage : Function: Example : Returns : Args : =cut sub _execute_Hyphy_install_script{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir; $destination =~ s|/$||; $destination .= "/" . $self->directory_name; chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nCompiling now... (this might take a while)\n\n"; $call = "sh build.sh MP"; system("$call") == 0 or die "Error when trying to run install script $?\n"; } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/Muscle.pm000066400000000000000000000066711302566030400224050ustar00rootroot00000000000000 # BioPerl module for Bio::Installer::Muscle # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::Muscle - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.shtml - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::Muscle; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'http://igs-server.cnrs-mrs.fr/~cnotred/Packages', 'BIN_FOLDER' => 'bin', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "$ENV{'HOME'}", 'PACKAGE_NAME' => 'T-COFFEE_distribution.tar.gz', 'DIRECTORY_NAME' => 'T-COFFEE_distribution_Version_1.37', 'ENV_NAME' => 'MUSCLEDIR', ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; my $dir; $self->_decompress; $self->_execute_Muscle_install_script; $dir = $self->destination_install_dir; $self->_remember_env; } =head2 _execute_Muscle_install_script Title : _execute_Muscle_install_script Usage : Function: Example : Returns : Args : =cut sub _execute_Muscle_install_script{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir; $destination =~ s|/$||; $destination .= "/" . $self->directory_name; chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nCompiling now... (this might take a while)\n\n"; $call = "sh install"; system("$call") == 0 or die "Error when trying to run install script $?\n"; } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/PAML.pm000066400000000000000000000121441302566030400216760ustar00rootroot00000000000000# BioPerl module for Bio::Installer::PAML # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::PAML - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::PAML; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'http://abacus.gene.ucl.ac.uk/software/', 'BIN_FOLDER' => 'src', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "$ENV{'HOME'}", 'PACKAGE_NAME' => 'paml4a.tar.gz', 'DIRECTORY_NAME' => 'paml4', 'ENV_NAME' => 'PAMLDIR', ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; $self->_decompress; $self->_tweak_paml_makefile; $self->_execute_paml_makefile; $self->_remember_env; } =head2 _execute_paml_makefile Title : _execute_paml_makefile Usage : Function: Example : Returns : Args : =cut sub _execute_paml_makefile{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir . "/" . $self->directory_name . "/src"; chdir $destination or $self->throw("Cant cd to $destination"); print "\n\nCompiling now... (this might take a while)\n\n"; if (($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { $call = "make"; system("$call") == 0 or $self->throw("Error when trying to run make"); } else { $self->throw("_execute_paml_makefile not yet implemented in this platform"); } } =head2 _tweak_paml_makefile Title : _tweak_paml_makefile Usage : Function: Example : Returns : Args : =cut sub _tweak_paml_makefile{ my ($self,@args) = @_; my $gcc3; my $return = qx/gcc -v 2>&1/; if ($return =~ /version 3/) { $return = `cat /proc/cpuinfo`; if( ($return =~ /mmx/) || ($return =~ /3dnow/)) { my $destination = $self->destination_install_dir . "/" . $self->directory_name . "/src"; chdir $destination or $self->throw("Cant cd to $destination"); my $new = "Makefile"; open(OLD, "< Makefile.UNIX") or $self->throw("can't open Makefile.UNIX"); open(NEW, "> $new") or $self->throw("can't open $new"); while () { # change $_, then... $_ =~ s/CFLAGS = -O3/\# CFLAGS = -O3/g; if( $return =~ /3dnow/ ) { $_ =~ s/#CFLAGS = -march=athlon -mcpu=athlon -O4 -funroll-loops -fomit-frame-pointer -finline-functions/CFLAGS = -march=athlon -mcpu=athlon -O4 -funroll-loops -fomit-frame-pointer -finline-functions/g; } elsif( $return =~ /mmx/ ) { $_ =~ s/#CFLAGS = -march=pentiumpro -mcpu=pentiumpro -O4 -funroll-loops -fomit-frame-pointer -finline-functions/CFLAGS = -march=pentiumpro -mcpu=pentiumpro -O4 -funroll-loops -fomit-frame-pointer -finline-functions/g; } print NEW $_ or $self->throw("can't write $new"); } close(OLD) or $self->throw("can't close Makefile.UNIX"); close(NEW) or $self->throw("can't close $new"); } } } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/Probcons.pm000066400000000000000000000111251302566030400227300ustar00rootroot00000000000000 # BioPerl module for Bio::Installer::Probcons # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::Probcons - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::Probcons; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'http://probcons.stanford.edu/', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "/usr/local", 'PACKAGE_NAME' => 'probcons_v1_09.tar.gz', 'DIRECTORY_NAME' => 'probcons' ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; $self->_decompress; $self->_execute_typical_nix_install; $self->_remember_env_is_in_path; } =head2 _execute_typical_nix_install Title : _execute_typical_nix_install Usage : Function: Example : Returns : Args : =cut sub _execute_typical_nix_install{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir . "/" . $self->directory_name; print "\n\nTypical linux install -- configure -- make -- make install (this might take a while)\n\n";sleep 1; if (($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nConfigure not needed\n\n";sleep 1; # $call = "./configure --prefix=$destination"; # system("$call") == 0 or $self->throw("Error when trying to run configure"); print "\n\nCalling make (this might take a while)\n\n";sleep 1; $call = "make"; system("$call") == 0 or $self->throw("Error when trying to run make"); # print "\n\nCalling make install (this might take a while)\n\n";sleep 1; # if (-w "$destination") { # $call = "make install"; # } else { # print "\n\n You may need administrative password to install\n" # ."this program so that bioperl can see it in the binary PATH\n"; # $call = "su -c \"make install\""; # } # system("$call") == 0 or $self->throw("Error when trying to run make install"); } else { $self->throw("_execute_typical_nix not yet implemented in this platform"); } } =head2 _remember_env_is_in_path Title : _remember_env_is_in_path Usage : Function: Example : Returns : Args : =cut sub _remember_env_is_in_path{ my ($self,@args) = @_; my $dir; $dir = $self->destination_install_dir; unless ($dir =~ m|/usr/local|) { print STDERR < # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::SLR - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.shtml - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::SLR; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'http://www.ebi.ac.uk/goldman-srv/SLR/download/current/', 'BIN_FOLDER' => 'bin', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "$ENV{'HOME'}", 'PACKAGE_NAME' => 'slr_source.tgz', 'DIRECTORY_NAME' => 'slr', 'ENV_NAME' => 'SLRDIR', ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; my $dir; $self->_decompress; # $self->_execute_slr_install_script; $dir = $self->destination_install_dir; $self->_remember_env; } =head2 _execute_slr_install_script Title : _execute_slr_install_script Usage : Function: Example : Returns : Args : =cut sub _execute_slr_install_script{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir; $destination =~ s|/$||; $destination .= "/" . $self->directory_name; chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nCompiling now... (this might take a while)\n\n"; $call = "sh build.sh MP"; system("$call") == 0 or die "Error when trying to run install script $?\n"; } 1; bioperl-run-release-1-7-1/lib/Bio/Installer/TCoffee.pm000066400000000000000000000071461302566030400224660ustar00rootroot00000000000000 # BioPerl module for Bio::Installer::TCoffee # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Installer::TCoffee - DESCRIPTION of Object =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Installer::TCoffee; use vars qw(@ISA %DEFAULTS); use strict; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Installer::Generic; @ISA = qw(Bio::Installer::Generic ); BEGIN { %DEFAULTS = ( 'ORIGIN_DOWNLOAD_DIR' => 'http://www.tcoffee.org/Packages/', 'BIN_FOLDER' => 'bin', 'DESTINATION_DOWNLOAD_DIR' => '/tmp', 'DESTINATION_INSTALL_DIR' => "$ENV{'HOME'}", # 'PACKAGE_NAME' => 'T-COFFEE_distribution_Version_4.45.tar.gz', 'PACKAGE_NAME' => 'T-COFFEE_distribution_Version_5.37.tar.gz', # 'DIRECTORY_NAME' => 'T-COFFEE_distribution_Version_4.45', 'DIRECTORY_NAME' => 'T-COFFEE_distribution_Version_5.37', 'ENV_NAME' => 'TCOFFEEDIR', ); } =head2 get_default Title : get_default Usage : Function: Example : Returns : Args : =cut sub get_default { my $self = shift; my $param = shift; return $DEFAULTS{$param}; } =head2 install Title : install Usage : $installer->install(); Function: Example : Returns : Args : =cut sub install{ my ($self,@args) = @_; my $dir; $self->_decompress; $self->_execute_tcoffee_install_script; $dir = $self->destination_install_dir; $self->_remember_env; } =head2 _execute_tcoffee_install_script Title : _execute_tcoffee_install_script Usage : Function: Example : Returns : Args : =cut sub _execute_tcoffee_install_script{ my ($self,@args) = @_; my $call; my $destination = $self->destination_install_dir; $destination =~ s|/$||; $destination .= "/" . $self->directory_name; chdir $destination or die "Cant cd to $destination $!\n"; print "\n\nCompiling now... (this might take a while)\n\n"; $call = "sh install"; system("$call") == 0 or die "Error when trying to run install script $?\n"; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/000077500000000000000000000000001302566030400177505ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/000077500000000000000000000000001302566030400205145ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/000077500000000000000000000000001302566030400224325ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Amap.pm000066400000000000000000000344411302566030400236540ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Amap # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Amap - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the Amap (2.0) program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Amap->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. #To run amap with training, try something like: #First round to generate train.params $factory = Bio::Tools::Run::Alignment::Amap->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'emissions' => '', 'verbose' => '', 'train' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/train.params"); #Second round to use train.params to get a high qual alignment $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); $aln = ''; $factory = ''; $factory = Bio::Tools::Run::Alignment::Amap->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'verbose' => '', 'paramfile' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/outfile.afa"); $aln = $factory->align($seq_array_ref); =head1 DESCRIPTION Amap uses a Sequence Annealing algorithm which is an incremental method for building multiple alignments. You can get it and see information about it at this URL http://bio.math.berkeley.edu/amap/ =head2 Helping the module find your executable FIXME: Amap uses the same parameters as Probcons, plus a few others. I haven't had time to check all the changes from the Probcons.pm runnable. Feel free to do it. You will need to enable Amap to find the amap program. This can be done in (at least) three ways: 1. Make sure the amap executable is in your path (i.e. 'which amap' returns a valid program 2. define an environmental variable AMAPDIR which points to a directory containing the 'amap' app: In bash export AMAPDIR=/home/progs/amap or In csh/tcsh setenv AMAPDIR /home/progs/amap 3. include a definition of an environmental variable AMAPDIR in every script that will BEGIN {$ENV{AMAPDIR} = '/home/progs/amap'; } use Bio::Tools::Run::Alignment::Amap; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email foo@bar.com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Amap; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @AMAP_PARAMS @AMAP_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @AMAP_PARAMS = qw (CONSISTENCY ITERATIVE-REFINEMENT PRE-TRAINING ANNOT TRAIN PARAMFILE MATRIXFILE CLUSTALW PAIRS VITERBI VERBOSE EMISSIONS EDGE-WEIGHT-THRESHOLD GAP-FACTOR); #FIXME: Last line are switches, dunno how to set them, #gave as params @AMAP_SWITCHES = qw(); @OTHER_SWITCHES = qw(PROGRESSIVE NOREORDER ALIGNMENT-ORDER MAXSTEP PRINT-POSTERIORS); # Authorize attribute fields foreach my $attr ( @AMAP_PARAMS, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'amap'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{AMAPDIR}) if $ENV{AMAPDIR}; } =head2 new Title : new Usage : my $amap = Bio::Tools::Run::Alignment::Amap->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Amap Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on) if $on; my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #AMAP version 1.09 - align multiple protein sequences and print to standard output $string =~ /AMAP\s+version.+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run amap return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to amap program Example : Returns : nothing; amap output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to amap =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." $infilename $params"; $self->debug( "amap command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Amap call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for amap program Example : Returns : name of file containing amap data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to amap!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for amap program Example : Returns : parameter string to be passed to amap during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @AMAP_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } for $attr ( @AMAP_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by Amap $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $amap->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $amap->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Blat.pm000077500000000000000000000367411302566030400236700ustar00rootroot00000000000000# # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Blat =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Blat; my $factory = Bio::Tools::Run::Alignment::Blat->new(-db => $database); # $report is a Bio::SearchIO-compliant object my $report = $factory->run($seqobj); =head1 DESCRIPTION Wrapper module for Blat program. This newer version allows for all parameters to be set by passing them as an option to new(). Key bits not implemented yet (TODO): =over 3 =item * Implement all needed L methods Missing are a few, including version(). =item * Re-implement using L Would like to get this running under something less reliant on OS-dependent changes within code. =item * No .2bit or .nib conversions yet These require callouts to faToNib or faTwoTwoBit, which may or may not be installed on a user's machine. We can possibly add functionality to check for faToTwoBit/faToNib and other UCSC tools in the future. =back =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Chris Fields - cjfields at bioperl dot org Original author - Bala Email bala@tll.org.sg =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Blat; use strict; use warnings; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); use Bio::SeqIO; use Bio::Root::Root; use Bio::Factory::ApplicationFactoryI; use Bio::SearchIO; use Bio::Tools::Run::WrapperBase; our ($PROGRAM, $PROGRAMDIR, $PROGRAMNAME); our %BLAT_PARAMS = map {$_ => 1} qw(ooc t q tileSize stepSize oneOff minMatch minScore minIdentity maxGap makeOoc repmatch mask qMask repeats minRepeatsDivergence dots out maxIntron); our %BLAT_SWITCHES = map {$_ => 1} qw(prot noHead trimT noTrimA trimHardA fastMap fine extendThroughN); our %LOCAL_ATTRIBUTES = map {$_ => 1} qw(db DB qsegment hsegment searchio outfile_name quiet); our %searchio_map = ( 'psl' => 'psl', 'pslx' => 'psl', # I don't think there is support for this yet 'axt' => 'axt', 'blast' => 'blast', 'sim4' => 'sim4', 'wublast' => 'blast', 'blast8' => 'blasttable', 'blast9' => 'blasttable' ); =head2 new Title : new Usage : $blat->new( -db => '' ) Function: Create a new Blat factory Returns : A new Bio::Tools::Run::Alignment::Blat object Args : -db : Mandatory parameter. See db() -qsegment : see qsegment() -tsegment : see tsegment() Also, Blat parameters and flags are accepted: -t, -q, -minIdentity, -minScore, -out, -trimT, ... See Blat's manual for details. =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->set_parameters(@args); return $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: Get the program name Returns : string Args : None =cut sub program_name { return 'blat'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{BLATDIR}) if $ENV{BLATDIR}; } =head2 run Title : run() Usage : $obj->run($query) Function: Run Blat. Returns : A Bio::SearchIO object that holds the results Args : A Bio::PrimarySeqI object or a file of query sequences =cut sub run { my ($self, $query) = @_; if (ref($query) ) { # it is an object if (ref($query) =~ /GLOB/) { $self->throw("Cannot use filehandle as argument to run()"); } $query = $self->_writeSeqFile($query); } return $self->_run($query); } =head2 align Title : align Usage : $obj->align($query) Function: Alias to run() =cut sub align { return shift->run(@_); } =head2 db Title : db Usage : $obj->db() Function: Get or set the file of database sequences (.fa , .nib or .2bit) Returns : Database filename Args : Database filename =cut sub db { my $self = shift; return $self->{blat_db} = shift if @_; return $self->{blat_db}; } # this is a kludge for tests (so one might expect this to be used elsewhere). # None of the other parameters worked in the past, so not replacing them *DB = \&db; =head2 qsegment Title : qsegment Usage : $obj->qsegment('sequence_a:0-1000') Function : pass in a B string for the query sequence(s) Returns : string Args : string Note : Requires the sequence(s) in question be 2bit or nib format Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence begins at 0, but start isn't counted with length), whereas BioPerl coordinates are 1-based closed (sequence begins with 1, both start and end are counted in the length of the segment). For example, a segment that is 'sequence_a:0-1000' will have BioPerl coordinates of 'sequence_a:1-1000', both with the same length (1000). =cut sub qsegment { my $self = shift; return $self->{blat_qsegment} = shift if @_; return $self->{blat_qsegment}; } =head2 tsegment Title : tsegment Usage : $obj->tsegment('sequence_a:0-1000') Function : pass in a B string for the target sequence(s) Returns : string Args : string Note : Requires the sequence(s) in question be 2bit or nib format Reminder : UCSC segment/regions coordinates are 0-based half-open (sequence begins at 0, but start isn't counted with length), whereas BioPerl coordinates are 1-based closed (sequence begins with 1, both start and end are counted in the length of the segment). For example, a segment that is 'sequence_a:0-1000' will have BioPerl coordinates of 'sequence_a:1-1000', both with the same length (1000). =cut sub tsegment { my $self = shift; return $self->{blat_tsegment} = shift if @_; return $self->{blat_tsegment}; } =head2 outfile_name Title : outfile_name Usage : $obj->outfile_name('out.blat') Function : Get or set the name for the BLAT output file Returns : string Args : string =cut # override this, otherwise one gets a default of 'mlc' sub outfile_name { my $self = shift; return $self->{blat_outfile} = shift if @_; return $self->{blat_outfile}; } =head2 searchio Title : searchio Usage : $obj->searchio{-writer => $writer} Function : Pass in additional parameters to the returned Bio::SearchIO parser Returns : Hash reference with Bio::SearchIO parameters Args : Hash reference Note : Currently, this implementation overrides any passed -format parameter based on whether the output is changed ('out'). This may change if requested, but we can't see the utility of doing so, as requesting mismatched output/parser combinations is just a recipe for disaster =cut sub searchio { my ($self, $params) = @_; if ($params && ref $params eq 'HASH') { delete $params->{-format}; $self->{blat_searchio} = $params; } return $self->{blat_searchio} || {}; } =head1 Bio::ParameterBaseI-specific methods These methods are part of the Bio::ParameterBaseI interface =head2 set_parameters Title : set_parameters Usage : $pobj->set_parameters(%params); Function: sets the parameters listed in the hash or array Returns : None Args : [optional] hash or array of parameter/values. These can optionally be hash or array references Note : This only sets parameters; to set methods use the method name =cut sub set_parameters { my $self = shift; # circumvent any issues arising from passing in refs my %args = (ref($_[0]) eq 'HASH') ? %{$_[0]} : (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_; # set the parameters passed in, but only ones supported for the program %args = map { my $a = $_; $a =~ s{^-}{}; $a => $args{$_}; } sort keys %args; while (my ($key, $val) = each %args) { if (exists $BLAT_PARAMS{$key}) { $self->{parameters}->{$key} = $val; } elsif (exists $BLAT_SWITCHES{$key}) { $self->{parameters}->{$key} = $BLAT_SWITCHES{$key} ? 1 : 0; } elsif ($LOCAL_ATTRIBUTES{$key} && $self->can($key)) { $self->$key($val); } } } =head2 reset_parameters Title : reset_parameters Usage : resets values Function: resets parameters to either undef or value in passed hash Returns : none Args : [optional] hash of parameter-value pairs =cut sub reset_parameters { my $self = shift; delete $self->{parameters}; if (@_) { $self->set_parameters(@_); } } =head2 validate_parameters Title : validate_parameters Usage : $pobj->validate_parameters(1); Function: sets a flag indicating whether to validate parameters via set_parameters() or reset_parameters() Returns : Bool Args : [optional] value evaluating to True/False Note : NYI =cut sub validate_parameters { 0 } =head2 parameters_changed Title : parameters_changed Usage : if ($pobj->parameters_changed) {...} Function: Returns boolean true (1) if parameters have changed Returns : Boolean (0 or 1) Args : None Note : This module does not run state checks, so this always returns True =cut sub parameters_changed { 1 } =head2 available_parameters Title : available_parameters Usage : @params = $pobj->available_parameters() Function: Returns a list of the available parameters Returns : Array of parameters Args : [optional] name of executable being used; defaults to returning all available parameters =cut sub available_parameters { my ($self, $exec) = @_; my @params = (sort keys %BLAT_PARAMS, sort keys %BLAT_SWITCHES); return @params; } =head2 get_parameters Title : get_parameters Usage : %params = $pobj->get_parameters; Function: Returns list of set key-value pairs, parameter => value Returns : List of key-value pairs Args : none =cut sub get_parameters { my ($self, $option) = @_; $option ||= ''; # no option my %params; if (exists $self->{parameters}) { %params = map {$_ => $self->{parameters}->{$_}} sort keys %{$self->{parameters}}; } else { %params = (); } return %params; } =head1 to_* methods All to_* methods are implementation-specific =head2 to_exe_string Title : to_exe_string Usage : $string = $pobj->to_exe_string; Function: Returns string (command line string in this case) Returns : String Args : =cut sub to_exe_string { my ($self, @passed) = @_; my ($seq) = $self->_rearrange([qw(SEQ_FILE)], @passed); $self->throw("Must provide a seq_file") unless defined $seq; my %params = $self->get_parameters(); my ($exe, $db, $qseg, $tseg) = ($self->executable, $self->db, $self->qsegment, $self->tsegment); $self->throw("Executable not found") unless defined($exe); if ($tseg) { $db .= ":$tseg"; } if ($qseg) { $seq .= ":$qseg"; } my @params; for my $p (sort keys %BLAT_SWITCHES) { if (exists $params{$p}) { push @params, "-$p" } } for my $p (sort keys %BLAT_PARAMS) { if (exists $params{$p}) { push @params, "-$p=$params{$p}" } } # this only passes in the first seq file (no globs are allow AFAIK) push @params, ($db, $seq); # quiet! Unfortunately, it is NYI my $string = "$exe ".join(' ',@params); return $string; } #=head2 _input # # Title : _input # Usage : obj->_input($seqFile) # Function: Internal (not to be used directly) # Returns : # Args : # #=cut sub _input() { my ($self,$infile1) = @_; if (defined $infile1) { $self->{'input'} = $infile1; } return $self->{'input'}; } #=head2 _database # # Title : _database # Usage : obj->_database($seqFile) # Function: Internal (not to be used directly) # Returns : # Args : # #=cut sub _database() { my ($self,$infile1) = @_; $self->{'db'} = $infile1 if(defined $infile1); return $self->{'db'}; } #=head2 _run # # Title : _run # Usage : $obj->_run() # Function: Internal (not to be used directly) # Returns : A Bio::SearchIO object that contains the results # Args : File of sequences # #=cut sub _run { my ($self, $seq_file) = @_; my $str = $self->to_exe_string(-seq_file => $seq_file); my $out = $self->outfile_name || $self->_tempfile; $str .= " $out".$self->_quiet; $self->debug($str."\n") if( $self->verbose > 0 ); my %params = $self->get_parameters; my $status = system($str); $self->throw( "Blat call ($str) crashed: $? \n") unless $status==0; my $format = exists($params{out}) ? $searchio_map{$params{out}} : 'psl'; my @io = ref ($out) !~ /GLOB/ ? (-file => $out,) : (-fh => $out,); my $blat_obj = Bio::SearchIO->new(%{$self->searchio}, @io, -query_type => $params{prot} ? 'protein' : $params{q} || 'dna', -hit_type => $params{prot} ? 'protein' : $params{t} || 'dna', -format => $format); return $blat_obj; } #=head2 _writeSeqFile # # Title : _writeSeqFile # Usage : obj->_writeSeqFile($seq) # Function: Internal (not to be used directly) # Returns : # Args : # #=cut sub _writeSeqFile { my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$Bio::Root::IO::TEMPDIR); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); return $inputfile; } sub _tempfile { my $self = shift; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$Bio::Root::IO::TEMPDIR); # this is because we only want a unique filename close($tfh); return $outfile; } sub _quiet { my $self = shift; my $q = ''; # BLAT output goes to a file, all other output is STDOUT if ($self->quiet) { $q = $^O =~ /Win/i ? ' 2>&1 NUL' : ' > /dev/null 2>&1'; } return $q; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Clustalw.pm000066400000000000000000000747471302566030400246110ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Clustalw # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Clustalw - Object for the calculation of a multiple sequence alignment from a set of unaligned sequences or alignments using the Clustalw program =head1 SYNOPSIS # Build a clustalw alignment factory @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); # $aln is a SimpleAlign object. # or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: $aln = $factory->profile_align($aln1,$aln2); # where $aln1 and $aln2 are Bio::SimpleAlign objects. # Or one can pass the factory an alignment and one or more unaligned # sequences to be added to the alignment. For example: $aln = $factory->profile_align($aln1,$seq); # $seq is a Bio::Seq object. # Get a tree of the sequences $tree = $factory->tree(\@seq_array); # Get both an alignment and a tree ($aln, $tree) = $factory->run(\@seq_array); # Do a footprinting analysis on the supplied sequences, getting back the # most conserved sub-alignments my @results = $factory->footprint(\@seq_array); foreach my $result (@results) { print $result->consensus_string, "\n"; } # There are various additional options and input formats available. # See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION Note: this DESCRIPTION only documents the Bioperl interface to Clustalw. Clustalw, itself, is a large & complex program - for more information regarding clustalw, please see the clustalw documentation which accompanies the clustalw distribution. Clustalw is available from (among others) ftp://ftp.ebi.ac.uk/pub/software/. Clustalw.pm has only been tested using version 1.8 of clustalw. Compatibility with earlier versions of the clustalw program is currently unknown. Before running Clustalw successfully it will be necessary: to install clustalw on your system, and to ensure that users have execute privileges for the clustalw program. =head2 Helping the module find your executable You will need to enable Clustalw to find the clustalw program. This can be done in (at least) three ways: 1. Make sure the clustalw executable is in your path so that which clustalw returns a clustalw executable on your system. 2. Define an environmental variable CLUSTALDIR which is a directory which contains the 'clustalw' application: In bash: export CLUSTALDIR=/home/username/clustalw1.8 In csh/tcsh: setenv CLUSTALDIR /home/username/clustalw1.8 3. Include a definition of an environmental variable CLUSTALDIR in every script that will use this Clustalw wrapper module, e.g.: BEGIN { $ENV{CLUSTALDIR} = '/home/username/clustalw1.8/' } use Bio::Tools::Run::Alignment::Clustalw; If you are running an application on a webserver make sure the webserver environment has the proper PATH set or use the options 2 or 3 to set the variables. =head2 How it works Bio::Tools::Run::Alignment::Clustalw is an object for performing a multiple sequence alignment from a set of unaligned sequences and/or sub-alignments by means of the clustalw program. Initially, a clustalw "factory object" is created. Optionally, the factory may be passed most of the parameters or switches of the clustalw program, e.g.: @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); Any parameters not explicitly set will remain as the defaults of the clustalw program. Additional parameters and switches (not available in clustalw) may also be set. Currently, the only such parameter is "quiet", which when set to a non-zero value, suppresses clustalw terminal output. Not all clustalw parameters are supported at this stage. By default, Clustalw output is returned solely in a the form of a Bio::SimpleAlign object which can then be printed and/or saved in multiple formats using the AlignIO.pm module. Optionally the raw clustalw output file can be saved if the calling script specifies an output file (with the clustalw parameter OUTFILE). Currently only the GCG-MSF output file formats is supported. Not all parameters and features have been implemented yet in Perl format. Alignment parameters can be changed and/or examined at any time after the factory has been created. The program checks that any parameter/switch being set/read is valid. However, currently no additional checks are included to check that parameters are of the proper type (eg string or numeric) or that their values are within the proper range. As an example, to change the value of the clustalw parameter ktuple to 3 and subsequently to check its value one would write: $ktuple = 3; $factory->ktuple($ktuple); $get_ktuple = $factory->ktuple(); Once the factory has been created and the appropriate parameters set, one can call the method align() to align a set of unaligned sequences, or call profile_align() to add one or more sequences or a second alignment to an initial alignment. Input to align() may consist of a set of unaligned sequences in the form of the name of file containing the sequences. For example, $inputfilename = 't/data/cysprot.fa'; $aln = $factory-Ealign($inputfilename); Alternately one can create an array of Bio::Seq objects somehow $str = Bio::SeqIO->new(-file=> 't/data/cysprot.fa', -format => 'Fasta'); @seq_array =(); while ( my $seq = $str->next_seq() ) {push (@seq_array, $seq) ;} and pass the factory a reference to that array $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); In either case, align() returns a reference to a SimpleAlign object which can then used (see L). Once an initial alignment exists, one can pass the factory additional sequence(s) to be added (ie aligned) to the original alignment. The alignment can be passed as either an alignment file or a Bio:SimpleAlign object. The unaligned sequence(s) can be passed as a filename or as an array of BioPerl sequence objects or as a single BioPerl Seq object. For example (to add a single sequence to an alignment), $str = Bio::AlignIO->new(-file=> 't/data/cysprot1a.msf'); $aln = $str->next_aln(); $str1 = Bio::SeqIO->new(-file=> 't/data/cysprot1b.fa'); $seq = $str1->next_seq(); $aln = $factory->profile_align($aln,$seq); In either case, profile_align() returns a reference to a SimpleAlign object containing a new SimpleAlign object of the alignment with the additional sequence(s) added in. Finally one can pass the factory a pair of (sub)alignments to be aligned against each other. The alignments can be passed in the form of either a pair of alignment files or a pair of Bio:SimpleAlign objects. For example, $profile1 = 't/data/cysprot1a.msf'; $profile2 = 't/data/cysprot1b.msf'; $aln = $factory->profile_align($profile1,$profile2); or $str1 = Bio::AlignIO->new(-file=> 't/data/cysprot1a.msf'); $aln1 = $str1->next_aln(); $str2 = Bio::AlignIO->new(-file=> 't/data/cysprot1b.msf'); $aln2 = $str2->next_aln(); $aln = $factory->profile_align($aln1,$aln2); In either case, profile_align() returns a reference to a SimpleAlign object containing an (super)alignment of the two input alignments. For more examples of syntax and use of Clustalw, the user is encouraged to look at the script Clustalw.t in the t/ directory. Note: Clustalw is still under development. Various features of the clustalw program have not yet been implemented. If you would like that a specific clustalw feature be added to this perl contact bioperl-l@bioperl.org. These can be specified as parameters when instantiating a new Clustalw object, or through get/set methods of the same name (lowercase). =head1 PARAMETER FOR ALIGNMENT COMPUTATION =head2 KTUPLE Title : KTUPLE Description : (optional) set the word size to be used in the alignment This is the size of exactly matching fragment that is used. INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity. For longer sequences (e.g. >1000 residues) you may need to increase the default =head2 TOPDIAGS Title : TOPDIAGS Description : (optional) number of best diagonals to use The number of k-tuple matches on each diagonal (in an imaginary dot-matrix plot) is calculated. Only the best ones (with most matches) are used in the alignment. This parameter specifies how many. Decrease for speed; increase for sensitivity. =head2 WINDOW Title : WINDOW Description : (optional) window size This is the number of diagonals around each of the 'best' diagonals that will be used. Decrease for speed; increase for sensitivity. =head2 PAIRGAP Title : PAIRGAP Description : (optional) gap penalty for pairwise alignments This is a penalty for each gap in the fast alignments. It has little affect on the speed or sensitivity except for extreme values. =head2 FIXEDGAP Title : FIXEDGAP Description : (optional) fixed length gap penalty =head2 FLOATGAP Title : FLOATGAP Description : (optional) variable length gap penalty =head2 MATRIX Title : MATRIX Default : PAM100 for DNA - PAM250 for protein alignment Description : (optional) substitution matrix used in the multiple alignments. Depends on the version of clustalw as to what default matrix will be used PROTEIN WEIGHT MATRIX leads to a new menu where you are offered a choice of weight matrices. The default for proteins in version 1.8 is the PAM series derived by Gonnet and colleagues. Note, a series is used! The actual matrix that is used depends on how similar the sequences to be aligned at this alignment step are. Different matrices work differently at each evolutionary distance. DNA WEIGHT MATRIX leads to a new menu where a single matrix (not a series) can be selected. The default is the matrix used by BESTFIT for comparison of nucleic acid sequences. =head2 TYPE Title : TYPE Description : (optional) sequence type: protein or DNA. This allows you to explicitly overide the programs attempt at guessing the type of the sequence. It is only useful if you are using sequences with a VERY strange composition. =head2 OUTPUT Title : OUTPUT Description : (optional) clustalw supports GCG or PHYLIP or PIR or Clustal format. See the Bio::AlignIO modules for which formats are supported by bioperl. =head2 OUTFILE Title : OUTFILE Description : (optional) Name of clustalw output file. If not set module will erase output file. In any case alignment will be returned in the form of SimpleAlign objects =head2 TRANSMIT Title : TRANSMIT Description : (optional) transitions not weighted. The default is to weight transitions as more favourable than other mismatches in DNA alignments. This switch makes all nucleotide mismatches equally weighted. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Peter Schattner Email schattner@alum.mit.edu =head1 CONTRIBUTORS Jason Stajich jason-AT-bioperl_DOT_org Sendu Bala bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Alignment::Clustalw; use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::IO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @CLUSTALW_PARAMS = qw(output ktuple topdiags window pairgap fixedgap floatgap matrix type transit dnamatrix outfile gapopen gapext maxdiv gapdist hgapresidues pwmatrix pwdnamatrix pwgapopen pwgapext score transweight seed helixgap outorder strandgap loopgap terminalgap helixendin helixendout strandendin strandendout program reps outputtree seed bootlabels bootstrap); our @CLUSTALW_SWITCHES = qw(help check options negative noweights endgaps nopgap nohgap novgap kimura tossgaps kimura tossgaps njtree); our @OTHER_SWITCHES = qw(quiet); our $PROGRAM_NAME = 'clustalw'; our $PROGRAM_DIR = $ENV{'CLUSTALDIR'} || $ENV{'CLUSTALWDIR'}; =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@CLUSTALW_PARAMS, @CLUSTALW_SWITCHES, @OTHER_SWITCHES], -create => 1); return $self; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $prog = $self->executable; my $string = `$prog -- 2>&1` ; $string =~ /\(?([\d.]+)\)?/xms; return $1 || undef; } =head2 run Title : run Usage : ($aln, $tree) = $factory->run($inputfilename); ($aln, $tree) = $factory->run($seq_array_ref); Function: Perform a multiple sequence alignment, generating a tree at the same time. (Like align() and tree() combined.) Returns : A SimpleAlign object containing the sequence alignment and a Bio::Tree::Tree object with the tree relating the sequences. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. =cut sub run { my ($self,$input) = @_; my ($temp,$infilename, $seq); my ($attr, $value, $switch); $self->io->_io_cleanup(); # Create input file pointer $infilename = $self->_setinput($input); $self->throw("Bad input data (sequences need an id) or less than 2 sequences in $input!") unless $infilename; # Create parameter string to pass to clustalw program my $param_string = $self->_setparams(); # run clustalw return $self->_run('both', $infilename, $param_string); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; $self->io->_io_cleanup(); # Create input file pointer my $infilename = $self->_setinput($input); $self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !") unless $infilename; # Create parameter string to pass to clustalw program my $param_string = $self->_setparams(); # run clustalw my $aln = $self->_run('align', $infilename, $param_string); } =head2 profile_align Title : profile_align Usage : $aln = $factory->profile_align(@simple_aligns); or $aln = $factory->profile_align(@subalignment_filenames); Function: Perform an alignment of 2 (sub)alignments Returns : Reference to a SimpleAlign object containing the (super)alignment. Args : Names of 2 files containing the subalignments or references to 2 Bio::SimpleAlign objects. Throws an exception if arguments are not either strings (eg filenames) or references to SimpleAlign objects. =cut sub profile_align { my ($self,$input1,$input2) = @_; $self->io->_io_cleanup(); # Create input file pointer my $infilename1 = $self->_setinput($input1, 1); my $infilename2 = $self->_setinput($input2, 2); if (!$infilename1 || !$infilename2) {$self->throw("Bad input data: $input1 or $input2 !");} unless ( -e $infilename1 and -e $infilename2) {$self->throw("Bad input file: $input1 or $input2 !");} # Create parameter string to pass to clustalw program my $param_string = $self->_setparams(); # run clustalw my $aln = $self->_run('profile-aln', $infilename1, $infilename2, $param_string); } =head2 add_sequences Title : add_sequences Usage : Function: Align and add sequences into an alignment Example : Returns : Reference to a SimpleAlign object containing the (super)alignment. Args : Names of 2 files, the first one containing an alignment and the second one containing sequences to be added or references to 2 Bio::SimpleAlign objects. Throws an exception if arguments are not either strings (eg filenames) or references to SimpleAlign objects. =cut sub add_sequences { my ($self,$input1,$input2) = @_; my ($temp,$infilename1,$infilename2,$input,$seq); $self->io->_io_cleanup(); # Create input file pointer $infilename1 = $self->_setinput($input1,1); $infilename2 = $self->_setinput($input2,2); if (!$infilename1 || !$infilename2) {$self->throw("Bad input data: $input1 or $input2 !");} unless ( -e $infilename1 and -e $infilename2) {$self->throw("Bad input file: $input1 or $input2 !");} # Create parameter string to pass to clustalw program my $param_string = $self->_setparams(); # run clustalw my $aln = $self->_run('add_sequences', $infilename1, $infilename2, $param_string); } =head2 tree Title : tree Usage : @params = ('bootstrap' => 1000, 'tossgaps' => 1, 'kimura' => 1, 'seed' => 121, 'bootlabels'=> 'nodes', 'quiet' => 1); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $tree_obj = $factory->tree($aln_obj); or $tree_obj = $factory->tree($treefilename); Function: Retrieve a tree corresponding to the input Returns : Bio::TreeIO object Args : Bio::SimpleAlign or filename of a tree =cut sub tree { my ($self,$input) = @_; $self->io->_io_cleanup(); # Create input file pointer my $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !");} # Create parameter string to pass to clustalw program my $param_string = $self->_setparams(); # run clustalw my $tree = $self->_run('tree', $infilename, $param_string); } =head2 footprint Title : footprint Usage : @alns = $factory->footprint($treefilename, $window_size, $diff); @alns = $factory->footprint($seqs_array_ref); Function: Aligns all the supplied sequences and slices out of the alignment those regions along a sliding window who's tree length differs significantly from the total average tree length. Returns : list of Bio::SimpleAlign objects Args : first argument as per run(), optional second argument to specify the size of the sliding window (default 5 bp) and optional third argument to specify the % difference from the total tree length needed for a window to be considered a footprint (default 33%). =cut sub footprint { my ($self, $in, $slice_size, $deviate) = @_; my ($simplealn, $tree) = $self->run($in); # total tree length? my $total_length = $tree->total_branch_length; # tree length along sliding window, picking regions that significantly # deviate from the average tree length $slice_size ||= 5; $deviate ||= 33; my $threshold = $total_length - (($total_length / 100) * $deviate); my $length = $simplealn->length; my $below = 0; my $found_minima = 0; my $minima = [$threshold, '']; my @results; for my $i (1..($length - $slice_size + 1)) { my $slice = $simplealn->slice($i, ($i + $slice_size - 1), 1); my $tree = $self->tree($slice); $self->throw("No tree returned") unless defined $tree; my $slice_length = $tree->total_branch_length; $slice_length <= $threshold ? ($below = 1) : ($below = 0); if ($below) { unless ($found_minima) { if ($slice_length < ${$minima}[0]) { $minima = [$slice_length, $slice]; } else { push(@results, ${$minima}[1]); $minima = [$threshold, '']; $found_minima = 1; } } } else { $found_minima = 0; } } return @results; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to clustalw program Returns : nothing; clustalw output is written to a temporary file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to clustalw =cut sub _run { my ($self, $command, $infile1, $infile2, $param_string) = @_; my ($instring, $tree); my $quiet = $self->quiet() || $self->verbose() < 0; if ($command =~ /align|both/) { if ($^O eq 'dec_osf') { $instring = $infile1; $command = ''; } else { $instring = " -infile=". '"' . $infile1 . '"'; } $param_string .= " $infile2"; } if ($command =~ /profile/) { $instring = "-profile1=$infile1 -profile2=$infile2"; chmod 0777, $infile1, $infile2; $command = '-profile'; } if ($command =~ /add_sequences/) { $instring = "-profile1=$infile1 -profile2=$infile2"; chmod 0777, $infile1,$infile2; $command = '-sequences'; } if ($command =~ /tree/) { if( $^O eq 'dec_osf' ) { $instring = $infile1; $command = ''; } else { $instring = " -infile=". '"' . $infile1 . '"'; } $param_string .= " $infile2"; $self->debug( "Program ".$self->executable."\n"); my $commandstring = $self->executable."$instring"."$param_string"; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 1>$null" if $quiet; $self->debug( "clustal command = $commandstring"); my $status = system($commandstring); unless( $status == 0 ) { $self->warn( "Clustalw call ($commandstring) crashed: $? \n"); return undef; } return $self->_get_tree($infile1, $param_string); } my $output = $self->output || 'gcg'; $self->debug( "Program ".$self->executable."\n"); my $commandstring = $self->executable." $command"." $instring"." -output=$output". " $param_string"; $self->debug( "clustal command = $commandstring\n"); open(my $pipe, "$commandstring |") || $self->throw("ClustalW call ($commandstring) failed to start: $? | $!"); my $score; while (<$pipe>) { print unless $quiet; # Kevin Brown suggested the following regex, though it matches multiple # times: we pick up the last one $score = $1 if ($_ =~ /Score:(\d+)/); # This one is printed at the end and seems the most appropriate to pick # up; we include the above regex incase 'Alignment Score' isn't given $score = $1 if ($_ =~ /Alignment Score (-?\d+)/); } close($pipe) || ($self->throw("ClustalW call ($commandstring) crashed: $?")); my $outfile = $self->outfile(); # retrieve alignment (Note: MSF format for AlignIO = GCG format of clustalw) my $format = $output =~ /gcg/i ? 'msf' : $output; if ($format =~ /clustal/i) { $format = 'clustalw'; # force clustalw incase 'clustal' is requested } my $in = Bio::AlignIO->new(-file => $outfile, -format=> $format); my $aln = $in->next_aln(); $in->close; $aln->score($score); if ($command eq 'both') { $tree = $self->_get_tree($infile1, $param_string); } # Clean up the temporary files created along the way... # Replace file suffix with dnd to find name of dendrogram file(s) to delete unless ( $self->save_tempfiles ) { foreach my $f ($infile1, $infile2) { $f =~ s/\.[^\.]*$// ; unlink $f .'.dnd' if ($f ne ''); } } if ($command eq 'both') { return ($aln, $tree); } return $aln; } sub _get_tree { my ($self, $treefile, $param_string) = @_; $treefile =~ s/\.[^\.]*$// ; if ($param_string =~ /-bootstrap/) { $treefile .= '.phb'; } elsif ($param_string =~ /-tree/) { $treefile .= '.ph'; } else { $treefile .= '.dnd'; } my $in = Bio::TreeIO->new('-file' => $treefile, '-format'=> 'newick'); my $tree = $in->next_tree; unless ( $self->save_tempfiles ) { foreach my $f ( $treefile ) { unlink $f if( $f ne '' ); } } return $tree; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for clustalw program Returns : name of file containing clustalw data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($infilename, $seq, $temp, $tfh); # suffix is used to distinguish alignment files If $input is not a # reference it better be the name of a file with the sequence/ # alignment data... unless (ref $input) { # check that file exists or throw $infilename = $input; return unless -e $input; return $infilename; } # $input may be an array of BioSeq objects... if (ref($input) eq "ARRAY") { # Open temporary file for both reading & writing of BioSeq array ($tfh,$infilename) = $self->io->tempfile(-dir=>$self->tempdir); $temp = Bio::SeqIO->new('-fh'=>$tfh, '-format' =>'Fasta'); # Need at least 2 seqs for alignment return unless (scalar(@$input) > 1); foreach $seq (@$input) { return unless (defined $seq && $seq->isa("Bio::PrimarySeqI") and $seq->id()); $temp->write_seq($seq); } $temp->close(); close($tfh); undef $tfh; return $infilename; } # $input may be a SimpleAlign object. elsif (ref($input) eq "Bio::SimpleAlign") { # Open temporary file for both reading & writing of SimpleAlign object ($tfh,$infilename) = $self->io->tempfile(-dir=>$self->tempdir); $temp = Bio::AlignIO->new('-fh'=> $tfh, '-format' => 'fasta'); $temp->write_aln($input); close($tfh); undef $tfh; return $infilename; } # or $input may be a single BioSeq object (to be added to a previous alignment) elsif (ref($input) && $input->isa("Bio::PrimarySeqI") && $suffix==2) { # Open temporary file for both reading & writing of BioSeq object ($tfh,$infilename) = $self->io->tempfile(); $temp = Bio::SeqIO->new(-fh=> $tfh, '-format' =>'Fasta'); $temp->write_seq($input); close($tfh); undef $tfh; return $infilename; } return; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for clustalw program Returns : parameter string to be passed to clustalw during align or profile_align Args : name of calling object =cut sub _setparams { my $self = shift; my $param_string = $self->SUPER::_setparams(-params => \@CLUSTALW_PARAMS, -switches => \@CLUSTALW_SWITCHES, -dash => 1, -lc => 1, -join => '='); # Set default output file if no explicit output file selected unless ($param_string =~ /outfile/) { my ($tfh, $outfile) = $self->io->tempfile(-dir => $self->tempdir()); close($tfh); undef $tfh; $self->outfile($outfile); $param_string .= " -outfile=\"$outfile\"" ; } $param_string .= ' 2>&1'; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/DBA.pm000066400000000000000000000433651302566030400233710ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::DBA # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::DBA - Object for the alignment of two sequences using the DNA Block Aligner program. =head1 SYNOPSIS use Bio::Tools::Run::Alignment::DBA; # Build a dba alignment factory my @params = ('matchA' => 0.75, 'matchB' => '0.55', 'dymem' =>'linear'); my $factory = Bio::Tools::Run::Alignment::DBA->new(@params); # Pass the factory a filename with 2 sequences to be aligned. $inputfilename = 't/data/dbaseq.fa'; # @hsps is an array of GenericHSP objects my @hsps = $factory->align($inputfilename); # or my @files = ('t/data/dbaseq1.fa','t/data/dbaseq2.fa'); my @hsps = $factory->align(\@files); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; my @hsps = $factory->align($seq_array_ref); =head1 DESCRIPTION DNA Block Aligner program (DBA) was developed by Ewan Birney. DBA is part of the Wise package available at L. You will need to enable dba to find the dba program. This can be done in a few different ways: 1. Define an environmental variable WISEDIR: export WISEDIR =/usr/local/share/wise2.2.0 2. Include a definition of an environmental variable WISEDIR in every script that will use DBA.pm: $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; 3. Make sure that the dba application is in your PATH. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::DBA; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @DBA_SWITCHES @DBA_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Search::HSP::GenericHSP; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @DBA_PARAMS = qw(MATCHA MATCHB MATCHC MATCHD GAP BLOCKOPEN UMATCH SINGLE NOMATCHN PARAMS KBYTE DYMEM DYDEBUG ERRORLOG); @OTHER_SWITCHES = qw(OUTFILE); @DBA_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD ALIGN LABEL); # Authorize attribute fields foreach my $attr ( @DBA_PARAMS, @DBA_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'dba'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin") if $ENV{WISEDIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/'PROGRAM'/i ) { $self->executable($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe = $self->executable(); return undef unless defined $exe; my $string = `$exe -- ` ; $string =~ /\(([\d.]+)\)/; return $1 || undef; } =head2 align Title : align Usage : $inputfilename = 't/data/seq.fa'; @hsps = $factory->align($inputfilename); or #@seq_array is array of Seq objs $seq_array_ref = \@seq_array; @hsps = $factory->align($seq_array_ref); or my @files = ('t/data/seq1.fa','t/data/seq2.fa'); @hsps = $factory->align(\@files); Function: Perform a DBA alignment Returns : An array of Bio::Search::HSP::GenericHSP objects Args : Name of a file containing a set of 2 fasta sequences or else a reference to an array to 2 Bio::Seq objects. or else a reference to an array of 2 file names containing 1 fasta sequence each Throws an exception if argument is not either a string (eg a filename) or a reference to an array of 2 Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; my ($temp,$infile1, $infile2, $seq); my ($attr, $value, $switch); # Create input file pointer ($infile1,$infile2)= $self->_setinput($input); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !");} # Create parameter string to pass to dba program my $param_string = $self->_setparams(); # run dba my @hsps = $self->_run($infile1,$infile2,$param_string); return @hsps; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to dba program Example : Returns : nothing; dba output is written to a temp file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to dba =cut sub _run { my ($self,$infile1,$infile2,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); unless( $self->outfile){ my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir); close($tfh); undef $tfh; $self->outfile($outfile); } my $outfile = $self->outfile(); my $commandstring = $self->executable." $param_string -pff $infile1 $infile2 > $outfile"; $self->debug( "dba command = $commandstring"); my $status = system($commandstring); $self->throw( "DBA call ($commandstring) crashed: $? \n") unless $status==0; #parse pff format and return a Bio::Search::HSP::GenericHSP array my $hsps = $self->_parse_results($outfile); return @{$hsps}; } =head2 _parse_results Title : __parse_results Usage : Internal function, not to be called directly Function: Parses dba output Example : Returns : an reference to an array of GenericHSPs Args : the name of the output file =cut sub _parse_results { my ($self,$outfile) = @_; $outfile||$self->throw("No outfile specified"); my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align); my $count = 0; my @hsps; open(OUT,$outfile); my (%query,%subject); while(my $entry = ){ if($entry =~ /^>(.+)/ ) { $tempname = $1; if( defined $name ) { if($count == 0){ my @parse = split("\t",$name); $query{seqname} = $parse[0]; $query{start} = $parse[3]; $query{end} = $parse[4]; $query{score} = $parse[5]; $query{strand} = ($parse[6] eq '+') ? 1 : -1; my @tags = split(";",$parse[8]); foreach my $tag(@tags){ $tag =~/(\S+)\s+(\S+)/; $query{$1} = $2; } $query{seq} = $seqchar; $count++; } elsif ($count == 1){ my @parse = split("\t",$name); $subject{seqname} = $parse[0]; $subject{start} = $parse[3]; $subject{end} = $parse[4]; $subject{score} = $parse[5]; $subject{strand} = ($parse[6] eq '+') ? 1:-1; my @tags = split(";",$parse[8]); foreach my $tag(@tags){ $tag =~/(\S+)\s+(\S+)/; $subject{$1} = $2; } $subject{seq} = $seqchar; #create homology string my $xor = $query{seq}^$subject{seq}; my $identical = $xor=~tr/\c@/*/; $xor=~tr/*/ /c; my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm =>'DBA', -score =>$query{score}, -hsp_length =>length($query{seq}), -query_gaps =>$query{gaps}, -hit_gaps =>$subject{gaps}, -query_name =>$query{seqname}, -query_start =>$query{start}, -query_end =>$query{end}, -hit_name =>$subject{seqname}, -hit_start =>$subject{start}, -hit_end =>$subject{end}, -hit_length =>length($self->_subject_seq->seq), -query_length =>length($self->_query_seq->seq), -query_seq =>$query{seq}, -hit_seq =>$subject{seq}, -conserved =>$identical, -identical =>$identical, -homology_seq =>$xor); push @hsps, $hsp; $count = 0; } } $name = $tempname; $seqchar = ""; next; } $entry =~ s/[^A-Za-z\.\-]//g; $seqchar .= $entry; } #do for the last entry if($count == 1){ my @parse = split("\t",$name); $subject{seqname} = $parse[1]; $subject{start} = $parse[3]; $subject{end} = $parse[4]; $subject{score} = $parse[5]; $subject{strand} = ($parse[6] eq '+') ? 1:-1; my @tags = split(";",$parse[8]); foreach my $tag(@tags){ $tag =~/(\S+)\s+(\S+)/; $subject{$1} = $2; } $subject{seq} = $seqchar; #create homology string my $xor = $query{seq}^$subject{seq}; my $identical = $xor=~tr/\c@/*/; $xor=~tr/*/ /c; my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm =>'DBA', -score =>$query{score}, -hsp_length =>length($query{seq}), -query_gaps =>$query{gaps}, -hit_gaps =>$subject{gaps}, -query_name =>$query{seqname}, -query_start =>$query{start}, -query_end =>$query{end}, -hit_name =>$subject{seqname}, -hit_start =>$subject{start}, -hit_end =>$subject{end}, -hit_length =>length($self->_subject_seq->seq), -query_length =>length($self->_query_seq->seq), -query_seq =>$query{seq}, -hit_seq =>$subject{seq}, -conserved =>$identical, -identical =>$identical, -homology_seq =>$xor); push @hsps, $hsp; } return \@hsps; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for dba program Example : Returns : name of file containing dba data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2); #there is gotta be some repetition here...need to clean up if (ref($input) ne "ARRAY"){ #a single file containg 2 seqeunces $infilename = $input; unless(-e $input){return 0;} my $in = Bio::SeqIO->new(-file => $infilename , '-format' => 'Fasta'); ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'Fasta','-flush'=>1); my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'Fasta','-flush'=>1); my $seq1 = $in->next_seq() || return 0; my $seq2 = $in->next_seq() || return 0; $out1->write_seq($seq1); $out2->write_seq($seq2); $self->_query_seq($seq1); $self->_subject_seq($seq2); $out1->close(); $out2->close(); close($tfh1); close($tfh2); undef $tfh1; undef $tfh2; return $outfile1,$outfile2; } else { scalar(@{$input}) == 2 || $self->throw("dba alignment can only be run on 2 sequences not."); if(ref($input->[0]) eq ""){#passing in two file names my $in1 = Bio::SeqIO->new(-file => $input->[0], '-format' => 'fasta'); my $in2 = Bio::SeqIO->new(-file => $input->[1], '-format' => 'fasta'); ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); my $seq1 = $in1->next_seq() || return 0; my $seq2 = $in2->next_seq() || return 0; $out1->write_seq($seq1); $out2->write_seq($seq2); $self->_query_seq($seq1); $self->_subject_seq($seq2); close($tfh1); close($tfh2); undef $tfh1; undef $tfh2; return $outfile1,$outfile2; } elsif($input->[0]->isa("Bio::PrimarySeqI") && $input->[1]->isa("Bio::PrimarySeqI")) { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); $out1->write_seq($input->[0]); $out2->write_seq($input->[1]); $self->_query_seq($input->[0]); $self->_subject_seq($input->[1]); close($tfh1); close($tfh2); undef $tfh1; undef $tfh2; return $outfile1,$outfile2; } else { return 0; } } return 0; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for dba program Example : Returns : parameter string to be passed to dba during align or profile_align Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @DBA_PARAMS ) { $value = $self->$attr(); next unless (defined $value); # next if $attr =~/outfile/i; my $attr_key = lc $attr; #put params in format expected by dba if($attr_key =~ /match([ABCDabcd])/i){ $attr_key = "match".uc($1); } $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( @DBA_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } return $param_string; } =head2 _query_seq() Title : _query_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_seq'} = $seq; } return $self->{'_query_seq'}; } =head2 _subject_seq() Title : _subject_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _subject_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_subject_seq'} = $seq; } return $self->{'_subject_seq'}; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Exonerate.pm000077500000000000000000000174331302566030400247350ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Exonerate =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Exonerate; use Bio::SeqIO; my $qio = Bio::SeqIO->new(-file=>$ARGV[0],-format=>'fasta'); my $query = $qio->next_seq(); my $tio = Bio::SeqIO->new(-file=>$ARGV[1],-format=>'fasta'); my $target = $sio->next_seq(); #exonerate parameters can all be passed via arguments parameter. #parameters passed are not checked for validity my $run = Bio::Tools::Run::Alignment::Exonerate-> new(arguments=>'--model est2genome --bestn 10'); my $searchio_obj = $run->run($query,$target); while(my $result = $searchio->next_result){ while( my $hit = $result->next_hit ) { while( my $hsp = $hit->next_hsp ) { print $hsp->start."\t".$hsp->end."\n"; } } } =head1 DESCRIPTION Wrapper for Exonerate alignment program. You can get exonerate at http://www.ebi.ac.uk/~guy/exonerate/. This wrapper is written without parameter checking. All parameters are passed via the arugment parameter that is passed in the constructor. See SYNOPSIS. For exonerate parameters, run exonerate --help for more details. =head1 PROGRAM VERSIONS The tests have been shown to pass with exonorate versions 2.0 - 2.2. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh-at-stanford.edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Exonerate; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @EXONERATE_PARAMS %OK_FIELD); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; use Bio::SearchIO; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'exonerate'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{EXONERATEDIR}) if $ENV{EXONERATEDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $factory= Bio::Tools::Run::Phrap->new(); Function: creates a new Phrap factory Returns: Bio::Tools::Run::Phrap Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe -v` ; #exonerate from exonerate version 2.0.0\n... my ($version) = $string =~ /exonerate version ([\d+\.]+)/m; $version =~ s/\.(\d+)$/$1/; return $version || undef; } =head2 run Title : run() Usage : my $feats = $factory->run($seq) Function: Runs Phrap Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$query,$target) = @_; my @feats; my ($file1) = $self->_writeInput($query); my ($file2) = $self->_writeInput($target); my $assembly = $self->_run($file1,$file2); return $assembly; } =head2 _input Title : _input Usage : $factory->_input($seqFile) Function: get/set for input file Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; $self->{'input'} = $infile1 if(defined $infile1); return $self->{'input'}; } =head2 _run Title : _run Usage : $factory->_run() Function: Makes a system call and runs Phrap Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self,$query,$target)= @_; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); my $param_str = $self->_setparams." ".$self->arguments; my $str = $self->executable." $param_str $query $target "." > $outfile"; $self->debug( "$str\n"); my $status = system($str); $self->throw( "Exonerate call ($str) crashed: $? \n") unless $status==0; my $filehandle; my $exonerate_obj = Bio::SearchIO->new(-file=>"$outfile",-format=>'exonerate'); close($tfh); undef $tfh; unlink $outfile; return $exonerate_obj; } =head2 _writeInput Title : _writeInput Usage : $factory->_writeInput($query,$target) Function: Creates a file from the given seq object Returns : A string(filename) Args : Bio::PrimarySeqI =cut sub _writeInput{ my ($self,$query) = @_; my ($fh,$infile1); if (ref($query) =~ /ARRAY/i) { my @infilearr; ($fh, $infile1) = $self->io->tempfile(); my $temp = Bio::SeqIO->new( -file => ">$infile1", -format => 'Fasta' ); foreach my $seq1 (@$query) { unless ($seq1->isa("Bio::PrimarySeqI")) { return 0; } $temp->write_seq($seq1); push @infilearr, $infile1; } } elsif($query->isa("Bio::PrimarySeqI")) { ($fh, $infile1) = $self->io->tempfile(); my $temp = Bio::SeqIO->new( -file => ">$infile1", -format => 'Fasta' ); $temp->write_seq($query); } else { $infile1 = $query; } return $infile1; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self) = @_; my $param_string = ''; foreach my $attr(@EXONERATE_PARAMS){ next if($attr=~/PROGRAM/); my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .= $attr_key.' '.$value; } return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Gmap.pm000066400000000000000000000176511302566030400236660ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Gmap # # Cared for by George Hartzell # # Copyright George Hartzell # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Gmap - Wrapper for running gmap. =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Gmap; use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file=>$filename ,-format=>'fasta'); my @seq; while(my $seq = $sio->next_seq()){ push @seq,$seq; } my $mapper =Bio::Tools::Run::Gmap->new(); my $result = $mapper->run(\@seq); =head1 DESCRIPTION Bioperl-run wrapper around gmap. See L for information about gmap. It requires a reference to an array of bioperl SeqI objects and returns a reference to a filehandle from which the gmap output can be read. One can explicitly set the name of the genome database (defaults to NHGD_R36) using the 'genome_db()' method. One can also explicitly set the flags that are passed to gmap (defaults to '-f 9 -5 -e') using the 'flags()' method. The name of the gmap executable can be overridden using the program_name() method and the directory in which to find that executable can be overridden using the program_dir() method. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - George Hartzell Email hartzell@alerce.com Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... # TODO handle stderr output from gmap. package Bio::Tools::Run::Alignment::Gmap; use strict; use warnings; # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::SeqIO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::Alignment::Gmap(); Function: Builds a new Bio::Tools::Run::Alignment::Gmap object Returns : an instance of Bio::Tools::Run::Alignment::Gmap Args : =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{_program_name} = 'gmap'; return $self; } =head2 version Title : version Usage : print "gmap version: " . $mapper->version() . "\n"; Function: retrieves and returns the version of the gmap package. Example : Returns : scalar string containing the version number. Probably looks like YYYY-MM-DD. Args : none. =cut sub version { my ($self,@args) = @_; my $version; my $str = $self->executable; $str .= ' --version'; $self->debug("gmap version command = $str\n"); open(GMAPRUN, "$str |") || $self->throw($@); { local $/ = undef; my $result = ; ($version) = ($result =~ m|.*Part of GMAP package, version (.*).*|); } return($version); } =head2 program_name Title : program_name Usage : $mapper->program_name('gmap-dev'); my $pname = $mapper->program_name(); Function: sets/gets the name of the program to run. Returns : string representing the name of the executable. Args : [optional] string representing the name of the executable to set. =cut sub program_name { my $self = shift; $self->{_program_name} = shift if @_; return $self->{_program_name}; } =head2 program_dir Title : program_dir Usage : $mapper->program_dir('/usr/local/sandbox/gmap/bin'); my $pdir = $mapper->program_dir(); Function: sets/gets the directory path in which to find the gmap executable. Returns : string representing the path to the directory. Args : [optional] string representing the directory path to set. =cut sub program_dir { my $self = shift; $self->{_program_dir} = shift if @_; return $self->{_program_dir}; } =head2 input_file Title : input_file Usage : $mapper->input_file('/tmp/moose.fasta'); my $filename = $mapper->input_file(); Function: sets/gets the name of a file containing sequences to be mapped. Returns : string containing the name of the query sequence. Args : [optional] string representing the directory path to set. =cut sub input_file { my $self = shift; $self->{_input_file} = shift if @_; return $self->{_input_file}; } =head2 genome_db Title : genome_db Usage : $mapper->genome_db('NHGD_R36'); my $genome_db = $mapper->genome_db(); Function: sets/gets the name of the genome database, this will be passed to gmap using its '-d' flag. Returns : name of the genome database. Args : [optional] string representing the genome db to set. =cut sub genome_db { my $self = shift; $self->{_genome_db} = shift if @_; return $self->{_genome_db}; } =head2 flags Title : flags Usage : $mapper->flags('-A -e -5'); my $flags = $mapper->flags(); Function: sets/gets the flags that will be passed to gmap. Returns : the current value of the flags that will be passed to gmap. Args : [optional] the flags to set. =cut sub flags { my $self = shift; $self->{_flags} = shift if @_; return $self->{_flags}; } =head2 run Title : run Usage : $mapper->run() Function: runs gmap Example : Returns : a file handle, opened for reading, for gmap's output. Args : An array of references query sequences (as Bio::Seq objects) =cut sub run { my $self = shift; $self->input_file( $self->_build_fasta_input_file(@_) ) if(@_); my $result = $self->_run(); return $result; } =head2 _build_fasta_input_file Title : _build_fasta_input_file Usage : my $seq_file = $self->_build_fasta_input_file(@_); Function: Example : Returns : The name of the temporary file that contains the sequence. Args : A reference to an array of Bio::Seq objects. =cut use File::Temp; sub _build_fasta_input_file { my $self = shift; my $seqs = shift; my $seq_count = 0; # the object returned by File::Temp->new() is magic. Used normally # it behaves as a filehandle opened onto the temporary file. Used # as a string it behaves as a string that is the name of the # temporary file. # It is up to the user to remove the when finished with it. my $file_tmp = File::Temp->new( TEMPLATE => 'mvp-gmap-tempfile-XXXXXX', TMPDIR => 1, UNLINK => 0, ); my $seqio = Bio::SeqIO->new( -fh => $file_tmp, -format => 'Fasta' ); if (ref($seqs) =~ /ARRAY/i) { foreach my $seq (@$seqs) { throw Bio::Root::BadParameter(-text => "sequence args must be a Bio::SeqI subclass.", ) unless ($seq->isa("Bio::PrimarySeqI")); $seqio->write_seq($seq); $seq_count++; } } if ($seq_count == 0) { throw Bio::Root::BadParameter(-text => <executable; $str .= ' -d' . ($self->genome_db() || 'NHGD_R36'); $str .= ' ' . ($self->flags() || '-f 9 -5 -e'); $str .= ' ' . $self->input_file(); $str .= " 2> $null"; $str .= '; rm -f ' . $self->input_file(); $self->debug("gmap command = $str\n"); open(GMAPRUN, "$str |") || $self->throw("Can't open gmap (command = \"$str\"): $!"); return (\*GMAPRUN); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Kalign.pm000066400000000000000000000335121302566030400242010ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Kalign # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Kalign - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the KALIGN program =head1 SYNOPSIS # Build a kalign alignment factory $factory = Bio::Tools::Run::Alignment::Kalign->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION Please cite: Timo Lassmann and Erik L.L. Sonnhammer (2005) Kalign - an accurate and fast multiple sequence alignment algorithm. BMC Bioinformatics 6:298 http://msa.cgb.ki.se/downloads/kalign/current.tar.gz =head2 Helping the module find your executable You will need to enable Kalign to find the kalign program. This can be done in (at least) three ways: 1. Make sure the kalign executable is in your path (i.e. 'which kalign' returns a valid program 2. define an environmental variable KALIGNDIR which points to a directory containing the 'kalign' app: In bash export KALIGNDIR=/home/progs/kalign or In csh/tcsh setenv KALIGNDIR /home/progs/kalign 3. include a definition of an environmental variable KALIGNDIR in every script that will BEGIN {$ENV{KALIGNDIR} = '/home/progs/kalign'; } use Bio::Tools::Run::Alignment::Kalign; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email idontlikespam@hotmail.com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Kalign; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @KALIGN_PARAMS @KALIGN_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @KALIGN_PARAMS = qw(IN OUT GAPOPEN GAPEXTENSION TERMINAL_GAP_EXTENSION_PENALTY MATRIX_BONUS SORT FEATURE DISTANCE TREE ZCUTOFF FORMAT MAXMB MAXHOURS MAXITERS); @KALIGN_SWITCHES = qw(QUIET); # Authorize attribute fields foreach my $attr ( @KALIGN_PARAMS, @KALIGN_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'kalign'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{KALIGNDIR}) if $ENV{KALIGNDIR}; } =head2 new Title : new Usage : my $kalign = Bio::Tools::Run::Alignment::Kalign->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Kalign Args : -outfile_name => $outname =cut sub new { my ($class,@args) = @_; my( @kalign_args, @obj_args); while( my $arg = shift @args ) { if( $arg =~ /^-/ ) { push @obj_args, $arg, shift @args; } else { push @kalign_args,$arg, shift @args; } } my $self = $class->SUPER::new(@obj_args); my ($on) = $self->_rearrange([qw(OUTFILE_NAME)],@obj_args); $self->outfile_name($on || ''); my ($attr, $value); # FIXME: only tested with fasta output format right now... $self->aformat($DEFAULTS{'AFORMAT'}); while ( @kalign_args) { $attr = shift @kalign_args; $value = shift @kalign_args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } if( defined $self->out ) { $self->outfile_name($self->out); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 2 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; # Kalign version 2.01, Copyright (C) 2004, 2005, 2006 Timo Lassmann return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; $string =~ /Kalign\s+version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my $infilename; if( defined $input ) { $infilename = $self->_setinput($input); } elsif( defined $self->in ) { $infilename = $self->_setinput($self->in); } else { $self->throw("No inputdata provided\n"); } if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run kalign return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to kalign program Example : Returns : nothing; kalign output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to kalign =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." -in $infilename $params"; $self->debug( "kalign command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Kalign call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for kalign program Example : Returns : name of file containing kalign data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to kalign!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for kalign program Example : Returns : parameter string to be passed to kalign during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @KALIGN_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' -'.$attr_key; $param_string .= $attr_key .' '.$value; } for $attr ( @KALIGN_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } $param_string .= " -out ".$self->outfile_name; if ($self->quiet() || $self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $kalign->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $kalign->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Lagan.pm000066400000000000000000000464561302566030400240310ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Lagan # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Stephen Montgomery # # Special thanks to Jason Stajich. # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Lagan - Object for the local execution of the LAGAN suite of tools (including MLAGAN for multiple sequence alignments) =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Lagan; @params = ('chaos' => "The contents of this string will be passed as args to chaos", #Read you chaos README file for more info/This functionality #has not been tested and will be integrated in future versions. 'order' => "\"-gs -7 -gc -2 -mt 2 -ms -1\"", #Where gap start penalty of- 7, gap continue of -2, match of 2, #and mismatch of -1. 'recurse' => "\"(12,25),(7,25),(4,30)"\", #A list of (wordlength,score cutoff) pairs to be used in the #recursive anchoring 'tree' => "\"(sample1 (sample2 sample3))"\", #Used by mlagan / tree can also be passed when calling mlagan directly #SCORING PARAMETERS FOR MLAGAN: 'match' => 12, 'mismatch' => -8, 'gapstart' => -50, 'gapend' => -50, 'gapcont' => -2, ); =head1 DESCRIPTION To run mlagan/lagan, you must have an environment variable that points to the executable directory with files lagan.pl etc. "LAGAN_DIR=/opt/lagan_executables/" Simply having the executables in your path is not supported because the executables themselves only work with the environment variable set. All lagan and mlagan parameters listed in their Readmes can be set except for the mfa flag which has been turned on by default to prevent parsing of the alignment format. TO USE LAGAN: my $lagan = Bio::Tools::Run::Alignment::Lagan->new(@params); my $report_out = $lagan->lagan($seq1, $seq2); A SimpleAlign object is returned. TO USE MLAGAN: my $lagan = Bio::Tools::Run::Alignment::Lagan->new(); my $tree = "(($seqname1 $seqname2) $seqname3)"; my @sequence_objs; #an array of bioperl Seq objects ##If you use an unblessed seq array my $seq_ref = \@sequence_objs; bless $seq_ref, "ARRAY"; my $report_out = $lagan->mlagan($seq_ref, $tree); A SimpleAlign object is returned Only basic mlagan/lagan functionality has been implemented due to the iterative development of their project. Future maintenance upgrades will include enhanced features and scoring. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Stephen Montgomery Email smontgom@bcgsc.bc.ca Genome Sciences Centre in beautiful Vancouver, British Columbia CANADA =head1 CONTRIBUTORS MLagan/Lagan is the hard work of Michael Brudno et al. Sendu Bala bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Lagan; use strict; use Bio::Root::IO; use Bio::Seq; use Bio::SeqIO; use Bio::AlignIO; use Bio::SimpleAlign; use File::Spec; use Bio::Matrix::IO; use Cwd; use base qw(Bio::Tools::Run::WrapperBase); our @LAGAN_PARAMS = qw(chaos order recurse mfa out lazy maskedonly usebounds rc translate draft info fastreject); our @OTHER_PARAMS = qw(outfile); our @LAGAN_SWITCHES = qw(silent quiet); our @MLAGAN_PARAMS = qw(nested postir translate lazy verbose tree match mismatch gapstart gapend gapcont out version); #Not all of these parameters are useful in this context, care #should be used in setting only standard ones #The LAGAN_DIR environment variable must be set our $PROGRAM_DIR = $ENV{'LAGAN_DIR'} || ''; sub new { my($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@LAGAN_PARAMS, @OTHER_PARAMS, @LAGAN_SWITCHES, @MLAGAN_PARAMS], -create => 1); my ($tfh, $tempfile) = $self->io->tempfile(); my $outfile = $self->out || $self->outfile || $tempfile; $self->out($outfile); close($tfh); undef $tfh; return $self; } =head2 lagan Runs the Lagan pairwise alignment algorithm Inputs should be two PrimarySeq objects. Returns an SimpleAlign object / preloaded with the tmp file of the Lagan multifasta output. =cut sub lagan { my ($self, $input1, $input2) = @_; $self->io->_io_cleanup(); my $executable = 'lagan.pl'; #my (undef, $tempfile) = $self->io->tempfile(); #$self->out($tempfile); my ($infile1, $infile2) = $self->_setinput($executable, $input1, $input2); my $lagan_report = &_generic_lagan( $self, $executable, $infile1, $infile2 ); } =head2 mlagan Runs the Mlagan multiple sequence alignment algorithm. Inputs should be an Array of Primary Seq objects and a Phylogenetic Tree in String format or as a Bio::Tree::TreeI compliant object. Returns an SimpleAlign object / preloaded with the tmp file of the Mlagan multifasta output. =cut sub mlagan { my ($self, $input1, $tree) = @_; $self->io->_io_cleanup(); my $executable = 'mlagan'; if ($tree && ref($tree) && $tree->isa('Bio::Tree::TreeI')) { # fiddle tree so mlagan will like it my %orig_ids; foreach my $node ($tree->get_nodes) { my $seq_id = $node->name('supplied'); $seq_id = $seq_id ? shift @{$seq_id} : ($node->node_name ? $node->node_name : $node->id); $orig_ids{$seq_id} = $node->id; $node->id($seq_id); } # convert to string my $tree_obj = $tree; $tree = $tree->simplify_to_leaves_string; # more fiddling $tree =~ s/ /_/g; $tree =~ s/"//g; $tree =~ s/,/ /g; # unfiddle the tree object foreach my $node ($tree_obj->get_nodes) { $node->id($orig_ids{$node->id}); } } my $infiles; ($infiles, $tree) = $self->_setinput($executable, $input1, $tree); my $lagan_report = &_generic_lagan ( $self, $executable, $infiles, $tree ); } =head2 nuc_matrix Title : nuc_matrix Usage : my $matrix_obj = $obj->nuc_matrix(); -or- $obj->nuc_matrix($matrix_obj); -or- $obj->nuc_matrix($matrix_file); Function: Get/set the substitution matrix for use by mlagan. By default the file $LAGAN_DIR/nucmatrix.txt is used by mlagan. By default this method returns a corresponding Matrix. Returns : Bio::Matrix::Mlagan object Args : none to get, OR to set: Bio::Matrix::MLagan object OR filename of an mlagan substitution matrix file NB: due to a bug in mlagan 2.0, the -nucmatrixfile option does not work, so this Bioperl wrapper is unable to simply point mlagan to your desired matrix file (or to a temp file generated from your matrix object). Instead the $LAGAN_DIR/nucmatrix.txt file must actually be replaced. This wrapper will make a back-up copy of that file, write the new file in its place, then revert things back to the way they were after the alignment has been produced. For this reason, $LAGAN_DIR must be writable, as must $LAGAN_DIR/nucmatrix.txt. =cut sub nuc_matrix { my ($self, $thing, $gap_open, $gap_continue) = @_; if ($thing) { if (-e $thing) { my $min = Bio::Matrix::IO->new(-format => 'mlagan', -file => $thing); $self->{_nuc_matrix} = $min->next_matrix; } elsif (ref($thing) && $thing->isa('Bio::Matrix::Mlagan')) { $self->{_nuc_matrix} = $thing; } else { $self->throw("Unknown kind of thing supplied, '$thing'"); } $self->{_nuc_matrix_set} = 1; } unless (defined $self->{_nuc_matrix}) { # read the program default file my $min = Bio::Matrix::IO->new(-format => 'mlagan', -file => File::Spec->catfile($PROGRAM_DIR, 'nucmatrix.txt')); $self->{_nuc_matrix} = $min->next_matrix; } $self->{_nuc_matrix_set} = 1 if defined wantarray; return $self->{_nuc_matrix}; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file(s) for Lagan executables Returns : name of files containing Lagan data input / or array of files and phylo tree for Mlagan data input =cut sub _setinput { my ($self, $executable, $input1, $input2) = @_; my ($fh, $infile1, $infile2, $temp1, $temp2, $seq1, $seq2); $self->io->_io_cleanup(); SWITCH: { if (ref($input1) =~ /ARRAY/i) { ##INPUTS TO MLAGAN / WILL hAVE TO BE CHANGED IF LAGAN EVER ##SUPPORTS MULTI-INPUT my @infilearr; foreach $seq1 (@$input1) { ($fh, $infile1) = $self->io->tempfile(); my $temp = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); unless ($seq1->isa("Bio::PrimarySeqI")) { return 0; } $temp->write_seq($seq1); close $fh; undef $fh; push @infilearr, $infile1; } $infile1 = \@infilearr; last SWITCH; } elsif ($input1->isa("Bio::PrimarySeqI")) { ##INPUTS TO LAGAN ($fh, $infile1) = $self->io->tempfile(); #Want to make sure their are no white spaces in sequence. #Happens if input1 is taken from an alignment. my $sequence = $input1->seq(); $sequence =~ s/\W+//g; $input1->seq($sequence); $temp1 = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); $temp1->write_seq($input1); close $fh; undef $fh; last SWITCH; } } SWITCH2: { if (ref($input2)) { if ($input2->isa("Bio::PrimarySeqI")) { ($fh, $infile2) = $self->io->tempfile(); #Want to make sure their are no white spaces in #sequence. Happens if input2 is taken from an #alignment. my $sequence = $input2->seq(); $sequence =~ s/\W+//g; $input2->seq($sequence); $temp2 = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); $temp2->write_seq($input2); close $fh; undef $fh; last SWITCH2; } } else { $infile2 = $input2; ##A tree as a scalar has been passed, pass it through } } return ($infile1, $infile2); } =head2 _generic_lagan Title : _generic_lagan Usage : internal function not called directly Returns : SimpleAlign object =cut sub _generic_lagan { my ($self, $executable, $input1, $input2) = @_; my $param_string = $self->_setparams($executable); my $lagan_report = &_runlagan($self, $executable, $param_string, $input1, $input2); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for (m)Lagan program Returns : parameter string to be passed to Lagan Args : Reference to calling object and name of (m)Lagan executable =cut sub _setparams { my ($self, $executable) = @_; my (@execparams, $nucmatrixfile); if ($executable eq 'lagan.pl') { @execparams = @LAGAN_PARAMS; } elsif ($executable eq 'mlagan') { @execparams = @MLAGAN_PARAMS; if ($self->{_nuc_matrix_set}) { # we create this file on every call because we have no way of # knowing if user altered the matrix object (my $handle, $nucmatrixfile) = $self->io->tempfile(); my $mout = Bio::Matrix::IO->new(-format => 'mlagan', -fh => $handle); $mout->write_matrix($self->nuc_matrix); $self->{_nucmatrixfile} = $nucmatrixfile; } } ##EXPAND OTHER LAGAN SUITE PROGRAMS HERE my $param_string = $self->SUPER::_setparams(-params => [@execparams], -dash => 1); $param_string .= " -nucmatrixfile $nucmatrixfile" if $nucmatrixfile; return $param_string . " -mfa "; } =head2 _runlagan Title : _runlagan Usage : Internal function, not to be called directly Function: makes actual system call to (m)Lagan program Example : Returns : Report object in the SimpleAlign object =cut sub _runlagan { my ($self, $executable, $param_string, $input1, $input2) = @_; my ($lagan_obj, $exe); if ( ! ($exe = $self->executable($executable))) { return; } my $version = $self->version; my $command_string; if ($executable eq 'lagan.pl') { $command_string = $exe . " " . $input1 . " " . $input2 . $param_string; } if ($executable eq 'mlagan') { $command_string = $exe; foreach my $tempfile (@$input1) { $command_string .= " " . $tempfile; } if (defined $input2) { $command_string .= " -tree " . "\"" . $input2 . "\""; } $command_string .= " " . $param_string; my $matrix_file = $self->{_nucmatrixfile}; if ($version <= 3 && $matrix_file) { # mlagan 2.0 bug-workaround my $orig = File::Spec->catfile($PROGRAM_DIR, 'nucmatrix.txt'); -e $orig || $self->throw("Strange, $orig doesn't seem to exist"); system("cp $orig $orig.bk") && $self->throw("Backup of $orig failed: $!"); system("cp $matrix_file $orig") && $self->throw("Copy of $matrix_file -> $orig failed: $!"); } } if (($self->silent || $self->quiet) && ($^O !~ /os2|dos|MSWin32|amigaos/)) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $command_string .= " > $null 2> $null"; } # will do brute-force clean up of junk files generated by lagan my $cwd = cwd(); opendir(my $cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); my %ok_files; foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /anch/) { $ok_files{$thing} = 1; } } closedir($cwd_dir); $self->debug("$command_string\n"); my $status = system(($version <= 3 ? '_POSIX2_VERSION=1 ' : '').$command_string); # temporary hack whilst lagan script 'rechaos.pl' uses obsolete sort syntax if ($version <= 1 && $self->{_nucmatrixfile}) { my $orig = File::Spec->catfile($PROGRAM_DIR, 'nucmatrix.txt'); system("mv $orig.bk $orig") && $self->warn("Restore of $orig from $orig.bk failed: $!"); } opendir($cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /anch/) { unlink($thing) unless $ok_files{$thing}; } } closedir($cwd_dir); my $outfile = $self->out(); my $align = Bio::AlignIO->new( '-file' => $outfile, '-format' => 'fasta' ); my $aln = $align->next_aln(); return $aln; } =head2 executable Title : executable Usage : my $exe = $lagan->executable('mlagan'); Function: Finds the full path to the 'lagan' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found Thanks to Jason Stajich for providing the framework for this subroutine =cut sub executable { my ($self, $exename, $exe, $warn) = @_; $exename = 'lagan.pl' unless defined $exename; if ( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless ( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); unless( $exe ) { if ( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } # even if its executable, we still need the environment variable to have # been set if (! $PROGRAM_DIR) { $self->warn("Environment variable LAGAN_DIR must be set, even if the lagan executables are in your path"); $self->{'_pathtoexe'}->{$exename} = undef; } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_path Title : program_path Usage : my $path = $lagan->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Thanks to Jason Stajich for providing the framework for this subroutine =cut sub program_path { my ($self,$program_name) = @_; my @path; push @path, $self->program_dir if $self->program_dir; push @path, $program_name .($^O =~ /mswin/i ?'':''); # Option for Windows variants / None so far return Bio::Root::IO->catfile(@path); } =head2 program_dir Title : program_dir Usage : my $dir = $lagan->program_dir(); Function: Abstract get method for dir of program. To be implemented by wrapper. Returns : string representing program directory Thanks to Jason Stajich for providing the framework for this subroutine =cut sub program_dir { $PROGRAM_DIR; } =head2 version Title : version Usage : my $version = $lagan->version; Function: returns the program version Returns : number Args : none =cut sub version { my $self = shift; my $exe = $self->executable('mlagan') || return; open(my $VER, "$exe -version 2>&1 |") || die "Could not open command '$exe -version'\n"; my $version; while (my $line = <$VER>) { ($version) = $line =~ /(\d+\S+)/; } close($VER) || die "Could not complete command '$exe -version'\n"; return $version; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/MAFFT.pm000066400000000000000000000423211302566030400236270ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::MAFFT # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::MAFFT - run the MAFFT alignment tools =head1 SYNOPSIS # Build a MAFFT alignment factory $factory = Bio::Tools::Run::Alignment::MAFFT->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); #There are various additional options available. =head1 DESCRIPTION You can get MAFFT from L. "fftnsi" is the default method for Mafft version 4 in this implementation. See Bio::Tools::Run::Alignment::Clustalw for a description on how to specify parameters to the underlying alignment program. See the MAFFT manual page for a description of the MAFFT parameters. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.html - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::MAFFT; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @MAFFT4_PARAMS @MAFFT4_SWITCHES @OTHER_SWITCHES %OK_FIELD @MAFFT_ALN_METHODS @MAFFT6_PARAMS @MAFFT6_SWITCHES %OK_FIELD6 ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'OUTPUT' => 'fasta', 'METHOD' => 'fftnsi', 'CYCLES' => 2); @MAFFT4_PARAMS =qw( METHOD CYCLES ); @MAFFT4_SWITCHES = qw( NJ ALL_POSITIVE); # NB: Mafft6 options are case-sensitive (eg. --lop and --LOP is different) @MAFFT6_PARAMS = qw( weighti retree maxiterate partsize groupsize op ep lop lep lexp LOP LEXP bl jtt tm aamatrix fmodel seed ); @MAFFT6_SWITCHES = qw( auto 6merpair globalpair localpair genafpair fastapair fft nofft noscore memsave parttree dpparttree fastaparttree clustalout inputorder reorder treeout nuc amino ); @OTHER_SWITCHES = qw(QUIET ALIGN OUTPUT OUTFILE); @MAFFT_ALN_METHODS = qw(fftnsi fftns nwnsi nwns fftnsrough nwnsrough); #@MAFFT6_ALN_METHODS = qw(linsi ginsi einsi fftnsi fftns nwnsi nwns) # Authorize attribute fields foreach my $attr ( @MAFFT4_SWITCHES,@MAFFT4_PARAMS,@OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } foreach my $attr ( @MAFFT6_PARAMS, @MAFFT6_SWITCHES ) { $OK_FIELD6{$attr}++ } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'mafft'; } =head2 executable Title : executable Usage : my $exe = $blastfactory->executable('blastall'); Function: Finds the full path to the 'codeml' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exename, $exe,$warn) = @_; $exename = $self->program_name unless (defined $exename ); if( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); # This is how I meant to split up these conditionals --jason # if exe is null we will execute this (handle the case where # PROGRAMDIR pointed to something invalid) unless( $exe ) { # we didn't find it in that last conditional if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_path Title : program_path Usage : my $path = $factory->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Args : none =cut sub program_path { my ($self,$program_name) = @_; my @path; push @path, $self->program_dir if $self->program_dir; push @path, $program_name .($^O =~ /mswin/i ?'.exe':''); return Bio::Root::IO->catfile(@path); } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return File::Spec->rel2abs($ENV{MAFFTDIR}) if $ENV{MAFFTDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } $self->output($DEFAULTS{'OUTPUT'}) unless( $self->output ); if ( ! $self->_version6 ) { $self->method($DEFAULTS{'METHOD'}) unless( $self->method ); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; # NB: Mafft6 options are case-sensitive if ( $self->_version6 ) { if ( $OK_FIELD6{ $attr } ) { # Don't want the attrs to clash with bioperl attributes $self->{version6attrs}{$attr} = shift if @_; return $self->{version6attrs}{$attr}; } } $attr = uc $attr; # aliasing $attr = 'OUTFILE' if $attr eq 'OUTFILE_NAME'; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysis run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return unless $exe = $self->executable; # this is a bit of a hack, but MAFFT is just a gawk script # so we are actually grepping the scriptfile # UPDATE (Torsten Seemann) # it now seems to be a 'sh' script and the format has changed # slightly. i've tried to make the change compatible with both... # version="v5.860 (2006/06/12)"; export version if( open(my $NAME, "grep 'export version' $exe | ") ) { while(<$NAME>) { if( /version.*?([\d.a-z]+)\s+/ ) { return $1; } } $self->warn("No version found"); close($NAME); } else { $self->warn("$!"); } return; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : array ref of Bio::PrimarySeqI objects OR filename of sequences to run with =cut sub run { my ($self,$seqs) = @_; return $self->align($seqs); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is an array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename,$type) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my ($param_string,$outstr) = $self->_setparams(); # run mafft return $self->_run($infilename, $param_string,$outstr); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to tcoffee program Example : Returns : nothing; tcoffee output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to tcoffee =cut sub _run { my ($self,$infilename,$paramstr,$outstr) = @_; my $commandstring = $self->executable()." $paramstr $infilename $outstr"; $self->debug( "mafft command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile(); if( !-e $outfile || -z $outfile ) { $self->warn( "MAFFT call crashed: $? [command $commandstring]\n"); return; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->output); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for mafft programs Example : Returns : name of file containing mafft data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return; } elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/ ) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for mafft program Example : Returns : parameter string to be passed to mafft program Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($outfile,$param_string) = ('',''); # Set default output file if no explicit output file selected unless (defined($outfile = $self->outfile) ) { my $tfh; ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile($outfile); } my ($attr,$value); if ( $self->_version6 ) { for $attr ( @MAFFT6_SWITCHES) { $value = $self->$attr(); next unless defined $value; my $attr_key = lc $attr; #put switches in format expected by mafft $attr_key = ' --'.$attr_key; $param_string .= $attr_key ; } for $attr ( @MAFFT6_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key; $param_string .= $attr_key .' '.$value; } if ( ! $self->no_param_checks ) { my @incompatible = qw/auto 6merpair globalpair localpair genafpair fastapair/; my @set = grep { $self->$_ } @incompatible; if ( @set > 1 ) { $self->throw("You can't specify more than one of @set"); } } } else { for $attr ( @MAFFT4_SWITCHES) { $value = $self->$attr(); next unless defined $value; my $attr_key = lc $attr; #put switches in format expected by mafft $attr_key = ' --'.$attr_key; $param_string .= $attr_key ; } # Method is a version 4 option my $method = $self->method; $self->throw("no method ") unless defined $method; if( $method !~ /(rough|nsi)$/ && defined $self->cycles) { $param_string .= " ".$self->cycles; } } my $outputstr = " 1>$outfile" ; if ($self->quiet() || $self->verbose < 0) { $param_string .= " --quiet"; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $outputstr .= " 2> $null"; } return ($param_string, $outputstr); } =head2 methods Title : methods Usage : my @methods = $self->methods() Function: Get/Set Alignment methods - NOT VALIDATED Returns : array of strings Args : arrayref of strings =cut sub methods { my ($self) = shift; return @MAFFT_ALN_METHODS; } =head2 _version6 Title : _version6 Usage : Internal function, not to be called directly Function: Check if the version of MAFFT is 6 Example : Returns : Boolean Args : None =cut sub _version6 { my $self = shift; if ( ! defined $self->{_version6} ) { my $version = $self->version || ''; if ( $version =~ /^v6/ ) { $self->{_version6} = 1; } else { $self->{_version6} = ''; } } return $self->{_version6}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $mafft->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $mafft->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/MSAProbs.pm000077500000000000000000000452211302566030400244250ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::MSAProbs # # Please direct questions and support issues to # # Cared for by Jessen Bredeson # # Copyright Jessen Bredeson # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::MSAProbs - Object for the calculation of a multiple sequence alignment (MSA) from a set of unaligned sequences using the MSAProbs program =head1 SYNOPSIS # Build a msaprobs alignment factory $factory = Bio::Tools::Run::Alignment::MSAProbs->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION MSAProbs is Liu, Schmidt, and Maskell's (2010) alignment program using HMM and partition function posterior probabilities. For more a more in-depth description see the original publication: Liu, Y., Schmidt, B., and Maskell, D. L. (2010) MSAProbs: multiple sequence alignment based on pair hidden Markov models and partition function posterior probabilities. I 26(16): 1958-1964 doi:10.1093/bioinformatics/btq338 -OR- http://bioinformatics.oxfordjournals.org/content/26/16/1958.abstract You can download the source code from http://sourceforge.net/projects/msaprobs/ It is recommended you use at least version 0.9; behaviour with earlier versions is questionable. =head2 Helping the module find your executable You will need to help MSAProbs to find the 'msaprobs' executable. This can be done in (at least) three ways: 1. Make sure the msaprobs executable is in your path (i.e. 'which msaprobs' returns a valid program) 2. define an environmental variable MSAPROBSDIR which points to a directory containing the 'msaprobs' app: In bash export MSAPROBSDIR=/home/progs/msaprobs or In csh/tcsh setenv MSAPROBSDIR /home/progs/msaprobs 3. include a definition of an environmental variable MSAPROBSDIR in every script that will BEGIN {$ENV{MSAPROBSDIR} = '/home/progs/msaprobs'; } use Bio::Tools::Run::Alignment::MSAProbs; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHOR - Jessen Bredeson Email jessenbredeson@berkeley.edu =head1 CONTRIBUTIONS This MSAProbs module was adapted from the Bio::Tools::Run::Alignment::Muscle module, written by Jason Stajich and almost all of the credit should be given to him. Email jason-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::MSAProbs; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS %MSAPROBS_PARAMS %MSAPROBS_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::GuessSeqFormat; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'QUIET' => 1, '_AFORMAT' => 'fasta', '_CONSISTENCY' => 2, '_ITERATIONS' => 10, '_CLUSTALW' => 0, '_ALIGNMENT_ORDER' => 0 ); %MSAPROBS_PARAMS = ( 'NUM_THREADS' => 'NUM_THREADS', 'CONSISTENCY' => 'C', 'ITERATIONS' => 'IR', 'ANNOT_FILE' => 'ANNOT' ); %MSAPROBS_SWITCHES = ( 'CLUSTALW' => 'CLUSTALW', 'ALIGNMENT_ORDER' => 'A' ); # Authorize attribute fields %OK_FIELD = map{ uc($_) => 1 } qw(INFILE OUTFILE VERBOSE QUIET VERSION), keys %MSAPROBS_PARAMS, keys %MSAPROBS_SWITCHES; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'msaprobs'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{MSAPROBSDIR}) if $ENV{MSAPROBSDIR}; } =head2 version Title : version Usage : exit if $prog->version() < 0.9.4 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my( $exe,$version ); return unless $exe = $self->executable; my $string = `$exe -version 2>&1` ; $string =~ /MSAPROBS\s+VERSION\s+([\d\.]+)/i; $version =~ s/\.(\d+)$/$1/ if ($version = $1); return $version || undef; } =head2 new Title : new Usage : my $msaprobs = Bio::Tools::Run::Alignment::MSAProbs->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::MSAProbs Args : -outfile => $outname =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my( @msap_args, @obj_args, $field ); while( my $arg = shift @args ) { $field = uc $arg; $field =~ s/^-//; $arg = '-'.$arg if $arg !~ /^-/; $self->throw("Invalid argument: $field") unless $OK_FIELD{$field}; push @msap_args, lc($arg),shift @args; } map{ $self->{lc($_)} = $DEFAULTS{$_} } keys %DEFAULTS; $self->_set_from_args(\@msap_args, -create => 1, -case_sensitive => 1, -methods => [map{lc($_);} keys %OK_FIELD]); return $self; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my( $self,$input ) = @_; $input ||= $self->infile; return $self->align($input); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my $infilename; if( defined($input) ) { $infilename = $self->_setinput($input); } elsif( defined($self->infile) ) { $infilename = $self->_setinput($self->infile); } else { $self->throw("No inputdata provided\n"); } unless( $infilename ) { $self->throw("Bad input data or less than 2 sequences in $infilename !"); } my $param_string = $self->_setparams(); # run msaprobs return &_run($self, $infilename, $param_string); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 infile Title : infile Usage : $prog->infile($filename) Function: get/set the fasta (and only a fasta) file to run on or the array reference containing the Bio::SeqI objects Returns : name of input sequence file or object array ref Args : name of input sequence file or object array ref =cut =head2 outfile Title : outfile Usage : $prog->outfile($filename) Function: get/set the file to save output to Returns : outfile name if set Args : newvalue (optional) =cut =head2 annot_file Title : annot_file Usage : $prog->annot_file($filename) Function: get/set the file name to write the MSA annotation to Returns : filename or undef Args : filename (optional) =cut =head2 num_threads Title : num_threads Usage : $prog->num_threads($cores) Function: get/set number of cores on your machine Returns : integer Args : integer (optional; executable auto-detects) =cut =head2 consistency Title : consistency Usage : $prog->consistency($passes) Function: get/set the number of consistency transformation passes Returns : integer Args : integer 0..5, [default 2] (optional) =cut =head2 iterations Title : iterations Usage : $prog->iterations($passes) Function: get/set the number of iterative-refinement passes Returns : integer Args : integer 0..1000, [default 10] (optional) =cut =head2 alignment_order Title : alignment_order Usage : $prog->alignment_order($bool) Function: specify whether or not to output aligned sequences in alignment order, not input order Returns : boolean Args : boolean [default: off] (optional) =cut =head2 clustalw Title : clustalw Usage : $prog->clustalw($bool) Function: write output in clustalw format; makes no sense unless outfile() is also specified Returns : boolean Args : boolean [default: off] (optional) =cut =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $msaprobs->outfile_name(); Function: Get the name of the output file from a run (if you wanted to do something special) Returns : string Args : none =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $msaprobs->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut =head1 Private Methods =cut =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to msaprobs program Example : Returns : nothing; msaprobs output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to msaprobs =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable.' '.$infilename.$params; $self->debug( "msaprobs command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name; if( !-s $outfile ) { $self->warn( "MSAProbs call crashed: $? [command $commandstring]\n"); return undef; } if( $self->clustalw ){ $outfile = $self->_clustalize($outfile); $self->aformat('clustalw'); } my $in = Bio::AlignIO->new( '-file' => $outfile, '-format' => $self->aformat, '-displayname_flat' => 1 ); my $aln = $in->next_aln(); undef $in; return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for msaprobs program Example : Returns : name of file containing msaprobs data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my( $self,$input ) = @_; my( $infilename,$outtemp,$tfh,@sequences ); if (! ref $input) { # check that file exists or throw return unless (-s $input && -r $input); # let's peek and guess $infilename = $input; open(IN,$input) || $self->throw("Cannot open $input"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); $header =~ /^>\s*\S+/ || $self->throw("Need to provide a FASTA-formatted file to msaprobs!"); my $inseqio = Bio::SeqIO->new( -file => $input, -format => 'fasta' ); while( my $seq = $inseqio->next_seq ){ push @sequences, $seq; } undef $inseqio; # have to check each seq for terminal '*', so # continue below and write clean output to temp file }elsif( ref($input) =~ /ARRAY/i ){ # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return; }elsif( $input->[0]->isa('Bio::PrimarySeqI') ){ @sequences = @$input; }else{ $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); return; } }else{ $self->warn("Got $input and don't know what to do with it\n"); return; } ($tfh,$infilename) = $self->io->tempfile(); $outtemp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my( @out,$string ); my $ct = 1; while( my $seq = shift @sequences){ return unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/){ $seq->display_id( "Seq".$ct++ ); } $string = $seq->seq; $string =~ s/\*$//; $seq->seq($string); if( $string =~ tr/~.-/~.-/ ){ $self->warn("These sequences may have already been aligned!"); } push @out, $seq; } $outtemp->write_seq(@out); $outtemp->close(); undef $outtemp; close($tfh); $tfh = undef; return $infilename; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for msaprobs program Example : Returns : parameter string to be passed to msaprobs during align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr,$method,$value,$param_string); $param_string = ''; unless( defined $self->outfile ){ $self->aformat($DEFAULTS{'AFORMAT'}); $self->clustalw(0); } #put switches/params in format expected by MSAProbs for $attr ( keys %MSAPROBS_PARAMS ){ $method = lc $attr; $value = $self->$method(); next unless (defined $value); my $attr_key = lc $MSAPROBS_PARAMS{$attr}; $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( keys %MSAPROBS_SWITCHES ){ $method = lc $attr; $value = $self->$method(); next unless $value; my $attr_key = lc $MSAPROBS_SWITCHES{$attr}; $attr_key = ' -'.$attr_key; $param_string .= $attr_key; } # Set default output file if no explicit file specified # or if a clustalw-formatted file is desired... if( $self->clustalw || ! $self->outfile ) { my ($tfh, $outfile) = $self->io->tempfile(-dir => $self->tempdir); close($tfh); undef $tfh; $self->outfile_name($outfile); }else{ $self->outfile_name($self->outfile); } my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= ' -v' if $self->verbose > 0; $param_string .= ' >'.$self->outfile_name; $param_string .= " 2>$null" if $self->quiet && $self->verbose < 1; $self->arguments($param_string); return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } sub _clustalize { my $self = shift; my $infile = shift; my $outfile = $self->outfile; local $/ = "\n"; my( $in,$out,$firstline,$line ); $in = Bio::Root::IO->new(-file => $infile); $out = Bio::Root::IO->new(-file => '>'.$outfile); while( defined( $firstline = $in->_readline )) { last if $firstline !~ /^\s*$/; } $in->_pushback('CLUSTALW format, '.$firstline); while( defined( $line = $in->_readline )) { $out->_print( $line ); } $out->close(); $in->close(); undef $out; undef $in; $self->debug($outfile); return $outfile if -s $outfile; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Muscle.pm000066400000000000000000000417571302566030400242360ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Muscle # # Please direct questions and support issues to # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Muscle - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the MUSCLE program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Muscle->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION Muscle is Robert Edgar's progressive alignment program. You can get it and see information about it at this URL http://www.drive5.com/muscle It is recommended you use at least version 3.6. Behaviour with earlier versions is questionable. =head2 Helping the module find your executable You will need to enable Muscle to find the muscle program. This can be done in (at least) three ways: 1. Make sure the muscle executable is in your path (i.e. 'which muscle' returns a valid program 2. define an environmental variable MUSCLEDIR which points to a directory containing the 'muscle' app: In bash export MUSCLEDIR=/home/progs/muscle or In csh/tcsh setenv MUSCLEDIR /home/progs/muscle 3. include a definition of an environmental variable MUSCLEDIR in every script that will BEGIN {$ENV{MUSCLEDIR} = '/home/progs/muscle'; } use Bio::Tools::Run::Alignment::Muscle; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Muscle; use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::IO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our %DEFAULTS = ( 'AFORMAT' => 'fasta' ); our @MUSCLE_PARAMS = qw(in out tree1 log loga scorefile gapopen seqtype maxmb maxhours maxiters kband in1 in2 usetree usetree_nowarn weight1 weight2 smoothwindow SUEFF smoothscoreceil root1 root2 refinewindow physout phyiout objscore minsmoothscore minbestcolscore hydrofactor hydro anchorspacing center cluster1 cluster2 clwout diagbreak diaglength diagmargin distance1 distance2); our @MUSCLE_SWITCHES = qw(quiet verbose diags refine stable group clw clwstrict msf brenner cluster dimer fasta html le anchors noanchors phyi phys profile refinew sp spscore spn sv); our $PROGRAM_NAME = 'muscle'; our $PROGRAM_DIR = Bio::Root::IO->catfile($ENV{MUSCLEDIR}) if $ENV{MUSCLEDIR}; =head2 new Title : new Usage : my $muscle = Bio::Tools::Run::Alignment::Muscle->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Muscle Args : -outfile_name => $outname =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->aformat( $DEFAULTS{'AFORMAT'} ); $self->_set_from_args( \@args, -methods => [ @MUSCLE_PARAMS, @MUSCLE_SWITCHES ], -create => 1 ); my ($out) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME)], @args ); $self->outfile_name( $out || '' ); $self->aformat('msf') if $self->msf; $self->aformat('clustalw') if $self->clw || $self->clwstrict; if ( defined $self->out ) { $self->outfile_name( $self->out ); } return $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1`; $string =~ /MUSCLE\s+v(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (e.g. a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ( $self, $input ) = @_; # Create input file pointer $self->io->_io_cleanup(); my $infilename; if ( defined $input ) { $infilename = $self->_setinput($input); } elsif ( defined $self->in ) { $infilename = $self->_setinput( $self->in ); } else { $self->throw("No inputdata provided\n"); } if ( !$infilename ) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run muscle return &_run( $self, $infilename, $param_string ); } =head2 profile Title : profile Usage : $alnfilename = /t/data/cysprot.msa'; $seqsfilename = 't/data/cysprot.fa'; $aln = $factory->profile($alnfilename,$seqsfilename); Function: Perform a profile alignment on a MSA to include more seqs Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing the fasta MSA and name of a file containing a set of unaligned fasta sequences Comments: This only works for muscle version 3.52. Some early versions of the 3.6 sources had a bug that caused a segfault with -profile. The attached should fix it, if not let Bob Edgar know. =cut sub profile { my ( $self, $alnfilename, $seqsfilename ) = @_; # Create input file pointer $self->io->_io_cleanup(); if ( $self->version ne '3.52' ) { $self->throw("profile does not work for this version of muscle\n"); } my $infilename; if ( defined $alnfilename ) { if ( !ref $alnfilename ) { # check that file exists or throw $infilename = $alnfilename; unless ( -e $infilename ) { return 0; } # let's peek and guess open( IN, $infilename ) || $self->throw("Cannot open $infilename"); my $header; while ( defined( $header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ) { $self->throw( "Need to provide a FASTA format file to muscle profile!"); } } } else { $self->throw("No inputdata provided\n"); } if ( !$infilename ) { $self->throw( "Bad input data or less than 2 sequences in $infilename !"); } if ( defined $seqsfilename ) { if ( !ref $seqsfilename ) { # check that file exists or throw $infilename = $seqsfilename; unless ( -e $infilename ) { return 0; } # let's peek and guess open( IN, $infilename ) || $self->throw("Cannot open $infilename"); my $header; while ( defined( $header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ) { $self->throw( "Need to provide a FASTA format file to muscle profile!"); } } } else { $self->throw("No inputdata provided\n"); } if ( !$infilename ) { $self->throw( "Bad input data or less than 2 sequences in $infilename !"); } my $param_string = $self->_setparams(); # run muscle $self->{_profile} = 1; return &_run( $self, "$alnfilename -in2 $seqsfilename", $param_string ); } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat { my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to muscle program Example : Returns : nothing; muscle output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to muscle =cut sub _run { my ( $self, $infilename, $params ) = @_; my $commandstring; if ( $self->{_profile} ) { $commandstring = $self->executable . " -profile -in1 $infilename $params"; $self->{_profile} = 0; } else { $commandstring = $self->executable . " -in $infilename $params"; } $self->debug("muscle command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if ( !-e $outfile || -z $outfile ) { $self->warn("Muscle call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new( '-file' => $outfile, '-format' => $self->aformat ); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for muscle program Example : Returns : name of file containing muscle data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ( $self, $input ) = @_; my ( $infilename, $seq, $temp, $tfh ); if ( !ref $input ) { # check that file exists or throw $infilename = $input; unless ( -e $input ) { return 0; } # let's peek and guess open( IN, $infilename ) || $self->throw("Cannot open $infilename"); my $header; while ( defined( $header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ) { $self->throw("Need to provide a FASTA format file to muscle!"); } return ($infilename); } elsif ( ref($input) =~ /ARRAY/i ) { # $input may be an array of BioSeq objects... # Open temporary file for both reading & writing of array ( $tfh, $infilename ) = $self->io->tempfile(); if ( !ref( $input->[0] ) ) { $self->warn( "passed an array ref which did not contain objects to _setinput" ); return undef; } elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new( '-fh' => $tfh, '-format' => 'fasta' ); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if ( !defined $seq->display_id || $seq->display_id =~ /^\s+$/ ) { $seq->display_id( "Seq" . $ct++ ); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry " . $input->[0] . " and don't know what to do with it\n" ); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for muscle program Example : Returns : parameter string to be passed to muscle during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ( $attr, $value, $param_string ); $param_string = ''; my $laststr; for $attr (@MUSCLE_PARAMS) { $value = $self->$attr(); next unless ( defined $value ); my $attr_key = lc $attr; $attr_key = ' -' . $attr_key; $param_string .= $attr_key . ' ' . $value; } for $attr (@MUSCLE_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; # put switches in format expected by tcoffee $attr_key = ' -' . $attr_key; $param_string .= $attr_key; } # Set default output file if no explicit output file selected unless ( $self->outfile_name ) { my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $self->tempdir() ); close($tfh); undef $tfh; $self->outfile_name($outfile); } $param_string .= " -out " . $self->outfile_name; if ( $self->quiet() || $self->verbose < 0 ) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $muscle->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $muscle->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Pal2Nal.pm000077500000000000000000000216341302566030400242320ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::Pal2Nal # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Pal2Nal - Wrapper for Pal2Nal =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Pal2Nal; # Make a Pal2Nal factory $factory = Bio::Tools::Run::Alignment::Pal2Nal->new(); # Run Pal2Nal with a protein alignment file and a multi-fasta nucleotide # file my $aln = $factory->run($protein_alignfilename, $nucleotide_filename); # or with Bioperl objects $aln = $factory->run($protein_bio_simplalign, [$nucleotide_bio_seq1, $nucleotide_bio_seq2]); # combinations of files/ objects are possible # $aln isa Bio::SimpleAlign of the nucleotide sequences aligned according to # the protein alignment =head1 DESCRIPTION This is a wrapper for running the Pal2Nal perl script by Mikita Suyama. You can get details here: http://coot.embl.de/pal2nal/. Pal2Nal is used for aligning a set of nucleotide sequences based on an alignment of their translations. You can try supplying normal pal2nal command-line arguments to new(), eg. new() or calling arg-named methods (excluding the initial hyphen, eg. $factory->(1) to set the - arg). You will need to enable this Pal2Nal wrapper to find the pal2nal.pl script. This can be done in (at least) three ways: 1. Make sure the script is in your path. 2. Define an environmental variable PAL2NALDIR which is a directory which contains the script: In bash: export PAL2NALDIR=/home/username/pal2nal/ In csh/tcsh: setenv PAL2NALDIR /home/username/pal2nal 3. Include a definition of an environmental variable PAL2NALDIR in every script that will use this Pal2Nal wrapper module, e.g.: BEGIN { $ENV{PAL2NALDIR} = '/home/username/pal2nal/' } use Bio::Tools::Run::Alignment::Pal2Nal; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Pal2Nal; use strict; use Bio::AlignIO; use Bio::SeqIO; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'pal2nal.pl'; our $PROGRAM_DIR = $ENV{'PAL2NALDIR'}; # methods for the pal2nal args we support our @PARAMS = qw(codontable); our @SWITCHES = qw(blockonly nogap nomismatch); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(output html h nostderr); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Alignment::Pal2Nal->new() Function: creates a new Pal2Nal factory. Returns : Bio::Tools::Run::Alignment::Pal2Nal Args : Most options understood by pal2nal.pl can be supplied as key => value pairs. These options can NOT be used with this wrapper: -output -html -h -nostderr =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($protein_align_file, $multi_fasta_nucleotide); -or- $result = $factory->run($prot_align_object, [$bioseq_object1, ...]); Function: Runs pal2nal on a protein alignment and set of nucleotide sequences. Returns : Bio::SimpleAlign; Args : The first argument represents a protein alignment, the second argument a set of nucleotide sequences. The alignment can be provided as an alignment file readable by Bio::AlignIO, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The nucleotide sequences can be provided as a single filename of a fasta file containing multiple nucleotide sequences, or an array ref of filenames, each file containing one sequence. Alternatively, an array ref of Bio::PrimarySeqI compliant objects can be supplied. In all cases, the protein alignment sequence names must correspond to the names of the supplied nucleotide sequences. =cut sub run { my ($self, $aln, $nucs) = @_; ($aln && $nucs) || $self->throw("alignment and nucleotides must be supplied"); $aln = $self->_alignment($aln); # gaps must be -, not . my $fixed_aln = Bio::SimpleAlign->new(); foreach my $seq ($aln->each_seq) { my $str = $seq->seq; $str =~ s/\./-/g; $fixed_aln->add_seq(Bio::LocatableSeq->new(-id => $seq->id, -seq => $str)); } $self->_alignment($fixed_aln); my $nucs_file; if (-e $nucs) { $nucs_file = $nucs; } elsif (ref($nucs) eq 'ARRAY') { (my $tempfh, $nucs_file) = $self->io->tempfile('-dir' => $self->tempdir(), UNLINK => ($self->save_tempfiles ? 0 : 1)); close($tempfh); my $sout = Bio::SeqIO->new(-file => ">".$nucs_file, -format => 'fasta'); foreach my $nuc (@{$nucs}) { if (-e $nuc) { my $sin = Bio::SeqIO->new(-file => $nuc); while (my $nuc_seq = $sin->next_seq) { $sout->write_seq($nuc_seq); } } elsif (ref($nuc) && $nuc->isa('Bio::PrimarySeqI')) { $sout->write_seq($nuc); } else { $self->throw("Don't understand nucleotide argument '$nuc'"); } } } else { $self->throw("Don't understand nucleotide argument '$nucs'"); } return $self->_run($nucs_file); } sub _run { my ($self, $nucs_file) = @_; my $exe = $self->executable || return; my $aln_file = $self->_write_alignment; my ($rfh, $result_file) = $self->io->tempfile('-dir' => $self->tempdir(), UNLINK => ($self->save_tempfiles ? 0 : 1)); my ($efh, $error_file) = $self->io->tempfile('-dir' => $self->tempdir(), UNLINK => ($self->save_tempfiles ? 0 : 1)); close($rfh); undef $rfh; close($efh); undef $efh; my $command = $exe.$self->_setparams($aln_file, $nucs_file, $result_file, $error_file); $self->debug("pal2nal command = $command\n"); system($command) && $self->throw("pal2nal call ($command) failed: $! | $?"); open(my $errfh, '<', $error_file); my $errors; while (<$errfh>) { $errors .= $_; } close($errfh); $self->throw("pal2nal call ($command) had errors:\n$errors") if $errors; my $ain = Bio::AlignIO->new(-file => $result_file, -format => 'fasta'); my $aln = $ain->next_aln; $ain->close; return $aln; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $nucs_file, $result_file, $error_file) = @_; my $param_string = ' '.$aln_file; $param_string .= ' '.$nucs_file; $param_string .= $self->SUPER::_setparams(-params => \@PARAMS, -switches => \@SWITCHES, -dash => 1); $param_string .= ' -output fasta'; $param_string .= " > $result_file 2> $error_file"; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Probalign.pm000066400000000000000000000334161302566030400247140ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Probalign # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Probalign - Object for the calculation of a multiple sequence alignment from a set of unaligned sequences or alignments using the Probalign program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Probalign->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. $factory = Bio::Tools::Run::Alignment::Probalign->new(); $factory->outfile_name("$dir/$subdir/$outdir/outfile.afa"); $aln = $factory->align($seq_array_ref); =head1 DESCRIPTION Probalign: multiple sequence alignment using partition function posterior probabilities. Probalign uses partition function posterior probability estimates to compute maximum expected accuracy multiple sequence alignments. You can get it and see information about it at this URL http://www.cs.njit.edu/usman/probalign =head2 Helping the module find your executable You will need to enable Probalign to find the probalign program. This can be done in (at least) three ways: 1. Make sure the probalign executable is in your path (i.e. 'which probalign' returns a valid program 2. define an environmental variable PROBALIGNDIR which points to a directory containing the 'probalign' app: In bash export PROBALIGNDIR=/home/progs/probalign or In csh/tcsh setenv PROBALIGNDIR /home/progs/probalign 3. include a definition of an environmental variable PROBALIGNDIR in every script that will BEGIN {$ENV{PROBALIGNDIR} = '/home/progs/probalign'; } use Bio::Tools::Run::Alignment::Probalign; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Probalign; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @PROBALIGN_PARAMS @PROBALIGN_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @PROBALIGN_PARAMS = qw (TEMPERATURE SCORE_MATRIX GAP-OPEN GAP-EXTENSION); @PROBALIGN_SWITCHES = qw(CLUSTALW VERBOSE ALIGNMENT-ORDER NUC PROT); @OTHER_SWITCHES = qw(); # Authorize attribute fields foreach my $attr ( @PROBALIGN_PARAMS, @PROBALIGN_SWITCHES, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'probalign'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PROBALIGNDIR}) if $ENV{PROBALIGNDIR}; } =head2 new Title : new Usage : my $probalign = Bio::Tools::Run::Alignment::Probalign->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Probalign Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on || ''); my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #PROBALIGN version 1.09 - align multiple protein sequences and print to standard output $string =~ /PROBALIGN\s+Beta\s+Version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run probalign return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to probalign program Example : Returns : nothing; probalign output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to probalign =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." $infilename $params"; $self->debug( "probalign command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Probalign call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for probalign program Example : Returns : name of file containing probalign data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to probalign!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for probalign program Example : Returns : parameter string to be passed to probalign during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @PROBALIGN_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } for $attr ( @PROBALIGN_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $probalign->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $probalign->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Probcons.pm000066400000000000000000000355541302566030400245710ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Probcons # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Probcons - Object for the calculation of an iterative multiple sequence alignment from a set of unaligned sequences or alignments using the Probcons program =head1 SYNOPSIS # Build a muscle alignment factory $factory = Bio::Tools::Run::Alignment::Probcons->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. #To run probcons with training, try something like: #First round to generate train.params $factory = Bio::Tools::Run::Alignment::Probcons->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'emissions' => '', 'verbose' => '', 'train' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/train.params"); #Second round to use train.params to get a high qual alignment $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); $aln = ''; $factory = ''; $factory = Bio::Tools::Run::Alignment::Probcons->new ( 'iterative-refinement' => '1000', 'consistency' => '5', 'pre-training' => '20', 'verbose' => '', 'paramfile' => "$dir/$subdir/$outdir/train.params", ); $factory->outfile_name("$dir/$subdir/$outdir/outfile.afa"); $aln = $factory->align($seq_array_ref); =head1 DESCRIPTION Probcons is a Probabilistic Consistency-based Multiple Alignment of Amino Acid Sequences. You can get it and see information about it at this URL http://probcons.stanford.edu/ =head2 Helping the module find your executable You will need to enable Probcons to find the probcons program. This can be done in (at least) three ways: 1. Make sure the probcons executable is in your path (i.e. 'which probcons' returns a valid program 2. define an environmental variable PROBCONSDIR which points to a directory containing the 'probcons' app: In bash export PROBCONSDIR=/home/progs/probcons or In csh/tcsh setenv PROBCONSDIR /home/progs/probcons 3. include a definition of an environmental variable PROBCONSDIR in every script that will BEGIN {$ENV{PROBCONSDIR} = '/home/progs/probcons'; } use Bio::Tools::Run::Alignment::Probcons; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Probcons; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @PROBCONS_PARAMS @PROBCONS_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'fasta' ); @PROBCONS_PARAMS = qw (CONSISTENCY ITERATIVE-REFINEMENT PRE-TRAINING ANNOT TRAIN PARAMFILE MATRIXFILE CLUSTALW PAIRS VITERBI VERBOSE EMISSIONS); #FIXME: Last line are switches, dunno how to set them, #gave as params @PROBCONS_SWITCHES = qw(); @OTHER_SWITCHES = qw(); # Authorize attribute fields foreach my $attr ( @PROBCONS_PARAMS, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'probcons'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PROBCONSDIR}) if $ENV{PROBCONSDIR}; } =head2 new Title : new Usage : my $probcons = Bio::Tools::Run::Alignment::Probcons->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Probcons Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on || ''); my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); if ($attr =~ /verbose/i) { $self->{verbose_set} = 1; } } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #PROBCONS version 1.09 - align multiple protein sequences and print to standard output $string =~ /PROBCONS\s+version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams($infilename); # run probcons return &_run($self, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to probcons program Example : Returns : nothing; probcons output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to probcons =cut sub _run { my ($self, $params) = @_; my $commandstring = $self->executable." $params"; $self->debug( "probcons command = $commandstring \n"); my $status = system($commandstring); if ($status) { $self->warn( "Probcons call crashed: $? [command $commandstring]\n"); return; } my $outfile = $self->outfile_name(); if (-e $outfile || -z $outfile) { my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat); my $aln = $in->next_aln(); return $aln; } return; # some modes of operation do not generate an output alignment } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for probcons program Example : Returns : name of file containing probcons data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to probcons!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for probcons program Example : Returns : parameter string to be passed to probcons during align or profile_align Args : name of calling object =cut sub _setparams { my ($self, $infilename) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @PROBCONS_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } if ($self->{verbose_set}) { $param_string .= ' --verbose'; } for $attr ( @PROBCONS_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " $infilename > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $probcons->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $probcons->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Proda.pm000066400000000000000000000334261302566030400240450ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Proda # # Please direct questions and support issues to # # Cared for by Albert Vilella # # # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Proda - Object for the calculation of sets of multiple sequence alignments from a set of unaligned sequences or alignments using the Proda program. =head1 SYNOPSIS # Build a Proda alignment factory $factory = Bio::Tools::Run::Alignment::Proda->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # @alns is an array of SimpleAlign objects. @alns = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; @alns = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. $factory = Bio::Tools::Run::Alignment::Proda->new(); @alns = $factory->align($seq_array_ref); =head1 DESCRIPTION You can get it and see information about it at this URL http://proda.stanford.edu This program will return one or more local alignments for the different repeated or rearranged regions in the sequences. If a sequences contains more than one of those patterns, it will be present more than once in the alignment. The difference will be in that the id contain the start and end, like myseqid(123-456) and myseqid(567-890), instead of simply myseqid as in the original input file. This is true for all the output ids, even if they are present only once. =head2 Helping the module find your executable You will need to enable Proda to find the proda program. This can be done in (at least) three ways: 1. Make sure the proda executable is in your path (i.e. 'which proda' returns a valid program 2. define an environmental variable PRODADIR which points to a directory containing the 'proda' app: In bash export PRODADIR=/home/progs/proda or In csh/tcsh setenv PRODADIR /home/progs/proda 3. include a definition of an environmental variable PRODADIR in every script that will BEGIN {$ENV{PRODADIR} = '/home/progs/proda'; } use Bio::Tools::Run::Alignment::Proda; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Proda; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS @PRODA_PARAMS @PRODA_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); BEGIN { %DEFAULTS = ( 'AFORMAT' => 'proda' ); @PRODA_PARAMS = qw (L); @PRODA_SWITCHES = qw(POSTERIOR TRAN SILENT); @OTHER_SWITCHES = qw(); # Authorize attribute fields foreach my $attr ( @PRODA_PARAMS, @PRODA_SWITCHES, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'proda'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PRODADIR}) if $ENV{PRODADIR}; } =head2 new Title : new Usage : my $proda = Bio::Tools::Run::Alignment::Proda->new(); Function: Constructor Returns : Bio::Tools::Run::Alignment::Proda Args : -outfile_name => $outname =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($on) = $self->SUPER::_rearrange([qw(OUTFILE_NAME)], @args); $self->outfile_name($on || ''); my ($attr, $value); $self->aformat($DEFAULTS{'AFORMAT'}); while ( @args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; #PRODA version 1.09 - align multiple protein sequences and print to standard output $string =~ /ProDA\s+version\s+(\d+\.\d+)/m; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(\@seqs); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : Arrayref of Bio::PrimarySeqI objects or a filename to run on =cut sub run { my $self = shift; return $self->align(shift); } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename) = $self->_setinput($input); if (! $infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run proda return &_run($self, $infilename, $param_string); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to proda program Example : Returns : nothing; proda output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to proda =cut sub _run { my ($self,$infilename,$params) = @_; my $commandstring = $self->executable." $infilename $params"; $self->debug( "proda command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile_name(); if( !-e $outfile || -z $outfile ) { $self->warn( "Proda call crashed: $? [command $commandstring]\n"); return undef; } my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->aformat, ); my @alns; while(my $aln = $in->next_aln) { push @alns, $aln; } return @alns; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for proda program Example : Returns : name of file containing proda data input AND Args : Arrayref of Seqs or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(IN,$infilename) || $self->throw("Cannot open $infilename"); my $header; while( defined ($header = ) ) { last if $header !~ /^\s+$/; } close(IN); if ( $header !~ /^>\s*\S+/ ){ $self->throw("Need to provide a FASTA format file to proda!"); } return ($infilename); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an # array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for proda program Example : Returns : parameter string to be passed to proda during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @PRODA_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; $attr_key = ' --'.$attr_key unless ($attr eq 'ANNOT'); $attr_key = ' -'.$attr_key if ($attr eq 'ANNOT'); $param_string .= $attr_key .' '.$value; } for $attr ( @PRODA_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } #FIXME: This may be only for *nixes. Double check in other OSes $param_string .= " > ".$self->outfile_name; if ($self->verbose < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null"; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $proda->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $proda->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/Sim4.pm000066400000000000000000000247741302566030400236220ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Alignment::Sim4 # # Please direct questions and support issues to # # Cared for by # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::Sim4 - Wrapper for Sim4 program that allows for alignment of cdna to genomic sequences =head1 SYNOPSIS use Bio::Tools::Run::Alignment::Sim4; my @params = (W=>15,K=>17,D=>10,N=>10,cdna_seq=>"mouse_cdna.fa",genomic_seq=>"mouse_genomic.fa"); my $sim4 = Bio::Tools::Run::Alignment::Sim4->new(@params); my @exon_sets = $sim4->align; foreach my $set(@exon_sets){ foreach my $exon($set->sub_SeqFeature){ print $exon->start."\t".$exon->end."\t".$exon->strand."\n"; print "\tMatched ".$exon->est_hit->seq_id."\t".$exon->est_hit->start."\t".$exon->est_hit->end."\n"; } } One can also provide a est database $sio = Bio::SeqIO->new(-file=>"est.fa",-format=>"fasta"); @est_seq=(); while(my $seq = $sio->next_seq){ push @est_seq,$seq; } my @exon_sets = $factory->align(\@est_seq,$genomic); =head1 DESCRIPTION Sim4 program is developed by Florea et al. for aligning cdna/est sequence to genomic sequences Florea L, Hartzell G, Zhang Z, Rubin GM, Miller W. A computer program for aligning a cDNA sequence with a genomic DNA sequence. Genome Res 1998 Sep;8(9):967-74 The program is available for download here: http://globin.cse.psu.edu/ =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::Sim4; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SIM4_PARAMS @OTHER_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Sim4::Results; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable Sim4 to find the Sim4 program. This # can be done in (at least) two ways: # # 1. define an environmental variable SIM4DIR # export SIM4DIR =/usr/local/share/sim4 # where the sim4 package is installed # # 2. include a definition of an environmental variable SIM4 in # every script that will use Sim4.pm # $ENV{SIMR4DIR} = '/usr/local/share/sim4'; BEGIN { @SIM4_PARAMS= qw(A W X K C R D H P N B); @OTHER_PARAMS= qw(CDNA_SEQ GENOMIC_SEQ OUTFILE); @OTHER_SWITCHES = qw(SILENT QUIET VERBOSE); # Authorize attribute fields foreach my $attr ( @SIM4_PARAMS, @OTHER_PARAMS, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'sim4'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SIM4DIR}) if $ENV{SIM4DIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); # to facilitiate tempfile cleanup $self->io->_initialize_io(); $self->A(0); # default my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; if ($attr =~/est_first/i ) { #NEW $self->{est_first} = $value; #NEW next; #NEW } #NEW next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/'PROGRAM'/i ) { $self->executable($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : not supported Function: Cannot determine from program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef; } =head2 align Title : align Usage : $cdna = 't/data/cdna.fa'; $genomic = 't/data/cdna.fa'; @exon_set = $factory->align($cdna,$genomic); or #@seq_array is array of Seq objs $cdna = \@seq_array; @exon_set = $factory->align($cdna,$genomic); of @exon_set = $factory->align($cdna->[0],$genomic) Function: Perform a Sim4 alignment Returns : An array of Bio::SeqFeature::Generic objects which has exons as sub seqfeatures. Args : Name of two files containing fasta sequences, or 2 Bio::SeqI objects or a combination of both first is assumed to be cdna second is assumed to be genomic More than one cdna may be provided. If an object, assume that its an array ref. =cut sub align { my ($self,$cdna,$genomic) = @_; $self->cdna_seq($cdna) if $cdna; $self->throw("Need to provide a cdna sequence") unless $self->cdna_seq; $self->genomic_seq($genomic) if $genomic; $self->throw("Need to provide a genomic sequence") unless $self->genomic_seq; my ($temp,$infile1, $infile2, $est_first,$seq); my ($attr, $value, $switch); # Create input file pointer ($est_first,$infile1,$infile2)= $self->_setinput($self->cdna_seq,$self->genomic_seq); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in align!");} # Create parameter string to pass to Sim4 program my $param_string = $self->_setparams(); # run Sim4 my @exon_sets = $self->_run($est_first,$infile1,$infile2,$param_string); return @exon_sets; } ################################################# #internal methods =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to Sim4 program Example : Returns : nothing; Sim4 output is written to a temp file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to Sim4 =cut sub _run { my ($self,$estfirst,$infile1,$infile2,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); if(! $self->outfile){ my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir); close($tfh); undef $tfh; $self->outfile($outfile); } my $outfile = $self->outfile(); my $commandstring = $self->executable." $infile1 $infile2 $param_string > $outfile"; if($self->quiet || $self->silent || ($self->verbose < 0)){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 2>$null"; } $self->debug( "Sim4 command = $commandstring"); my $status = system($commandstring); $self->throw( "Sim4 call ($commandstring) crashed: $? \n") unless $status==0; #use Sim4 parser my $sim4_parser = Bio::Tools::Sim4::Results->new(-file=>$outfile,-estfirst=>$estfirst); my @out; while(my $exonset = $sim4_parser->next_exonset){ push @out, $exonset; } return @out; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for Sim4 program Example : Returns : name of file containing Sim4 data input Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self, $cdna,$genomic) = @_; my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2); #my $estfirst=1; my $estfirst= defined($self->{est_first}) ? $self->{_est_first} : 1; my ($cdna_file,$genomic_file); #a sequence obj if(ref($cdna)) { my @cdna = ref $cdna eq "ARRAY" ? @{$cdna} : ($cdna); ($tfh1,$cdna_file) = $self->io->tempfile(-dir=>$self->tempdir); my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta'); foreach my $c (@cdna){ $seqio->write_seq($c); } close $tfh1; undef $tfh1; #if we have a est database, then input will go second if($#cdna > 0){ $estfirst=0; } } else { my $sio = Bio::SeqIO->new(-file=>$cdna,-format=>"fasta"); my $count = 0; while(my $seq = $sio->next_seq){ $count++; } $estfirst = $count > 1 ? 0:1; $cdna_file = $cdna; } if( ref($genomic) ) { ($tfh1,$genomic_file) = $self->io->tempfile(-dir=>$self->tempdir); my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta'); $seqio->write_seq($genomic); close $tfh1; undef $tfh1; } else { $genomic_file = $genomic; } return ($estfirst,$cdna_file,$genomic_file) if $estfirst; return ($estfirst,$genomic_file,$cdna_file); } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Sim4 program Example : Returns : parameter string to be passed to Sim4 during align or profile_align Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @SIM4_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = uc $attr; #put params in format expected by Sim4 $attr_key = ' '.$attr_key; $param_string .= $attr_key.'='.$value; } return $param_string; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/StandAloneFasta.pm000066400000000000000000000354011302566030400260020ustar00rootroot00000000000000#StandAloneFasta.pm v1.00 2002/11/01 # #Bioperl module for Bio::Tools::Run::Alignment::StandAloneFasta # # Written by Tiequan Zhang # Please direct questions and support issues to # # Cared for by Shawn Hoon # Copyright Tiequan Zhang # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::StandAloneFasta - Object for the local execution of the Fasta3 programs ((t)fasta3, (t)fastx3, (t)fasty3 ssearch3) =head1 SYNOPSIS #!/usr/bin/perl use Bio::Tools::Run::Alignment::StandAloneFasta; use Bio::SeqIO; use strict; my @arg=( 'b' =>'15', 'O' =>'resultfile', 'H'=>'', 'program'=>'fasta34' ); my $factory=Bio::Tools::Run::Alignment::StandAloneFasta->new(@arg); $factory->ktup(1); $factory->library('p'); #print result file name print $factory->O; my @fastreport=$factory->run($ARGV[0]); foreach (@fastreport) { print "Parsed fasta report:\n"; my $result = $_->next_result; while( my $hit = $result->next_hit()) { print "\thit name: ", $hit->name(), "\n"; while( my $hsp = $hit->next_hsp()) { print "E: ", $hsp->evalue(), "frac_identical: ", $hsp->frac_identical(), "\n"; } } } #pass in seq objects my $sio = Bio::SeqIO->new(-file=>$ARGV[0],-format=>"fasta"); my $seq = $sio->next_seq; my @fastreport=$factory->run($ARGV[0]); =head1 DESCRIPTION This wrapper works with version 3 of the FASTA program package (see W. R. Pearson and D. J. Lipman (1988), "Improved Tools for Biological Sequence Analysis", PNAS 85:2444-2448 (Pearson and Lipman, 1988); W. R. Pearson (1996) "Effective protein sequence comparison" Meth. Enzymol. 266:227-258 (Pearson, 1996); Pearson et. al. (1997) Genomics 46:24-36 (Zhang et al., 1997); Pearson, (1999) Meth. in Molecular Biology 132:185-219 (Pearson, 1999). Version 3 of the FASTA packages contains many programs for searching DNA and protein databases and one program (prss3) for evaluating statistical significance from randomly shuffled sequences. Fasta is available at ftp://ftp.virginia.edu/pub/fasta =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Tiequan Zhang Adapted for bioperl by Shawn Hoon Enhanced by Jason Stajich Email tqzhang1973@yahoo.com shawnh@fugu-sg.org jason-at-bioperl.org =head1 Appendix The rest of the documendation details each of the object methods. Internal methods are preceded with a underscore =cut package Bio::Tools::Run::Alignment::StandAloneFasta; use vars qw ($AUTOLOAD @ISA $library %parameters $ktup @FASTA_PARAMS %OK_FIELD @OTHER_PARAMS); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::Seq; use Bio::SeqIO; use Bio::SearchIO; use Bio::Tools::Run::WrapperBase; BEGIN { @FASTA_PARAMS=qw(a A b c E d f g h H i j l L M m n O o p Q q r R s S w x y z); @OTHER_PARAMS =qw(program output database); foreach my $att (@FASTA_PARAMS, @OTHER_PARAMS) {$OK_FIELD{$att}++;} $ktup=2; %parameters=('H' => '', 'q' => '', 'm' =>'1', 'O' =>''); } @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); sub new { my ($caller,@args)=@_; #chained new my $self = $caller->SUPER::new(@args); while(@args){ my $attr = shift @args; my $value = shift @args; next if ($attr=~/^-/ || ! $attr); $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; return $self->program(@_); } =head2 executable Title : executable Usage : my $exe = $blastfactory->executable('blastall'); Function: Finds the full path to the 'codeml' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exename, $exe,$warn) = @_; $exename = 'fasta34' unless defined $exename; if( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); # This is how I meant to split up these conditionals --jason # if exe is null we will execute this (handle the case where # PROGRAMDIR pointed to something invalid) unless( $exe ) { # we didn't find it in that last conditional if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{FASTADIR}) if $ENV{FASTADIR}; } =head2 run Title : run Usage : my @fasta_object = $factory->($input,$onefile); where $factory is the name of executable FASTA program; $input is file name containing the sequences in the format of fasta or Bio::Seq object or array of Bio::Seq object; $onefile is 0 if you want to save the outputs to different files default: outputs are saved in one file Function: Attempts to run an executable FASTA program and return array of fasta objects containing the fasta report Returns : aray of fasta report object If the user specify the output file(s), the raw fasta report will be saved Args : sequence object OR array reference of sequence objects filename of file containing fasta formatted sequences =cut sub run { my ($self,$input,$onefile)=@_; local * FASTARUN; $self->io->_io_cleanup; my $program = $self->executable($self->program_name) || $self->throw("FASTA program not found or not executable.\n"); # You should specify a library file $self->throw("You didn't choose library.\n") unless ( $library); my @seqs = $self->_setinput($input); return 0 unless (@seqs); my @fastobj; my ($fhout, $tempoutfile)=$self->io->tempfile(-dir=>$self->tempdir); my $outfile=$self->O(); # The outputs from executable FASTA program will # be saved into different files if $onefile is 0, # else will be concatenated into one file my $onfile = (!defined $onefile || $onefile =~ /^0$/); unless( $onfile ) { my $count=0; # do some fancy stuff here to test if we are running fasta34 # with mlib so we just pass in a single file rather than # running fasta N times # (not implemented yet) foreach my $seq (@seqs){ $count++; # Decide if the output will be saved into a temporary file if( $outfile ) { $self->O(sprintf("%s_%d",$outfile,$count)); } my ($fhinput,$teminputfile)= $self->io->tempfile(-dir=>$self->tempdir); my $temp = Bio::SeqIO->new(-fh=>$fhinput, '-format'=>'Fasta'); $temp->write_seq($seq); $temp->close(); close $fhinput; undef $fhinput; my $para= $self->_setparams; $para .=" $teminputfile $library $ktup"; $para ="$program $para"; my $object; unless( $outfile ) { open(FASTARUN, "$para |") || $self->throw($@); $object = Bio::SearchIO->new(-fh=>\*FASTARUN, -format=>"fasta"); } else { if ( $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $para .= " >$null 2>$null"; } else { $self->debug("Going to execute: $para"); } my $status = system($para); $self->throw("$para crashed: $?\n" )unless ($status==0); $object = Bio::SearchIO->new(-file=>$self->O, -format=>"fasta"); } push @fastobj, $object; } } else { if ($outfile){ open (FILE, ">$outfile") or $self->throw("can't use $outfile:$!"); close(FILE); } foreach my $seq (@seqs){ my ($fhinput,$teminputfile)=$self->io->tempfile(-dir=>$self->tempdir); my $temp=Bio::SeqIO->new(-fh=>$fhinput, '-format'=>'fasta'); $temp->write_seq($seq); $temp->close(); close $fhinput; undef $fhinput; $self->O($tempoutfile) if( $outfile ); my $para= $self->_setparams; $para .= " $teminputfile $library $ktup"; $para ="$program $para"; my $object; unless( $outfile ) { open(FASTARUN, "$para |") || $self->throw($@); $object=Bio::SearchIO->new(-fh=>\*FASTARUN, -format=>"fasta"); } else { if ( $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $para .= " >$null 2>$null"; } else { $self->debug("Going to execute: $para"); } my $status = system($para); $self->throw("$para crashed: $?\n" )unless ($status==0); $object = Bio::SearchIO->new(-file=>$self->O, -format=>"fasta"); } push @fastobj, $object; # The output in the temporary file # will be saved at the end of $outfile if($outfile){ open (FHOUT, $tempoutfile) or die("can't open the $tempoutfile file"); open (FH, ">>$outfile") or die("can't use the $outfile file"); print FH (); close (FHOUT); close (FH); } } } return @fastobj; } =head2 library Title : library Usage : my $lb = $self->library Function: Fetch or set the name of the library to search against Returns : The name of the library Args : No argument if user wants to fetch the name of library file; A letter or a string of letter preceded by %; (e.g. P or %pn, the letter is the character in the third field of any line of fastlibs file ) or the name of library file (if environmental variable FASTLIBS is not set); if user wants to set the name of library file to search against =cut sub library { my($self,$lb)=@_; return $library if (!defined($lb)); if ( ($lb =~ /^%[a-zA-Z]+$/)||($lb=~ /^[a-zA-Z]$/)){ if(! defined $ENV{'FASTLIBS'} ){ $self->throw("abbrv. list request but FASTLIBS undefined, cannot use $lb"); } } else { unless ( -e $lb){ $self->throw("cannot open $lb library"); } } return $library=$lb; } *database = \&library; =head2 output Title : output Usage : $obj->output($newval) Function: The output directory if we want to use this Example : Returns : value of output (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub output{ my $self = shift; return $self->{'output'} = shift if @_; return $self->{'output'}; } =head2 ktup Title : ktup Usage : my $ktup = $self->ktup Function: Fetch or set the ktup value for executable FASTA programs Example : Returns : The value of ktup if defined, else undef is returned Args : No argument if user want to fetch ktup value; A integer value between 1-6 if user want to set the ktup value =cut sub ktup { my($self,$k)=@_; if(!defined($k)){return $ktup;} if ($k =~ /^[1-6]$/){ $ktup=$k; return $ktup } else { $self->warn("You should set the ktup value between 1-6. The FASTA program will decide your default ktup value."); return $ktup= undef; } } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file(s) for Blast executable Example : Returns : array of Bio::Seq object reference Args : Seq object reference or input file name =cut sub _setinput { my ($self, $input) = @_; if( ! defined $input ) { $self->throw("Calling fasta program with no input"); } my @seqs; if( ! ref $input ) { if( -e $input ) { my $seqio = Bio::SeqIO->new(-format => 'fasta', -file => $input); while( my $seq = $seqio->next_seq ) { push @seqs, $seq; } } else { $self->throw("Input $input was not a valid filename"); } } elsif( ref($input) =~ /ARRAY/i ) { foreach ( @$input ) { if( ref($_) && $_->isa('Bio::PrimarySeqI') ) { push @seqs, $_; } else { $self->warn("Trying to add a " . ref($_) ." but expected a Bio::PrimarySeqI"); } } if( ! @seqs) { $self->throw("Did not pass in valid input -- no sequence objects found"); } } elsif( $input->isa('Bio::PrimarySeqI') ) { push @seqs, $input; } return @seqs; } =head2 _exist Title : _exist Usage : Internal function, not to be called directly Function: Determine whether a executable FASTA program can be found Cf. the DESCRIPTION section of this POD for how to make sure for your FASTA installation to be found. This method checks for existence of the blastall executable in the path. Returns : 1 if FASTA program found at expected location, 0 otherwise. Args : none =cut sub _exist { my $exe = shift @_; return 0 unless($exe =~ /fast|ssearch/); $exe .='.exe' if ($^O =~ /mswin/i); my $f; return ($f=Bio::Root::IO->exists_exe($exe))&&(-x $f); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for FASTA executable Returns : part of parameter string to be passed to FASTA program Args : none =cut sub _setparams { my ($self,$attr,$value); $self = shift; my $para = ""; foreach my $attr(@FASTA_PARAMS) { $value = $self->$attr(); next unless (defined $value); $para .=" -$attr $value"; } $para .= " -q "; return $para; } 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Alignment/TCoffee.pm000066400000000000000000001123621302566030400243100ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Alignment::TCoffee # # Please direct questions and support issues to # # Cared for by Jason Stajich, Peter Schattner # # Copyright Jason Stajich, Peter Schattner # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Alignment::TCoffee - Object for the calculation of a multiple sequence alignment from a set of unaligned sequences or alignments using the TCoffee program =head1 SYNOPSIS # Build a tcoffee alignment factory @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::TCoffee->new(@params); # Pass the factory a list of sequences to be aligned. $inputfilename = 't/cysprot.fa'; # $aln is a SimpleAlign object. $aln = $factory->align($inputfilename); # or where @seq_array is an array of Bio::Seq objects $seq_array_ref = \@seq_array; $aln = $factory->align($seq_array_ref); # Or one can pass the factory a pair of (sub)alignments #to be aligned against each other, e.g.: # where $aln1 and $aln2 are Bio::SimpleAlign objects. $aln = $factory->profile_align($aln1,$aln2); # Or one can pass the factory an alignment and one or more # unaligned sequences to be added to the alignment. For example: # $seq is a Bio::Seq object. $aln = $factory->profile_align($aln1,$seq); #There are various additional options and input formats available. #See the DESCRIPTION section that follows for additional details. =head1 DESCRIPTION Note: this DESCRIPTION only documents the (Bio)perl interface to TCoffee. =head2 Helping the module find your executable You will need to enable TCoffee to find the t_coffee program. This can be done in (at least) three ways: 1. Make sure the t_coffee executable is in your path so that which t_coffee returns a t_coffee executable on your system. 2. Define an environmental variable TCOFFEEDIR which is a dir which contains the 't_coffee' app: In bash export TCOFFEEDIR=/home/username/progs/T-COFFEE_distribution_Version_1.37/bin In csh/tcsh setenv TCOFFEEDIR /home/username/progs/T-COFFEE_distribution_Version_1.37/bin 3. Include a definition of an environmental variable TCOFFEEDIR in every script that will use this TCoffee wrapper module. BEGIN { $ENV{TCOFFEDIR} = '/home/username/progs/T-COFFEE_distribution_Version_1.37/bin' } use Bio::Tools::Run::Alignment::TCoffee; If you are running an application on a webserver make sure the webserver environment has the proper PATH set or use the options 2 or 3 to set the variables. =head1 PARAMETERS FOR ALIGNMENT COMPUTATION There are a number of possible parameters one can pass in TCoffee. One should really read the online manual for the best explanation of all the features. See http://igs-server.cnrs-mrs.fr/~cnotred/Documentation/t_coffee/t_coffee_doc.html These can be specified as parameters when instantiating a new TCoffee object, or through get/set methods of the same name (lowercase). =head2 IN Title : IN Description : (optional) input filename, this is specified when align so should not use this directly unless one understand TCoffee program very well. =head2 TYPE Title : TYPE Args : [string] DNA, PROTEIN Description : (optional) set the sequence type, guessed automatically so should not use this directly =head2 PARAMETERS Title : PARAMETERS Description : (optional) Indicates a file containing extra parameters =head2 EXTEND Title : EXTEND Args : 0, 1, or positive value Default : 1 Description : Flag indicating that library extension should be carried out when performing multiple alignments, if set to 0 then extension is not made, if set to 1 extension is made on all pairs in the library. If extension is set to another positive value, the extension is only carried out on pairs having a weigth value superior to the specified limit. =head2 DP_NORMALISE Title : DP_NORMALISE Args : 0 or positive value Default : 1000 Description : When using a value different from 0, this flag sets the score of the highest scoring pair to 1000. =head2 DP_MODE Title : DP_MODE Args : [string] gotoh_pair_wise, myers_miller_pair_wise, fasta_pair_wise cfasta_pair_wise Default : cfast_fair_wise Description : Indicates the type of dynamic programming used by the program gotoh_pair_wise : implementation of the gotoh algorithm (quadratic in memory and time) myers_miller_pair_wise : implementation of the Myers and Miller dynamic programming algorithm ( quadratic in time and linear in space). This algorithm is recommended for very long sequences. It is about 2 time slower than gotoh. It only accepts tg_mode=1. fasta_pair_wise: implementation of the fasta algorithm. The sequence is hashed, looking for ktuples words. Dynamic programming is only carried out on the ndiag best scoring diagonals. This is much faster but less accurate than the two previous. cfasta_pair_wise : c stands for checked. It is the same algorithm. The dynamic programming is made on the ndiag best diagonals, and then on the 2*ndiags, and so on until the scores converge. Complexity will depend on the level of divergence of the sequences, but will usually be L*log(L), with an accuracy comparable to the two first mode ( this was checked on BaliBase). =head2 KTUPLE Title : KTUPLE Args : numeric value Default : 1 or 2 (1 for protein, 2 for DNA ) Description : Indicates the ktuple size for cfasta_pair_wise dp_mode and fasta_pair_wise. It is set to 1 for proteins, and 2 for DNA. The alphabet used for protein is not the 20 letter code, but a mildly degenerated version, where some residues are grouped under one letter, based on physicochemical properties: rk, de, qh, vilm, fy (the other residues are not degenerated). =head2 NDIAGS Title : NDIAGS Args : numeric value Default : 0 Description : Indicates the number of diagonals used by the fasta_pair_wise algorithm. When set to 0, n_diag=Log (length of the smallest sequence) =head2 DIAG_MODE Title : DIAG_MODE Args : numeric value Default : 0 Description : Indicates the manner in which diagonals are scored during the fasta hashing. 0 indicates that the score of a diagonal is equal to the sum of the scores of the exact matches it contains. 1 indicates that this score is set equal to the score of the best uninterrupted segment 1 can be useful when dealing with fragments of sequences. =head2 SIM_MATRIX Title : SIM_MATRIX Args : string Default : vasiliky Description : Indicates the manner in which the amino acid is being degenerated when hashing. All the substitution matrix are acceptable. Categories will be defined as sub-group of residues all having a positive substitution score (they can overlap). If you wish to keep the non degenerated amino acid alphabet, use 'idmat' =head2 MATRIX Title : MATRIX Args : Default : Description : This flag is provided for compatibility with ClustalW. Setting matrix = 'blosum' is equivalent to -in=Xblosum62mt , -matrix=pam is equivalent to in=Xpam250mt . Apart from this, the rules are similar to those applying when declaring a matrix with the -in=X fl =head2 GAPOPEN Title : GAPOPEN Args : numeric Default : 0 Description : Indicates the penalty applied for opening a gap. The penalty must be negative. If you provide a positive value, it will automatically be turned into a negative number. We recommend a value of 10 with pam matrices, and a value of 0 when a library is used. =head2 GAPEXT Title : GAPEXT Args : numeric Default : 0 Description : Indicates the penalty applied for extending a gap. =head2 COSMETIC_PENALTY Title : COSMETIC_PENALTY Args : numeric Default : 100 Description : Indicates the penalty applied for opening a gap. This penalty is set to a very low value. It will only have an influence on the portions of the alignment that are unalignable. It will not make them more correct, but only more pleasing to the eye ( i.e. Avoid stretches of lonely residues). The cosmetic penalty is automatically turned off if a substitution matrix is used rather than a library. =head2 TG_MODE Title : TG_MODE Args : 0,1,2 Default : 1 Description : (Terminal Gaps) 0: indicates that terminal gaps must be panelized with a gapopen and a gapext penalty. 1: indicates that terminal gaps must be penalized only with a gapext penalty 2: indicates that terminal gaps must not be penalized. =head2 WEIGHT Title : WEIGHT Args : sim or sim_ or integer value Default : sim Description : Weight defines the way alignments are weighted when turned into a library. sim indicates that the weight equals the average identity within the match residues. sim_matrix_name indicates the average identity with two residues regarded as identical when their substitution value is positive. The valid matrices names are in matrices.h (pam250mt) . Matrices not found in this header are considered to be filenames. See the format section for matrices. For instance, -weight=sim_pam250mt indicates that the grouping used for similarity will be the set of classes with positive substitutions. Other groups include sim_clustalw_col ( categories of clustalw marked with :) sim_clustalw_dot ( categories of clustalw marked with .) Value indicates that all the pairs found in the alignments must be given the same weight equal to value. This is useful when the alignment one wishes to turn into a library must be given a pre-specified score (for instance if they come from a structure super-imposition program). Value is an integer: -weight=1000 Note : Weight only affects methods that return an alignment to T-Coffee, such as ClustalW. On the contrary, the version of Lalign we use here returns a library where weights have already been applied and are therefore insensitive to the -weight flag. =head2 SEQ_TO_ALIGN Title : SEQ_TO_ALIGN Args : filename Default : no file - align all the sequences Description : You may not wish to align all the sequences brought in by the -in flag. Supplying the seq_to_align flag allows for this, the file is simply a list of names in Fasta format. However, note that library extension will be carried out on all the sequences. =head1 PARAMETERS FOR TREE COMPUTATION AND OUTPUT =head2 NEWTREE Title : NEWTREE Args : treefile Default : no file Description : Indicates the name of the new tree to compute. The default will be .dnd, or . Format is Phylip/Newick tree format =head2 USETREE Title : USETREE Args : treefile Default : no file specified Description : This flag indicates that rather than computing a new dendrogram, t_coffee can use a pre-computed one. The tree files are in phylips format and compatible with ClustalW. In most cases, using a pre-computed tree will halve the computation time required by t_coffee. It is also possible to use trees output by ClustalW or Phylips. Format is Phylips tree format =head2 TREE_MODE Title : TREE_MODE Args : slow, fast, very_fast Default : very_fast Description : This flag indicates the method used for computing the dendrogram. slow : the chosen dp_mode using the extended library, fast : The fasta dp_mode using the extended library. very_fast: The fasta dp_mode using pam250mt. =head2 QUICKTREE Title : QUICKTREE Args : Default : Description : This flag is kept for compatibility with ClustalW. It indicates that: -tree_mode=very_fast =head1 PARAMETERS FOR ALIGNMENT OUTPUT =head2 OUTFILE Title : OUTFILE Args : out_aln file, default, no Default : default ( yourseqfile.aln) Description : indicates name of output alignment file =head2 OUTPUT Title : OUTPUT Args : format1, format2 Default : clustalw Description : Indicated format for outputting outputfile Supported formats are: clustalw_aln, clustalw: ClustalW format. gcg, msf_aln : Msf alignment. pir_aln : pir alignment. fasta_aln : fasta alignment. phylip : Phylip format. pir_seq : pir sequences (no gap). fasta_seq : fasta sequences (no gap). As well as: score_html : causes the output to be a reliability plot in HTML score_pdf : idem in PDF. score_ps : idem in postscript. More than one format can be indicated: -output=clustalw,gcg, score_html =head2 CASE Title : CASE Args : upper, lower Default : upper Description : triggers choice of the case for output =head2 CPU Title : CPU Args : value Default : 0 Description : Indicates the cpu time (micro seconds) that must be added to the t_coffee computation time. =head2 OUT_LIB Title : OUT_LIB Args : name of library, default, no Default : default Description : Sets the name of the library output. Default implies .tc_lib =head2 OUTORDER Title : OUTORDER Args : input or aligned Default : input Description : Sets the name of the library output. Default implies .tc_lib =head2 SEQNOS Title : SEQNOS Args : on or off Default : off Description : Causes the output alignment to contain residue numbers at the end of each line: =head1 PARAMETERS FOR GENERIC OUTPUT =head2 RUN_NAME Title : RUN_NAME Args : your run name Default : Description : This flag causes the prefix to be replaced by when renaming the default files. =head2 ALIGN Title : ALIGN Args : Default : Description : Indicates that the program must produce the alignment. This flag is here for compatibility with ClustalW =head2 QUIET Title : QUIET Args : stderr, stdout, or filename, or nothing Default : stderr Description : Redirects the standard output to either a file. -quiet on its own redirect the output to /dev/null. =head2 CONVERT Title : CONVERT Args : Default : Description : Indicates that the program must not compute the alignment but simply convert all the sequences, alignments and libraries into the format indicated with -output. This flag can also be used if you simply want to compute a library ( i.e. You have an alignment and you want to turn it into a library). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich, Peter Schattner Email jason-at-bioperl-dot-org, schattner@alum.mit.edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Alignment::TCoffee; use vars qw($AUTOLOAD @ISA $PROGRAM_NAME $PROGRAM_DIR %DEFAULTS @TCOFFEE_PARAMS @TCOFFEE_SWITCHES @OTHER_SWITCHES %OK_FIELD ); use strict; use Cwd; use Bio::Seq; use Bio::SeqIO; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); # You will need to enable TCoffee to find the tcoffee program. This can be done # in (at least) twp ways: # 1. define an environmental variable TCOFFEE: # export TCOFFEEDIR=/home/progs/tcoffee or # 2. include a definition of an environmental variable TCOFFEEDIR # in every script that will # use Bio::Tools::Run::Alignment::TCoffee.pm. # BEGIN {$ENV{TCOFFEEDIR} = '/home/progs/tcoffee'; } BEGIN { $PROGRAM_NAME = 't_coffee'; $PROGRAM_DIR = $ENV{'TCOFFEEDIR'}; %DEFAULTS = ( 'MATRIX' => 'blosum', 'OUTPUT' => 'clustalw', 'AFORMAT'=> 'msf', 'METHODS' => [qw(lalign_id_pair clustalw_pair)] ); @TCOFFEE_PARAMS = qw(IN TYPE PARAMETERS DO_NORMALISE EXTEND DP_MODE KTUPLE NDIAGS DIAG_MODE SIM_MATRIX MATRIX GAPOPEN GAPEXT COSMETIC_PENALTY TG_MODE WEIGHT SEQ_TO_ALIGN NEWTREE USETREE TREE_MODE OUTFILE OUTPUT CASE CPU OUT_LIB OUTORDER SEQNOS RUN_NAME CONVERT ); @TCOFFEE_SWITCHES = qw(QUICKTREE); @OTHER_SWITCHES = qw(QUIET ALIGN KEEPDND); # Authorize attribute fields foreach my $attr ( @TCOFFEE_PARAMS, @TCOFFEE_SWITCHES, @OTHER_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/); # don't want named parameters $self->$attr($value); } $self->matrix($DEFAULTS{'MATRIX'}) unless( $self->matrix ); $self->output($DEFAULTS{'OUTPUT'}) unless( $self->output ); $self->methods($DEFAULTS{'METHODS'}) unless $self->methods; # $self->aformat($DEFAULTS{'AFORMAT'}) unless $self->aformat; return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; # aliasing $attr = 'OUTFILE' if $attr eq 'OUTFILE_NAME'; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe -quiet=stdout 2>&1` ; $string =~ /Version_([\d.]+)/; return $1 || undef; } =head2 run Title : run Usage : my $output = $application->run(-seq => $seq, -profile => $profile, -type => 'profile-aln'); Function: Generic run of an application Returns : Bio::SimpleAlign object Args : key-value parameters allowed for TCoffee runs AND -type => profile-aln or alignment for profile alignments or just multiple sequence alignment -seq => either Bio::PrimarySeqI object OR array ref of Bio::PrimarySeqI objects OR filename of sequences to run with -profile => profile to align to, if this is an array ref will specify the first two entries as the two profiles to align to each other =cut sub run{ my ($self,@args) = @_; my ($type,$seq,$profile) = $self->_rearrange([qw(TYPE SEQ PROFILE)], @args); if( $type =~ /align/i ) { return $self->align($seq); } elsif( $type =~ /profile/i ) { return $self->profile_align($profile,$seq); } else { $self->warn("unrecognized alignment type $type\n"); } return undef; } =head2 align Title : align Usage : $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); or $seq_array_ref = \@seq_array; # @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); Function: Perform a multiple sequence alignment Returns : Reference to a SimpleAlign object containing the sequence alignment. Args : Name of a file containing a set of unaligned fasta sequences or else an array of references to Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or a reference to an array of Bio::Seq objects. If argument is string, throws exception if file corresponding to string name can not be found. If argument is Bio::Seq array, throws exception if less than two sequence objects are in array. =cut sub align { my ($self,$input) = @_; # Create input file pointer $self->io->_io_cleanup(); my ($infilename,$type) = $self->_setinput($input); if (!$infilename) { $self->throw("Bad input data or less than 2 sequences in $input !"); } my $param_string = $self->_setparams(); # run tcoffee return $self->_run('align', [$infilename,$type], $param_string); } ################################################# =head2 profile_align Title : profile_align Usage : Function: Perform an alignment of 2 (sub)alignments Example : Returns : Reference to a SimpleAlign object containing the (super)alignment. Args : Names of 2 files containing the subalignments or references to 2 Bio::SimpleAlign objects. Note : Needs to be updated to run with newer TCoffee code, which allows more than two profile alignments. Throws an exception if arguments are not either strings (eg filenames) or references to SimpleAlign objects. =cut sub profile_align { my $self = shift; my $input1 = shift; my $input2 = shift; my ($temp,$infilename1,$infilename2,$type1,$type2,$input,$seq); $self->io->_io_cleanup(); # Create input file pointers ($infilename1,$type1) = $self->_setinput($input1); ($infilename2,$type2) = $self->_setinput($input2); unless ($type1) { $self->throw("Unknown type for first argument"); } unless ($type2) { $self->throw("Unknown type for second argument") } if (!$infilename1 || !$infilename2) { $self->throw("Bad input data: $input1 or $input2 !"); } my $param_string = $self->_setparams(); # run tcoffee my $aln = $self->_run('profile-aln', [$infilename1,$type1], [$infilename2,$type2], $param_string) ; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to tcoffee program Example : Returns : nothing; tcoffee output is written to a temporary file OR specified output file Args : Name of a file containing a set of unaligned fasta sequences and hash of parameters to be passed to tcoffee =cut sub _run { my ($infilename, $infile1,$infile2) = ('','',''); my $self = shift; my $command = shift; my $instring; if ($command =~ /align/) { my $infile = shift ; my $type; ($infilename,$type) = @$infile; $instring = '-in='.join(',',($infilename, 'X'.$self->matrix, $self->methods)); } if ($command =~ /profile/) { my $in1 = shift ; my $in2 = shift ; my ($type1,$type2); ($infile1,$type1) = @$in1; ($infile2,$type2) = @$in2; # in later versions (tested on 5.72 and 7.54) the API for profile # alignment changed. This attempts to do the right thing for older # versions but corrects for newer ones if ($self->version && $self->version < 5) { # this breaks severely on newer TCoffee (>= v5) unless (($self->matrix =~ /none/i) || ($self->matrix =~ /null/i) ) { $instring = '-in='.join(',', ($type2.$infile2), 'X'.$self->matrix, (map {'M'.$_} $self->methods) ); $instring .= ' -profile='.$infile1; } else { $instring = '-in='.join(',',( $type1.$infile1, $type2.$infile2, (map {'M'.$_} $self->methods) ) ); } } else { if ($type2 eq 'S') { # second infile is a sequence, not an alignment $instring .= ' -profile='.join(',',$infile1); $instring .= ' -seq='.join(',',$infile2); } else { $instring .= ' -profile='.join(',',$infile1,$infile2); } $instring .= ' -matrix='.$self->matrix unless (($self->matrix =~ /none/i) || ($self->matrix =~ /null/i)) ; $instring .= ' -method='.join(',',$self->methods) if ($self->methods) ; } } my $param_string = shift; # my ($paramfh,$parameterFile) = $self->io->tempfile; # print $paramfh ( "$instring\n-output=gcg$param_string") ; # close($paramfh); # my $commandstring = "t_coffee -output=gcg -parameters $parameterFile" ; ##MJL my $commandstring = $self->executable." $instring $param_string"; #$self->debug( "tcoffee command = $commandstring \n"); my $status = system($commandstring); my $outfile = $self->outfile(); if( !-e $outfile || -z $outfile ) { $self->warn( "TCoffee call crashed: $? [command $commandstring]\n"); return undef; } # retrieve alignment (Note: MSF format for AlignIO = GCG format of # tcoffee) my $in = Bio::AlignIO->new('-file' => $outfile, '-format' => $self->output); my $aln = $in->next_aln(); # Replace file suffix with dnd to find name of dendrogram file(s) to delete if( ! $self->keepdnd ) { foreach my $f ( $infilename, $infile1, $infile2 ) { next if( !defined $f || $f eq ''); $f =~ s/\.[^\.]*$// ; # because TCoffee writes these files to the CWD if( $Bio::Root::IO::PATHSEP ) { my @line = split(/$Bio::Root::IO::PATHSEP/, $f); $f = pop @line; } else { (undef, undef, $f) = File::Spec->splitpath($f); } unlink $f .'.dnd' if( $f ne '' ); } } return $aln; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for tcoffee program Example : Returns : name of file containing tcoffee data input AND type of file (if known, S for sequence, L for sequence library, A for sequence alignment) Args : Seq or Align object reference or input file name =cut sub _setinput { my ($self,$input) = @_; my ($infilename, $seq, $temp, $tfh); # If $input is not a reference it better be the name of a # file with the sequence/ alignment data... my $type = ''; if (! ref $input) { # check that file exists or throw $infilename = $input; unless (-e $input) {return 0;} # let's peek and guess open(my $IN,$infilename) || $self->throw("Cannot open $infilename"); my $header = <$IN>; if( $header =~ /^\s+\d+\s+\d+/ || $header =~ /Pileup/i || $header =~ /clustal/i ) { # phylip $type = 'A'; } # On some systems, having filenames with / in them (ie. a file in a # directory) causes t-coffee to completely fail. It warns on all systems. # The -no_warning option solves this, but there is still some strange # bug when doing certain profile-related things. This is magically solved # by copying the profile file to a temp file in the current directory, so # it its filename supplied to t-coffee contains no / # (It's messy here - I just do this to /all/ input files to most easily # catch all variants of providing a profile - it may only be the last # form (isa("Bio::PrimarySeqI")) that causes a problem?) my (undef, undef, $adjustedfilename) = File::Spec->splitpath($infilename); if ($adjustedfilename ne $infilename) { my ($fh, $tempfile) = $self->io->tempfile(-dir => cwd()); seek($IN, 0, 0); while (<$IN>) { print $fh $_; } close($fh); (undef, undef, $tempfile) = File::Spec->splitpath($tempfile); $infilename = $tempfile; $type = 'S'; } close($IN); return ($infilename,$type); } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an array of BioSeq objects... # Open temporary file for both reading & writing of array ($tfh,$infilename) = $self->io->tempfile(-dir => cwd()); (undef, undef, $infilename) = File::Spec->splitpath($infilename); if( ! ref($input->[0]) ) { $self->warn("passed an array ref which did not contain objects to _setinput"); return undef; } elsif( $input->[0]->isa('Bio::PrimarySeqI') ) { $temp = Bio::SeqIO->new('-fh' => $tfh, '-format' => 'fasta'); my $ct = 1; foreach $seq (@$input) { return 0 unless ( ref($seq) && $seq->isa("Bio::PrimarySeqI") ); if( ! defined $seq->display_id || $seq->display_id =~ /^\s+$/) { $seq->display_id( "Seq".$ct++); } $temp->write_seq($seq); } $temp->close(); undef $temp; close($tfh); $tfh = undef; $type = 'S'; } elsif( $input->[0]->isa('Bio::Align::AlignI' ) ) { $temp = Bio::AlignIO->new('-fh' => $tfh, '-format' => $self->aformat); foreach my $aln (@$input) { next unless ( ref($aln) && $aln->isa("Bio::Align::AlignI") ); $temp->write_aln($aln); } $temp->close(); undef $temp; close($tfh); $tfh = undef; $type = 'A'; } else { $self->warn( "got an array ref with 1st entry ". $input->[0]. " and don't know what to do with it\n"); } return ($infilename,$type); # $input may be a SimpleAlign object. } elsif ( $input->isa("Bio::Align::AlignI") ) { # Open temporary file for both reading & writing of SimpleAlign object ($tfh, $infilename) = $self->io->tempfile(-dir => cwd()); (undef, undef, $infilename) = File::Spec->splitpath($infilename); $temp = Bio::AlignIO->new(-fh=>$tfh, '-format' => 'clustalw'); $temp->write_aln($input); close($tfh); undef $tfh; return ($infilename,'A'); } # or $input may be a single BioSeq object (to be added to # a previous alignment) elsif ( $input->isa("Bio::PrimarySeqI")) { # Open temporary file for both reading & writing of BioSeq object ($tfh,$infilename) = $self->io->tempfile(-dir => cwd()); (undef, undef, $infilename) = File::Spec->splitpath($infilename); $temp = Bio::SeqIO->new(-fh=> $tfh, '-format' =>'Fasta'); $temp->write_seq($input); $temp->close(); close($tfh); undef $tfh; return ($infilename,'S'); } else { $self->warn("Got $input and don't know what to do with it\n"); } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for tcoffee program Example : Returns : parameter string to be passed to tcoffee during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @TCOFFEE_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; if( $attr_key =~ /matrix/ ) { $self->{'_in'} = [ "X".lc($value) ]; } else { $attr_key = ' -'.$attr_key; $param_string .= $attr_key .'='.$value; } } for $attr ( @TCOFFEE_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by tcoffee $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } # Set default output file if no explicit output file selected unless ($self->outfile ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile($outfile); $param_string .= " -outfile=$outfile" ; } if ($self->quiet() || $self->verbose < 0) { $param_string .= ' -quiet';} # -no_warning is required on some systems with certain versions or failure # is guaranteed if ($self->version >= 4 && $self->version < 4.7) { $param_string .= ' -no_warning'; } return $param_string; } =head2 aformat Title : aformat Usage : my $alignmentformat = $self->aformat(); Function: Get/Set alignment format Returns : string Args : string =cut sub aformat{ my $self = shift; $self->{'_aformat'} = shift if @_; return $self->{'_aformat'}; } =head2 methods Title : methods Usage : my @methods = $self->methods() Function: Get/Set Alignment methods - NOT VALIDATED Returns : array of strings Args : arrayref of strings =cut sub methods{ my ($self) = shift; @{$self->{'_methods'}} = @{shift || []} if @_; return @{$self->{'_methods'} || []}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $tcoffee->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $tcoffee->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Analysis.pm000066400000000000000000000573051302566030400226470ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Analysis # # Please direct questions and support issues to # # Cared for by Martin Senger # For copyright and disclaimer see below. # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Analysis - Module representing any (remote or local) analysis tool =head1 SYNOPSIS # run analysis 'seqret' using a default location and a default # access method (which means using a Web Service at EBI) use Bio::Tools::Run::Analysis; print new Bio::Tools::Run::Analysis (-name => 'edit::seqret') ->wait_for ({ sequence_direct_data => 'tatatacgtatacga', osformat => 'embl' }) ->result ('outseq'); # run a longer job without waiting for its completion use Bio::Tools::Run::Analysis; my $job = Bio::Tools::Run::Analysis->new(-name => 'edit::seqret') ->run ({ sequence_direct_data => 'tatatacgtatacga', osformat => 'embl' }); # ...and after a while $job->result ('outseq'); # get all results in the same invocation (as a hash reference # with result names as keys) - let the module decide which # results are binary (images in this examples) and save those # in file (or files); it also shows how to tell that the module # should read input data from a local file first use Bio::Tools::Run::Analysis; my $results = Bio::Tools::Run::Analysis->new(-name => 'alignment_multiple::prettyplot') ->wait_for ( { msf_direct_data => '@/home/testdata/my.seq' } ) ->results ('?'); use Data::Dumper; print Dumper ($results); # get names, types of all inputs and results, # get short and detailed (in XML) service description use Bio::Tools::Run::Analysis; my $service = Bio::Tools::Run::Analysis->new(-name => 'edit::seqret'); my $hash1 = $service->input_spec; my $hash2 = $service->result_spec; my $hash3 = $service->analysis_spec; my $xml = $service->describe; # get current job status use Bio::Tools::Run::Analysis; print new Bio::Tools::Run::Analysis (-name => 'edit::seqret') ->run ( { #...input data... } ) ->status; # run a job and print its job ID, keep the job un-destroyed use Bio::Tools::Run::Analysis; my $job = Bio::Tools::Run::Analysis->new(-name => 'edit::seqret', -destroy_on_exit => 0) ->run ( { sequence_direct_data => '@/home/testdata/mzef.seq' } ); print $job->id . "\n"; # ...it prints (for example): # edit::seqret/c8ef56:ef535489ac:-7ff4 # ...in another time, on another planet, you may say use Bio::Tools::Run::Analysis; my $job = Bio::Tools::Run::Analysis::Job->new(-name => 'edit::seqret', -id => 'edit::seqret/c8ef56:ef535489ac:-7ff4'); print join ("\n", $job->status, 'Finished: ' . $job->ended (1), # (1) means 'formatted' 'Elapsed time: ' . $job->elapsed, $job->last_event, $job->result ('outseq') ); # ...or you may achieve the same keeping module # Bio::Tools::Run::Analysis::Job invisible use Bio::Tools::Run::Analysis; my $job = Bio::Tools::Run::Analysis->new(-name => 'edit::seqret') ->create_job ('edit::seqret/c8ef56:ef535489ac:-7ff4'); print join ("\n", $job->status, # ... ); # ...and later you may free this job resources $job->remove; # # --- See DESCRIPTION for using generator 'applmaker.pl': # =head1 DESCRIPTION The module represents an access to the local and/or remote analysis tools in a unified way that allows adding new access methods (protocols) seamlessly. At the moment of writing, there is available a I access to almost all EMBOSS applications, running at the European Bioinformatics Institute. The documentation of all C methods are to be found in C. A tutorial (and examples how to call almost all public methods) is in the script C (go to the C directory and type C). The module C uses general approach allowing to set arbitrary input data and to retrieve results by naming them. However, sometimes is more convenient to use a specific module, representing one analysis tool, that already knows about available input and result names. Such analyses-specific Perl modules can be generated by C generator. Its features and usage are documented in the generator (go to the C directory and type C). # this will generate module Seqret.pm perl papplmaker.PLS -n edit.seqret -m Seqret # ...which can be used with data-specific methods use Seqret; my $outseq = new Seqret ->sequence_direct_data ('@/home/testdata/my.seq') ->osformat ('embl') ->wait_for ->outseq ; print $outseq; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Martin Senger (martin.senger@gmail.com) =head1 COPYRIGHT Copyright (c) 2003, Martin Senger and EMBL-EBI. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =head1 SEE ALSO =over =item * http://www.ebi.ac.uk/soaplab/Perl_Client.html =back =head1 APPENDIX Here is the rest of the object methods. Internal methods are preceded with an underscore _. =cut # Let the code begin... package Bio::Tools::Run::Analysis; use vars qw(@ISA $Revision); use strict; use Bio::Root::Root; use Bio::AnalysisI; @ISA = qw(Bio::Root::Root Bio::AnalysisI); BEGIN { $Revision = q[$Id$]; } # ----------------------------------------------------------------------------- =head2 new Usage : my $tool = Bio::Tools::Run::Analysis->new(-access => 'soap', -name => 'edit.seqret', ... ); Returns : a new Bio::Tools::Run::Analysis object representing the given tool Args : There may be additional arguments which are specific to the access method (see methods 'new' or '_initialize' of the access-specific implementations (such as module Bio::Tools::Run::Analysis::soap for a SOAP-based access). The recognised and used arguments are: -access -location -name -httpproxy -timeout It builds, populates and returns a new C object. This is how it is seen from the outside. But in fact, it builds, populates and returns a more specific lower-level object, for example C object - which one it depends on the C<-access> parameter. =over =item -access It indicates what lower-level module to load. Default is 'soap'. Other (but future) possibilities may be: -access => 'novella' -access => 'local' =item -location A location of the service. The contents is access-specific (see details in the lower-level implementation modules). Default is C ( services running at European Bioinformatics Institute on top of most of EMBOSS analyses, and on few others). =item -name A name of an analysis tool, or a name of its higher-level abstraction, possibly including a category where the analysis belong to. There is no default value (which usually means that this parameter is mandatory unless your I<-location> parameter includes also the name (but it is then access-dependent). =item -destroy_on_exit =E '0' Default value is '1' which means that all Bio::Tools::Run::Analysis::Job objects - when being finalised - will send a request to the remote site to forget the results of these jobs. If you change it to '0' make sure that you know the job identification - otherwise you will not be able to re-established connection with it (later, when you use your program again). This can be done by calling method C on the job object (such object is returned by any of these methods: C, C, C). =item -httpproxy In addition to the I parameter, you may need to specify also a location/URL of an HTTP proxy server (if your site requires one). The expected format is C. There is no default value. It is also an access-specific parameter which may not be used by all access methods. =item -timeout For long(er) running jobs the HTTP connection may be time-outed. In order to avoid it (or, vice-versa, to call timeout sooner) you may specify C with the number of seconds the connection will be kept alive. Zero means to keep it alive forever. The default value is two minutes. =back =cut sub new { my ($caller,@args) = @_; my $class = ref($caller) || $caller; if ($class eq 'Bio::Tools::Run::Analysis') { # this is called only the first time when somebody calls: 'new # Bio::Tools::Run::Analysis (...)', and it actually loads a 'real-work-doing' # module and call this new() method again (unless the loaded # module has its own new() method) my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys my $access = $param {'-access'} || # use -access parameter &Bio::Tools::Run::Analysis::Utils::_guess_access ( \%param ) || # or guess from other parameters 'soap'; # or use a default access method $access = "\L$access"; # normalize capitalization to lower case # remember the access method (putting it into @args means that the # object - when created - will remember it) push (@args, (-access => $access)) unless $param {'-access'}; # load module with the real implementation - as defined in $access return undef unless (&Bio::Tools::Run::Analysis::Utils::_load_access_module ($access)); # this calls this same method new() - but now its object part # (see the upper branche above) is called return "Bio::Tools::Run::Analysis::$access"->new (@args); } else { # if $caller is an object, or if it is an underlying # 'real-work-doing' class (e.g. Bio::Tools::Run::Analysis::soap) then # we want to call SUPER to create and bless a new object my ($self) = $class->SUPER::new (@args); # now the $self is an empty object - we will populate it from # the $caller - if $caller is an object (so we do cloning here) if (ref ($caller)) { %{ $self } = %{ $caller }; } # and finally add values from '@args' into the newly created # object (the values will overwrite the values copied above); # this is done by calling '_initialize' of the 'real-work-doing' # class (if there is no one there, there is always an empty one # in Bio::Root::Root) $self->_initialize (@args); return $self; } } # # Create a hash with named inputs, all extracted # from the given data. # sub _prepare_inputs { my $self = shift; my %inputs = (); # collect here input data foreach my $input (@_) { next unless defined $input; # an element can be an array reference # (with scalar elements: 'name = [[@]value]') if (ref $input eq 'ARRAY') { foreach my $elem (@$input) { unless (ref $elem) { # taking only scalars my ($name, $value) = split (/\s*=\s*/, $elem, 2); next unless $name; # am I paranoid ? $value = 1 unless defined $value; $inputs{$name} = $value; next; } } } # ...or an element can be a hash # (name => [@]value) elsif (ref $input eq 'HASH') { foreach my $name (keys %$input) { my $value = $$input{$name}; $inputs{$name} = $value; } } # ...or an element can be a scalar (which means that it # represents a name of a boolean parameter (an option) elsif (ref \$input eq 'SCALAR') { $input =~ s/^@/\\@/; # this cannot be a filename $inputs{$input} = 1; } # everything else is ignored else { warn "Unrecognized input data type: $input\n"; } } # extracted inputs may be actually filenames and we want the # contents of the files instead # TBD: to support also filehandlers here? foreach my $name (keys %inputs) { $inputs{$name} = $self->_read_value ($inputs{$name}); } return \%inputs; } # --- if a $value is a filename, read it and return its contents # otherwise return the $value itself; if $value start with # an escaped '@', change it to a normal '@' sub _read_value { my ($self, $value) = @_; return unless defined $value; if ($value =~ s/^\@//) { my ($buf); open (DATA, $value) || $self->throw ("Cannot read from '$value' ($!)"); binmode (DATA); undef $value; while (read (DATA, $buf, 8 * 2**10)) { $value .= $buf; } close DATA; } elsif ($value =~ s/^\\\@/@/) { } $value; } # --- save $value of result $name into file $filename + $seq; # use some default filename if $filename not given #$part = $self->_save_result (-value => $part, # -name => $name, # -filename => $filename, # -template => $template, # -seq => $seq++); sub _save_result { my ($self, %params) = @_; my $name = $params{'-name'} || 'result'; # invent filename (if not given) from the given or default template my $filename = $params{'-filename'}; unless ($filename) { $filename = $params{'-template'}; $filename = "\$ANALYSIS_*_$name" unless $filename; # replace $ANALYSIS and $RESULT in the filename if ($filename =~ /\$\{?ANALYSIS\}?/) { # (better to ask if we need it because getting # the analysis name may require going to server) my $analysis = $self->analysis_name; $analysis =~ s/[:\/]/_/g; # would be troubles in filename $filename =~ s/\$\{?ANALYSIS\}?/$analysis/ig; } $filename =~ s/\$\{?RESULT\}?/$name/ig; } # include the sequential number before file extension (if any) my $seq = $params{'-seq'}; if ($seq) { my $pos = rindex ($filename, '.'); if ($pos > -1) { substr ($filename, $pos, 0) = ".$seq"; # insert $seq } else { $filename .= ".$seq"; # add $seq } } # replace '*' in filename with a unique number while ($filename =~ /\*/) { my $unique_name; my $number = 1; while (1) { ($unique_name = $filename) =~ s/\*/$number/; last unless -e $unique_name; $number++; } $filename = $unique_name; } # and finally write the file open (DATA, ">$filename") || $self->throw ("Error by saving result '$name' into '$filename' ($!)"); binmode (DATA); print (DATA $params{'-value'}) || $self->throw ("Error by writing result '$name' into '$filename' ($!)"); close DATA || $self->throw ("Error by closing result '$name' in '$filename' ($!)"); return $filename; } =head2 VERSION and Revision Usage : print $Bio::Tools::Run::Analysis::VERSION; print $Bio::Tools::Run::Analysis::Revision; =cut # ----------------------------------------------------------------------------- # Bio::Tools::Run::Analysis::Job # A module representing an invocation (execution, job) of an analysis. # ----------------------------------------------------------------------------- package Bio::Tools::Run::Analysis::Job; =head1 Module Bio::Tools::Run::Analysis::Job It represents a job, a single execution of an analysis tool. Usually you do not instantiate these objects - they are returned by methods C, C, and C of C object. However, if you wish to re-create a job you need to know its ID (method C gives it to you). The ID can be passed directly to the C method, or again you may use C of a C object with the ID as parameter. See SYNOPSIS above for an example. Remember that all public methods of this module are described in details in interface module C and in the tutorial in the C script. =cut use vars qw(@ISA); use strict; use Bio::Root::Root; @ISA = qw(Bio::Root::Root Bio::AnalysisI::JobI); # ----------------------------------------------------------------------------- =head2 new Usage : my $job = Bio::Tools::Run::Analysis::Job->new (-access => 'soap', -name => 'edit.seqret', -id => 'xxxyyy111222333' ); Returns : a re-created object representing a job Args : The same arguments as for Bio::Tools::Run::Analysis object: -access -location -name -httpproxy -timeout (and perhaps others) Additionally and specifically for this object: -id -analysis =over =item -id A job ID created some previous time and now used to re-create the same job (in order to re-gain access to this job results, for example). =item -analysis A C object whose properties (such as C<-access> and C<-location> are used to re-create this job object. =back =cut sub new { my ($caller, @args) = @_; my $class = ref($caller) || $caller; if ($class eq 'Bio::Tools::Run::Analysis::Job') { # this is called only the first time when somebody calls: #'Bio::Tools::Run::Analysis::Job->new(...)' my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys if ($param {'-analysis'}) { # usually a new Job object is created from an existing # Analysis object - which means that the Analysis already # loaded a 'real-work-doing' Job object, so we need just # to create a Job object (by calling its new() method, # which calls actually this new() method again - but its # 'object' part - see below my $analysis = $param {'-analysis'}; return undef unless $analysis->{'_access'}; # TBD: error message here? my $access = $analysis->{'_access'}; return "Bio::Tools::Run::Analysis::Job::$access"->new (@args); } else { # if a new Job object is created directly (by a user, not # by a parent Analysis object) we need to create the # Analysis object first (because it is the Analysis object # who knows how to contact the underlying analysis tool), # and only then let the Analysis create this Job object # (which may be an empty Job - if there is no 'id' in @args) return new Bio::Tools::Run::Analysis (@args)->create_job ($param {'-id'}); } } else { # if $caller is an object, or if it is an underlying # 'real-work-doing' class (e.g. Bio::Tools::Run::Analysis::Job::soap) then # we want to call SUPER to create and bless a new object my ($self) = $class->SUPER::new (@args); # now the $self is an empty object - we will populate it from # the $caller - if $caller is an object (so we do cloning here) if (ref ($caller)) { %{ $self } = %{ $caller }; } # and finally add values from '@args' into the newly created # object (the values will overwrite the values copied above); # this is done by calling '_initialize' of the 'real-work-doing' # class (if there is no one there, there is always an empty one # in Bio::Root::Root) $self->_initialize (@args); return $self; } } sub id { shift->{'_id'}; } # --------------------------------------------------------------------- # # A Utility module... # # --------------------------------------------------------------------- package Bio::Tools::Run::Analysis::Utils; =head1 Module Bio::Tools::Run::Analysis::Utils It contains several general utilities. These are C, not methods. Therefore call them like, for example: &Bio::Tools::Run::Analysis::Utils::format_time (...); =cut # ----------------------------------------------------------------------------- =head2 format_time Usage : Bio::Tools::Run::Analysis::Utils::format_time ($time); Returns : Slightly formatted $time Args : $time is number of seconds from the beginning of Epoch It returns what C returns which means that return value is different in the array and scalar context (see localtime). If C<$time> is ``-1'' it returns 'n/a' (in the scalar context) or an empty array (in the array context). If C<$time> is too small to represent the distance from the beginning of the Epoch, it returns it unchanged (the same in any contex) - this is reasonable for C<$time> representing an elapsed time. The function is used to format times coming back from various job time methods. =cut sub format_time { my $time = shift; return wantarray ? () : 'n/a' if "$time" eq '-1'; return $time if $time < 1000000000; return localtime $time; } # ----------------------------------------------------------------------------- # It processes given result names which may be of various different # types and returns a hash reference with result names as keys and # values being result destinations (such as file names, or templates # how to create filenames. # # Or, it returns a scalar ('@[template]' or '?[template]') if there # were no real result names but only a global rule how to create # result destinantions for all results. # # Or, it returns 'undef' if there were no result names at all. sub normalize_names { return undef unless @_; my %names = (); foreach (@_) { if (ref $_ eq 'HASH') { %names = (%names, %$_); } elsif (not ref $_) { my ($name, $dest) = split (/\s*=\s*/, $_, 2); return $name if $name =~ /^@/; # special: it nullifies other rules return $name if $name =~ /^\?/; # ditto $names{$name} = $dest; # $dest may be undef } } \%names; } # ----------------------------------------------------------------------------- =head2 _load_access_module Usage : $class->_load_access_module ($access) Returns : 1 on success, undef on failure Args : 'access' should contain the last part of the name of a module who does the real implementation It does (in the run-time) a similar thing as require Bio::Tools::Run::Analysis::$access It prints an error on STDERR if it fails to find and load the module (for example, because of the compilation errors in the module). =cut sub _load_access_module { my ($access) = @_; my $load = "Bio/Tools/Run/Analysis/$access.pm"; eval { require $load; }; if ( $@ ) { Bio::Root::Root->throw (<. Rememeber that this method is called only if there was no I<-access> parameter which could tell directly what access method to use. =cut sub _guess_access { my ($rh_params) = @_; return undef; } 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Analysis/000077500000000000000000000000001302566030400222775ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Analysis/soap.pm000066400000000000000000000533021302566030400236020ustar00rootroot00000000000000# $Id$ # # BioPerl module Bio::Tools::Run::Analysis::soap.pm # # Please direct questions and support issues to # # Cared for by Martin Senger # For copyright and disclaimer see below. # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Analysis::soap - A SOAP-based access to the analysis tools =head1 SYNOPSIS Do not use this object directly, it is recommended to access it and use it through the C module: use Bio::Tools::Run::Analysis; my $tool = Bio::Tools::Run::Analysis->new(-access => 'soap', -name => 'seqret'); =head1 DESCRIPTION This object allows to execute and to control a remote analysis tool (an application, a program) using the SOAP middleware, All its public methods are documented in the interface module C and explained in tutorial available in the C script. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Martin Senger (martin.senger@gmail.com) =head1 COPYRIGHT Copyright (c) 2003, Martin Senger and EMBL-EBI. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =head1 SEE ALSO =over 4 =item * http://www.ebi.ac.uk/soaplab/Perl_Client.html =back =head1 BUGS AND LIMITATIONS None known at the time of writing this. =head1 APPENDIX Here is the rest of the object methods. Internal methods are preceded with an underscore _. =cut # Let the code begin... package Bio::Tools::Run::Analysis::soap; use vars qw(@ISA $Revision $DEFAULT_LOCATION); use strict; use Bio::Tools::Run::Analysis; use SOAP::Lite on_fault => sub { my $soap = shift; my $res = shift; my $msg = ref $res ? "--- SOAP FAULT ---\n" . 'faultcode: ' . $res->faultcode . "\n" . 'faultstring: ' . Bio::Tools::Run::Analysis::soap::_clean_msg ($res->faultstring) : "--- TRANSPORT ERROR ---\n" . $soap->transport->status . "\n$res\n"; Bio::Tools::Run::Analysis::soap->throw ($msg); } ; @ISA = qw(Bio::Tools::Run::Analysis); BEGIN { $Revision = q[$Id$]; # where to go $DEFAULT_LOCATION = 'http://www.ebi.ac.uk/soaplab/services'; } # ----------------------------------------------------------------------------- =head2 _initialize Usage : my $tool = Bio::Tools::Run::Analysis->new(-access => 'soap', -name => 'seqret', ...); (_initialize is internally called from the 'new()' method) Returns : nothing interesting Args : This module recognises and uses following arguments: -location -name -httpproxy -timeout Additionally, the main module Bio::Tools::Run::Analysis recognises also: -access It populates calling object with the given arguments, and then - for some attributes and only if they are not yet populated - it assigns some default values. This is an actual new() method (except for the real object creation and its blessing which is done in the parent class Bio::Root::Root in method _create_object). Note that this method is called always as an I method (never as a I method) - and that the object who calls this method may already be partly initiated (from Bio::Tools::Run::Analysis::new method); so if you need to do some tricks with the 'class invocation' you need to change Bio::Analysis I method, not this one. =over 4 =item -location A URL (also called an I) defining where is located a Web Service representing this analysis tool. Default is C (services running at European Bioinformatics Institute on top of most of EMBOSS analyses, and few others). For example, if you run your own Web Service using Java(TM) Apache Axis toolkit, the location might be something like C. =item -name A name of a Web Service (also called a I or a I). There is no default value (which usually means that this parameter is mandatory unless your I<-location> parameter includes also a Web Service name). =item -destroy_on_exit =E '0' Default value is '1' which means that all Bio::Tools::Run::Analysis::Job objects - when being finalised - will send a request to the remote Web Service to forget the results of these jobs. If you change it to '0' make sure that you know the job identification - otherwise you will not be able to re-established connection with it (later, when you use your script again). This can be done by calling method C on the job object (such object is returned by any of these methods: C, C, C). =item -httpproxy In addition to the I parameter, you may need to specify also a location/URL of an HTTP proxy server (if your site requires one). The expected format is C. There is no default value. =item -timeout For long(er) running jobs the HTTP connection may be time-outed. In order to avoid it (or, vice-versa, to call timeout sooner) you may specify C with the number of seconds the connection will be kept alive. Zero means to keep it alive forever. The default value is two minutes. =back =cut sub _initialize { my ($self, @args) = @_; # make a hashtable from @args my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys # copy all @args into this object (overwriting what may already be # there) - changing '-key' into '_key' my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ $new_key } = $param { $key }; } # finally add default values for those keys who have default value # and who are not yet in the object $self->{'_location'} = $DEFAULT_LOCATION unless $self->{'_location'}; # create a SOAP::Lite object, the main worker if (defined $self->{'_httpproxy'}) { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}, timeout => (defined $self->{'_timeout'} ? $self->{'_timeout'} : 120), proxy => ['http' => $self->{'_httpproxy'}]); } else { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}, timeout => (defined $self->{'_timeout'} ? $self->{'_timeout'} : 120), ); } $self->{'_soap'}->uri ($self->{'_name'}) if $self->{'_name'}; # forget cached things which should not be cloned into new # instances (because they may represent a completely different # analysis delete $self->{'_analysis_spec'}; delete $self->{'_input_spec'}; delete $self->{'_result_spec'}; } # # Create a hash with named inputs, all extracted # from the given data. # # The main job is done in the SUPER class - here we do # only the SOAP-specific stuff. # sub _prepare_inputs { my $self = shift; my $rh_inputs = $self->SUPER::_prepare_inputs (@_); foreach my $name (keys %{$rh_inputs}) { my $value = $$rh_inputs{$name}; # value of type ref ARRAY is send as byte[][] if (ref $value eq 'ARRAY') { my @bytes = map { SOAP::Data->new (type => 'base64', value => $_) } @$value; $$rh_inputs{$name} = \@bytes; next; } } return $rh_inputs; } # --------------------------------------------------------------------- # # Here are the methods implementing Bio::AnalysisI interface # (documentation is in Bio::AnalysisI) # # --------------------------------------------------------------------- sub analysis_name { my $self = shift; ${ $self->analysis_spec }{'name'}; } # Map getAnalysisType() sub analysis_spec { my ($self) = @_; return $self->{'_analysis_spec'} if $self->{'_analysis_spec'}; my $soap = $self->{'_soap'}; $self->{'_analysis_spec'} = $soap->getAnalysisType->result; } # String describe() sub describe { my ($self) = @_; my $soap = $self->{'_soap'}; $soap->describe->result; } # Map[] getInputSpec() sub input_spec { my ($self) = @_; return $self->{'_input_spec'} if $self->{'_input_spec'}; my $soap = $self->{'_soap'}; $self->{'_input_spec'} = $soap->getInputSpec->result; } # Map[] getResultSpec() sub result_spec { my ($self) = @_; return $self->{'_result_spec'} if $self->{'_result_spec'}; my $soap = $self->{'_soap'}; $self->{'_result_spec'} = $soap->getResultSpec->result; } # String createJob (Map inputs) # String createJob (String id) # String createJob () sub create_job { my ($self, $params) = @_; my $job_id; my $force_to_live; # if $params is a reference then it contains *all* input data # (see details in '_prepare_inputs' how they can be coded) - # send it to the server to get a unique job ID if (ref $params) { my $rh_inputs = $self->_prepare_inputs ($params); my $soap = $self->{'_soap'}; $job_id = $soap->createJob (SOAP::Data->type (map => $rh_inputs))->result; # if $params is a defined scalar it represents a job ID obtained in # some previous invocation - such job already exists on the server # side, just re-create it here using the same job ID # (in this case, such job will *not* be implicitly destroyed on exit) } elsif (defined $params) { $job_id = $params; $force_to_live = 1; # finally, if $params is undef, ask server to create an empty job # (and give me its unique job ID), the input data may be added # later using 'set_data' method(s) - see scripts/applmaker.pl } else { my $soap = $self->{'_soap'}; $job_id = $soap->createEmptyJob->result; # this method may not exist on server (TBD) } if ($force_to_live) { return new Bio::Tools::Run::Analysis::Job (-analysis => $self, -id => $job_id, -destroy_on_exit => 0, ); } elsif (defined $self->{'_destroy_on_exit'}) { return new Bio::Tools::Run::Analysis::Job (-analysis => $self, -id => $job_id, -destroy_on_exit => $self->{'_destroy_on_exit'}, ); } else { return new Bio::Tools::Run::Analysis::Job (-analysis => $self, -id => $job_id, ); } } # String createAndRun (Map inputs) sub run { my $self = shift; return $self->create_job (@_)->run; } # Map runAndWaitFor (Map inputs) sub wait_for { my $self = shift; return $self->run (@_)->wait_for; } # --------------------------------------------------------------------- # # Here are internal methods fo Bio::Tools::Run::Analysis::soap... # # --------------------------------------------------------------------- # Do something (or nothing) with $rh_resuls (coming from the server) # depending on rules defined in $rh_rules. # # $rh_results: keys are result names, values are results themselves # (either scalars or array references - if one result is split into # more parts). # # $rh_rules: keys are result names, values say what to do with # results: undef ... do nothing, return unchanged result # - ... send it to STDOUT, return nothing # @[template] ... put it into file (invent its name, # perhaps based on template), return filename # ?[template] ... ask server for result type, then decide: # put a binary result into file (invent its name) # and return the filename, for other result type # do nothing and return result unchanged # Special cases: if $rh_rules is scalar '@[template]', do with ALL results # as described above for @[template], or # if $rh_rules is scalar '?[template]', do with ALL results # as described above for ?[template]. sub _process_results { my ($self, $rh_results, $rh_rules) = @_; my $default_rule = $rh_rules if defined $rh_rules && $rh_rules =~ /^[\?@]/; foreach my $name (keys %$rh_results) { my $rule = $default_rule ? $default_rule : $$rh_rules{$name}; next unless $rule; next if $rule =~ /^\?/ && ! $self->is_binary ($name); my ($prefix, $template) = $rule =~ /^([\?@])(.*)/; $template = $ENV{'RESULT_FILENAME_TEMPLATE'} unless $template; my $filename = $rule unless $template || $prefix; my $stdout = ($rule eq '-'); if (ref $$rh_results{$name}) { # --- result value is an array reference my $seq = 1; foreach my $part (@{ $$rh_results{$name} }) { print STDOUT $part && next if $stdout; $part = $self->_save_result (-value => $part, -name => $name, -filename => $filename, -template => $template, -seq => $seq++); } } else { # --- result value is a scalar print STDOUT $$rh_results{$name} && next if $stdout; $$rh_results{$name} = $self->_save_result (-value => $$rh_results{$name}, -name => $name, -filename => $filename, -template => $template); } delete $$rh_results{$name} if $stdout; } $rh_results; } # --------------------------------------------------------------------- # # is the given result $name binary? # =head2 is_binary Usage : if ($service->is_binary ('graph_result')) { ... } Returns : 1 or 0 Args : $name is a result name we are interested in =cut sub is_binary { my ($self, $name) = @_; foreach my $result (@{ $self->result_spec }) { if ($result->{'name'} eq $name) { return ($result->{'type'} =~ /^byte\[/); } } return 0; } # --------------------------------------------------------------------- # # Here are internal subroutines (NOT methods) # for Bio::Tools::Run::Analysis::soap # # --------------------------------------------------------------------- sub _clean_msg { my ($msg) = @_; $msg =~ s/^org\.embl\.ebi\.SoaplabShare\.SoaplabException\:\s*//; $msg; } # --------------------------------------------------------------------- # # Here is the rest of Bio::Analysis::soap # # --------------------------------------------------------------------- =head2 VERSION and Revision Usage : print $Bio::Tools::Run::Analysis::soap::VERSION; print $Bio::Tools::Run::Analysis::soap::Revision; =cut =head2 Defaults Usage : print $Bio::Tools::Run::Analysis::soap::DEFAULT_LOCATION; =cut # --------------------------------------------------------------------- # # Bio::Tools::Run::Analysis::Job::soap # ------------------------------------ # A module representing a job (an invocation, an execution) # of an analysis (the analysis itself is represented by # a Bio::Tools::Run::Analysis::soap object) # # Documentation is in Bio::AnalysisI::JobI. # # --------------------------------------------------------------------- package Bio::Tools::Run::Analysis::Job::soap; use vars qw(@ISA); use strict; @ISA = qw(Bio::Tools::Run::Analysis::Job); sub _initialize { my ($self, @args) = @_; # make a hashtable from @args my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys # copy all @args into this object (overwriting what may already be # there) - changing '-key' into '_key' my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ $new_key } = $param { $key }; } # finally add default values for those keys who have default value # and who are not yet in the object $self->{'_destroy_on_exit'} = 1 unless defined $self->{'_destroy_on_exit'}; } # --------------------------------------------------------------------- # # Here are the methods implementing Bio::AnalysisI::JobI interface # (documentation is in Bio::AnalysisI) # # --------------------------------------------------------------------- # void run (String jobID) sub run { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->run (SOAP::Data->type (string => $self->{'_id'})); return $self; } # void waitFor (String jobID) sub wait_for { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->waitFor (SOAP::Data->type (string => $self->{'_id'})); return $self; } # void terminate (String jobID) sub terminate { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->terminate (SOAP::Data->type (string => $self->{'_id'})); return $self; } # String getLastEvent (String jobID) sub last_event { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->getLastEvent (SOAP::Data->type (string => $self->{'_id'}))->result; } # String getStatus (String jobID) sub status { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->getStatus (SOAP::Data->type (string => $self->{'_id'}))->result; } # long getCreated (String jobID) sub created { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $time = $soap->getCreated (SOAP::Data->type (string => $self->{'_id'}))->result; $formatted ? Bio::Tools::Run::Analysis::Utils::format_time ($time) : $time; } # long getStarted (String jobID) sub started { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $time = $soap->getStarted (SOAP::Data->type (string => $self->{'_id'}))->result; $formatted ? Bio::Tools::Run::Analysis::Utils::format_time ($time) : $time; } # long getEnded (String jobID) sub ended { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $time = $soap->getEnded (SOAP::Data->type (string => $self->{'_id'}))->result; $formatted ? Bio::Tools::Run::Analysis::Utils::format_time ($time) : $time; } # long getElapsed (String jobID) sub elapsed { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; $soap->getElapsed (SOAP::Data->type (string => $self->{'_id'}))->result; } # Map getCharacterictics (String jobID) sub times { my ($self, $formatted) = @_; my $soap = $self->{'_analysis'}->{'_soap'}; my $rh_times = $soap->getCharacteristics (SOAP::Data->type (string => $self->{'_id'}))->result; map { $_ = Bio::Tools::Run::Analysis::Utils::format_time ($_) } values %$rh_times if $formatted; return $rh_times; } # Map getResults (String jobID) # Map getResults (String jobID, String[] resultNames) # Retrieving NAMED results: # ------------------------- # results ('name1', ...) => return results as they are, no storing into files # # results ( { 'name1' => 'filename', ... } ) => store into 'filename', return 'filename' # results ( 'name1=filename', ...) => ditto # # results ( { 'name1' => '-', ... } ) => send result to the STDOUT, do not return anything # results ( 'name1=-', ...) => ditto # # results ( { 'name1' => '@', ... } ) => store into file whose name is invented by # this method, perhaps using RESULT_NAME_TEMPLATE env # results ( 'name1=@', ...) => ditto # # results ( { 'name1' => '?', ... } ) => find of what type is this result and then use # {'name1'=>'@' for binary files, and a regular # return for non-binary files # results ( 'name=?', ...) => ditto # # Retrieving ALL results: # ----------------------- # results() => return all results as they are, no storing into files # # results ('@') => return all results, as if each of them given # as {'name' => '@'} (see above) # # results ('?') => return all results, as if each of them given # as {'name' => '?'} (see above) # # Misc: # ----- # * results(...) equals to result(...) # * any result can be returned as a scalar value, or as an array reference # (the latter is used for results consisting of more parts, such images); # this applies regardless whether the returned result is the result itself # or a filename created for the result sub results { my $self = shift; my $rh_names = Bio::Tools::Run::Analysis::Utils::normalize_names (@_); my $soap = $self->{'_analysis'}->{'_soap'}; if (ref $rh_names) { # retrieve only named results return $self->{'_analysis'}->_process_results ($soap->getSomeResults (SOAP::Data->type (string => $self->{'_id'}), [ keys %$rh_names ])->result, $rh_names); } else { # no result names given: take all return $self->{'_analysis'}->_process_results ($soap->getResults (SOAP::Data->type (string => $self->{'_id'}))->result, $rh_names); } } sub result { my $self = shift; my $rh_results = $self->results (@_); (values %$rh_results)[0]; } sub remove { shift->{'_destroy_on_exit'} = 1; } # # job objects are being destroyed if they have attribute # '_destroy_on_exit' set to true - which is a default value # (void destroy (String jobID) # sub DESTROY { my $self = shift; my $soap = $self->{'_analysis'}->{'_soap'}; return unless $self->{'_destroy_on_exit'} && $self->{'_id'}; # ignore all errors here eval { $soap->destroy (SOAP::Data->type (string => $self->{'_id'})); } } 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/AnalysisFactory.pm000066400000000000000000000251241302566030400241710ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::AnalysisFactory # # Please direct questions and support issues to # # Cared for by Martin Senger # For copyright and disclaimer see below. # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::AnalysisFactory - A directory of analysis tools =head1 SYNOPSIS # list all available analyses from the default location, # using a default (SOAP) access method use Bio::Tools::Run::AnalysisFactory; my $list = Bio::Tools::Run::AnalysisFactory->new(); ->available_analyses; use Data::Dumper; print Dumper ($list); # ditto, but from a different location use Bio::Tools::Run::AnalysisFactory; my $list = Bio::Tools::Run::AnalysisFactory->new(-location => 'http://somewhere/something') ->available_analyses; # ...and using a different access method # (this example is not yet impelmented) use Bio::Tools::Run::AnalysisFactory; my $list = Bio::Tools::Run::AnalysisFactory->new(-location => 'http://somewhere/something', -access => 'novella') ->available_analyses; # list available categories of analyses use Bio::Tools::Run::AnalysisFactory; my $categories = Bio::Tools::Run::AnalysisFactory->new(); ->available_categories; use Data::Dumper; print Dumper ($categories); # show all analyses group by categories use Bio::Tools::Run::AnalysisFactory; my $factory = Bio::Tools::Run::AnalysisFactory->new(); foreach $cat ( @{ $factory->available_categories } ) { my @sublist = @{ $factory->available_analyses ($cat) }; print "$cat:\n\t", join ("\n\t", @{ $factory->available_analyses ($cat) }), "\n"; } # create an analysis object use Bio::Tools::Run::AnalysisFactory; $service = Bio::Tools::Run::AnalysisFactory->new(); ->create_analysis ('edit.seqret'); $service->run ( #... )->results; =head1 DESCRIPTION The module represents a list of available analysis tools from a given location using a given access method. Additionally, for any of the available analyses, it can create an object of type C. The module is a higher-level abstraction whose main job is to load a 'real-work-doing' implementation. Which one is used, it depends on the C<-access> parameter. The same design is used here as for C module. There is available a I access to almost all EMBOSS applications, running at European Bioinformatics Institute. The documentation of all C methods are to be found in C. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Martin Senger (martin.senger@gmail.com) =head1 COPYRIGHT Copyright (c) 2003, Martin Senger and EMBL-EBI. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =head1 SEE ALSO =over 4 =item * http://www.ebi.ac.uk/soaplab/Perl_Client.html =back =head1 APPENDIX Here is the rest of the object methods. Internal methods are preceded with an underscore _. =cut # Let the code begin... package Bio::Tools::Run::AnalysisFactory; use vars qw(@ISA $Revision); use strict; use Bio::Root::Root; use Bio::Factory::AnalysisI; @ISA = qw(Bio::Root::Root Bio::Factory::AnalysisI); BEGIN { $Revision = q$Id$; } # ----------------------------------------------------------------------------- # Available (understood) parameters: # -access # (+ parameters used in guessing an access) # ----------------------------------------------------------------------------- =head2 new Usage : my $factory = Bio::Tools::Run::AnalysisFactory->new(-access => 'soap', -location => 'http://...'); Returns : a new Bio::Tools::Run::AnalysisFactory object representing a list of available analyses Args : There may be additional arguments which are specific to the access method (see methods 'new' or '_initialize' of the access-specific implementations (such as module Bio::Tools::Run::AnalysisFactory::soap for a SOAP-based access). The recognised and used arguments are: -access -location -httpproxy -timeout It builds, populates and returns a new C object. This is how it is seen from the outside. But in fact, it builds, populates and returns a more specific lower-level object, for example C object - which one it is it depends on the C<-access> parameter. =over 4 =item -access It indicates what lower-level module to load. Default is 'soap'. Other (but future) possibilities are: -access => 'novella' -access => 'local' =item -location A location of the service. The contents is access-specific (see details in the lower-level implementation modules). Default is C (there are services running at European Bioinformatics Institute on top of most of EMBOSS analyses, and on some others). =item -httpproxy In addition to the I parameter, you may need to specify also a location/URL of an HTTP proxy server (if your site requires one). The expected format is C. There is no default value. It is also an access-specific parameter which may not be used by all access methods. =item -timeout For long(er) running jobs the HTTP connection may be time-outed. In order to avoid it (or, vice-versa, to call timeout sooner) you may specify C with the number of seconds the connection will be kept alive. Zero means to keep it alive forever. The default value is two minutes. =back =cut sub new { my ($caller,@args) = @_; my $class = ref($caller) || $caller; if ($class eq 'Bio::Tools::Run::AnalysisFactory') { # this is called only the first time when somebody calls: 'new # Bio::Tools::Run::AnalysisFactory (...)', and it actually loads a # 'real-work-doing' module and call this new() method again # (unless the loaded module has its own new() method) my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys my $access = $param {'-access'} || # use -access parameter $class->_guess_access ( \%param ) || # or guess from other parameters 'soap'; # or use a default access method $access = "\L$access"; # normalize capitalization to lower case # remember the access method (putting it into @args means that the # object - when created - will remember it) push (@args, (-access => $access)) unless $param {'-access'}; # load module with the real implementation - as defined in $access return undef unless (&_load_access_module ($access)); # this calls this same method new() - but now its object part # (see the upper branche above) is called return "Bio::Tools::Run::AnalysisFactory::$access"->new (@args); } else { # if $caller is an object, or if it is an underlying # 'real-work-doing' class (e.g. Bio::Tools::Run::AnalysisFactory::soap) # then we want to call SUPER to create and bless a new object my ($self) = $class->SUPER::new (@args); # now the $self is an empty object - we will populate it from # the $caller - if $caller is an object (so we do cloning here) if (ref ($caller)) { %{ $self } = %{ $caller }; } # and finally add values from '@args' into the newly created # object (the values will overwrite the values copied above); # this is done by calling '_initialize' of the 'real-work-doing' # class (if there is no one there, there is always an empty one # in Bio::Root::Root) $self->_initialize (@args); return $self; } } # ----------------------------------------------------------------------------- =head2 _load_access_module Usage : $class->_load_access_module ($access) Returns : 1 on success, undef on failure Args : 'access' should contain the last part of the name of a module who does the real implementation It does (in the run-time) a similar thing as require Bio::Tools::Run::AnalysisFactory::$access It prints an error on STDERR if it fails to find and load the module (for example, because of the compilation errors in the module). =cut sub _load_access_module { my ($access) = @_; my $load = "Bio/Tools/Run/AnalysisFactory/$access.pm"; eval { require $load; }; if ( $@ ) { Bio::Root::Root->throw (<_guess_access ($rh_params) Returns : string with a guessed access protocol (e.g. 'soap'), or undef if the guessing failed Args : 'rh_params' is a hash reference containing parameters given to the 'new' method. It makes an expert guess what kind of access/transport protocol should be used to access the underlying analysis. The guess is based on the parameters in I. Rememeber that this method is called only if there was no I<-access> parameter which could tell directly what access method to use. =cut sub _guess_access { my ($class, $rh_params) = @_; return undef; } # ----------------------------------------------------------------------------- =head2 VERSION and Revision Usage : print $Bio::Tools::Run::AnalysisFactory::VERSION; print $Bio::Tools::Run::AnalysisFactory::Revision; =cut 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/AnalysisFactory/000077500000000000000000000000001302566030400236275ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/AnalysisFactory/soap.pm000066400000000000000000000217371302566030400251410ustar00rootroot00000000000000# $Id$ # # BioPerl module Bio::Tools::Run::AnalysisFactory::soap.pm # # Please direct questions and support issues to # # Cared for by Martin Senger # For copyright and disclaimer see below. # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::AnalysisFactory::soap - A SOAP-based access to the list of analysis tools =head1 SYNOPSIS Do not use this object directly, it is recommended to access it and use it through the I module: use Bio::Tools::Run::AnalysisFactory; my $list = Bio::Tools::Run::AnalysisFactory->new(-access => 'soap') ->available_analyses; print join ("\n", @$list) . "\n"; =head1 DESCRIPTION All public methods are documented in the interface module C. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Martin Senger (martin.senger@gmail.com) =head1 COPYRIGHT Copyright (c) 2003, Martin Senger and EMBL-EBI. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =head1 SEE ALSO =over =item * http://www.ebi.ac.uk/soaplab/Perl_Client.html =back =head1 BUGS AND LIMITATIONS None known at the time of writing this. =head1 APPENDIX The main documentation details are in C. =cut # Let the code begin... package Bio::Tools::Run::AnalysisFactory::soap; use vars qw(@ISA $Revision $DEFAULT_LOCATION @DEFAULT_DIR_SERVICE); use strict; use Bio::Tools::Run::AnalysisFactory; use Bio::Tools::Run::Analysis; use SOAP::Lite on_fault => sub { my $soap = shift; my $res = shift; my $msg = ref $res ? "--- SOAP FAULT ---\n" . 'faultcode: ' . $res->faultcode . "\n" . 'faultstring: ' . Bio::Tools::Run::AnalysisFactory::soap::_clean_msg ($res->faultstring) : "--- TRANSPORT ERROR ---\n" . $soap->transport->status . "\n$res\n"; Bio::Tools::Run::AnalysisFactory::soap->throw ($msg); } ; @ISA = qw(Bio::Tools::Run::AnalysisFactory); BEGIN { $Revision = q[$Id$]; # where to go... $DEFAULT_LOCATION = 'http://www.ebi.ac.uk/soaplab/services'; # ...and what to find there # (this is a list of service names available from the given location; # those that do not exist are ignored; if none exists then only # location - without any service name appended - is used) @DEFAULT_DIR_SERVICE = ('AnalysisFactory', 'GowlabFactory'); } # ----------------------------------------------------------------------------- =head2 _initialize Usage : my $factory = Bio::Tools::Run::AnalysisFactory->new(@args); (_initialize is internally called from the 'new()' method) Returns : nothing interesting Args : This module recognises and uses following arguments: -location -httpproxy -soap Additionally, the main module Bio::Tools::Run::AnalysisFactory recognises also: -access It populates calling object with the given arguments, and then - for some attributes and only if they are not yet populated - it assigns some default values. This is an actual new() method (except for the real object creation and its blessing which is done in the parent class Bio::Root::Root in method _create_object). Note that this method is called always as an I method (never as a I method) - and that the object who calls this method may already be partly initiated (from Bio::Tools::Run::AnalysisFactory::new method); so if you need to do some tricks with the 'class invocation' you need to change Bio::Tools::Run::AnalysisFactory I method, not this one. =over =item -location A URL (also called an I) defining where is located a Web Service functioning for this object. Default is C (a service running at European Bioinformatics Institute on top of most of the EMBOSS analyses, and on top of few others). For example, if you run your own Web Service using Java(TM) Apache Axis toolkit, the location might be something like C. =item -httpproxy In addition to the I parameter, you may need to specify also a location/URL of an HTTP proxy server (if your site requires one). The expected format is C. There is no default value. =item -soap Defines your own SOAP::Lite object. Useful if you need finer-grained access to many features and attributes of the wonderful Paul Kulchenko's module. =back =cut # ' sub _initialize { my ($self, @args) = @_; # make a hashtable from @args my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys # copy all @args into this object (overwriting what may already be # there) - changing '-key' into '_key' my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ $new_key } = $param { $key }; } # finally add default values for those keys who have default value # and who are not yet in the object $self->{'_location'} = $DEFAULT_LOCATION unless $self->{'_location'}; # create a SOAP object which will do the main job # ('uri' (representing a service name) will be added before each call) unless ($self->{'_soap'}) { if (defined $self->{'_httpproxy'}) { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}, proxy => ['http' => $self->{'_httpproxy'}]); } else { $self->{'_soap'} = SOAP::Lite -> proxy ($self->{'_location'}); } } } sub _clean_msg { my ($msg) = @_; $msg =~ s/^org\.embl\.ebi\.SoaplabShare\.SoaplabException\:\s*//; $msg; } # String[] getAvailableCategories() sub available_categories { my ($self) = @_; my $soap = $self->{'_soap'}; my @result = (); my $okay = 0; foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { push (@result, @{ $soap->getAvailableCategories->result }); }; $okay = 1 unless $@; } return $soap->getAvailableCategories->result unless $okay; \@result; } # String[] getAvailableAnalyses() # String[] getAvailableAnalysesInCategory (String categoryName) sub available_analyses { my ($self, $category) = @_; my $soap = $self->{'_soap'}; my @result = (); my $okay = 0; if (defined $category) { foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { push (@result, @{ $soap->getAvailableAnalysesInCategory (SOAP::Data->type (string => $category))->result }); }; $okay = 1 unless $@; } return $soap->getAvailableAnalysesInCategory (SOAP::Data->type (string => $category)) ->result unless $okay; \@result; } else { foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { push (@result, @{ $soap->getAvailableAnalyses->result }); }; $okay = 1 unless $@; } return $soap->getAvailableAnalyses->result unless $okay; \@result; } } # String getServiceLocation (String analysisName) sub create_analysis { my ($self, $name) = @_; # service name my @name = ('-name', $name) if $name; # ask for an endpoint my $soap = $self->{'_soap'}; my $location; foreach my $service_name (@DEFAULT_DIR_SERVICE) { $soap-> uri ($service_name); eval { $location = $soap->getServiceLocation (SOAP::Data->type (string => $name))->result; }; last if defined $location; } unless (defined $location) { $location = $soap->getServiceLocation (SOAP::Data->type (string => $name)) ->result; } my @location = ('-location', $location) if $location; # share some of my properties with the new Bio::Analysis object my @access = ('-access', $self->{'_access'}) if $self->{'_access'}; my @httpproxy = ('-httpproxy', $self->{'_httpproxy'}) if $self->{'_httpproxy'}; Bio::Tools::Run::Analysis->new(@name, @location, @httpproxy, @access); } =head2 VERSION and Revision Usage : print $Bio::Tools::Run::AnalysisFactory::soap::VERSION; print $Bio::Tools::Run::AnalysisFactory::soap::Revision; =cut 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/AssemblerBase.pm000066400000000000000000000673621302566030400236000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::AssemblerBase # # Please direct questions and support issues to # # Cared for by Florent Angly # # Copyright Florent Angly # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::AssemblerBase - base class for wrapping external assemblers =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Describe the object here # use of globals for configuration... # I've created the separate Config.pm module, and 'use'd it in the # main module, for instance... # other configuration globals: # $use_dash = [1|single|double|mixed] =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Florent Angly Email florent dot angly at gmail dot com =head1 CONTRIBUTORS Mark A. Jensen - maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::AssemblerBase; use strict; use Bio::SeqIO; use Bio::Assembly::IO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::ParameterBaseI); our $default_out_type = 'Bio::Assembly::ScaffoldI'; =head2 program_name Title : program_name Usage : $assembler>program_name() Function: get/set the executable name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->{'_program_name'} = $val if $val; return $self->{'_program_name'}; } =head2 program_dir Title : program_dir Usage : $assembler->program_dir() Function: get/set the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 out_type Title : out_type Usage : $assembler->out_type('Bio::Assembly::ScaffoldI') Function: get/set the desired type of output Returns : The type of results to return Args : Type of results to return (optional): 'Bio::Assembly::IO' object 'Bio::Assembly::ScaffoldI' object (default) The name of a file to save the results in =cut sub out_type { my ($self, $val) = @_; if (defined $val) { $self->{'_out_type'} = $val; } else { if (not defined $self->{'_out_type'}) { $self->{'_out_type'} = $default_out_type; } } return $self->{'_out_type'}; } =head2 _assembly_format Title : _assembly_format Usage : $assembler->_assembly_format('ace') Function: get/set the driver to use to parse the assembly results Returns : the driver to use to parse the assembly results Args : the driver to use to parse the assembly results (optional) =cut sub _assembly_format { my ($self, $asm_format) = @_; if (defined $asm_format) { $self->{'_assembly_format'} = $asm_format; } return $self->{'_assembly_format'}; } =head2 _assembly_variant Title : _assembly_variant Usage : $assembler->_assembly_variant('454') Function: get/set the driver variant to use to parse the assembly results. For example, the ACE format has the ACE-454 and the ACE-consed variants Returns : the driver variant to use to parse the assembly results Args : the driver variant to use to parse the assembly results (optional) =cut sub _assembly_variant { my ($self, $asm_variant) = @_; if (defined $asm_variant) { $self->{'_assembly_variant'} = $asm_variant; } return $self->{'_assembly_variant'}; } =head2 _check_executable Title : _check_executable Usage : $assembler->_check_executable() Function: Verifies that the program executable can be found, or throw an error. Returns: 1 for success Args : - =cut sub _check_executable { my ($self) = @_; if (not defined $self->executable()) { $self->throw("Could not find the executable '".$self->program_name()."'. ". 'You can use $self->program_dir() and $self->program_name() to '. "specify the location of the program."); } return 1; } =head2 _check_sequence_input Title : _check_sequence_input Usage : $assembler->_check_sequence_input($seqs) Function: Check that the sequence input is a valid file, or an arrayref of sequence objects (Bio::PrimarySeqI or Bio::SeqI). If not, an exception is thrown. Returns : 1 if the check passed Args : sequence input =cut sub _check_sequence_input { my ($self, $seqs) = @_; if (not $seqs) { $self->throw("Must supply sequences as a FASTA filename or a sequence object". " (Bio::PrimarySeqI or Bio::SeqI) array reference"); } else { if (ref($seqs) =~ m/ARRAY/i ) { for my $seq (@$seqs) { unless ($seq->isa('Bio::PrimarySeqI') || $seq->isa('Bio::SeqI')) { $self->throw("Not a valid Bio::PrimarySeqI or Bio::SeqI object"); } } } else { if (not -f $seqs) { $self->throw("Input file '$seqs' does not seem to exist."); } } } return 1; } =head2 _check_optional_quality_input Title : _check_optional_quality_input Usage : $assembler->_check_optional_quality_input($quals) Function: If a quality score input is provided, check that it is either a valid file or an arrayref of quality score objects (Bio::Seq:: QualI or Bio::Seq::Quality). If not, an exception is thrown. Returns : 1 if the check passed (or quality score input was provided) Args : quality score input =cut sub _check_optional_quality_input { my ($self, $quals) = @_; if (defined $quals) { if (ref($quals) =~ m/ARRAY/i) { for my $qual (@$quals) { unless ($qual->isa('Bio::Seq::QualI') || $qual->isa('Bio::Seq::Quality')) { $self->throw("Not a valid Bio::Seq::QualI or Bio::Seq::Quality object"); } } } else { if (not -f $quals) { $self->throw("Input file '$quals' does not seem to exist."); } } } return 1; } =head2 _prepare_input_file Title : _prepare_input_file Usage : ($fasta_file, $qual_file) = $assembler->_prepare_input_file(\@seqs, \@quals); Function: Create the input FASTA and QUAL files as needed. If the input sequences are provided in a (FASTA) file, the optional input quality scores are also expected to be in a (QUAL) file. If the input sequences are an arrayref of bioperl sequence objects, the optional input quality scores are expected to be an arrayref of bioperl quality score objects, in the same order as the sequence objects. Returns : - input filehandle - input filename Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut sub _prepare_input_files { my ($self, $seqs, $quals) = @_; # Set up input FASTA and QUAL files $self->io->_initialize_io(); #$self->tempdir(); my $fasta_file; my $qual_file; if ( ref($seqs) =~ m/ARRAY/i ) { # Input sequences are an arrayref of Bioperl sequence objects if (defined $quals && not ref($quals) =~ m/ARRAY/i) { $self->throw("The input sequences are an arrayref of sequence objects. ". "Expecting the quality scores as an arrayref of quality score objects"); } else { # The input qualities are not defined or are an arrayref of quality objects # Write temp FASTA and QUAL input files ($fasta_file, $qual_file) = $self->_write_seq_file($seqs, $quals); } } else { # Sequence input is a FASTA file $fasta_file = $seqs; if (defined $quals && ref($quals) =~ m/ARRAY/i) { # Quality input is defined and is an arrayref of quality objects $self->throw("The input sequences are in a FASTA file. Expecting the ". "quality scores in a QUAL file."); } else { # Input quality scores is either not defined or is a QUAL file $qual_file = $quals; } } return $fasta_file, $qual_file; } =head2 _write_seq_file Title : _write_seq_file Usage : ($fasta_file, $qual_file) = $assembler->_write_seq_file(\@seqs, \@quals) Function: Write temporary FASTA and QUAL files on disk Returns : name of FASTA file name of QUAL file (undef if no quality scoress) Args : - arrayref of sequence objects - optional arrayref of quality score objects =cut sub _write_seq_file { my ($self, $seqs, $quals) = @_; # Store the sequences in temporary FASTA files my $tmpdir = $self->tempdir(); my ($fasta_h, $fasta_file) = $self->io->tempfile( -dir => $tmpdir ); my ($qual_h, $qual_file ) = $self->io->tempfile( -dir => $tmpdir ); my $fasta_out = Bio::SeqIO->new( -fh => $fasta_h , -format => 'fasta'); my $qual_out = Bio::SeqIO->new( -fh => $qual_h , -format => 'qual' ); my $use_qual_file = 0; my $size = scalar @$seqs; for ( my $i = 0 ; $i < $size ; $i++ ) { my $seq = $$seqs[$i]; # Make sure that all sequences have an ID (to prevent TIGR Assembler crash) if (not defined $seq->id) { my $newid = 'tmp'.$i; print $newid."\n"; $seq->id($newid); $self->warn("A sequence had no ID. Its ID is now $newid"); } my $seqid = $seq->id; # Write the FASTA entries in files (and QUAL if appropriate) $fasta_out->write_seq($seq); if ($seq->isa('Bio::Seq::Quality')) { # Quality scores embedded in seq object if (scalar @{$seq->qual} > 0) { $qual_out->write_seq($seq); $use_qual_file = 1; } } else { # Quality score in a different object from the sequence object my $qual = $$quals[$i]; if (defined $qual) { my $qualid = $qual->id; if ($qualid eq $seqid) { # valid quality score information $qual_out->write_seq($qual); $use_qual_file = 1; } else { # ID mismatch between sequence and quality score $self->warn("Sequence object with ID $seqid does not match quality ". "score object with ID $qualid"); } } } } close($fasta_h); close($qual_h); $fasta_out->close(); $qual_out->close(); return undef if scalar @$seqs <= 0; $qual_file = undef if $use_qual_file == 0; return $fasta_file, $qual_file; } =head2 _prepare_output_file Title : _prepare_output_file Usage : ($out_fh, $out_file) = $assembler->_prepare_output_file( ); Function: Prepare the output file Returns : - output filehandle - output filename Args : none =cut sub _prepare_output_file { my ($self) = @_; my ($output_fh, $output_file); my $out_type = $self->out_type(); if ( (not $out_type eq 'Bio::Assembly::ScaffoldI') && (not $out_type eq 'Bio::Assembly::IO' ) ) { # Output is a file with specified name $output_file = $out_type; open $output_fh, '>', $output_file or $self->throw("Could not write file ". "'$output_file': $!"); } else { ( $output_fh, $output_file ) = $self->io->tempfile( -dir => $self->tempdir() ); } $self->outfile_name($output_file); return $output_fh, $output_file; } =head2 _export_results Title : _export_results Usage : $results = $assembler->_export_results($asm_file); Function: Export the assembly results Returns : Exported assembly (file or IO object or assembly object) Args : -Name of the file containing an assembly - -keep_asm => boolean (if true, do not unlink $asm_file) -[optional] additional named args required by the B:A:IO object =cut sub _export_results { my ($self, $asm_file, @named_args) = @_; my $results; my $asm_io; my $asm; my %args = @named_args; my $keep_asm = $args{'-keep_asm'}; delete $args{'-keep_asm'}; my $out_type = $self->out_type(); if ( (not $out_type eq 'Bio::Assembly::ScaffoldI') && (not $out_type eq 'Bio::Assembly::IO' ) ) { # Results are the assembler output file $results = $asm_file; } else { $asm_io = Bio::Assembly::IO->new( -file => "<$asm_file", -format => $self->_assembly_format(), -variant => $self->_assembly_variant(), @named_args ); # this unlink is a problem for Bio::DB::Sam (in B:A:I:sam), which needs # the original bam file around. unlink $asm_file unless $keep_asm; if ($out_type eq 'Bio::Assembly::IO') { # Results are a Bio::Assembly::IO object $results = $asm_io; } else { $asm = $asm_io->next_assembly(); $asm_io->close; if ($out_type eq 'Bio::Assembly::ScaffoldI') { # Results are a Bio::Assembly::Scaffold object $results = $asm; } else { $self->throw("The return type has to be 'Bio::Assembly::IO', 'Bio::". "Assembly::ScaffoldI' or a file name."); } } } $self->cleanup(); return $results; } =head2 _register_program_commands() Title : _register_program_commands Usage : $assembler->_register_program_commands( \@commands, \%prefixes ) Function: Register the commands a program accepts (for programs that act as frontends for a set of commands, each command having its own set of params/switches) Returns : true on success Args : arrayref to a list of commands (scalar strings), hashref to a translation table of the form { $prefix1 => $command1, ... } [optional] Note : To implement a program with this kind of calling structure, include a parameter called 'command' in the @program_params global Note : The translation table is used to associate parameters and switches specified in _set_program_options with the correct program command. In the globals @program_params and @program_switches, specify elements as 'prefix1|param' and 'prefix1|switch', etc. =cut sub _register_program_commands { my ($self, $commands, $prefixes) = @_; $self->{'_options'}->{'_commands'} = $commands; $self->{'_options'}->{'_prefixes'} = $prefixes; return 1; } =head2 _set_program_options Title : _set_program_options Usage : $assembler->_set_program_options( \@ args ); Function: Register the parameters and flags that an assembler takes. Returns : 1 for success Args : - arguments passed by the user - parameters that the program accepts, optional (default: none) - switches that the program accepts, optional (default: none) - parameter translation, optional (default: no translation occurs) - dash option for the program parameters, [1|single|double|mixed], optional (default: yes, use single dashes only) - join, optional (default: ' ') =cut sub _set_program_options { my ($self, $args, $params, $switches, $translation, $qual_param, $use_dash, $join) = @_; # I think we need to filter on the basis of -command here... my %args = @$args; my $cmd = $args{'-command'} || $args{'command'}; if ($cmd) { my (@p,@s, %x); $self->warn('Command found, but no commands registered; invoke _register_program_commands') unless $self->{'_options'}->{'_commands'}; $self->throw("Command '$cmd' not registered") unless grep /^$cmd$/, @{$self->{'_options'}->{'_commands'}}; if ($self->{'_options'}->{'_prefixes'}) { $cmd = $self->{'_options'}->{'_prefixes'}->{$cmd}; } # else, the command is its own prefix # problem here: if a param/switch does not have a prefix (pfx|), then # should probably allow it to pass thru... @p = (grep(!/^.*?\|/, @$params), $cmd ? grep(/^${cmd}\|/, @$params) : ()); @s = (grep(!/^.*?\|/, @$switches), $cmd ? grep(/^${cmd}\|/, @$switches) : ()); s/.*?\|// for @p; s/.*?\|// for @s; @x{@p, @s} = @{$translation}{ grep( !/^.*?\|/, @$params, @$switches), $cmd ? grep(/^${cmd}\|/, @$params, @$switches) : () }; $translation = \%x; $params = \@p; $switches = \@s; } $self->{'_options'}->{'_params'} = $params; $self->{'_options'}->{'_switches'} = $switches; $self->{'_options'}->{'_translation'} = $translation; $self->{'_options'}->{'_qual_param'} = $qual_param; if (not defined $use_dash) { $self->{'_options'}->{'_dash'} = 1; } else { $self->{'_options'}->{'_dash'} = $use_dash; } if (not defined $join) { $self->{'_options'}->{'_join'} = ' '; } else { $self->{'_options'}->{'_join'} = $join; } # if there is a parameter 'command' in @program_params, and # new is called with new( -command => $cmd, ... ), then # _set_from_args will create an accessor $self->command containing # the value $cmd... $self->_set_from_args( $args, -methods => [ @$params, @$switches, 'program_name', 'program_dir', 'out_type' ], -create => 1, # when our parms are accessed, signal parameters are unchanged for # future reads (until set_parameters is called) -code => 'my $self = shift; $self->parameters_changed(0); return $self->{\'_\'.$method} = shift if @_; return $self->{\'_\'.$method};' ); return 1; } =head2 _translate_params Title : _translate_params Usage : @options = $assembler->_translate_params( ); Function: Translate the Bioperl arguments into the arguments to pass to the assembler on the command line Returns : Arrayref of arguments Args : none =cut sub _translate_params { my ($self) = @_; # Get option string my $params = $self->{'_options'}->{'_params'}; my $switches = $self->{'_options'}->{'_switches'}; my $join = $self->{'_options'}->{'_join'}; my $dash = $self->{'_options'}->{'_dash'}; my $translat = $self->{'_options'}->{'_translation'}; # patch to access the multiple dash choices of _setparams... my @dash_args; $dash ||= 1; # default as advertised for ($dash) { $_ == 1 && do { @dash_args = ( -dash => 1 ); last; }; /^s/ && do { #single dash only @dash_args = ( -dash => 1); last; }; /^d/ && do { # double dash only @dash_args = ( -double_dash => 1); last; }; /^m/ && do { # mixed dash: one-letter opts get -, # long opts get -- @dash_args = ( -mixed_dash => 1); last; }; do { $self->warn( "Dash spec '$dash' not recognized; using 'single'" ); @dash_args = ( -dash => 1 ); }; } my $options = $self->_setparams( -params => $params, -switches => $switches, -join => $join, @dash_args ); # Translate options my @options = split(/(\s|$join)/, $options); for (my $i = 0; $i < scalar @options; $i++) { my ($prefix, $name) = ( $options[$i] =~ m/^(-{0,2})(.+)$/ ); if (defined $name) { if ($name =~ /command/i) { $name = $options[$i+2]; # get the command splice @options, $i, 4; unshift @options, $name; # put it first } elsif (defined $$translat{$name}) { $options[$i] = $prefix.$$translat{$name}; } } else { splice @options, $i, 1; $i--; } } $options = join('', @options); # this is a kludge for mixed options: the reason mixed doesn't # work right on the pass through _setparams is that the # *aliases* and not the actual params are passed to it. # here we just rejigger the dashes if ($dash =~ /^m/) { $options =~ s/--([a-z0-9](?:\s|$))/-$1/gi; } # Now arrayify the options @options = split(' ', $options); return \@options; } =head2 _prepare_input_sequences Title : _prepare_input_sequences Usage : ($seqs, $quals) = $assembler->_prepare_input_sequences(\@seqs, \@quals); Function: Do something to the input sequence and qual objects. By default, nothing happens. Overload this method in the specific assembly module if processing of the sequences is needed (e.g. as in the TigrAssembler module). Returns : - sequence input - optional quality score input Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut sub _prepare_input_sequences { my ($self, $seqs, $quals) = @_; return $seqs, $quals; } =head2 _collate_subcmd_args() Title : _collate_subcmd_args Usage : $args_hash = $self->_collate_subcmd_args Function: collate parameters and switches into command-specific arg lists for passing to new() Returns : hash of named argument lists Args : [optional] composite cmd prefix (scalar string) [default is 'run'] =cut sub _collate_subcmd_args { my $self = shift; my $cmd = shift; my %ret; # default command is 'run' $cmd ||= 'run'; my @subcmds = @{$self->{'_options'}->{'_composite_commands'}->{$cmd}}; my %subcmds; my $cur_options = $self->{'_options'}; # collate foreach my $subcmd (@subcmds) { # find the composite cmd form of the argument in # the current params and switches # e.g., map_max_mismatches my @params = grep /^${subcmd}_/, @{$$cur_options{'_params'}}; my @switches = grep /^${subcmd}_/, @{$$cur_options{'_switches'}}; $ret{$subcmd} = []; # create an argument list suitable for passing to new() of # the subcommand factory... foreach my $opt (@params, @switches) { my $subopt = $opt; $subopt =~ s/^${subcmd}_//; push(@{$ret{$subcmd}}, '-'.$subopt => $self->$opt) if defined $self->$opt; } } return \%ret; } =head2 run Title : run Usage : $assembly = $assembler->run(\@seqs, \@quals); or $assembly = $assembler->run($fasta_file, $qual_file); Function: Run the assembler. The specific assembler wrapper needs to provide the $assembler->_run() method. Returns : Assembly results (file, IO object or Assembly object) Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut sub run { my ($self, $seqs, $quals) = @_; # Sanity checks $self->_check_executable(); $self->_check_sequence_input($seqs); $self->_check_optional_quality_input($quals); # Process objects if needed $self->_prepare_input_sequences($seqs, $quals); # Write input files my ($fasta_file, $qual_file) = $self->_prepare_input_files($seqs,$quals); # If needed, set the program argument for a QUAL file my $qual_param = $self->{'_options'}->{'_qual_param'}; if (defined $qual_param) { if ($qual_file) { # Set the quality input parameter $quals = $self->$qual_param($qual_file); } else { # Remove the quality input parameter $quals = $self->$qual_param(undef); } } # Assemble my $output_file = $self->_run($fasta_file, $qual_file); # Export results in desired object type my $asm = $self->_export_results($output_file); return $asm; } =head1 Bio:ParameterBaseI compliance =head2 set_parameters() Title : set_parameters Usage : $pobj->set_parameters(%params); Function: sets the parameters listed in the hash or array Returns : true on success Args : [optional] hash or array of parameter/values. =cut sub set_parameters { my ($self, @args) = @_; # currently stored stuff my $opts = $self->{'_options'}; my $params = $opts->{'_params'}; my $switches = $opts->{'_switches'}; my $translation = $opts->{'_translation'}; my $qual_param = $opts->{'_qual_param'}; my $use_dash = $opts->{'_dash'}; my $join = $opts->{'_join'}; $self->_set_program_options(\@args, $params, $switches, $translation, $qual_param, $use_dash, $join); # the question is, are previously-set parameters left alone when # not specified in @args? $self->parameters_changed(1); return 1; } =head2 reset_parameters() Title : reset_parameters Usage : resets values Function: resets parameters to either undef or value in passed hash Returns : none Args : [optional] hash of parameter-value pairs =cut sub reset_parameters { my ($self, @args) = @_; my @reset_args; # currently stored stuff my $opts = $self->{'_options'}; my $params = $opts->{'_params'}; my $switches = $opts->{'_switches'}; my $translation = $opts->{'_translation'}; my $qual_param = $opts->{'_qual_param'}; my $use_dash = $opts->{'_dash'}; my $join = $opts->{'_join'}; # don't like this, b/c _set_program_args will create a bunch of # accessors with undef values, but oh well for now /maj # Is better to use hashes than arrays, to use their unique keys my %reset_args = @args; foreach my $p (@$params) { if (not exists $reset_args{"-$p"}) { $reset_args{"-$p"} = undef; } } foreach my $s (@$switches) { if (not exists $reset_args{"-$s"}) { $reset_args{"-$s"} = undef; } } while (my ($method, $value) = each %reset_args) { push(@reset_args, $method => $value); } $self->_set_program_options(\@reset_args, $params, $switches, $translation, $qual_param, $use_dash, $join); $self->parameters_changed(1); } =head2 parameters_changed() Title : parameters_changed Usage : if ($pobj->parameters_changed) {...} Function: Returns boolean true (1) if parameters have changed Returns : Boolean (0 or 1) Args : [optional] Boolean =cut sub parameters_changed { my $self = shift; return $self->{'_parameters_changed'} = shift if @_; return $self->{'_parameters_changed'}; } =head2 available_parameters() Title : available_parameters Usage : @params = $pobj->available_parameters() Function: Returns a list of the available parameters Returns : Array of parameters Args : 'params' for settable program parameters 'switches' for boolean program switches default: all =cut sub available_parameters { my $self = shift; my $subset = shift; my $opts = $self->{'_options'}; my @ret; for ($subset) { (!defined || /^a/) && do { @ret = (@{$opts->{'_params'}}, @{$opts->{'_switches'}}); last; }; m/^p/i && do { @ret = @{$opts->{'_params'}}; last; }; m/^s/i && do { @ret = @{$opts->{'_switches'}}; last; }; do { #fail $self->throw("available_parameters: unrecognized subset"); }; } return @ret; } =head2 get_parameters() Title : get_parameters Usage : %params = $pobj->get_parameters; Function: Returns list of key-value pairs of parameter => value Returns : List of key-value pairs Args : [optional] A string is allowed if subsets are wanted or (if a parameter subset is default) 'all' to return all parameters =cut sub get_parameters { my $self = shift; my $subset = shift; $subset ||= 'all'; my @ret; my $opts = $self->{'_options'}; for ($subset) { m/^p/i && do { #params only for (@{$opts->{'_params'}}) { push(@ret, $_, $self->$_) if $self->can($_) && defined $self->$_; } last; }; m/^s/i && do { #switches only for (@{$opts->{'_switches'}}) { push(@ret, $_, $self->$_) if $self->can($_) && defined $self->$_; } last; }; m/^a/i && do { # all for (@{$opts->{'_params'}},@{$opts->{'_switches'}}) { push(@ret, $_, $self->$_) if $self->can($_) && defined $self->$_; } last; }; do { $self->throw("get_parameters: unrecognized subset"); }; } return @ret; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BEDTools.pm000066400000000000000000000626361302566030400225020ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::BEDTools # # Please direct questions and support issues to # # Cared for by Dan Kortschak # # Copyright Dan Kortschak # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BEDTools - Run wrapper for the BEDTools suite of programs *BETA* =head1 SYNOPSIS # use a BEDTools program $bedtools_fac = Bio::Tools::Run::BEDTools->new( -command => 'subtract' ); $result_file = $bedtools_fac->run( -bed1 => 'genes.bed', -bed2 => 'mask.bed' ); # if IO::Uncompress::Gunzip is available... $result_file = $bedtools_fac->run( -bed1 => 'genes.bed.gz', -bed2 => 'mask.bed.gz' ); # be more strict $bedtools_fac->set_parameters( -strandedness => 1 ); # and even more... $bedtools_fac->set_parameters( -minimum_overlap => 1e-6 ); # create a Bio::SeqFeature::Collection object $features = $bedtools_fac->result( -want => 'Bio::SeqFeature::Collection' ); =head1 DEPRECATION WARNING Most executables from BEDTools v>=2.10.1 can read GFF and VCF formats in addition to BED format. This requires the use of a new input file param, shown in the following documentation, '-bgv', in place of '-bed' for the executables that can do this. This behaviour breaks existing scripts. =head1 DESCRIPTION This module provides a wrapper interface for Aaron R. Quinlan and Ira M. Hall's utilities C that allow for (among other things): =over =item * Intersecting two BED files in search of overlapping features. =item * Merging overlapping features. =item * Screening for paired-end (PE) overlaps between PE sequences and existing genomic features. =item * Calculating the depth and breadth of sequence coverage across defined "windows" in a genome. =back (see L for manuals and downloads). =head1 OPTIONS C is a suite of 17 commandline executable. This module attempts to provide and options comprehensively. You can browse the choices like so: $bedtools_fac = Bio::Tools::Run::BEDTools->new; # all bowtie commands @all_commands = $bedtools_fac->available_parameters('commands'); @all_commands = $bedtools_fac->available_commands; # alias # just for default command ('bam_to_bed') @btb_params = $bedtools_fac->available_parameters('params'); @btb_switches = $bedtools_fac->available_parameters('switches'); @btb_all_options = $bedtools_fac->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. As a number of options are mutually exclusive, and the interpretation of intent is based on last-pass option reaching bowtie with potentially unpredicted results. This module will prevent inconsistent switches and parameters from being passed. See L for details of BEDTools options. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $bedtools_fac = Bio::Tools::Run::BEDTools->new( -command => 'pair_to_bed' ); @filespec = $bedtools_fac->filespec; This example returns the following array: #bedpe #bam bed #out This indicates that the bed (C BED format) file MUST be specified, and that the out, bedpe (C BEDPE format) and bam (C binary format) file MAY be specified (Note that in this case you MUST provide ONE of bedpe OR bam, the module at this stage does not allow this information to be queried). Use these in the C call like so: $bedtools_fac->run( -bedpe => 'paired.bedpe', -bgv => 'genes.bed', -out => 'overlap' ); The object will store the programs STDERR output for you in the C attribute: handle_bed_warning($bedtools_fac) if ($bedtools_fac->stderr =~ /Usage:/); For the commands 'fasta_from_bed' and 'mask_fasta_from_bed' STDOUT will also be captured in the C attribute by default and all other commands can be forced to capture program output in STDOUT by setting the -out filespec parameter to '-'. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L Rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Dan Kortschak Email dan.kortschak adelaide.edu.au =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BEDTools; use strict; our $HAVE_IO_UNCOMPRESS; BEGIN { eval 'require IO::Uncompress::Gunzip; $HAVE_IO_UNCOMPRESS = 1'; } use IPC::Run; # Object preamble - inherits from Bio::Root::Root use lib '../../..'; use Bio::Tools::Run::BEDTools::Config; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::WrapperBase::CommandExts; use Bio::Tools::GuessSeqFormat; use Bio::SeqFeature::Generic; use Bio::SeqFeature::Collection; use Bio::SeqIO; use File::Sort qw( sort_file ); use base qw( Bio::Tools::Run::WrapperBase ); ## BEDTools our $program_name = '*bedtools'; our $default_cmd = 'bam_to_bed'; # Note: Other globals imported from Bio::Tools::Run::BEDTools::Config our $qual_param = undef; our $use_dash = 'single'; our $join = ' '; our %strand_translate = ( '+' => 1, '-' => -1, '.' => 0 ); =head2 new() Title : new Usage : my $obj = new Bio::Tools::Run::BEDTools(); Function: Builds a new Bio::Tools::Run::BEDTools object Returns : an instance of Bio::Tools::Run::BEDTools Args : =cut sub new { my ($class,@args) = @_; unless (grep /command/, @args) { push @args, '-command', $default_cmd; } my $self = $class->SUPER::new(@args); foreach (keys %command_executables) { $self->executables($_, $self->_find_executable($command_executables{$_})); } $self->want($self->_rearrange([qw(WANT)],@args)); $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI return $self; } =head2 run() Title : run Usage : $result = $bedtools_fac->run(%params); Function: Run a BEDTools command. Returns : Command results (file, IO object or Bio object) Args : Dependent on filespec for command. See $bedtools_fac->filespec and BEDTools Manual. Also accepts -want => '(raw|format|)' - see want(). Note : gzipped inputs are allowed if IO::Uncompress::Gunzip is available =cut sub run { my $self = shift; my ($ann, $bed, $bg, $bgv, $bgv1, $bgv2, $bam, $bedpe, $bedpe1, $bedpe2, $seq, $genome, $out); if (!(@_ % 2)) { my %args = @_; if ((grep /^-\w+/, keys %args) == keys %args) { ($ann, $bed, $bg, $bgv, $bgv1, $bgv2, $bam, $bedpe, $bedpe1, $bedpe2, $seq, $genome, $out) = $self->_rearrange([qw( ANN BED BG BGV BGV1 BGV2 BAM BEDPE BEDPE1 BEDPE2 SEQ GENOME OUT )], @_); } else { $self->throw("Badly formed named args: ".join(' ',@_)); } } else { if (grep /^-\w+/, @_) { $self->throw("Badly formed named args: ".join(' ',@_)); } else { $self->throw("Require named args."); } } # Sanity checks $self->executable || $self->throw("No executable!"); my $cmd = $self->command if $self->can('command'); for ($cmd) { =pod Command annotate bgv ann(s) #out =cut m/^annotate$/ && do { $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); @$ann = map { my $a = $_; $a = $self->_uncompress($a); $self->_validate_file_input(-ann => $a) || $self->throw("File '$a' not BED/GFF/VCF format."); $a; } @$ann; last; }; =pod graph_union bg_files #out =cut m/^graph_union$/ && do { @$bg = map { my $g = $_; $g = $self->_uncompress($g); $self->_validate_file_input(-bg => $g) || $self->throw("File '$a' not BedGraph format."); $g; } @$bg; last; }; =pod fasta_from_bed seq bgv #out mask_fasta_from_bed seq bgv #out =cut m/fasta_from_bed$/ && do { ($out // 0) eq '-' && $self->throw("Cannot capture results in STDOUT with sequence commands."); $seq = $self->_uncompress($seq); $self->_validate_file_input(-seq => $seq) || $self->throw("File '$seq' not fasta format."); $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); last; }; =pod bam_to_bed bam #out =cut m/^bam_to_bed$/ && do { $bam = $self->_uncompress($bam); $self->_validate_file_input(-bam => $bam) || $self->throw("File '$bam' not BAM format."); last; }; =pod bed_to_IGV bgv #out merge bgv #out sort bgv #out links bgv #out =cut m/^(?:bed_to_IGV|merge|sort|links)$/ && do { $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); }; =pod b12_to_b6 bed #out overlap bed #out group_by bed #out =cut m/^(?:b12_to_b6|overlap|group_by)$/ && do { $bed = $self->_uncompress($bed); $self->_validate_file_input(-bed => $bed) || $self->throw("File '$bgv' not BED format."); if ($cmd eq 'group_by') { my $c =(my @c)= split(",",$self->columns); my $o =(my @o)= split(",",$self->operations); unless ($c > 0 && $o == $c) { $self->throw("The command 'group_by' requires "."paired "x($o == $c)."'-columns' and '-operations' parameters"); } } last; }; =pod bed_to_bam bgv #out shuffle bgv genome #out slop bgv genome #out complement bgv genome #out =cut m/^(?:bed_to_bam|shuffle|slop|complement)$/ && do { $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bgv' not BED/GFF/VCF format."); $genome = $self->_uncompress($genome); $self->_validate_file_input(-genome => $genome) || $self->throw("File '$genome' not genome format."); if ($cmd eq 'slop') { my $l = defined $self->add_to_left; my $r = defined $self->add_to_right; my $b = defined $self->add_bidirectional; # I think I have a lisp unless (($l && $r) || ($b xor ($l || $r))) { $self->throw("The command 'slop' requires an unambiguous description of the slop you want"); } } last; }; =pod genome_coverage bed genome #out =cut m/^genome_coverage$/ && do { $bed = $self->_uncompress($bed); $self->_validate_file_input(-bed => $bed) || $self->throw("File '$bed' not BED format."); $genome = $self->_uncompress($genome); $self->_validate_file_input(-genome => $genome) || $self->throw("File '$genome' not genome format."); my ($th, $tf) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => '.bed' ); $th->close; sort_file({k => 1, I => $bed, o => $tf}); $bed = $tf; last; }; =pod window bgv1 bgv2 #out closest bgv1 bgv2 #out coverage bgv1 bgv2 #out subtract bgv1 bgv2 #out =cut m/^(?:window|closest|coverage|subtract)$/ && do { $bgv1 = $self->_uncompress($bgv1); $self->_validate_file_input(-bgv1 => $bgv1) || $self->throw("File '$bgv1' not BED/GFF/VCF format."); $bgv2 = $self->_uncompress($bgv2); $self->_validate_file_input(-bgv2 => $bgv2) || $self->throw("File '$bgv2' not BED/GFF/VCF format."); }; =pod pair_to_pair bedpe1 bedpe2 #out =cut m/^pair_to_pair$/ && do { $bedpe1 = $self->_uncompress($bedpe1); $self->_validate_file_input(-bedpe1 => $bedpe1) || $self->throw("File '$bedpe1' not BEDPE format."); $bedpe2 = $self->_uncompress($bedpe2); $self->_validate_file_input(-bedpe2 => $bedpe2) || $self->throw("File '$bedpe2' not BEDPE format."); last; }; =pod intersect bgv1|bam bgv2 #out =cut m/^intersect$/ && do { $bgv1 = $self->_uncompress($bgv1); $bam = $self->_uncompress($bam); ($bam && $self->_validate_file_input(-bam => $bam)) || ($bgv1 && $self->_validate_file_input(-bgv1 => $bgv1)) || $self->throw("File in position 1. not correct format."); $bgv2 = $self->_uncompress($bgv2); $self->_validate_file_input(-bgv2 => $bgv2) || $self->throw("File '$bgv2' not BED/GFF/VCF format."); last; }; =pod pair_to_bed bedpe|bam bgv #out bgv* signifies any of BED, GFF or VCF. ann is a bgv. NOTE: Replace 'bgv' with 'bed' unless $use_bgv is set. =cut m/^pair_to_bed$/ && do { $bedpe = $self->_uncompress($bedpe); $bam = $self->_uncompress($bam); ($bam && $self->_validate_file_input(-bam => $bam)) || ($bedpe && $self->_validate_file_input(-bedpe => $bedpe)) || $self->throw("File in position 1. not correct format."); $bgv = $self->_uncompress($bgv); $self->_validate_file_input(-bgv => $bgv) || $self->throw("File '$bed' not BED/GFF/VCF format."); last; } } my %params = ( '-ann' => $ann, '-bam' => $bam, '-bed' => $bed, '-bgv' => $bgv, '-bg' => $bg, '-bgv1' => $bgv1, '-bgv2' => $bgv2, '-bedpe' => $bedpe, '-bedpe1' => $bedpe1, '-bedpe2' => $bedpe2, '-seq' => $seq, '-genome' => $genome ); map { delete $params{$_} unless defined $params{$_} } keys %params; my $format = $self->_determine_format(\%params); my $suffix = '.'.$format; if (!defined $out) { my ($outh, $outf) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => $suffix ); $outh->close; $out = $outf; } elsif ($out ne '-') { $out .= $suffix; } else { undef $out; } $params{'-out'} = $out if defined $out; $self->_run(%params); $self->{'_result'}->{'file_name'} = $out // '-'; $self->{'_result'}->{'format'} = $format; $self->{'_result'}->{'file'} = defined $out ? Bio::Root::IO->new( -file => $out ) : undef; return $self->result; } sub _uncompress { my ($self, $file) = @_; return if !defined $file; return $file unless ($file =~ m/\.gz[^.]*$/); return $file unless (-e $file && -r _); # other people will deal with this unless ($HAVE_IO_UNCOMPRESS) { croak( "IO::Uncompress::Gunzip not available, can't expand '$file'" ); } my ($tfh, $tf) = $self->io->tempfile( -dir => $self->tempdir() ); my $z = IO::Uncompress::Gunzip->new($file); while (my $block = $z->getline) { print $tfh $block } close $tfh; return $tf } =head2 want() Title : want Usage : $bowtiefac->want( $class ) Function: make factory return $class, or 'raw' results in file or 'format' for result format All commands can return Bio::Root::IO commands returning: can return object: - BED or BEDPE - Bio::SeqFeature::Collection - sequence - Bio::SeqIO Returns : return wanted type Args : [optional] string indicating class or raw of wanted result =cut sub want { my $self = shift; return $self->{'_want'} = shift if @_; return $self->{'_want'}; } =head2 result() Title : result Usage : $bedtoolsfac->result( [-want => $type|$format] ) Function: return result in wanted format Returns : results Args : [optional] hashref of wanted type Note : -want arg does not persist between result() call when specified in result(), for persistence, use want() =cut sub result { my ($self, @args) = @_; my $want = $self->_rearrange([qw(WANT)],@args); $want ||= $self->want; my $cmd = $self->command if $self->can('command'); my $format = $self->{'_result'}->{'format'}; my $file_name = $self->{'_result'}->{'file_name'}; return $self->{'_result'}->{'format'} if (defined $want && $want eq 'format'); return $self->{'_result'}->{'file_name'} if (!$want || $want eq 'raw'); return $self->{'_result'}->{'file'} if ($want =~ m/^Bio::Root::IO/); # this will be undef if -out eq '-' for ($format) { # these are dissected more finely than seems resonable to allow easy extension m/bed/ && do { for ($want) { m/Bio::SeqFeature::Collection/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqFeature::Collection/) { $self->{'_result'}->{'object'} = $self->_read_bed; } return $self->{'_result'}->{'object'}; }; $self->warn("Cannot make '$_' for $format."); return; } last; }; m/bedpe/ && do { for ($want) { m/Bio::SeqFeature::Collection/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqFeature::Collection/) { $self->{'_result'}->{'object'} = $self->_read_bedpe; } return $self->{'_result'}->{'object'}; }; $self->warn("Cannot make '$_' for $format."); return; } last; }; m/bam/ && do { $self->warn("Cannot make '$_' for $format."); return; }; m/^(?:fasta|raw)$/ && do { for ($want) { m/Bio::SeqIO/ && do { $file_name eq '-' && $self->throw("Cannot make a SeqIO object from STDOUT."); unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqIO/) { $self->{'_result'}->{'object'} = Bio::SeqIO->new(-file => $file_name, -format => $format); } return $self->{'_result'}->{'object'}; }; $self->warn("Cannot make '$_' for $format."); return; } last; }; m/tab/ && do { $self->warn("Cannot make '$_' for $format."); return; }; m/igv/ && do { $self->warn("Cannot make '$_' for $format."); return; }; m/html/ && do { $self->warn("Cannot make '$_' for $format."); return; }; do { $self->warn("Result format '$_' not recognised - have you called run() yet?"); } } } =head2 _determine_format() Title : _determine_format( $has_run ) Usage : $bedtools-fac->_determine_format Function: determine the format of output for current options Returns : format of bowtie output Args : [optional] boolean to indicate result exists =cut sub _determine_format { my ($self, $params) = @_; my $cmd = $self->command if $self->can('command'); my $format = $format_lookup{$cmd}; #special cases - dependent on switches and files for ($cmd) { m/^intersect$/ && do { return 'bed' if !defined $$params{'-bam'} || $self->write_bed; return 'bam'; }; m/^pair_to_bed$/ && do { return 'bedpe' if !defined $$params{'-bam'} || $self->write_bedpe; return 'bam'; }; m/^fasta_from_bed$/ && do { return $self->output_tab_format ? 'tab' : 'fasta'; } } return $format; } =head2 _read_bed() Title : _read_bed() Usage : $bedtools_fac->_read_bed Function: return a Bio::SeqFeature::Collection object from a BED file Returns : Bio::SeqFeature::Collection Args : =cut sub _read_bed { my ($self) = shift; my @features; if ($self->{'_result'}->{'file_name'} ne '-') { my $in = $self->{'_result'}->{'file'}; while (my $feature = $in->_readline) { chomp $feature; push @features, _read_bed_line($feature); } } else { for my $feature (split("\cJ", $self->stdout)) { push @features, _read_bed_line($feature); } } my $collection = Bio::SeqFeature::Collection->new; $collection->add_features(\@features); return $collection; } sub _read_bed_line { my $feature = shift; my ($chr, $start, $end, $name, $score, $strand, $thick_start, $thick_end, $item_RGB, $block_count, $block_size, $block_start) = split("\cI",$feature); $strand ||= '.'; # BED3 doesn't have strand data - for 'merge' and 'complement' return Bio::SeqFeature::Generic->new( -seq_id => $chr, -primary => $name, -start => $start, -end => $end, -strand => $strand_translate{$strand}, -score => $score, -tag => { thick_start => $thick_start, thick_end => $thick_end, item_RGB => $item_RGB, block_count => $block_count, block_size => $block_size, block_start => $block_size } ); } =head2 _read_bedpe() Title : _read_bedpe() Usage : $bedtools_fac->_read_bedpe Function: return a Bio::SeqFeature::Collection object from a BEDPE file Returns : Bio::SeqFeature::Collection Args : =cut sub _read_bedpe { my ($self) = shift; my @features; if ($self->{'_result'}->{'file_name'} ne '-') { my $in = $self->{'_result'}->{'file'}; while (my $feature = $in->_readline) { chomp $feature; push @features, _read_bedpe_line($feature); } } else { for my $feature (split("\cJ", $self->stdout)) { push @features, _read_bedpe_line($feature); } } my $collection = Bio::SeqFeature::Collection->new; $collection->add_features(\@features); return $collection; } sub _read_bedpe_line { my $feature = shift; my ($chr1, $start1, $end1, $chr2, $start2, $end2, $name, $score, $strand1, $strand2, @add) = split("\cI",$feature); $strand1 ||= '.'; $strand2 ||= '.'; return Bio::SeqFeature::FeaturePair->new( -primary => $name, -seq_id => $chr1, -start => $start1, -end => $end1, -strand => $strand_translate{$strand1}, -hprimary_tag => $name, -hseqname => $chr2, -hstart => $start2, -hend => $end2, -hstrand => $strand_translate{$strand2}, -score => $score ); } =head2 _validate_file_input() Title : _validate_file_input Usage : $bedtools_fac->_validate_file_input( -type => $file ) Function: validate file type for file spec Returns : file type if valid type for file spec Args : hash of filespec => file_name =cut sub _validate_file_input { my ($self, @args) = @_; my (%args); if (grep (/^-/, @args) && (@args > 1)) { # named parms $self->throw("Wrong number of args - requires one named arg") if (@args > 2); s/^-// for @args; %args = @args; } else { $self->throw("Must provide named filespec"); } for (keys %args) { m/bam/ && do { return 'bam'; }; do { return unless ( -e $args{$_} && -r _ ); my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$args{$_}); return $guesser->guess if grep {$guesser->guess =~ m/$_/} @{$accepted_types{$_}}; } } return; } =head2 version() Title : version Usage : $version = $bedtools_fac->version() Function: Returns the program version (if available) Returns : string representing location and version of the program =cut sub version{ my ($self) = @_; my $cmd = $self->command if $self->can('command'); defined $cmd or $self->throw("No command defined - cannot determine program executable"); # new bahaviour for some BEDTools executables - breaks previous approach to getting version # $dummy can be any non-recognised parameter - '-version' works for most my $dummy = '-version'; $dummy = '-examples' if $cmd =~ /graph_union/; my ($in, $out, $err); my $dum; $in = \$dum; $out = \$self->{'stdout'}; $err = \$self->{'stderr'}; # Get program executable my $exe = $self->executable; my @ipc_args = ( $exe, $dummy ); eval { IPC::Run::run(\@ipc_args, $in, $out, $err) or die ("There was a problem running $exe : $!"); }; # We don't bother trying to catch this: version is returned as an illegal file seek my @details = split("\n",$self->stderr); (my $version) = grep /^Program: .*$/, @details; $version =~ s/^Program: //; return $version; } sub available_commands { shift->available_parameters('commands') }; sub filespec { shift->available_parameters('filespec') }; 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BEDTools/000077500000000000000000000000001302566030400221275ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BEDTools/Config.pm000066400000000000000000000366231302566030400237040ustar00rootroot00000000000000# $Id: Config.pm kortsch $ # # BioPerl module for Bio::Tools::Run::BEDTools::Config # # Please direct questions and support issues to # # Cared for by Dan Kortschak # # Copyright Dan Kortschak # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BEDTools::Config - Configuration data for bowtie commands =head1 SYNOPSIS Used internally by L. =head1 DESCRIPTION This package exports information describing BEDTools commands, parameters, switches, and input and output filetypes for individual BEDTools commands. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Dan Kortschak Email dan.kortschak adelaide.edu.au =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BEDTools::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_executables %format_lookup %command_prefixes %composite_commands @program_params @program_switches %param_translation %command_files %accepted_types ); @EXPORT_OK = qw(); our @program_commands = qw( annotate fasta_from_bed overlap bam_to_bed genome_coverage pair_to_pair bed_to_bam graph_union pair_to_bed bed_to_IGV group_by shuffle b12_to_b6 intersect slop closest links sort complement mask_fasta_from_bed subtract coverage merge window ); our %command_executables = ( 'annotate' => 'annotateBed', 'bam_to_bed' => 'bamToBed', 'bed_to_bam' => 'bedToBam', 'bed_to_IGV' => 'bedToIgv', 'b12_to_b6' => 'bed12ToBed6', 'fasta_from_bed' => 'fastaFromBed', 'mask_fasta_from_bed' => 'maskFastaFromBed', 'shuffle' => 'shuffleBed', 'window' => 'windowBed', 'closest' => 'closestBed', 'genome_coverage' => 'genomeCoverageBed', 'merge' => 'mergeBed', 'slop' => 'slopBed', 'complement' => 'complementBed', 'intersect' => 'intersectBed', 'pair_to_bed' => 'pairToBed', 'sort' => 'sortBed', 'coverage' => 'coverageBed', 'links' => 'linksBed', 'pair_to_pair' => 'pairToPair', 'subtract' => 'subtractBed', 'overlap' => 'overlap', 'group_by' => 'groupBy', 'graph_union' => 'unionBedGraphs' ); our %format_lookup = ( 'annotate' => 'bed', 'bam_to_bed' => 'bed', 'bed_to_bam' => 'bam', 'bed_to_IGV' => 'igv', 'b12_to_b6' => 'bed', 'closest' => 'bedpe', 'complement' => 'bed', 'coverage' => 'bed', 'fasta_from_bed' => 'fasta', 'genome_coverage' => 'tab', 'graph_union' => 'bg', 'group_by' => 'bed', 'intersect' => 'bed|bam', 'links' => 'html', 'mask_fasta_from_bed' => 'fasta', 'merge' => 'bed', 'overlap' => 'bed', 'pair_to_bed' => 'bedpe|bam', 'pair_to_pair' => 'bedpe', 'slop' => 'bed', 'shuffle' => 'bed', 'sort' => 'bed', 'subtract' => 'bed', 'window' => 'bedpe' ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # our %composite_commands = ( ); # prefixes only for commands that take params/switches... our %command_prefixes = ( 'annotate' => 'ann', 'bam_to_bed' => 'ate', 'bed_to_bam' => 'eta', 'bed_to_IGV' => 'eti', 'b12_to_b6' => '126', 'fasta_from_bed' => 'ffb', 'mask_fasta_from_bed' => 'mfb', 'shuffle' => 'shb', 'window' => 'wib', 'closest' => 'clb', 'genome_coverage' => 'gcb', 'merge' => 'meb', 'slop' => 'slb', 'complement' => 'cob', 'intersect' => 'inb', 'pair_to_bed' => 'ptb', 'sort' => 'sob', 'coverage' => 'cvb', 'links' => 'lib', 'pair_to_pair' => 'ptp', 'subtract' => 'sub', 'overlap' => 'ove', 'group_by' => 'grp', 'graph_union' => 'ubg' ); our @program_params = qw( command ate|tag ate|color eta|quality eti|path eti|session eti|sort eti|slop eti|image shb|exclude shb|seed wib|window_size wib|left_window_size wib|right_window_size clb|ties_policy gcb|max_depth gcb|strand meb|max_distance slb|add_bidirectional slb|add_to_left slb|add_to_right inb|minimum_overlap ptb|minimum_overlap ptb|type ptp|minimum_overlap ptp|type ptp|slop sub|minimum_overlap lib|basename lib|organism lib|genome_build ove|columns grp|group grp|columns grp|operations ubg|names ubg|filler ); our @program_switches = qw( ann|names ann|count ann|both ann|strandedness ate|write_bedpe ate|use_edit_distance ate|bam12 ate|split ate|use_edit_distance ate|cigar eta|uncompressed eta|bed12 eti|collapse eti|name ffb|use_bed_name ffb|output_tab_format ffb|strandedness gcb|bedgraph gcb|bedgraph_all gcb|split mfb|soft_mask shb|keep_chromosome wib|define_by_strand wib|same_strand wib|report_once_only wib|report_hits wib|invert clb|strandedness clb|report_distance gcb|report_pos_depth meb|strandedness meb|report_n_merged meb|report_names_merged slb|define_by_strand inb|write_bed inb|write_entry_1 inb|write_entry_2 inb|report_once_only inb|report_n_hits inb|invert_match inb|reciprocal inb|strandedness inb|write_overlap inb|write_overlap_all inb|split ptb|write_bedpe ptb|strandedness ptb|use_edit_distance ptb|write_uncompressed sob|size_asc sob|size_desc sob|chr_size_asc sob|chr_size_desc sob|chr_score_asc sob|chr_score_desc cvb|strandedness cvb|histogram cvb|depth cvb|split ptp|ignore_strand ptp|slop_strandedness ptp|no_self_hits sub|strandedness ubg|header ubg|empty ); our %param_translation = ( 'ann|names' => 'names', 'ann|counts' => 'counts', 'ann|both' => 'both', 'ann|strandedness' => 's', 'ate|write_bedpe' => 'bedpe', 'ate|use_edit_distance' => 'ed', 'ate|bam12' => 'bam12', 'ate|split' => 'split', 'ate|use_edit_distance' => 'ed', 'ate|tag' => 'tag', 'ate|color' => 'color', 'ate|cigar' => 'cigar', 'eta|quality' => 'maqp', 'eta|uncompressed' => 'ubam', 'eta|bed12' => 'bed12', 'eti|path' => 'path', 'eti|session' => 'sess', 'eti|sort' => 'sort', 'eti|collapse' => 'clps', 'eti|name' => 'name', 'eti|slop' => 'slop', 'eti|image' => 'img', 'ffb|use_bed_name' => 'names', 'ffb|output_tab_format' => 'tab', 'ffb|strandedness' => 's', 'mfb|soft_mask' => 'soft', 'shb|keep_chromosome' => 'chrom', 'shb|exclude' => 'excl', 'shb|seed' => 'seed', 'wib|define_by_strand' => 'sw', 'wib|same_strand' => 'sm', 'wib|report_once_only' => 'u', 'wib|report_hits' => 'c', 'wib|invert' => 'v', 'wib|window_size' => 'w', 'wib|left_window_size' => 'l', 'wib|right_window_size' => 'r', 'clb|strandedness' => 's', 'clb|report_distance' => 'd', 'clb|ties_policy' => 't', 'gcb|report_pos_depth' => 'd', 'gcb|max_depth' => 'max', 'gcb|bedgraph' => 'bg', 'gcb|bedgraph_all' => 'bga', 'gcb|split' => 'split', 'gcb|strand' => 'strand', 'meb|strandedness' => 's', 'meb|report_n_merged' => 'n', 'meb|report_names_merged' => 'nms', 'meb|max_distance' => 'd', 'slb|define_by_strand' => 's', 'slb|add_bidirectional' => 'b', 'slb|add_to_left' => 'l', 'slb|add_to_right' => 'r', 'inb|write_bed' => 'bed', 'inb|write_entry_1' => 'wa', 'inb|write_entry_2' => 'wb', 'inb|write_overlap' => 'wo', 'inb|write_overlap_all' => 'woa', 'inb|report_once_only' => 'u', 'inb|report_n_hits' => 'c', 'inb|invert_match' => 'v', 'inb|reciprocal' => 'r', 'inb|strandedness' => 's', 'inb|minimum_overlap' => 'f', 'inb|split' => 'split', 'ptb|write_bedpe' => 'bedpe', 'ptb|strandedness' => 's', 'ptb|minimum_overlap' => 'f', 'ptb|type' => 'type', 'ptb|use_edit_distance' => 'ed', 'ptb|write_uncompressed' => 'ubam', 'sob|size_asc' => 'sizeA', 'sob|size_desc' => 'sizeD', 'sob|chr_size_asc' => 'chrThenSizeA', 'sob|chr_size_desc' => 'chrThenSizeD', 'sob|chr_score_asc' => 'chrThenScoreA', 'sob|chr_score_desc' => 'chrThenScoreD', 'cvb|strandedness' => 's', 'cvb|histogram' => 'hist', 'cvb|depth' => 'd', 'cvb|split' => 'split', 'ptp|ignore_strand' => 'is', 'ptp|slop_strandedness' => 'ss', 'ptp|minimum_overlap' => 'f', 'ptp|type' => 'type', 'ptp|slop' => 'slop', 'ptp|no_self_hits' => 'rdn', 'sub|strandedness' => 's', 'sub|minimum_overlap' => 'f', 'lib|basename' => 'base', 'lib|organism' => 'org', 'lib|genome_build' => 'db', 'ove|columns' => 'cols', 'grp|group' => 'grp', 'grp|columns' => 'opCols', 'grp|operations' => 'ops', 'ubg|header' => 'header', 'ubg|names' => 'names', 'ubg|empty' => 'empty', 'ubg|filler' => 'filler' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_bowtie # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'annotate' => [qw( -i|bgv -files|*ann >#out )], 'bam_to_bed' => [qw( -i|bam >#out )], 'bed_to_bam' => [qw( -i|bgv -g|genome >#out )], 'bed_to_IGV' => [qw( -i|bgv >#out )], 'b12_to_b6' => [qw( -i|bed >#out )], 'fasta_from_bed' => [qw( -fi|seq -bed|bgv -fo|#out )], 'mask_fasta_from_bed' => [qw( -fi|seq -bed|bgv -fo|#out )], 'shuffle' => [qw( -i|bgv -g|genome >#out )], 'window' => [qw( -a|bgv1 -b|bgv2 >#out )], 'closest' => [qw( -a|bgv1 -b|bgv2 >#out )], 'genome_coverage' => [qw( -i|bed -g|genome >#out )], 'merge' => [qw( -i|bgv >#out )], 'slop' => [qw( -i|bgv -g|genome >#out )], 'complement' => [qw( -i|bgv -g|genome >#out )], 'intersect' => [qw( -a|#bgv1 -abam|#bam -b|bgv2 >#out )], # (bgv1|bam) required 'pair_to_bed' => [qw( -a|#bedpe -abam|#bam -b|bgv >#out )], # (bedpe|bam) required 'sort' => [qw( -i|bgv >#out )], 'coverage' => [qw( -a|bgv1 -b|bgv2 >#out )], 'links' => [qw( -i|bgv >#out )], 'pair_to_pair' => [qw( -a|bedpe1 -b|bedpe2 >#out )], 'subtract' => [qw( -a|bgv1 -b|bgv2 >#out )], 'group_by' => [qw( -i|bed >#out )], 'graph_union' => [qw( -i|*bg -g|#genome >#out )], 'overlap' => [qw( -i|bed >#out )] ); our %accepted_types = ( 'ann' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bam' => [qw()], # we need a test for this 'bed' => [qw( tab )], 'bgv' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bgv1' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bgv2' => [qw( tab vcf gff )], # BEDTools now has multiple accepted input formats: bed/gff/vcf 'bedpe' => [qw( tab )], 'bedpe1' => [qw( tab )], 'bedpe2' => [qw( tab )], 'seq' => [qw( fasta )], 'genome' => [qw( tab )], 'bg' => [qw( tab )] ); 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BWA.pm000077500000000000000000000426631302566030400215010ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::BWA # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BWA - Run wrapper for the BWA short-read assembler *BETA* =head1 SYNOPSIS # create an assembly # run BWA commands separately =head1 DESCRIPTION This module provides a wrapper interface for Heng Li's reference-directed short read assembly suite C (see L for manuals and downloads). Manipulating the alignments requires C (L) and Lincoln Stein's package C (L). There are two modes of action. =over =item * Easy assembly The first is a simple pipeline through the C commands, taking your read data in and squirting out an assembly object of type L. The pipeline is based on the one performed by C: Action maq commands ------ ------------ data conversion to fasta2bfa, fastq2bfq maq binary formats map sequence reads map to reference seq assemble, creating assemble consensus convert map & cns mapview, cns2fq files to plaintext (for B:A:IO:maq) Command-line options can be directed to the C, C, and C steps. See L below. =item * BWA command mode The second mode is direct access to C commands. To run a C command, construct a run factory, specifying the desired command using the C<-command> argument in the factory constructor, along with options specific to that command (see L): $bwafac = Bio::Tools::Run::BWA->new( -command => 'fasta2bfa' ); To execute, use the C methods. Input and output files are specified in the arguments of C (see L): $bwafac->run_bwa( -fas => "myref.fas", -bfa => "myref.bfa" ); =back =head1 OPTIONS C is complex, with many subprograms (commands) and command-line options and file specs for each. This module attempts to provide commands and options comprehensively. You can browse the choices like so: $bwafac = Bio::Tools::Run::BWA->new( -command => 'aln' ); # all maq commands @all_commands = $bwafac->available_parameters('commands'); @all_commands = $bwafac->available_commands; # alias # just for aln @aln_params = $bwafac->available_parameters('params'); @aln_switches = $bwafac->available_parameters('switches'); @aln_all_options = $bwafac->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. See L for the gory details. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $bwafac = Bio::Tools::Run::BWA->new( -command => 'aln' ); @filespec = $bwafac->filespec; This example returns the following array: fas faq >sai This indicates that the FASTA database (faq) and the FASTQ reads (faq) MUST be specified, and the STDOUT of this program (SA coordinates) MAY be slurped into a file specified in the C argument list: $bwafac->run_bwa( -fas => 'my.db.fas', -faq => 'reads.faq', -sai => 'out.sai' ); If files are not specified per the filespec, text sent to STDOUT and STDERR is saved and is accessible with C<$bwafac->stdout()> and C<$bwafac->stderr()>. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BWA; use strict; use IPC::Run; our $HAVE_IO_UNCOMPRESS; our $HAVE_SAMTOOLS; BEGIN { eval "require IO::Uncompress::Gunzip; \$HAVE_IO_UNCOMPRESS = 1"; eval "require Bio::Tools::Run::Samtools; \$HAVE_SAMTOOLS = 1"; } # Object preamble - inherits from Bio::Root::Root use Bio::Root::Root; use Bio::Tools::Run::BWA::Config; use Bio::Tools::GuessSeqFormat; use File::Basename qw(fileparse); use File::Copy; use Cwd; use base qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase ); our $program_name = 'bwa'; # name of the executable # Note: # other globals required by Bio::Tools::Run::AssemblerBase are # imported from Bio::Tools::Run::BWA::Config our $qual_param = 'quality_file'; our $use_dash = 1; our $join = ' '; # Bio::Assembly::IO::sam now workable! our $asm_format = 'sam'; =head2 new() Title : new Usage : my $obj = new Bio::Tools::Run::BWA(); Function: Builds a new Bio::Tools::Run::BWA object Returns : an instance of Bio::Tools::Run::BWA Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->parameters_changed(1); $self->_register_program_commands( \@program_commands, \%command_prefixes ); unless (grep /command/, @args) { push @args, '-command', 'run'; } $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); $self->program_name($program_name) if not defined $self->program_name(); $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI $self->_assembly_format($asm_format); return $self; } =head2 run Title : run Usage : $assembly = $bwafac->run( @args ); Function: Run the bwa assembly pipeline. Returns : Assembly results (file, IO object or Assembly object) Args : - fastq file containing single-end reads - fasta file containing the reference sequence - [optional] fastq file containing paired-end reads =cut sub run { my ($self, $rd1_file, $ref_file, $rd2_file) = @_; # Sanity checks $self->_check_executable(); unless ($HAVE_SAMTOOLS) { cluck( "Bio::Tools::Run::Samtools is not available. A .sam output alignment will be created, but must be converted to binary SAM (.bam) before it can be passed to Bio::Assembly::IO, as follows: \n\t\$ samtools view -Sb out.sam > out.bam" ); } $rd1_file or $self->throw("Fastq reads file required at arg 1"); $ref_file or $self->throw("Fasta refseq file required at arg 2"); for ($rd1_file, $ref_file, $rd2_file) { next unless $_; if (/\.gz[^.]*$/) { unless ($HAVE_IO_UNCOMPRESS) { croak( "IO::Uncompress::Gunzip not available, can't expand '$_'" ); } my ($tfh, $tf) = $self->io->tempfile; my $z = IO::Uncompress::Gunzip->new($_); while (<$z>) { print $tfh $_ } close $tfh; $_ = $tf; } } my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$rd1_file); $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 1"); $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$ref_file); $guesser->guess eq 'fasta' or $self->throw("Refseq file doesn't look like fasta at arg 2"); if ($rd2_file) { $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$rd2_file); $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 3"); } #Assemble my ($sam_file) = $self->_run($rd1_file, $ref_file, $rd2_file); if ($HAVE_SAMTOOLS) { my ($nm,$dr,$suf) = fileparse($sam_file, ".sam"); # goofy kludge for samtools... my $pwd = getcwd; chdir($dr); my $samt = Bio::Tools::Run::Samtools->new( -command => 'view', -sam_input => 1, -bam_output => 1, -refseq => $ref_file); my $bam_file = $nm.'.bam'; $samt->run( -bam => $nm.$suf, -out => $bam_file ) or croak( "Problem converting .sam file"); $samt = Bio::Tools::Run::Samtools->new( -command => 'sort' ); $samt->run( -bam => $bam_file, -pfx => $nm.'.srt' ) or croak( "Problem sorting .bam file"); move( $nm.'.srt.bam', $bam_file ); $samt = Bio::Tools::Run::Samtools->new( -command => 'index' ); $samt->run( -bam => $bam_file ); $bam_file = File::Spec->catfile($dr, $bam_file); $sam_file = $bam_file; chdir($pwd); } # Export results in desired object type my $asm = $self->_export_results($sam_file, -refdb => $ref_file, -keep_asm => 1); return $asm; } =head2 run_bwa() Title : run_bwa Usage : $obj->run_bwa( @file_args ) Function: Run a bwa command as specified during object contruction Returns : Args : a specification of the files to operate on: =cut sub run_bwa { my ($self, @args) = @_; # _translate_params will provide an array of command/parameters/switches # -- these are set at object construction # to set up the run, need to add the files to the call # -- provide these as arguments to this function my $cmd = $self->command if $self->can('command'); $self->throw("No maq command specified for the object") unless $cmd; # setup files necessary for this command my $filespec = $command_files{$cmd}; $self->throw("No command-line file specification is defined for command '$cmd'; check Bio::Tools::Run::Maq::Config") unless $filespec; # parse args based on filespec # require named args $self->throw("Named args are required") unless !(@args % 2); s/^-// for @args; my %args = @args; # validate my @req = map { my $s = $_; $s =~ s/^[012]?[<>]//; $s =~ s/[^a-zA-Z0-9_]//g; $s } grep !/[#]/, @$filespec; !defined($args{$_}) && $self->throw("Required filearg '$_' not specified") for @req; # set up redirects my ($in, $out, $err); for (@$filespec) { m/^1?>(.*)/ && do { defined($args{$1}) && ( open($out,">", $args{$1}) or $self->throw("Open for write error : $!")); next; }; m/^2>#?(.*)/ && do { defined($args{$1}) && (open($err, ">", $args{$1}) or $self->throw("Open for write error : $!")); next; }; m/^<#?(.*)/ && do { defined($args{$1}) && (open($in, "<", $args{$1}) or $self->throw("Open for read error : $!")); next; } } my $dum; $in || ($in = \$dum); $out || ($out = \$self->{'stdout'}); $err || ($err = \$self->{'stderr'}); # Get program executable my $exe = $self->executable; # Get command-line options my $options = $self->_translate_params(); # Get file specs sans redirects in correct order my @specs = map { my $s = $_; $s =~ s/[^a-zA-Z0-9_]//g; $s } grep !/[<>]/, @$filespec; my @files = @args{@specs}; # expand arrayrefs my $l = $#files; for (0..$l) { splice(@files, $_, 1, @{$files[$_]}) if (ref($files[$_]) eq 'ARRAY'); } @files = map { defined $_ ? $_ : () } @files; # squish undefs my @ipc_args = ( $exe, @$options, @files ); eval { IPC::Run::run(\@ipc_args, $in, $out, $err) or die ("There was a problem running $exe : $!"); }; if ($@) { $self->throw("$exe call crashed: $@"); } # return arguments as specified on call return @args; } =head2 stdout() Title : stdout Usage : $fac->stdout() Function: store the output from STDOUT for the run, if no file specified in run_maq() Example : Returns : scalar string Args : on set, new value (a scalar or undef, optional) =cut sub stdout { my $self = shift; return $self->{'stdout'} = shift if @_; return $self->{'stdout'}; } =head2 stderr() Title : stderr Usage : $fac->stderr() Function: store the output from STDERR for the run, if no file is specified in run_maq() Example : Returns : scalar string Args : on set, new value (a scalar or undef, optional) =cut sub stderr { my $self = shift; return $self->{'stderr'} = shift if @_; return $self->{'stderr'}; } =head1 Bio::Tools::Run::AssemblerBase overrides =head2 _check_sequence_input() No-op. =cut sub _check_sequence_input { return 1; } =head2 _check_optional_quality_input() No-op. =cut sub _check_optional_quality_input { return 1; } =head2 _prepare_input_sequences Convert input fastq and fasta to maq format. =cut sub _prepare_input_sequences { shift->throw_not_implemented; } =head2 _collate_subcmd_args() Title : _collate_subcmd_args Usage : $args_hash = $self->_collate_subcmd_args Function: collate parameters and switches into command-specific arg lists for passing to new() Returns : hash of named argument lists Args : [optional] composite cmd prefix (scalar string) [default is 'run'] =cut sub _collate_subcmd_args { my $self = shift; my $cmd = shift; my %ret; # default command is 'run' $cmd ||= 'run'; my @subcmds = @{$composite_commands{$cmd}}; my %subcmds; my $cur_options = $self->{'_options'}; # collate foreach my $subcmd (@subcmds) { # find the composite cmd form of the argument in # the current params and switches # e.g., map_max_mismatches my @params = grep /^${subcmd}_/, @{$$cur_options{'_params'}}; my @switches = grep /^${subcmd}_/, @{$$cur_options{'_switches'}}; $ret{$subcmd} = []; # create an argument list suitable for passing to new() of # the subcommand factory... foreach my $opt (@params, @switches) { my $subopt = $opt; $subopt =~ s/^${subcmd}_//; push(@{$ret{$subcmd}}, '-'.$subopt => $self->$opt) if defined $self->$opt; } } return \%ret; } =head2 _run() Title : _run Usage : $factory->_run() Function: Run a bwa assembly pipeline Returns : a text-formatted sam alignment Args : - single end read file in maq bfq format - reference seq file in maq bfa format - [optional] paired end read file in maq bfq format =cut sub _run { my ($self, $rd1_file, $ref_file, $rd2_file) = @_; my ($cmd, $filespec, @ipc_args); # Get program executable my $exe = $self->executable; my $paired = $rd1_file && $rd2_file; my $tdir = $self->tempdir(); my ($saih, $saif) = $self->io->tempfile(-template => 'saiXXXX', -dir=>$tdir); my ($sai2h, $sai2f) = $self->io->tempfile(-template => 'saiXXXX', -dir=>$tdir) if $paired; my ($samh, $samf) = $self->io->tempfile(-template => 'saiXXXX', -dir=>$tdir); $_->close for ($saih, $samh); $sai2h->close if $paired; my $subcmd_args = $self->_collate_subcmd_args(); # index the fasta file (bwa's, not samtools', index...) my $bwa = Bio::Tools::Run::BWA->new( -command => 'index', @{$subcmd_args->{idx}} ); $bwa->run_bwa( -fas => $ref_file ); # map reads to reference seqs $bwa = Bio::Tools::Run::BWA->new( -command => 'aln', @{$subcmd_args->{aln}} ); $bwa->run_bwa( -fas => $ref_file, -faq => $rd1_file, -sai => $saif ); # do paired run if nec $bwa->run_bwa( -fas => $ref_file, -faq => $rd2_file, -sai => $sai2f ) if $paired; # assemble reads $bwa = Bio::Tools::Run::BWA->new( -command => ($paired ? 'sampe' : 'samse'), @{$subcmd_args->{ ($paired ? 'smp' : 'sms' ) }} ); if ($paired) { $bwa->run_bwa( -fas => $ref_file, -sai1 => $saif, -faq1 => $rd1_file, -sai2 => $sai2f, -faq2 => $rd2_file, -sam => $samf ); } else { $bwa->run_bwa( -fas => $ref_file, -sai => $saif, -faq => $rd1_file, -sam => $samf ); } # note this returns a text-sam file-- needs conversion for B:A:IO::sam... # conversion done in run(), if Bio::Tools::Run::Samtools available. return $samf; } =head2 available_parameters() Title : available_parameters Usage : @cmds = $fac->available_commands('commands'); Function: Use to browse available commands, params, or switches Returns : array of scalar strings Args : 'commands' : all bwa commands 'params' : parameters for this object's command 'switches' : boolean switches for this object's command 'filespec' : the filename spec for this object's command 4Geeks : Overrides Bio::ParameterBaseI via Bio::Tools::Run::AssemblerBase =cut sub available_parameters { my $self = shift; my $subset = shift; for ($subset) { # get commands !defined && do { # delegate return $self->SUPER::available_parameters($subset); }; m/^c/i && do { return grep !/^run$/, @program_commands; }; m/^f/i && do { # get file spec return @{$command_files{$self->command}}; }; do { #else delegate... return $self->SUPER::available_parameters($subset); }; } } sub available_commands { shift->available_parameters('commands') }; sub filespec { shift->available_parameters('filespec') }; 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BWA/000077500000000000000000000000001302566030400211255ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BWA/Config.pm000077500000000000000000000137761302566030400227110ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::BWA::Config # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BWA::Config - Configuration data for BWA commands =head1 SYNOPSIS Used internally by L. =head1 DESCRIPTION This package exports information describing BWA commands, parameters, switches, and input and output filetypes for individual BWA commands. See L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj@fortinbras.us Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BWA::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root ); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_prefixes %composite_commands @program_params @program_switches %param_translation %command_files ); @EXPORT_OK = qw(); our @program_commands = qw( run index aln samse sampe dbwtsw ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # our %composite_commands = ( 'run' => [qw( map asm c2q )] ); # prefixes only for commands that take params/switches... our %command_prefixes = ( 'aln' => 'aln', 'dbwtsw' => 'bwt', 'index' => 'idx', 'samse' => 'sms', 'sampe' => 'smp', 'run' => 'run' ); our @program_params = qw( command idx|output_prefix idx|algorithm aln|max_edit_dist aln|max_gap_opens aln|max_gap_extns aln|deln_protect_3p aln|deln_protect_ends aln|subseq_seed aln|max_edit_dist_seed aln|n_threads aln|mm_penalty aln|gap_open_penalty aln|gap_extn_penalty aln|subopt_hit_threshold aln|trim_parameter sms|hit_limit smp|max_insert_size smp|max_read_occur bwt|match_score bwt|mm_penalty bwt|gap_open_penalty bwt|gap_extn_penalty bwt|n_threads bwt|band_width bwt|rel_min_score_threshold bwt|threshold_adj_coeff bwt|z_best bwt|max_sa_interval bwt|min_seeds_to_skip ); our @program_switches = qw( idx|color_space_idx aln|reverse_no_comp aln|no_iter_search ); our %param_translation = ( 'idx|output_prefix' => 'p', 'idx|algorithm' => 'a', 'aln|max_edit_dist' => 'n', 'aln|max_gap_opens' => 'o', 'aln|max_gap_extns' => 'e', 'aln|deln_protect_3p' => 'd', 'aln|deln_protect_ends' => 'i', 'aln|subseq_seed' => 'l', 'aln|max_edit_dist_seed' => 'k', 'aln|n_threads' => 't', 'aln|mm_penalty' => 'M', 'aln|gap_open_penalty' => 'O', 'aln|gap_extn_penalty' => 'E', 'aln|subopt_hit_threshold' => 'R', 'aln|trim_parameter' => 'q', 'sms|hit_limit' => 'n', 'smp|max_insert_size' => 'a', 'smp|max_read_occur' => 'o', 'bwt|match_score' => 'a', 'bwt|mm_penalty' => 'b', 'bwt|gap_open_penalty' => 'q', 'bwt|gap_extn_penalty' => 'r', 'bwt|n_threads' => 't', 'bwt|band_width' => 'w', 'bwt|rel_min_score_threshold' => 'T', 'bwt|threshold_adj_coeff' => 'c', 'bwt|z_best' => 'z', 'bwt|max_sa_interval' => 's', 'bwt|min_seeds_to_skip' => 'N' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_maq # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'run' => [qw( faq fas faq )], 'index' => [qw( fas )], 'aln' => [qw( fas faq >sai )], 'samse' => [qw( fas sai faq >sam )], 'sampe' => [qw( fas sai1 sai2 faq1 faq2 >sam )], 'dbwtsw' => [qw( fas faq )] ); INIT { # add subcommand params and switches for # composite commands my @sub_params; my @sub_switches; foreach my $cmd (keys %composite_commands) { foreach my $subcmd ( @{$composite_commands{$cmd}} ) { my @sub_program_params = grep /^$subcmd\|/, @program_params; my @sub_program_switches = grep /^$subcmd\|/, @program_switches; for (@sub_program_params) { m/^$subcmd\|(.*)/; push @sub_params, "$cmd\|${subcmd}_".$1; } for (@sub_program_switches) { m/^$subcmd\|(.*)/; push @sub_switches, "$cmd\|${subcmd}_".$1; } } } push @program_params, @sub_params; push @program_switches, @sub_switches; # translations for subcmd params/switches not necessary } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BlastPlus.pm000077500000000000000000000075351302566030400230000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::BlastPlus # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::BlastPlus - A wrapper for NCBI's blast+ suite =head1 SYNOPSIS Give standard usage here =head1 DESCRIPTION Blast+ is NCBI's successor to the C family of programs. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::BlastPlus; use strict; use warnings; use lib '../../..'; use Bio::Root::Root; use Bio::Tools::Run::BlastPlus::Config; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::WrapperBase::CommandExts; use base qw(Bio::Tools::Run::WrapperBase Bio::Root::Root); =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::BlastPlus(); Function: Builds a new Bio::Tools::Run::BlastPlus object Returns : an instance of Bio::Tools::Run::BlastPlus Args : =cut sub new { my ($class,@args) = @_; $program_dir ||= $ENV{BLASTPLUSDIR}; my $self = $class->SUPER::new(@args); return $self; } =head2 program_version() Title : program_version Usage : $version = $bedtools_fac->program_version() Function: Returns the program version (if available) Returns : string representing location and version of the program Note : this works around the WrapperBase::version() method conflicting with the -version parameter for SABlast (good argument for not having getter/setters for these) =cut =head2 package_version() Title : package_version Usage : $version = $bedtools_fac->version() Function: Returns the BLAST+ package version (if available) Returns : string representing BLAST+ package version (may differ from version()) =cut sub program_version { my ($self) = @_; if (!defined $self->{program_version}) { $self->_version; } $self->{program_version} || ''; } sub package_version { my ($self) = @_; if (!defined $self->{package_version}) { $self->_version; } $self->{package_version} || ''; } sub _version { my $self = shift; my ($in, $out, $err); # Get program executable my $exe = $self->executable; my @ipc_args = ( $exe, '-version'); eval { IPC::Run::run(\@ipc_args, \$in, \$out, \$err) or die ("There was a problem running $exe : $!"); }; if ($out =~ /blastdbcmd\:\s+(\S+)\nPackage\:\s+([^,]+)/xms) { @{$self}{qw(program_version package_version)} = ($1, $2); } else { $self->throw("Unknown version output: $out"); } } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BlastPlus/000077500000000000000000000000001302566030400224255ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/BlastPlus/Config.pm000077500000000000000000000203201302566030400241700ustar00rootroot00000000000000#$Id$ package Bio::Tools::Run::BlastPlus::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_prefixes %composite_commands @program_params @program_switches %param_translation %command_files $program_name $program_dir $use_dash $join ); @EXPORT_OK = qw(); # getting the parms and switches from the usage string: #$ blastp -h | perl -ne '@a = m/\[(.*?)\]/g; for $a (@a) { @b = split(/\s+/,$a); $b[0]=~s/-//; $ptr = (@b==1 ? \@s : \@p ); push @$ptr, $b[0]; } END { print "p arms\n", join("\n",@p), "\n\n", "switches\n", join("\n",@s); }' # '*' indicates a 'pseudo'-program : i.e. each # command has its own executable... our $program_name = '*blast+'; our $use_dash = 'single'; our $join = ' '; our @program_commands = qw( run blastn blastx tblastx tblastn blastp psiblast rpsblast rpstblastn makeblastdb blastdb_aliastool blastdbcmd blastdbcheck convert2blastmask dustmasker segmasker windowmasker ); # full command => prefix our %command_prefixes = ( run => 'run', blastn => 'bln', blastx => 'blx', tblastx => 'tbx', tblastn => 'tbn', blastp => 'blp', psiblast => 'psi', rpsblast => 'rps', rpstblastn => 'rpst', makeblastdb => 'mak', blastdb_aliastool => 'dba', blastdbcmd => 'dbc', blastdbcheck => 'dbk', convert2blastmask => 'c2m', dustmasker => 'dms', segmasker => 'sms', windowmasker => 'wms' ); # each elt : pfx|wrapper_parm_name our @program_params = qw( command tbn|import_search_strategy tbn|export_search_strategy tbn|db tbn|dbsize tbn|gilist tbn|negative_gilist tbn|entrez_query tbn|subject tbn|subject_loc tbn|query tbn|out tbn|evalue tbn|word_size tbn|gapopen tbn|gapextend tbn|xdrop_ungap tbn|xdrop_gap tbn|xdrop_gap_final tbn|searchsp tbn|db_gencode tbn|frame_shift_penalty tbn|max_intron_length tbn|seg tbn|soft_masking tbn|matrix tbn|threshold tbn|culling_limit tbn|best_hit_overhang tbn|best_hit_score_edge tbn|window_size tbn|query_loc tbn|outfmt tbn|num_descriptions tbn|num_alignments tbn|max_target_seqs tbn|num_threads tbn|comp_based_stats tbn|in_pssm blx|import_search_strategy blx|export_search_strategy blx|db blx|dbsize blx|gilist blx|negative_gilist blx|entrez_query blx|db_soft_mask blx|subject blx|subject_loc blx|query blx|out blx|evalue blx|word_size blx|gapopen blx|gapextend blx|xdrop_ungap blx|xdrop_gap blx|xdrop_gap_final blx|searchsp blx|frame_shift_penalty blx|max_intron_length blx|seg blx|soft_masking blx|matrix blx|threshold blx|culling_limit blx|best_hit_overhang blx|best_hit_score_edge blx|window_size blx|query_loc blx|strand blx|query_gencode blx|outfmt blx|num_descriptions blx|num_alignments blx|max_target_seqs blx|num_threads bln|import_search_strategy bln|export_search_strategy bln|task bln|db bln|dbsize bln|gilist bln|negative_gilist bln|entrez_query bln|db_soft_mask bln|subject bln|subject_loc bln|query bln|out bln|evalue bln|word_size bln|gapopen bln|gapextend bln|perc_identity bln|xdrop_ungap bln|xdrop_gap bln|xdrop_gap_final bln|searchsp bln|penalty bln|reward bln|min_raw_gapped_score bln|template_type bln|template_length bln|dust bln|filtering_db bln|window_masker_taxid bln|window_masker_db bln|soft_masking bln|culling_limit bln|best_hit_overhang bln|best_hit_score_edge bln|window_size bln|use_index bln|index_name bln|query_loc bln|strand bln|outfmt bln|num_descriptions bln|num_alignments bln|max_target_seqs bln|num_threads blp|import_search_strategy blp|export_search_strategy blp|task blp|db blp|dbsize blp|gilist blp|negative_gilist blp|entrez_query blp|db_soft_mask blp|subject blp|subject_loc blp|query blp|out blp|evalue blp|word_size blp|gapopen blp|gapextend blp|xdrop_ungap blp|xdrop_gap blp|xdrop_gap_final blp|searchsp blp|seg blp|soft_masking blp|matrix blp|threshold blp|culling_limit blp|best_hit_overhang blp|best_hit_score_edge blp|window_size blp|query_loc blp|outfmt blp|num_descriptions blp|num_alignments blp|max_target_seqs blp|num_threads blp|comp_based_stats psi|import_search_strategy psi|export_search_strategy psi|db psi|dbsize psi|gilist psi|negative_gilist psi|entrez_query psi|subject psi|subject_loc psi|query psi|out psi|evalue psi|word_size psi|gapopen psi|gapextend psi|xdrop_ungap psi|xdrop_gap psi|xdrop_gap_final psi|searchsp psi|seg psi|soft_masking psi|matrix psi|threshold psi|culling_limit psi|best_hit_overhang psi|best_hit_score_edge psi|window_size psi|query_loc psi|outfmt psi|num_descriptions psi|num_alignments psi|max_target_seqs psi|num_threads psi|comp_based_stats psi|gap_trigger psi|num_iterations psi|out_pssm psi|out_ascii_pssm psi|in_msa psi|in_pssm psi|pseudocount psi|inclusion_ethresh psi|phi_pattern rpst|import_search_strategy rpst|export_search_strategy rpst|db rpst|dbsize rpst|gilist rpst|negative_gilist rpst|entrez_query rpst|query rpst|out rpst|evalue rpst|word_size rpst|xdrop_ungap rpst|xdrop_gap rpst|xdrop_gap_final rpst|searchsp rpst|query_gencode rpst|seg rpst|soft_masking rpst|window_size rpst|query_loc rpst|strand rpst|outfmt rpst|num_descriptions rpst|num_alignments rpst|max_target_seqs rpst|num_threads mak|in mak|dbtype mak|title mak|mask_data mak|out mak|max_file_sz mak|taxid mak|taxid_map mak|logfile dba|gi_file_in dba|gi_file_out dba|db dba|dbtype dba|title dba|gilist dba|out dba|dblist dba|num_volumes dba|logfile tbx|import_search_strategy tbx|export_search_strategy tbx|db tbx|dbsize tbx|gilist tbx|negative_gilist tbx|entrez_query tbx|subject tbx|subject_loc tbx|query tbx|out tbx|evalue tbx|word_size tbx|gapopen tbx|gapextend tbx|xdrop_ungap tbx|xdrop_gap tbx|xdrop_gap_final tbx|searchsp tbx|max_intron_length tbx|seg tbx|soft_masking tbx|matrix tbx|threshold tbx|culling_limit tbx|best_hit_overhang tbx|best_hit_score_edge tbx|window_size tbx|query_loc tbx|strand tbx|query_gencode tbx|db_gencode tbx|outfmt tbx|num_descriptions tbx|num_alignments tbx|max_target_seqs tbx|num_threads dbc|db dbc|dbtype dbc|entry dbc|entry_batch dbc|pig dbc|range dbc|strand dbc|mask_sequence_with dbc|out dbc|outfmt dbc|line_length c2m|in c2m|out c2m|outfmt dms|in dms|out dms|window dms|level dms|linker dms|outfmt sms|in sms|out sms|infmt sms|outfmt sms|window sms|locut sms|hicut wms|ustat wms|in wms|out wms|checkdup wms|window wms|t_extend wms|t_thres wms|t_high wms|t_low wms|set_t_high wms|set_t_low wms|infmt wms|outfmt wms|sformat wms|convert wms|fa_list wms|mem wms|smem wms|unit wms|genome_size wms|dust wms|dust_level wms|exclude_ids wms|ids wms|text_match wms|use_ba ); # each elt : pfx|wrapper_switch_name our @program_switches = qw( tbn|h tbn|help tbn|ungapped tbn|lcase_masking tbn|parse_deflines tbn|show_gis tbn|html tbn|remote tbn|use_sw_tback tbn|version blx|h blx|help blx|ungapped blx|lcase_masking blx|parse_deflines blx|show_gis blx|html blx|remote blx|version bln|h bln|help bln|no_greedy bln|ungapped bln|lcase_masking bln|parse_deflines bln|show_gis bln|html bln|remote bln|version blp|h blp|help blp|lcase_masking blp|parse_deflines blp|show_gis blp|html blp|ungapped blp|remote blp|use_sw_tback blp|version psi|h psi|help psi|lcase_masking psi|parse_deflines psi|show_gis psi|html psi|remote psi|use_sw_tback psi|version rpst|h rpst|help rpst|ungapped rpst|lcase_masking rpst|parse_deflines rpst|show_gis rpst|html rpst|remote rpst|version mak|h mak|help mak|parse_seqids mak|hash_index mak|version dba|h dba|help dba|version tbx|h tbx|help tbx|lcase_masking tbx|parse_deflines tbx|show_gis tbx|html tbx|remote tbx|version dbc|h dbc|help dbc|info dbc|target_only dbc|get_dups dbc|ctrl_a dbc|version c2m|h c2m|help c2m|parse_seqids c2m|version dms|h dms|help dms|xmlhelp dms|parse_seqids dms|version-full sms|h sms|help sms|xmlhelp sms|parse_seqids sms|version-full wms|h wms|help wms|xmlhelp wms|parse_seqids wms|version-full wms|mk_counts ); #each pair : pfx|wrapper_opt_name => command_line_name (without dashes) # for blast+, the options are all long and mnemonic, so a param translation # isn't required. In CommandExts, a parameter name should be passed through # as-is, if a translation is not found---. our %param_translation = ( ); our %composite_commands = ( ); our %command_files = ( ); 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Bowtie.pm000066400000000000000000000637341302566030400223200ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Bowtie # # Please direct questions and support issues to # # Cared for by Dan Kortschak # # Copyright Dan Kortschak and Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Bowtie - Run wrapper for the Bowtie short-read assembler *BETA* =head1 SYNOPSIS # create an index $bowtie_build = Bio::Tools::Run::Bowtie->new(); $index = $bowtie_fac->run( 'reference.fasta', 'index_base' ); # or with named args... $index = $bowtie_fac->run( -ref => 'reference.fasta', -ind => 'index_base' ); # get the base name of the last index from an index builder $index = $bowtie_fac->result; # create an assembly $bowtie_fac = Bio::Tools::Run::Bowtie->new(); $bowtie_fac->want('Bio::Assembly::Scaffold'); $bowtie_assy = $bowtie_fac->run( 'reads.fastq', 'index_base' ); # if IO::Uncompress::Gunzip is available and with named args... $bowtie_assy = $bowtie_fac->run( -seq => 'reads.fastq.gz', -ind => 'index_base' ); # paired-end $bowtie_fac = Bio::Tools::Run::Bowtie->new(-command => 'paired', -want => 'Bio::Assembly::Scaffold'); $bowtie_assy = $bowtie_fac->run( 'reads.fastq', 'index_base', 'paired-reads.fastq' ); # be more strict $bowtie_fac->set_parameters( -max_qual_mismatch => 50 ); # create a Bio::Assembly::Scaffold object $bowtie_assy = $bowtie_fac->run( 'reads.fastq', 'index_base', 'paired-reads.fastq' ); # print consensus sequences from assembly object for $contig ($bowtie_assy->all_contigs) { print $contig->get_consensus_sequence->seq,"\n"; } # get the file object of the last assembly $io = $bowtie_fac->result( -want => 'Bio::Root::IO' ); # get a merged SeqFeature::Collection of all hits # - currently only available with SAM format $io = $bowtie_fac->result( -want => 'Bio::SeqFeature::Collection' ); #... or the file name directly $filename = $bowtie_fac->result( -want => 'raw' ); =head1 DESCRIPTION This module provides a wrapper interface for Ben Langmead and Col Trapnell's ultrafast memory-efficient short read aligner C (see L for manuals and downloads). =head1 OPTIONS C is complex, with many command-line options. This module attempts to provide and options comprehensively. You can browse the choices like so: $bowtiefac = Bio::Tools::Run::Bowtie->new( -command => 'single' ); # all bowtie commands @all_commands = $bowtiefac->available_parameters('commands'); @all_commands = $bowtiefac->available_commands; # alias # just for single @assemble_params = $bowtiefac->available_parameters('params'); @assemble_switches = $bowtiefac->available_parameters('switches'); @assemble_all_options = $bowtiefac->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. As a number of options are mutually exclusive, and the interpretation of intent is based on last-pass option reaching bowtie with potentially unpredicted results. This module will prevent inconsistent switches and parameters from being passed. See L for details of bowtie options. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $bowtiefac = Bio::Tools::Run::Bowtie->new( -command => 'paired' ); @filespec = $bowtiefac->filespec; This example returns the following array: ind seq seq2 #out This indicates that ind (C index file base name), seq (fasta/fastq),and seq2 (fasta/fastq) files MUST be specified, and that the out file MAY be specified. Use these in the C call like so: $bowtiefac->run( -ind => 'index_base', -seq => 'seq-a.fq', -seq2 => 'seq-b.fq', -out => 'align.out' ); Note that named parameters in this form allow you to specify the location of the outfile; without named parameters, the outfile is located in a tempdir and does not persist beyond the life of the object - with the exception of index creation. The object will store the programs STDOUT and STDERR output for you in the C and C attributes: handle_map_warning($bowtiefac) if ($bowtiefac->stderr =~ /warning/); =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L Rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Dan Kortschak Email dan.kortschak adelaide.edu.au =head1 CONTRIBUTORS Mark A. Jensen (maj -at- fortinbras -dot- us) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Bowtie; use strict; our $HAVE_IO_UNCOMPRESS; BEGIN { eval 'require IO::Uncompress::Gunzip; $HAVE_IO_UNCOMPRESS = 1'; } use IPC::Run; # Object preamble - inherits from Bio::Root::Root use lib '../../..'; use Bio::Tools::Run::Bowtie::Config; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::WrapperBase::CommandExts; use Bio::Tools::GuessSeqFormat; use Bio::Tools::Run::Samtools; use Bio::Seq; use File::Basename; use base qw( Bio::Tools::Run::WrapperBase Bio::Tools::Run::AssemblerBase ); ## bowtie our $program_name = '*bowtie'; our $default_cmd = 'single'; our $asm_format; # this is determined dynamically # Note: # other globals required by Bio::Tools::Run::AssemblerBase are # imported from Bio::Tools::Run::Bowtie::Config our $qual_param = undef; our $use_dash = 'mixed'; our $join = ' '; =head2 new() Title : new Usage : my $obj = new Bio::Tools::Run::Bowtie(); Function: Builds a new Bio::Tools::Run::Bowtie object Returns : an instance of Bio::Tools::Run::Bowtie Args : =cut sub new { my ($class,@args) = @_; unless (grep /command/, @args) { push @args, '-command', $default_cmd; } #default to SAM output if no other format specified and we are running an alignment my %args=@args; if ($args{'-command'} =~ m/(?:single|paired|crossbow)/) { unless (grep /(?:sam_format|concise|quiet|refout|refidx)/, @args) { push @args, ('-sam_format', 1); } } my $self = $class->SUPER::new(@args); foreach (keys %command_executables) { $self->executables($_, $self->_find_executable($command_executables{$_})); } my ($want) = $self->_rearrange([qw(WANT)],@args); $self->want($want); $asm_format = $self->_assembly_format; $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI return $self; } =head2 run() Title : run Usage : $assembly = $bowtie_assembler->run($read1_fastq_file, $index_location, $read2_fastq_file); $assembly = $bowtie_assembler->run(%params); Function: Run the bowtie assembly pipeline. Returns : Assembly results (file, IO object or Assembly object) Args : - fastq file containing single-end reads - name of the base of the bowtie index - [optional] fastq file containing paired-end reads Named params are also available with args: -seq, -seq2, -ind (bowtie index), -ref (fasta reference) and -out Note : gzipped inputs are allowed if IO::Uncompress::Gunzip is available The behaviour for locating indexes follows the definition in the bowtie manual - you may use the environment variable BOWTIE_INDEXES to specify the index path or use an 'indexes' directory under the directory where the bowtie executable is located =cut sub run { my $self = shift; my ($arg1, $arg2, $arg3); # these are useless names because the different # programs take very different arguments my ($index, $seq, $seq2, $ref, $out); # these are the meaningful names that are used # with named args if (!(@_ % 2)) { my %args = @_; if ((grep /^-\w+/, keys %args) == keys %args) { ($index, $seq, $seq2, $ref, $out) = $self->_rearrange([qw( IND SEQ SEQ2 REF OUT )], @_); } elsif (grep /^-\w+/, keys %args) { $self->throw("Badly formed named args: ".join(' ',@_)); } else { ($arg1, $arg2) = @_; } } else { if (grep /^-\w+/, @_) { $self->throw("Badly formed named args: ".join(' ',@_)); } else { ($arg1, $arg2, $arg3) = @_; } } # Sanity checks $self->_check_executable(); my $cmd = $self->command if $self->can('command'); for ($cmd) { m/(?:single|paired|crossbow)/ && do { $seq ||= $arg1; $index ||= $arg2; $seq2 ||= $arg3; $seq or $self->throw("Fasta/fastq/raw read(s) file/Bio::Seq required at arg 1/-seq"); $index or $self->throw("Bowtie index base required at arg 2/-index"); # expand gzipped files as nec. for ($seq, $seq2) { next unless $_; if (/\.gz[^.]*$/) { unless ($HAVE_IO_UNCOMPRESS) { croak( "IO::Uncompress::Gunzip not available, can't expand '$_'" ); } my ($tfh, $tf) = $self->io->tempfile; my $z = IO::Uncompress::Gunzip->new($_); while (<$z>) { print $tfh $_ } close $tfh; $_ = $tf; } } # confirm index files exist $self->_validate_file_input( -ind => $index ) or ($self->_validate_file_input( -ind => $self->io->catfile(dirname($self->executable),'indexes',$index)) and $index = $self->io->catfile(dirname($self->executable),'indexes',$index)) or ($self->_validate_file_input( -ind => $self->io->catfile($ENV{BOWTIE_INDEXES},$index)) and $index = $self->io->catfile($ENV{BOWTIE_INDEXES},$index)) or $self->throw("Incorrect filetype (expecting bowtie index) or absent file arg 2/-index"); # bowtie prepare the multiple input types $seq = $self->_prepare_input_sequences($seq); if ($cmd =~ m/^p/) { $seq2 && ($seq2 = $self->_prepare_input_sequences($seq2)); } else { $seq2 && $self->throw("Second sequence input not wanted for command: $cmd"); } # Assemble my $format = $self->_assembly_format; my $suffix = '.'.$format; if ($out) { $out .= $suffix; } else { my ($bowtieh, $bowtief) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => $suffix ); $bowtieh->close; $out = $bowtief; } my %params = ( -ind => $index, -seq => $seq, -seq2 => $seq2, -out => $out ); map { delete $params{$_} unless defined $params{$_} } keys %params; $self->_run(%params); $self->{'_result'}->{'index'} = $index; $self->{'_result'}->{'file_name'} = $out; $self->{'_result'}->{'format'} = $format; $self->{'_result'}->{'file'} = Bio::Root::IO->new( -file => $out ); return $self->result; }; m/build/ && do { $ref ||= $arg1; $index ||= $arg2; $ref or $self->throw("Fasta read(s) file/Bio::Seq required at arg 1/-ref"); $index ||= $self->io->tempdir(CLEANUP => 1).'/index'; # we want a new one each time $arg3 && $self->throw("Second sequence input not wanted for command: $cmd"); my $format = $self->_assembly_format; # expand gzipped file as nec. if ($ref =~ (m/\.gz[^.]*$/)) { unless ($HAVE_IO_UNCOMPRESS) { croak( "IO::Uncompress::Gunzip not available, can't expand '$_'" ); } my ($tfh, $tf) = $self->io->tempfile; my $z = IO::Uncompress::Gunzip->new($_); while (<$z>) { print $tfh $_ } close $tfh; $ref = $tf; } # bowtie prepare the two input types for the first argument $ref = $self->_prepare_input_sequences($ref); # Build index $self->_run( -ref => $ref, -out => $index ); $self->{'_result'}->{'format'} = $format; $self->{'_result'}->{'file_name'} = $index; return $index; }; m/inspect/ && do { $index ||= $arg1; $out ||= $arg2; $index or $self->throw("Bowtie index required at arg 1"); $self->_validate_file_input( -ind => $index ) or ($self->_validate_file_input( -ind => $self->io->catfile(dirname($self->executable),'indexes',$index)) and $index = $self->io->catfile(dirname($self->executable),'indexes',$index)) or ($self->_validate_file_input( -ind => $self->io->catfile($ENV{BOWTIE_INDEXES},$index)) and $index = $self->io->catfile($ENV{BOWTIE_INDEXES},$index)) or $self->throw("'$index' doesn't look like a bowtie index or index component is missing at arg 1/-ind"); $arg3 && $self->throw("Second sequence input not wanted for command: $cmd"); # Inspect index my $format = $self->_assembly_format; my $suffix = '.'.$format; if ($out) { $out .= $suffix; } else { my ($desch, $descf) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => $suffix ); $desch->close; $out = $descf; } $self->_run( -ind => $index, -out => $out ); $self->{'_result'}->{'file_name'} = $out; $self->{'_result'}->{'format'} = $format; $self->{'_result'}->{'file'} = Bio::Root::IO->new( -file => $out ); return $self->result; } } } =head2 want() Title : want Usage : $bowtiefac->want( $class ) Function: make factory return $class, or raw (scalar) results in file Returns : return wanted type Args : [optional] string indicating class or raw of wanted result =cut sub want { my $self = shift; return $self->{'_want'} = shift if @_; return $self->{'_want'}; } =head2 result() Title : result Usage : $bowtiefac->result( [-want => $type|$format] ) Function: return result in wanted format Returns : results Args : [optional] hashref of wanted type =cut sub result { my ($self, @args) = @_; my $want = $self->want ? $self->want : $self->want($self->_rearrange([qw(WANT)],@args)); my $cmd = $self->command if $self->can('command'); my $format = $self->{'_result'}->{'format'}; return $self->{'_result'}->{'format'} if (defined $want && $want eq 'format'); return $self->{'_result'}->{'file_name'} if (!$want || $want eq 'raw' || $cmd eq 'build'); return $self->{'_result'}->{'file'} if ($want =~ m/^Bio::Root::IO/); for ($cmd) { m/(?:single|paired|crossbow)/ && do { my $scaffold; for ($format) { m/^bowtie/i && $want =~ m/^Bio::Assembly::Scaffold/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::Assembly::Scaffold/) { $self->{'_result'}->{'object'} = $self->_export_results( $self->{'_result'}->{'file_name'}, -index => $self->{'_result'}->{'index'}, -keep_asm => 1 ); } last; }; m/^bowtie/i && $want =~ m/^Bio::SeqFeature::Collection/ && do { $self->warn("Don't know how to create a $want object for $cmd with bowtie format - try SAM format."); last; }; m/^sam/i && $want =~ m/^Bio::Assembly::Scaffold/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::Assembly::Scaffold/) { my $bamf = $self->_make_bam($self->{'_result'}->{'file_name'}, 1); my $inspector = Bio::Tools::Run::Bowtie->new( -command => 'inspect' ); my $refdb = $inspector->run($self->{'_result'}->{'index'}); $self->{'_result'}->{'object'} = $self->_export_results($bamf, -refdb => $refdb, -keep_asm => 1 ); } last; }; m/^sam/i && $want =~ m/^Bio::SeqFeature::Collection/ && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::Assembly::Scaffold/) { my $bamf = $self->_make_bam($self->{'_result'}->{'file_name'}, 0); my $convert = Bio::Tools::Run::BEDTools->new( -command => 'bam_to_bed' ); my $bedf = $convert->run( -bed => $bamf ); my $merge = Bio::Tools::Run::BEDTools->new( -command => 'merge' ); $merge->run($self->{'_result'}->{'index'}); $self->{'_result'}->{'object'} = $merge->result( -want => $want ); } last; }; do { $self->warn("Don't know how to create a $want object for $cmd."); return; } }; last; }; m/inspect/ && do { for ($want) { m/^Bio::SeqIO/ && $format eq 'fasta' && do { unless (defined $self->{'_result'}->{'object'} && ref($self->{'_result'}->{'object'}) =~ m/^Bio::SeqIO/) { $self->{'_result'}->{'object'} = Bio::SeqIO->new(-file => $self->{'_result'}->{'file'}, -format => 'fasta'); } last; }; m/^Bio::SeqIO/ && $format ne 'fasta' && do { $self->warn("Don't know how to create a $want object for names only - try -want => 'Bio::Root::IO'."); return; }; do { $self->warn("Don't know how to create a $want object for $cmd."); return; } } } } return $self->{'_result'}->{'object'}; } =head2 _determine_format() Title : _determine_format Usage : $bowtiefac->_determine_format Function: determine the format of output for current options Returns : format of bowtie output Args : =cut sub _determine_format { my ($self) = shift; my $cmd = $self->command if $self->can('command'); for ($cmd) { m/build/ && do { return 'ebwt'; }; m/inspect/ && do { $self->{'_summary'} && return 'text'; return $self->{'_names_only'} ? 'text' : 'fasta'; }; m/(?:single|paired|crossbow)/ && do { my $format = 'bowtie'; # this is our default position for (keys %format_lookup) { $format = $format_lookup{$_} if $self->{'_'.$_}; } return $format; } } } =head2 _make_bam() Title : _make_bam Usage : $bowtiefac->_make_bam( $file, $sort ) Function: make a sorted BAM format file from SAM file Returns : sorted BAM file name Args : SAM file name and boolean flag to select sorted BAM format =cut sub _make_bam { my ($self, $file, $sort) = @_; $self->throw("'$file' does not exist or is not readable") unless ( -e $file && -r _ ); # make a sorted bam file from a sam file input my ($bamh, $bamf) = $self->io->tempfile( -dir => $self->tempdir(), -suffix => '.bam' ); $bamh->close; my $samt = Bio::Tools::Run::Samtools->new( -command => 'view', -sam_input => 1, -bam_output => 1 ); $samt->run( -bam => $file, -out => $bamf ); if ($sort) { my ($srth, $srtf) = $self->io->tempfile( -dir => $self->io->tempdir(CLEANUP=>1), -suffix => '.srt' ); # shared tempdir, so make new - otherwise it is scrubbed during Bio::DB::Sam $srth->close; $samt = Bio::Tools::Run::Samtools->new( -command => 'sort' ); $samt->run( -bam => $bamf, -pfx => $srtf); return $srtf.'.bam'; } else { return $bamf; } } =head2 _validate_file_input() Title : _validate_file_input Usage : $bowtiefac->_validate_file_input( -type => $file ) Function: validate file type for file spec Returns : file type if valid type for file spec Args : hash of filespec => file_name =cut sub _validate_file_input { my ($self, @args) = @_; my (%args); if (grep (/^-/, @args)) { # named parms $self->throw("Wrong number of args - requires one named arg") if (@args > 2); s/^-// for @args; %args = @args; } else { $self->throw("Must provide named filespec"); } for (keys %args) { m/^seq|seq2|ref$/ && do { return unless ( -e $args{$_} && -r _ ); my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$args{$_}); return $guesser->guess if grep {$guesser->guess =~ m/$_/} @{$accepted_types{$_}}; }; m/^ind$/ && do { return 'ebwt' if (-e $args{$_}.'.1.ebwt' && -e $args{$_}.'.2.ebwt' && -e $args{$_}.'.3.ebwt' && -e $args{$_}.'.4.ebwt' && -e $args{$_}.'.rev.1.ebwt' && -e $args{$_}.'.rev.2.ebwt'); } } return; } =head1 Bio::Tools::Run::AssemblerBase overrides =head2 _assembly_format() Title : _assembly_format Usage : $bowtiefac->_determine_format Function: set the format of output for current options Returns : format of bowtie output Args : =cut sub _assembly_format { my $self = shift; my $format = $self->_determine_format; return $self->SUPER::_assembly_format($format); } =head2 _check_sequence_input() No-op. =cut sub _check_sequence_input { return 1; } =head2 _check_optional_quality_input() No-op. =cut sub _check_optional_quality_input { return 1; } =head2 _prepare_input_sequences() Prepare and check input sequences for bowtie. =cut sub _prepare_input_sequences { my ($self, @args) = @_; my (%args, $read); if (grep (/^-/, @args)) { # named parms $self->throw("Input args not an even number") unless !(@args % 2); %args = @args; ($read) = @args{qw( -sequence )}; } else { ($read) = @args; } # Could use the AssemblerBase routine for this, except that would not permit # an array of strings if ($self->inline) { # expect inline data if (UNIVERSAL::isa($read,'can') && $read->isa("Bio::PrimarySeqI")) { # we have a Bio::*Seq* $read=$read->seq(); } else { # we have something else if (ref($read) =~ /ARRAY/i) { my @ts; foreach my $seq (@$read) { if ($seq->isa("Bio::PrimarySeqI")) { $seq=$seq->seq(); } else { next if $read=~m/[[^:alpha:]]/; } push @ts,$seq; } $self->throw("bowtie requires at least one sequence read") unless (@ts); if (@ts>1) { $read="'".join(',',@ts)."'"; } else { ($read)=@ts; } } else { #must be a string... fail if non-alpha $self->throw("bowtie requires at least one valid sequence read") if $read=~m/[[^:alpha:]]/; } } } else { # expect file(s) - so test whether it's/they're appropriate # and make a comma-separated list of filenames my @ts = (ref($read) =~ /ARRAY/i) ? @$read : ($read); for my $file (@ts) { if ( -e $file ) { my $cmd = $self->command if $self->can('command'); my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$file); for ($guesser->guess) { m/^fasta$/ && do { $cmd =~ m/^b/ && last; ($cmd =~ m/^c/ or $self->fastq or $self->raw) and $self->throw("Fasta reads file '$file' inappropriate"); $self->fasta(1); last; }; m/^fastq$/ && do { ($cmd =~ m/^[cb]/ or $self->fasta or $self->raw) and $self->throw("Fastq reads file '$file' inappropriate"); $self->fastq(1); last; }; m/^tab$/ && do { $cmd =~ m/^c/ or $self->throw("Crossbow reads file '$file' inappropriate"); # this is unrecoverable since the object has default program defined last; }; m/^raw$/ && do { ($cmd =~ m/^[cb]/ or $self->fasta or $self->fastq) and $self->throw("Raw reads file '$file' inappropriate"); $self->raw(1); last; }; do { $self->throw("File '$file' not a recognised bowtie input filetype"); } } } else { $self->throw("Sequence read file '$file' does not exist"); } } if (@ts>1) { $read="'".join(',',@ts)."'"; } else { ($read)=@ts; } } return $read; } =head2 set_parameters() Title : set_parameters Usage : $bowtiefac->set_parameters(%params); Function: sets the parameters listed in the hash or array, maintaining sane options. Returns : true on success Args : [optional] hash or array of parameter/values. Note : This will unset conflicts and set required options, but will not prevent non-sane requests in the arguments =cut sub set_parameters { my ($self, @args) = @_; # Mutually exclusive switches/params prevented from being set to # avoid confusion resulting from setting incompatible switches. $self->throw("Input args not an even number") if (@args % 2); my %args = @args; foreach (keys %args) { my @added; my @removed; s/^-//; foreach my $conflict (@{$incompat_params{$_}}) { return if grep /$conflict/, @added; delete $args{'-'.$conflict}; $args{'-'.$conflict} = undef if $self->{'_'.$conflict}; push @removed, $conflict; } foreach my $requirement (@{$corequisite_switches{$_}}) { return if grep /$requirement/, @removed; $args{'-'.$requirement}=1 if $args{$_}; push @added, $requirement; } } return $self->SUPER::set_parameters(%args); } =head2 version() Title : version Usage : $version = $bowtiefac->version() Function: Returns the program version (if available) Returns : string representing location and version of the program =cut sub version{ my ($self) = @_; my $cmd = $self->command if $self->can('command'); defined $cmd or $self->throw("No command defined - cannot determine program executable"); my ($in, $out, $err); my $dum; $in = \$dum; $out = \$self->{'stdout'}; $err = \$self->{'stderr'}; # Get program executable my $exe = $self->executable; # Get version switch from switches, translate and dash it my $version_switch = $param_translation{"$command_prefixes{$cmd}|version"}; $version_switch = $self->_dash_switch( $version_switch ); my @ipc_args = ( $exe, $version_switch ); eval { IPC::Run::run(\@ipc_args, $in, $out, $err) or die ("There was a problem running $exe : $!"); }; if ($@) { $self->throw("$exe call crashed: $@"); } my @details = split("\n",$self->stdout); (my $version) = grep /$exe version [[:graph:]]*$/, @details; $version =~ s/version //; (my $addressing) = grep /-bit$/, @details; return $version.' '.$addressing; } sub available_commands { shift->available_parameters('commands') }; sub filespec { shift->available_parameters('filespec') }; 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Bowtie/000077500000000000000000000000001302566030400217455ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Bowtie/Config.pm000066400000000000000000000443231302566030400235160ustar00rootroot00000000000000# $Id: Config.pm kortsch $ # # BioPerl module for Bio::Tools::Run::Bowtie::Config # # Please direct questions and support issues to # # Cared for by Dan Kortschak # # Copyright Dan Kortschak and Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Bowtie::Config - Configuration data for bowtie commands =head1 SYNOPSIS Used internally by L. =head1 DESCRIPTION This package exports information describing bowtie commands, parameters, switches, and input and output filetypes for individual bowtie commands. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Dan Kortschak Email dan.kortschak adelaide.edu.au Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Bowtie::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_executables %format_lookup %command_prefixes %composite_commands @program_params @program_switches %incompat_params %corequisite_switches %param_translation %command_files %accepted_types ); @EXPORT_OK = qw(); our @program_commands = qw( single paired crossbow build inspect ); our %command_executables = ( 'single' => 'bowtie', 'paired' => 'bowtie', 'crossbow' => 'bowtie', 'build' => 'bowtie-build', 'inspect' => 'bowtie-inspect' ); # These should be in clobbering order - more delicate formats first our %format_lookup = ( 'sam_format' => 'sam', 'refidx' => 'bowtie', 'concise' => undef, 'suppress_columns' => undef, 'refout' => undef ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # our %composite_commands = ( ); # prefixes only for commands that take params/switches... our %command_prefixes = ( 'single' => 'one', 'paired' => 'par', 'crossbow' => 'crb', 'build' => 'bld', 'inspect' => 'ins' ); our @program_params = qw( command one|qualities one|skip one|upto one|trim5 one|trim3 one|max_seed_mismatches one|max_qual_mismatch one|max_quality_sum one|snp_penalty one|snp_frac one|seed_length one|max_mismatches one|max_backtracks one|max_search_ram one|report_n_alignments one|supress one|supress_random one|offset_base one|defaul_mapq one|sam_rg one|suppress_columns one|alignmed_file one|unaligned_file one|excess_file one|threads one|offrate one|random_seed par|qualities1 par|qualities2 par|min_insert_size par|max_insert_size par|max_mate_attempts bld|max_bucket_block bld|max_bucket_div bld|diff_cover bld|off_rate bld|ftabchars bld|seed bld|cutoff ins|seq_width ); our @program_switches = qw( one|fastq one|fasta one|raw one|inline one|color_space one|phred33 one|phred64 one|solexa one|solexa1_3 one|integer_qual one|no_maq_rounding one|no_forward_alignment one|no_reverse_alignment one|try_hard one|all one|best one|strata one|fix_strand_bias one|print_color one|color_quals one|color_keep_ends one|sam_format one|sam_no_head one|sam_no_sq one|concise one|time one|be_quiet one|ref_map one|ref_index one|full_ref_name one|memory_mapped_io one|shared_memory par|forward_reverse par|reverse_reverse par|forward_forward bld|fasta bld|inline bld|color_space bld|both bld|no_auto bld|packed bld|no_diff_cover bld|no_ref bld|just_ref bld|NtoA bld|big_endian bld|little_endian ins|names_only ins|summary ins|reconstruct ); # be careful of collisions here - this could do with command specification our %incompat_params = ( qualities => [qw( qualities1 qualities2 )], qualities1 => [qw( qualities )], qualities2 => [qw( qualities )], max_seed_mismatches => [qw( max_mismatches )], max_mismatches => [qw( max_seed_mismatches )], fastq => [qw( fasta raw inline )], fasta => [qw( fastq raw inline )], raw => [qw( fastq fasta inline )], inline => [qw( fastq fasta raw )], phred33 => [qw( phred64 solexa solexa1_3 integer_qual )], phred64 => [qw( phred33 solexa solexa1_3 integer_qual )], solexa => [qw( phred33 phred64 solexa1_3 integer_qual )], solexa1_3 => [qw( phred33 phred64 solexa integer_qual )], integer_qual => [qw( phred33 phred64 solexa solexa1_3 )], no_forward_alignment => [qw( no_reverse_alignment )], no_reverse_alignment => [qw( no_forward_alignment )], all => [qw( report_n_alignments )], report_n_alignments => [qw( all )], forward_reverse => [qw( reverse_reverse forward_forward )], reverse_reverse => [qw( forward_reverse forward_forward )], forward_forward => [qw( reverse_reverse forward_forward )], color_space => [qw( both )], both => [qw( color_space)] ); our %corequisite_switches = ( qualities1 => [qw( qualities2 )], qualities2 => [qw( qualities1 )], strata => [qw( best )], suppress_random => [qw( best )], snp_penalty => [qw( color_space )], snp_frac => [qw( color_space )], print_color => [qw( color_space )], color_quals => [qw( color_space )], color_keep_ends => [qw( color_space )], defaul_mapq => [qw( sam_format )], sam_no_head => [qw( sam_format )], sam_no_sq => [qw( sam_format )], sam_rg => [qw( sam_format )] ); our %param_translation = ( 'one|fastq' => 'q', 'one|fasta' => 'f', 'one|raw' => 'r', 'one|inline' => 'c', 'one|color_space' => 'C', 'one|qualities' => 'Q', 'one|skip' => 's', 'one|upto' => 'u', 'one|trim5' => '5', 'one|trim3' => '3', 'one|phred33' => 'phred33-quals', 'one|phred64' => 'phred64-quals', 'one|solexa' => 'solexa-quals', 'one|solexa1_3' => 'solexa1.3-quals', 'one|integer_qual' => 'integer-quals', 'one|max_seed_mismatches' => 'n', 'one|max_qual_mismatch' => 'e', 'one|seed_length' => 'l', 'one|no_maq_rounding' => 'nomaqround', 'one|max_mismatches' => 'v', 'one|no_forward_alignment' => 'nofw', 'one|no_reverse_alignment' => 'norc', 'one|max_backtracks' => 'maxbts', 'one|try_hard' => 'y', 'one|max_search_ram' => 'chunkmbs', 'one|report_n_alignments' => 'k', 'one|all' => 'a', 'one|supress' => 'm', 'one|supress_random' => 'M', 'one|best' => 'best', 'one|strata' => 'strata', 'one|snp_penalty' => 'snpphred', 'one|snp_frac' => 'snpfrac', 'one|print_color' => 'col-cseq', 'one|color_quals' => 'colc-cqual', 'one|color_keep_ends' => 'col-keepends', 'one|sam_format' => 'S', 'one|defaul_mapq' => 'mapq', 'one|sam_no_head' => 'sam_nohead', 'one|sam_no_sq' => 'sam_nosq', 'one|sam_rg' => 'sam-RG', 'one|suppress_columns' => 'suppress', 'one|time' => 't', 'one|offset_base' => 'B', 'one|be_quiet' => 'quiet', 'one|ref_map' => 'refout', 'one|ref_index' => 'refidx', 'one|alignmed_file' => 'al', 'one|unaligned_file' => 'un', 'one|excess_file' => 'max', 'one|full_ref_name' => 'fullref', 'one|threads' => 'p', 'one|offrate' => 'o', 'one|memory_mapped_io' => 'mm', 'one|shared_memory' => 'shmem', 'one|random_seed' => 'seed', 'one|version' => 'version', 'par|fastq' => 'q', 'par|fasta' => 'f', 'par|raw' => 'r', 'par|inline' => 'c', 'par|color_space' => 'C', 'par|qualities' => 'Q', # Don't know if bowtie will accept this - won't break if left in 'par|qualities1' => 'Q1', 'par|qualities2' => 'Q2', 'par|skip' => 's', 'par|upto' => 'u', 'par|trim5' => '5', 'par|trim3' => '3', 'par|phred33' => 'phred33-quals', 'par|phred64' => 'phred64-quals', 'par|solexa' => 'solexa-quals', 'par|solexa1_3' => 'solexa1.3-quals', 'par|integer_qual' => 'integer-quals', 'par|max_seed_mismatches' => 'n', 'par|max_qual_mismatch' => 'e', 'par|seed_length' => 'l', 'par|no_maq_rounding' => 'nomaqround', 'par|max_mismatches' => 'v', 'par|min_insert_size' => 'I', 'par|max_insert_size' => 'X', 'par|forward_reverse' => 'fr', 'par|reverse_forward' => 'rf', 'par|forward_forward' => 'ff', 'par|no_forward_alignment' => 'nofw', 'par|no_reverse_alignment' => 'norc', 'par|max_backtracks' => 'maxbts', 'par|max_mate_attempts' => 'pairtries', 'par|try_hard' => 'y', 'par|max_search_ram' => 'chunkmbs', 'par|report_n_alignments' => 'k', 'par|all' => 'a', 'par|suppress' => 'm', 'par|suppress_random' => 'M', 'par|best' => 'best', 'par|strata' => 'strata', 'par|snp_penalty' => 'snpphred', 'par|snp_frac' => 'snpfrac', 'par|print_color' => 'col-cseq', 'par|color_quals' => 'colc-cqual', 'par|color_keep_ends' => 'col-keepends', 'par|sam_format' => 'S', 'par|defaul_mapq' => 'mapq', 'par|sam_no_head' => 'sam_nohead', 'par|sam_no_sq' => 'sam_nosq', 'par|sam_rg' => 'sam-RG', 'par|suppress_columns' => 'suppress', 'par|time' => 't', 'par|offset_base' => 'B', 'par|be_quiet' => 'quiet', 'par|ref_map' => 'refout', 'par|ref_index' => 'refidx', 'par|alignmed_file' => 'al', 'par|unaligned_file' => 'un', 'par|excess_file' => 'max', 'par|full_ref_name' => 'fullref', 'par|threads' => 'p', 'par|offrate' => 'o', 'par|memory_mapped_io' => 'mm', 'par|shared_memory' => 'shmem', 'par|random_seed' => 'seed', 'par|version' => 'version', 'crb|fastq' => 'q', 'crb|fasta' => 'f', 'crb|raw' => 'r', 'crb|inline' => 'c', 'crb|color_space' => 'C', 'crb|qualities' => 'Q', 'crb|skip' => 's', 'crb|upto' => 'u', 'crb|trim5' => '5', 'crb|trim3' => '3', 'crb|phred33' => 'phred33-quals', 'crb|phred64' => 'phred64-quals', 'crb|solexa' => 'solexa-quals', 'crb|solexa1_3' => 'solexa1.3-quals', 'crb|integer_qual' => 'integer-quals', 'crb|max_seed_mismatches' => 'n', 'crb|max_qual_mismatch' => 'e', 'crb|seed_length' => 'l', 'crb|no_maq_rounding' => 'nomaqround', 'crb|max_mismatches' => 'v', 'crb|min_insert_size' => 'I', 'crb|max_insert_size' => 'X', 'crb|forward_reverse' => 'fr', 'crb|reverse_forward' => 'rf', 'crb|forward_forward' => 'ff', 'crb|no_forward_alignment' => 'nofw', 'crb|no_reverse_alignment' => 'norc', 'crb|max_backtracks' => 'maxbts', 'crb|max_mate_attempts' => 'pairtries', 'crb|try_hard' => 'y', 'crb|max_search_ram' => 'chunkmbs', 'crb|report_n_alignments' => 'k', 'crb|all' => 'a', 'crb|suppress' => 'm', 'crb|suppress_random' => 'M', 'crb|best' => 'best', 'crb|strata' => 'strata', 'crb|snp_penalty' => 'snpphred', 'crb|snp_frac' => 'snpfrac', 'crb|print_color' => 'col-cseq', 'crb|color_quals' => 'colc-cqual', 'crb|color_keep_ends' => 'col-keepends', 'crb|sam_format' => 'S', 'crb|defaul_mapq' => 'mapq', 'crb|sam_no_head' => 'sam_nohead', 'crb|sam_no_sq' => 'sam_nosq', 'crb|sam_rg' => 'sam-RG', 'crb|suppress_columns' => 'suppress', 'crb|time' => 't', 'crb|offset_base' => 'B', 'crb|be_quiet' => 'quiet', 'crb|ref_map' => 'refout', 'crb|ref_index' => 'refidx', 'crb|alignmed_file' => 'al', 'crb|unaligned_file' => 'un', 'crb|excess_file' => 'max', 'crb|full_ref_name' => 'fullref', 'crb|threads' => 'p', 'crb|offrate' => 'o', 'crb|memory_mapped_io' => 'mm', 'crb|shared_memory' => 'shmem', 'crb|random_seed' => 'seed', 'crb|version' => 'version', 'bld|fasta' => 'f', 'bld|inline' => 'c', 'bld|color_space' => 'C', 'bld|both' => 'B', 'bld|no_auto' => 'a', 'bld|packed' => 'p', 'bld|max_bucket_block' => 'bmax', 'bld|max_bucket_div' => 'bmaxdivn', 'bld|diff_cover' => 'dcv', 'bld|no_diff_cover' => 'nodc', 'bld|no_ref' => 'r', 'bld|just_ref' => '3', 'bld|off_rate' => 'o', 'bld|ftabchars' => 't', 'bld|NtoA' => 'ntoa', 'bld|big_endian' => 'big', 'bld|little_endian' => 'little', 'bld|seed' => 'seed', 'bld|cutoff' => 'cutoff', 'bld|version' => 'version', 'ins|seq_width' => 'a', 'ins|names_only' => 'n', 'ins|summary' => 's', 'ins|reconstruct' => 'e', 'ins|version' => 'version' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_bowtie # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'single' => [qw( ind seq #out )], 'paired' => [qw( ind -1|seq -2|seq2 #out )], 'crossbow' => [qw( ind -12|seq #out )], 'build' => [qw( ref #out )], 'inspect' => [qw( ind >#out )] ); our %accepted_types = ( # ind is not a single file, so not included here 'seq' => [qw( fasta fastq raw crossbow )], 'seq2' => [qw( fasta fastq raw )], 'ref' => [qw( fasta )] ); foreach (@program_params) { push @program_params, "par\|".$1 if (m/^one\|(.*)/); push @program_params, "crb\|".$1 if (m/^par\|(.*)/) && !(m/^par\|(?:fasta|fastq|raw|qualities[12])/); } foreach (@program_switches) { push @program_switches, "par\|".$1 if (m/^one\|(.*)/); push @program_switches, "crb\|".$1 if (m/^par\|(.*)/) && !(m/^par\|(?:fasta|fastq|raw)/); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Cap3.pm000066400000000000000000000213171302566030400216440ustar00rootroot00000000000000# You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Cap3 - wrapper for CAP3 =head1 SYNOPSIS use Bio::Tools::Run::Cap3; # Run Cap3 using an input FASTA file my $factory = Bio::Tools::Run::Cap3->new( -clipping_range => 150 ); my $asm_obj = $factory->run($fasta_file, $qual_file); # An assembly object is returned by default for my $contig ($assembly->all_contigs) { ... do something ... } # Read some sequences use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta'); my @seqs; while (my $seq = $sio->next_seq()) { push @seqs,$seq; } # Run Cap3 using input sequence objects and returning an assembly file my $asm_file = 'results.ace'; $factory->out_type($asm_file); $factory->run(\@seqs); =head1 DESCRIPTION Wrapper module for CAP3 program =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHORS Marc Logghe =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Cap3; use strict; use File::Copy; use base qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase); our $program_name = 'cap3'; our @program_params = (qw( band_expansion_size differences_quality_cutoff clipping_quality_cutoff max_qscore_sum extra_nof_differences max_gap_length gap_penalty_factor max_overhang_percent match_score_factor mismatch_score_factor overlap_length_cutoff overlap_identity_cutoff reverse_orientation_value overlap_score_cutoff max_word_occurrences min_correction_constraints min_linking_constraints clipping_info_file output_prefix_string clipping_range min_clip_good_reads )); our @program_switches; our %param_translation = ( 'band_expansion_size' => 'a', 'differences_quality_cutoff' => 'b', 'clipping_quality_cutoff' => 'c', 'max_qscore_sum' => 'd', 'extra_nof_differences' => 'e', 'max_gap_length' => 'f', 'gap_penalty_factor' => 'g', 'max_overhang_percent' => 'h', 'match_score_factor' => 'm', 'mismatch_score_factor' => 'n', 'overlap_length_cutoff' => 'o', 'overlap_identity_cutoff' => 'p', 'reverse_orientation_value' => 'r', 'overlap_score_cutoff' => 's', 'max_word_occurrences' => 't', 'min_correction_constraints' => 'u', 'min_linking_constraints' => 'v', 'clipping_info_file' => 'w', 'output_prefix_string' => 'x', 'clipping_range' => 'y', 'min_clip_good_reads' => 'z' ); our $qual_param; our $use_dash = 1; our $join = ' '; our $asm_format = 'ace'; =head2 new Title : new Usage : $factory->new( -overlap_length_cutoff => 35, -overlap_identity_cutoff => 98 # % } Function: Create a new Cap3 factory Returns : A Bio::Tools::Run::Cap3 object Args : Cap3 options available in this module: band_expansion_size specify band expansion size N > 10 (20) differences_quality_cutoff specify base quality cutoff for differences N > 15 (20) clipping_quality_cutoff specify base quality cutoff for clipping N > 5 (12) max_qscore_sum specify max qscore sum at differences N > 20 (200) extra_nof_differences specify clearance between no. of diff N > 10 (30) max_gap_length specify max gap length in any overlap N > 1 (20) gap_penalty_factor specify gap penalty factor N > 0 (6) max_overhang_percent specify max overhang percent length N > 2 (20) match_score_factor specify match score factor N > 0 (2) mismatch_score_factor specify mismatch score factor N < 0 (-5) overlap_length_cutoff / minimum_overlap_length specify overlap length cutoff > 20 (40) overlap_identity_cutoff / minimum_overlap_similarity specify overlap percent identity cutoff N > 65 (80) reverse_orientation_value specify reverse orientation value N >= 0 (1) overlap_score_cutoff specify overlap similarity score cutoff N > 400 (900) max_word_occurrences specify max number of word matches N > 30 (300) min_correction_constraints specify min number of constraints for correction N > 0 (3) min_linking_constraints specify min number of constraints for linking N > 0 (2) clipping_info_file specify file name for clipping information (none) output_prefix_string specify prefix string for output file names (cap) clipping_range specify clipping range N > 5 (250) min_clip_good_reads specify min no. of good reads at clip pos N > 0 (3) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); *minimum_overlap_length = \&overlap_length_cutoff; *minimum_overlap_similarity = \&overlap_identity_cutoff; $self->program_name($program_name) if not defined $self->program_name(); $self->_assembly_format($asm_format); return $self; } =head2 out_type Title : out_type Usage : $assembler->out_type('Bio::Assembly::ScaffoldI') Function: Get/set the desired type of output Returns : The type of results to return Args : Desired type of results to return (optional): 'Bio::Assembly::IO' object 'Bio::Assembly::ScaffoldI' object (default) The name of a file to save the results in =cut =head2 run Title : run Usage : $asm = $factory->run($fasta_file); Function: Run CAP3 Returns : Assembly results (file, IO object or assembly object) Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut =head2 _run Title : _run Usage : $factory->_run() Function: Make a system call and run Cap3 Returns : An assembly file Args : - FASTA file - optional QUAL file =cut sub _run { my ($self, $fasta_file, $qual_file) = @_; # Move quality file to proper place my $tmp_qual_file = "$fasta_file.qual"; if ($qual_file && not $qual_file eq $tmp_qual_file) { $tmp_qual_file = "$fasta_file.qual"; # by Cap3 convention link ($qual_file, $tmp_qual_file) or copy ($qual_file, $tmp_qual_file) or $self->throw("Could not copy file '$qual_file' to '$tmp_qual_file': $!"); } # Setup needed files and filehandles my ($output_fh, $output_file) = $self->_prepare_output_file( ); # Get program executable my $exe = $self->executable; # Get command-line options my $options = join ' ', @{$self->_translate_params()}; # Usage: cap3 File_of_reads [options] my $commandstring = "$exe $fasta_file $options"; if ($self->verbose() >= 0) { $self->debug( "$exe command = $commandstring\n" ); } open(CAP3, "$commandstring |") || $self->throw(sprintf("%s call crashed: %s %s\n", $self->program_name, $!, $commandstring)); local $/ = undef; #my ($result) = ; # standard output of the program ; close CAP3; close $output_fh; # Result files my $prefix = $self->output_prefix_string() || 'cap'; my $ace_file = "$fasta_file.$prefix.ace"; my $contigs_file = "$fasta_file.$prefix.contigs"; $qual_file = "$fasta_file.$prefix.contigs.links"; my $links_file = "$fasta_file.$prefix.contigs.qual"; my $info_file = "$fasta_file.$prefix.info"; my $singlet_file = "$fasta_file.$prefix.singlets"; # Remove all files except for the ACE file for my $file ($contigs_file, $qual_file, $links_file, $info_file, $singlet_file, $tmp_qual_file) { unlink $file; } # Move the ACE file to its final destination move ($ace_file, $output_file) or $self->throw("Could not move file '$ace_file' to '$output_file': $!"); return $output_file; } 1;bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Coil.pm000077500000000000000000000172631302566030400217540ustar00rootroot00000000000000# Wrapper module for Coil Bio::Tools::Run::Coil # # Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Coil # originally written by Marc Sohrmann (ms2@sanger.ac.uk) # Written in BioPipe by Balamurugan Kumarasamy # Please direct questions and support issues to # # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Coil - wrapper for ncoils program =head1 SYNOPSIS # Build a Coil factory my $factory = Bio::Tools::Run::Coil->new($params); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION This module is a wrapper for the B program available via L for predicting coiled coils in protein sequences. By default it looks for an executable called I and data/parameter files in the directory specified by the I environmental variable. =head1 REFERENCES Lupas, van Dyke & Stock, I, Science B<252>:1162-1164, 1991. Lupas, A., I, Meth. Enzymology B<266>:513-525, 1996. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHORS Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Coil originally written by Marc Sohrmann (ms2@sanger.ac.uk) Written in BioPipe by Balamurugan Kumarasamy # Please direct questions and support issues to # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Coil; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @COIL_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Coil; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @COIL_PARAMS=qw(PROGRAM VERBOSE QUIET SILENT); foreach my $attr ( @COIL_PARAMS) { $OK_FIELD{$attr}++; } } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'ncoils'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{COILSDIR}) if $ENV{COILSDIR}; } =head2 new Title : new Usage : $coil->new(@params) Function: creates a new Coil factory Returns: Bio::Tools::Run::Coil Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED. Use $obj->run instead. Function: Runs Coil and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs Coil and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI, or a Fasta filename. =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ) { # it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { #The argument is not a seq object but a sequence in a fasta file. #Perhaps should check here or before if this file is fasta format...if not die #Here the file does not need to be created or deleted. Its already written and may be used by other runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str =$self->executable." -f < ".$self->{'input'}." > ".$outfile; if($self->quiet || $self->verbose <=0 || $self->silent){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $str.=" 2>$null"; } my $status = system($str); $self->throw( "Coil call ($str) crashed: $? \n") unless $status==0; my $coil_parser = Bio::Tools::Coil->new(); my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (COIL, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*COIL; } else { $filehandle = $outfile; } my @coil_feat; while(my $coil_feat = $coil_parser->next_result($filehandle)){ push @coil_feat, $coil_feat; } $self->cleanup(); close($tfh1); undef $tfh1; unlink $outfile; return @coil_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/EMBOSSApplication.pm000066400000000000000000000225741302566030400242400ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::EMBOSSApplication # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::EMBOSSApplication - class for EMBOSS Applications =head1 SYNOPSIS use Bio::Factory::EMBOSS; # get an EMBOSS application object from the EMBOSS factory $factory = Bio::Factory::EMBOSS->new(); $application = $factory->program('embossversion'); # run the application with an optional hash containing parameters $result = $application->run(); # returns a string or creates a file print $result . "\n"; $water = $factory->program('water'); # here is an example of running the application # water can compare 1 seq against 1->many sequences # in a database using Smith-Waterman my $seq_to_test; # this would have a seq here my @seqs_to_check; # this would be a list of seqs to compare my $wateroutfile = 'out.water'; $water->run({-sequencea => $seq_to_test, -seqall => \@seqs_to_check, -gapopen => '10.0', -gapextend => '0.5', -outfile => $wateroutfile}); # now you might want to get the alignment use Bio::AlignIO; my $alnin = Bio::AlignIO->new(-format => 'emboss', -file => $wateroutfile); while ( my $aln = $alnin->next_aln ) { # process the alignment -- these will be Bio::SimpleAlign objects } =head1 DESCRIPTION The EMBOSSApplication class can represent EMBOSS any program. It is created by a L object. If you want to check command line options before sending them to the program set $prog-Everbose to positive integer. The ADC description of the available command line options is then parsed in and compared to input. See also L and L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing lists Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Heikki Lehvaslaiho Email heikki-at-bioperl-dot-org =head2 CONTRIBUTORS Email: jason-AT-bioperl_DOT_org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::EMBOSSApplication; use vars qw( $SEQIOLOADED $ALIGNIOLOADED ); use strict; use Data::Dumper; use Bio::Tools::Run::EMBOSSacd; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); sub new { my($class, $args) = @_; my $self = $class->SUPER::new(); $self->name($args->{'name'}); $self->verbose($args->{'verbose'}); $self->acd if $self->verbose > 0; return $self; } =head2 run Title : run Usage : $embossapplication->run($attribute_hash) Function: Runs the EMBOSS program. Returns : string or creates files for now; will return objects! Args : hash of input to the program =cut sub run { my ($self, $input) = @_; $self->io->_io_cleanup(); # test input $self->debug( Dumper($input) ); # parse ACD information $self->acd if $self->verbose > 0; # collect the options into a string my $option_string = ''; foreach my $attr (keys %{$input}) { my $attr_name = substr($attr, 1) if substr($attr, 0, 1) =~ /\W/; my $array = 0; if( defined $input->{$attr} && ref($input->{$attr}) ) { my (@pieces); if( $array = (ref($input->{$attr}) =~ /array/i) ) { foreach my $s ( @{$input->{$attr}} ) { @pieces = @{$input->{$attr}}; } } else { @pieces = ($input->{$attr}); } if( ! defined $pieces[0] ) { # we ignore for now $self->warn("specified a parameter $attr with no value"); $input->{$attr} = undef; return; } elsif( $pieces[0]->isa('Bio::PrimarySeqI') ) { unless( $SEQIOLOADED ) { require Bio::SeqIO; $SEQIOLOADED = 1; } my ($tfh,$tempfile) = $self->io->tempfile(-dir => $self->tempdir); my $out = Bio::SeqIO->new(-format => 'fasta', -fh => $tfh); foreach my $seq ( @pieces ) { $out->write_seq($seq); } $out->close(); $input->{$attr} = $tempfile; close($tfh); undef $tfh; } elsif( $pieces[0]->isa('Bio::Align::AlignI') ) { unless( $ALIGNIOLOADED ) { require Bio::AlignIO; $ALIGNIOLOADED = 1; } my ($tfh,$tempfile) = $self->io->tempfile(); my $out = Bio::AlignIO->new(-format => 'msf', -fh => $tfh); foreach my $p ( @pieces ) { $out->write_aln($p); } $input->{$attr} = $tempfile; close($tfh); undef $tfh; } } # check each argument against ACD if ($self->verbose > 0) { last unless defined $self->acd; # might not have the parser $self->throw("Attribute [$attr] not recognized!\n") unless $self->acd->qualifier($attr); } # print out debugging info $self->debug("Input attr: ". $attr_name. " => ". $input->{$attr}. "\n"); $option_string .= " " . $attr; $option_string .= " ". $input->{$attr} if defined $input->{$attr}; } # check mandatory attributes against given ones if ($self->verbose > 0) { last unless defined $self->acd; # might not have the parser # $self->acd->mandatory->print; # if ($self->name eq 'water') { # print Dumper($self->acd->mandatory); # } foreach my $attr (keys %{$self->acd->mandatory} ) { last unless defined $self->acd; # might not have the parser unless (defined $input->{$attr}) { print "-" x 38, "\n", "MISSING MANDATORY ATTRIBUTE: $attr\n", "-" x 38, "\n"; $self->acd->print($attr) and $self->throw("Program ". $self->name. " needs attribute [$attr]!\n") } } } my $runstring = join (' ', $self->name, $option_string, '-auto'); $self->debug( "Command line: ", $runstring, "\n"); return `$runstring`; } =head2 acd Title : acd Usage : $embossprogram->acd Function: finds out all the possible qualifiers for this EMBOSS application. They can be used to debug the options given. Throws : Returns : boolean Args : =cut sub acd { my ($self) = @_; unless ( $self->{'_acd'} ) { $self->{'_acd'} = Bio::Tools::Run::EMBOSSacd->new($self->name); } return $self->{'_acd'}; } =head2 name Title : name Usage : $embossprogram->name Function: sets/gets the name of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. Throws : Returns : name string Args : None =cut sub name { my ($self,$value) = @_; if (defined $value) { $self->{'_name'} = $value; } return $self->{'_name'}; } =head2 descr Title : descr Usage : $embossprogram->descr Function: sets/gets the descr of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. Throws : Returns : description string Args : None =cut sub descr { my ($self,$value) = @_; if (defined $value) { $self->{'_descr'} = $value; } return $self->{'_descr'}; } =head2 group Title : group Usage : $embossprogram->group Function: sets/gets the group of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. If the application is assigned into a subgroup use l to get it. Throws : Returns : string, group name Args : group string =cut sub group { my ($self,$value) = @_; if (defined $value) { my ($group, $subgroup) = split ':', $value; $self->{'_group'} = $group; $self->{'_subgroup'} = $subgroup; } return $self->{'_group'}; } =head2 subgroup Title : subgroup Usage : $embossprogram->subgroup Function: sets/gets the subgroup of the EMBOSS program Setting is done by the EMBOSSFactory object, you should only get it. Throws : Returns : string, subgroup name; undef if not defined Args : None =cut sub subgroup { my ($self) = @_; return $self->{'_subgroup'}; } =head2 program_dir Title : program_dir Usage : Function: Required by WrapperBase Throws : Returns : Name of directory with EMBOSS programs Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{EMBOSS_ACDROOT}); } =head2 program_path Title : program_path Usage : Function: Required by WrapperBase Throws : Returns : Full path of program Args : =cut sub program_path { my $self = shift; my $name = $self->{_name}; my $dir = Bio::Root::IO->catfile($ENV{EMBOSS_ACDROOT}); return "$dir/$name"; } =head2 executable Title : executable Usage : Function: Required by WrapperBase Throws : Returns : Name of program Args : =cut sub executable { my $self = shift; $self->{_name}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/EMBOSSacd.pm000066400000000000000000000241311302566030400225130ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::EMBOSSacd # # # Please direct questions and support issues to # # Cared for by Heikki Lehvaslaiho # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::EMBOSSacd - class for EMBOSS Application qualifiers =head1 SYNOPSIS use Bio::Factory::EMBOSS; # get an EMBOSS application object from the EMBOSS factory $factory = Bio::Factory::EMBOSS->new(); $application = $factory->program('embossversion'); # run the application with an optional hash containing parameters $result = $application->run(); # returns a string or creates a file print $result . "\n"; $water = $factory->program('water'); # here is an example of running the application # water can compare 1 seq against 1->many sequences # in a database using Smith-Waterman my $seq_to_test; # this would have a seq here my @seqs_to_check; # this would be a list of seqs to compare # (could be just 1) my $wateroutfile = 'out.water'; $water->run({ -sequencea => $seq_to_test, -seqall => \@seqs_to_check, -gapopen => '10.0', -gapextend => '0.5', -outfile => $wateroutfile}); # now you might want to get the alignment use Bio::AlignIO; my $alnin = Bio::AlignIO->new(-format => 'emboss', -file => $wateroutfile); while( my $aln = $alnin->next_aln ) { # process the alignment -- these will be Bio::SimpleAlign objects } =head1 DESCRIPTION The EMBOSSacd represents all the possible command line arguments that can be given to an EMBOSS application. Do not create this object directly. It will be created and attached to its corresponding Bio::Tools::Run::EMBOSSApplication if you set $application->verbose > 0 Call $application->acd to retrive the Bio::Tools::Run::EMBOSSApplication::EMBOSSacd object. See also L and L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing lists Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Heikki Lehvaslaiho Email: heikki-at-bioperl-dot-org Address: EMBL Outstation, European Bioinformatics Institute Wellcome Trust Genome Campus, Hinxton Cambs. CB10 1SD, United Kingdom =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::EMBOSSacd; use vars qw(@ISA %QUALIFIER_CATEGORIES $QUAL %OPT); use strict; use Data::Dumper; use Bio::Root::Root; @ISA = qw(Bio::Root::Root); BEGIN { %QUALIFIER_CATEGORIES = ( 'Mandatory qualifiers' => 'mandatory', 'Standard (Mandatory) qualifiers' => 'mandatory', 'Optional qualifiers' => 'optional', 'Additional (Optional) qualifiers'=> 'optional', 'Advanced qualifiers' => 'advanced', 'Advanced (Unprompted) qualifiers'=> 'advanced', 'Associated qualifiers' => 'associated', 'General qualifiers' => 'general', ); $QUAL; # qualifier category } =head2 new Title : new Usage : $emboss_prog->acd($prog_name); Function: Constructor for the class. Calls EMBOSS program 'acdc', converts the HTML output into XML and uses XML::Twig XML parser to write out a hash of qualifiers which is then blessed. Throws : without program name Returns : new object Args : EMBOSS program name =cut sub new { my($class, $prog) = @_; eval {require XML::Twig;}; Bio::Root::Root->warn("You need XML::Twig for EMBOSS ACD parsing") and return undef if $@; Bio::Root::Root->throw("Need EMBOSSprogram name as an argument") unless $prog; # reset global hash %OPT = (); my $version = `embossversion -auto`; my $file; if ($version lt "2.8.0") { # reading from EMBOSS program acdc stdout (prior to version 2.8.0) $file = `acdc $prog -help -verbose -acdtable 2>&1`; } else { # reading from EMBOSS program acdtable stdout (version 2.8.0 or greater) $file = `acdtable $prog -help -verbose 2>&1`; } # converting HTML -> XHTML for XML parsing $file =~ s/border/border="1"/; $file =~ s/=(\d+)/="$1"/g; $file =~ s/
/
<\/br>/g; $file =~ s/ //g; my $t = XML::Twig->new( TwigHandlers => { '/table/tr' => \&_row } ); $t->safe_parse( $file); #Bio::Root::Root->throw("XML parsing error: $@"); my %acd = %OPT; # copy to a private hash $acd{'_name'} = $prog; bless \%acd, $class; } sub _row { my ($t, $row)= @_; return if $row->text eq "(none)"; # no qualifiers in this category my $name = $row->first_child; # qualifier name my $namet = $name->text; if ($namet =~ /qualifiers$/) { # set category $QUAL = $QUALIFIER_CATEGORIES{$namet}; if( ! defined $QUAL ) { warn("-- namet is $namet\n"); } return; } my $unnamed = 0; if ($namet =~ /\(Parameter (\d+)\)/) { # unnamed parameter $unnamed = $1; $namet =~ s/\(Parameter (\d+)\)//; $namet =~ s/[\[\]]//g ; # name is in brackets } my $desc = $name->next_sibling; my $values = $desc->next_sibling; my $default = $values->next_sibling; $OPT{$namet}{'unnamed'} = $unnamed; $OPT{$namet}{'category'} = $QUAL; $OPT{$namet}{'descr'} = $desc->text; $OPT{$namet}{'values'} = $values->text; $OPT{$namet}{'default'} = $default->text; $t->purge; # to reduce memory requirements } =head2 name Title : name Usage : $embossacd->name Function: sets/gets the name of the EMBOSS program Setting is done by the EMBOSSApplication object, you should only get it. Throws : Returns : name string Args : None =cut sub name { my ($self,$value) = @_; if (defined $value) { $self->{'_name'} = $value; } return $self->{'_name'}; } =head2 print Title : print Usage : $embossacd->print; $embossacd->print('-word'); Function: Print out the qualifiers. Uses Data::Dumper to print the qualifiers into STDOUT. A valid qualifier name given as an argment limits the output. Throws : Returns : print string Args : optional qualifier name =cut sub print { my ($self, $value) = @_; if ($value and $self->{$value}) { print Dumper $self->{$value}; } else { print Dumper $self; } } =head2 mandatory Title : mandatory Usage : $acd->mandatory Function: gets a mandatory subset of qualifiers Throws : Returns : Bio::Tools::Run::EMBOSSacd object Args : none =cut sub mandatory { my ($self) = @_; my %mand; foreach my $key (keys %{$self}) { next unless $key =~ /^-/; #ignore other attributes $mand{$key} = $self->{$key} if $self->{$key}{category} eq 'mandatory'; } bless \%mand; } =head2 Qualifier queries These methods can be used test qualifier names and read values. =cut =head2 qualifier Title : qualifier Usage : $acd->qualifier($string) Function: tests for the existence of the qualifier Throws : Returns : boolean Args : string, name of the qualifier =cut sub qualifier { my ($self, $value) = @_; $self->throw("Qualifier has to start with '-'") unless $value =~ /^-/; $self->{$value} ? 1 : 0 } =head2 category Title : category Usage : $acd->category($qual_name) Function: Return the category of the qualifier Throws : Returns : 'mandatory' or 'optional' or 'advanced' or 'associated' or 'general' Args : string, name of the qualifier =cut sub category { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'category'}; } =head2 values Title : values Usage : $acd->values($qual_name) Function: Return the possible values for the qualifier Throws : Returns : string Args : string, name of the qualifier =cut sub values { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'values'}; } =head2 descr Title : descr Usage : $acd->descr($qual_name) Function: Return the description of the qualifier Throws : Returns : boolean Args : string, name of the qualifier =cut sub descr { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'descr'}; } =head2 unnamed Title : unnamed Usage : $acd->unnamed($qual_name) Function: Find if the qualifier can be left unnamed Throws : Returns : 0 if needs to be named, order number otherwise Args : string, name of the qualifier =cut sub unnamed { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'unnamed'}; } =head2 default Title : default Usage : $acd->default($qual_name) Function: Return the default value for the qualifier Throws : Returns : scalar Args : string, name of the qualifier =cut sub default { my ($self, $value) = @_; $self->throw("Not a valid qualifier name [$value]") unless $self->qualifier($value); $self->{$value}->{'default'}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/ERPIN.pm000066400000000000000000000252021302566030400217300ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::ERPIN # # Please direct questions and support issues to # # Cared for by Chris Fields # # Copyright Chris Fields # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::ERPIN - Wrapper for local execution of the ERPIN suite of programs. =head1 SYNOPSIS #run my @params = ( trset => 'BL.erpin', region => [1, 10], # Set up search strategy this way... strategy => [ 'umask' => [1, 2], 'umask' => [1, 2, 3, 4], 'umask' => [1, 2, 3, 4, 5, 6], 'nomask', 'cutoff' => [0, 10, 15, 20] ] # or use a simple string... #strategy => 'Ðumask 4 Ðadd 5 -nomask -cutoff 0 10 15', pcw => 100 ); my $factory = Bio::Tools::Run::ERPIN->new(-program =>'erpin', @params); # Pass the factory a Bio::Seq object or a file name # Returns a Bio::SearchIO object #my $search = $factory->run("B_sub.fas"); my $search = $factory->run($seq); my @feat; while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $r->query_name, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->meta, $hsp->score, )), "\n"; } } } =head1 DESCRIPTION =cut =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Chris Fields Email: cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS cjfields-at-uiuc-dot-edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::ERPIN; use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::SearchIO; use Bio::AlignIO; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # will move parameters to each program, use this for _set_params my %ERPIN_PROGS = ( cfgs => 1, erpin => 1, frandseq => 1, mstat => 1, sview => 1, tstrip => 1, epnstat => 1, ev => 1, mhistview => 1, pview => 1, tstat => 1, tview => 1, ); my %ERPIN_SWITCHES = map {$_ => 1} qw(dmp smp fwd rev fwd+rev long short mute warnings globstat locstat unifstat Eon Eoff hist chrono); # order is important here my @ERPIN_PARAMS=qw(program model file strategy dmp smp fwd rev fwd+rev long short mute warnings globstat locstat unifstat Eon Eoff hist seq1 nseq bgn len logzero tablen chrono pcw hpcw spcw sumf tset); =head2 new Title : new Usage : my $wrapper = Bio::Tools::Run::RNAMotif->new(@params) Function: creates a new RNAMotif factory Returns: Bio::Tools::Run::RNAMotif Args : list of parameters -tempfile => set tempfile flag (default 0) -outfile_name => set file to send output to (default none) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($out, $tf) = $self->_rearrange([qw(OUTFILE_NAME TEMPFILE)], @args); $self->io->_initialize_io(); if ($tf && !$out) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } else { $out ||= ''; $self->outfile_name($out); } $tf && $self->tempfile($tf); $self->_set_from_args(\@args, -methods => [@ERPIN_PARAMS], -create => 1 ); return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; return $self->program(@_); } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{ERPINDIR}) if $ENV{ERPINDIR}; } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $string = `erpin -h 2>&1`; my $v; if ($string =~ m{Version\s([\d.]+)}) { $v = $1; } return $self->{'_progversion'} = $v || $string; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs ERPIN programs and returns Bio::SearchIO Returns : Args : Must pass Bio::PrimarySeqI's or file names =cut sub run { my ($self,@seq) = @_; $self->throw ("Must define 'db', pass a file name, or a list of Bio::PrimarySeqI objects") if (!@seq); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head2 tempfile Title : tempfile Usage : $obj->tempfile(1) Function: Set tempfile flag. When set, writes output to a tempfile; this is overridden by outfile_name() if set Returns : Boolean setting (or undef if not set) Args : [OPTIONAL] Boolean =cut sub tempfile { my $self = shift; return $self->{'_tempfile'} = shift if @_; return $self->{'_tempfile'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : Args : =cut sub _run { my ($self,$file,$prog) = @_; return unless $self->executable; $self->io->_io_cleanup(); my ($str, $progname, $outfile) = ($prog || $self->executable, $self->program_name, $self->outfile_name); my $param_str = $self->_setparams($file); $str .= " $param_str"; $self->debug("ERPIN command: $str\n"); # rnamotif => SearchIO object # rmfmt -a => AlignIO object # all others sent to outfile, tempfile, or STDERR (upon verbose = 1) my $obj = ($progname eq 'erpin') ? Bio::SearchIO->new(-verbose => $self->verbose, -format => "erpin", -version => $self->version, -database => $file ) : undef; my @args; # file-based if ($outfile) { local $SIG{CHLD} = 'DEFAULT'; my $status = system($str); if($status || !-e $outfile || -z $outfile ) { my $error = ($!) ? "$! Status: $status" : "Status: $status"; $self->throw( "ERPIN call crashed: $error \n[command $str]\n"); return undef; } if ($obj && ref($obj)) { $obj->file($outfile); @args = (-file => $outfile); } # fh-based } else { open(my $fh,"$str |") || $self->throw("ERPIN call ($str) crashed: $?\n"); if ($obj && ref($obj)) { $obj->fh($fh); @args = (-fh => $fh); } else { # dump to debugging my $io; while(<$fh>) {$io .= $_;} close($fh); $self->debug($io); return 1; } } # initialize SearchIO/AlignIO...um...IO # (since file/fh set post obj construction) $obj->_initialize_io(@args) if $obj && ref($obj); return $obj || 1; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self, $file) = @_; my $progname = $self->program_name; # small sanity check $self->throw("Unknown program: $progname") if (!exists $ERPIN_PROGS{$progname} ); my $param_string; my $outfile = ($self->outfile_name) ? ' > '.$self->outfile_name : ''; my ($tset, $st) = ($self->tset, $self->strategy); $param_string = join " ", ($tset, $file, $st); $self->debug("String : $param_string\n"); $self->throw("Must have both a training set and search strategy defined!") if (!defined($tset) || !defined ($st)); my @params; foreach my $attr (@ERPIN_PARAMS) { next if $attr eq 'program' || $attr eq 'tset' || $attr eq 'strategy'; my $value = $self->$attr(); next unless ($attr eq 'file' || defined $value); my $attr_key = '-'.$attr; if (exists $ERPIN_SWITCHES{$attr}) { push @params, $attr_key; } else { if ($attr eq 'file') { push @params, $file; } else { push @params, $attr_key.' '.$value; } } } $param_string .= ' '.join ' ', @params; $param_string .= $outfile if $outfile; return $param_string; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : writes passed Seq objects to tempfile, to be used as input for program Args : =cut sub _writeSeqFile { my ($self,@seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); foreach my $s(@seq){ $in->write_seq($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Ensembl.pm000066400000000000000000000336551302566030400224530ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Ensembl # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Ensembl - A simplified front-end for setting up the registry for, and then using an Ensembl database with the Ensembl Perl API. =head1 SYNOPSIS use Bio::Tools::Run::Ensembl; # get a Bio::EnsEMBL::Gene for agene of interest my $gene = Bio::Tools::Run::Ensembl->get_gene_by_name(-species => 'human', -name => 'BRCA2'); =head1 DESCRIPTION This is a simple way of accessing the Ensembl database to retrieve gene information. Rather than learn the whole Ensembl Perl API, you only need to install it (that is, check it out from CVS: http://www.ensembl.org/info/docs/api/api_installation.html - ignore the information about BioPerl version) and then you can get information about a gene using get_gene_by_name(). For gene retrieval it is especially useful compared to direct Ensembl Perl API usage since it can use information from alternate data sources (orthologues, Swissprot, Entrez) to get your gene. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Ensembl; use strict; use Bio::WebAgent; use Bio::DB::EUtilities; use base qw(Bio::Root::Root); our $ENSEMBL_INSTALLED; our $NODB; our $LOADED_STR; our $TOTAL = 0; our $ORTHS = 0; our $SWISS = 0; our $NCBI = 0; our $BAD = 0; our $GOOD = 0; BEGIN { eval { require Bio::EnsEMBL::Registry; }; $ENSEMBL_INSTALLED = ! $@; $NODB = 0; $LOADED_STR = ''; } =head2 registry_setup Title : registry_setup Usage : Bio::Tools::Run::Ensembl->registry_setup(-host => $host, -user => $user); if (Bio::Tools::Run::Ensembl->registry_setup) {...} Function: Configure the ensembl registy to use a certain database. The database must be an Ensembl database compatible with the Ensembl Perl API, and you must have that API installed for this method to return true. Defaults to anonymous access to ensembldb.ensembl.org Or just ask if the registry is setup and the database ready to use. Returns : boolean (true if Registry loaded and ready to use) Args : -host => host name (defaults to 'ensembldb.ensembl.org') -user => username (defaults to 'anonymous') -pass => password (no default) -port => port (defaults to 3306) -db_version => version of ensembl database to use, if different from your installed Ensembl modules -verbose => boolean (1 to print messages during database connection) -no_database => boolean (1 to disable database access, causing this method to always return false) =cut sub registry_setup { return 0 unless $ENSEMBL_INSTALLED; my $class = shift; my ($host, $user, $pass, $port, $verbose, $no_db, $db_version) = $class->_rearrange([qw(HOST USER PASS PORT VERBOSE NO_DATABASE DB_VERSION)], @_); $host ||= 'ensembldb.ensembl.org'; $user ||= 'anonymous'; $NODB = $no_db if defined($no_db); return 0 if $NODB; my $load_str = $host.$user. (defined $pass ? $pass : '') . (defined $port ? $port : ''); unless ($LOADED_STR eq $load_str) { Bio::EnsEMBL::Registry->load_registry_from_db(-host => $host, -user => $user, defined $pass ? (-pass => $pass) : (), defined $port ? (-port => $port) : (), defined $db_version ? (-db_version => $db_version) : (), -verbose => $verbose); $LOADED_STR = $load_str; } return 1; } =head2 get_adaptor Title : get_adaptor Usage : my $adaptor = Bio::Tools::Run::Ensembl->get_adaptor($species, $type); Function: Get a species-specific 'core' database adaptor, optionally of a certain type. Returns : Bio::EnsEMBL::DBSQL::DBAdaptor, OR if a certain type requested, a Bio::EnsEMBL::DBSQL::${type}Adaptor Args : Bio::Species or string (species name) (REQUIRED), AND optionally string (the type of adaptor, eg. 'Gene' or 'Slice'). =cut sub get_adaptor { my ($class, $species, $type) = @_; return unless $class->registry_setup; return unless $species; if (ref($species)) { $species = $species->scientific_name; } return Bio::EnsEMBL::Registry->get_adaptor($species, 'core', $type) if $type; return Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core'); } =head2 get_gene_by_name Title : get_gene_by_name Usage : my $gene = Bio::Tools::Run::Ensembl->get_gene_by_name(); Function: Get a gene given species and a gene name. If multiple genes match this combination, tries to pick the 'best' match. Returns : Bio::EnsEMBL::Gene Args : -species => Bio::Species or string (species name), REQUIRED -name => string: gene name, REQUIRED If searching for the supplied gene name in the supplied species results in no genes, or more than one, you can choose what else is attempted in order to find just one gene: -use_orthologues => Bio::Species or string (species name), or array ref of such things: see if any of these supplied species have (unambiguously) a gene with the supplied gene name and if a (one-to-one) orthologue of that gene in that species is present in the main desired species supplied to -species, returns that orthologous gene. (default: none, do not use orthologues) -use_swiss_lookup => boolean: queries swissprot at expasy and if a suitable match is found, queries ensembl with the swissprot id. (default: 0, do not use swiss) -use_entrez_lookup => boolean: queries entrez at the NCBI server if (only) a single gene could not be found by any other method, then query ensembl with the entrez gene id. (default: 0, do not use NCBI) (Attempts proceed in this order and return as soon as one method is successful.) -strict => boolean: return undef with no warnings if more than one, or zero genes were found. (default: 0, warnings are issued and if many genes were found, one of them is returned) =cut sub get_gene_by_name { my $class = shift; return unless $class->registry_setup; my ($species, $gene_name, $use_swiss, $use_orth, $use_entrez, $strict) = $class->_rearrange([qw(SPECIES NAME USE_SWISS_LOOKUP USE_ORTHOLOGUES USE_ENTREZ_LOOKUP STRICT)], @_); $species || $class->throw("You must supply a -species"); $gene_name || $class->throw("You must supply a -name"); my $taxid; if (ref($species)) { $taxid = $species->id; $species = $species->scientific_name; } $TOTAL++; #print ". "; my $gene_adaptor = $class->get_adaptor($species, 'Gene') || return; # get the first gene that matches our query, warn if more than one did my @genes = @{$gene_adaptor->fetch_all_by_external_name($gene_name)}; my $gene = shift(@genes); # if not good enough, try again using orthologues if ($use_orth && (! $gene || @genes > 0)) { my @tests; if (ref($use_orth) && ref($use_orth) eq 'ARRAY') { @tests = @{$use_orth}; } else { @tests = ($use_orth); } my $alias_species = Bio::EnsEMBL::Registry->get_alias($species); foreach my $test_species (@tests) { $test_species = $test_species->scientific_name if ref($test_species); $test_species eq $species and next; my $test_gene = $class->get_gene_by_name(-species => $test_species, -name => $gene_name, -strict => 1) || next; my $homologue_results_ref = $test_gene->get_all_homologous_Genes(); # get the species and gene id of each homologue foreach my $result_ref (@{$homologue_results_ref}) { my ($homolog_gene, $homology, $homolog_species) = @{$result_ref}; # get_alias returns lower case, underscored version of what we get here $homolog_species = lc($homolog_species); $homolog_species =~ s/ /_/g; $homolog_species eq $alias_species or next; $homology->description eq 'UBRH' or next; $gene = $homolog_gene; $ORTHS++; last; } $gene and last; } } # if not good enough, try again using swissprot if ($use_swiss && (! $gene || @genes > 0)) { my $swiss_id; #*** swiss look up should be farmed out to some dedicated class my $swiss_name = lc($gene_name); my $swiss_species = lc($species); $swiss_species =~ s/\s/+/g; my $url = "http://www.expasy.org/cgi-bin/get-entries?db=sp&db=tr&DE=&GNc=AND&GN=$swiss_name&OC=$swiss_species&view=&num=100"; my $web_agent = Bio::WebAgent->new(); $web_agent->url($url); my $rq = HTTP::Request->new(GET=>$url); my $reply = $web_agent->request($rq); if ($reply->is_error) { $class->throw($reply->as_string()."\nError getting for url $url!\n"); } my $content = $reply->content; if ($content && $content !~ /No entries have been found/) { my @possibles = split(" 'y', -verbose => -1); my $esummary = Bio::DB::EUtilities->new(-eutil => 'esummary', -history => $esearch->next_History); eval {$esummary->parse_data;}; if (!$@) { my $ncbi_id; while (my $docsum = $esummary->next_DocSum) { my $item = $docsum->get_Item_by_name('Name'); if (lc($item->get_content) eq lc($gene_name)) { $ncbi_id = $docsum->get_id; last; } } if ($ncbi_id) { @genes = @{$gene_adaptor->fetch_all_by_external_name($ncbi_id)}; $gene = shift(@genes); $NCBI++ if ($gene && @genes == 0); } } } if (@genes > 0) { return if $strict; #$class->warn("Species '$species' had multiple matches to gene '$gene_name', using first gene '".$gene->display_id."'"); } unless ($gene) { return if $strict; $BAD++; #$class->warn("Species '$species' didn't have gene '$gene_name'"); return; } $GOOD++; return $gene; } sub _stats { print "$TOTAL | $ORTHS | $SWISS | $NCBI | good vs bad = $GOOD vs $BAD\n"; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Eponine.pm000066400000000000000000000236001302566030400224500ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by Tania Oh # # Copyright Tania Oh # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Eponine - Object for execution of the Eponine which is a mammalian TSS predictor =head1 SYNOPSIS use Bio::Tools::Run::Eponine; use strict; my $seq = "/data/seq.fa"; my $threshold = "0.999"; my @params = ( '-seq' => $seq, '-threshold' => $threshold, '-epojar' => '/usr/local/bin/eponine-scan.jar', '-java' => '/usr/local/bin/java'); my $factory = Bio::Tools::Run::Eponine->new(@params); # run eponine against fasta my $r = $factory->run($seq); my $parser = Bio::Tools::Eponine->new($r); while (my $feat = $parser->next_prediction){ #$feat contains array of SeqFeature foreach my $orf($feat){ print $orf->seqname. "\n"; } } # Various additional options and input formats are available. See # the DESCRIPTION section for details. =head1 DESCRIPTION wrapper for eponine, a mammalian TSS predictor. The environment variable EPONINEDIR must be set to point at either the directory which contains eponine-scan.jar or directly at the jar which eponine-scan classfiles. NOTE: EPONINEDIR must point at the real file not a symlink. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Email gisoht@nus.edu.sg =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Eponine; #tgot to take inmore parameters use vars qw($AUTOLOAD @ISA @EPONINE_PARAMS %EPONINE_PARAMS $EPOJAR $JAVA $PROGRAMDIR $PROGRAMNAME $PROGRAM $TMPDIR $TMPOUTFILE $DEFAULT_THRESHOLD %OK_FIELD); use strict; use Bio::Tools::Eponine; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { $DEFAULT_THRESHOLD = 50; $PROGRAMNAME = 'java'; $EPOJAR = 'eponine-scan.jar'; if( ! defined $PROGRAMDIR ) { $PROGRAMDIR = $ENV{'JAVA_HOME'} || $ENV{'JAVA_DIR'}; } if (defined $PROGRAMDIR) { foreach my $progname ( [qw(java)],[qw(bin java)] ) { my $f = Bio::Root::IO->catfile($PROGRAMDIR, @$progname); if( -e $f && -x $f ) { $PROGRAM = $f; last; } } } if( $ENV{'EPONINEDIR'} ) { if ( -d $ENV{'EPONINEDIR'} ) { $EPOJAR = Bio::Root::IO->catfile($ENV{'EPONINEDIR'}, $EPOJAR) } elsif(-e $ENV{'EPONINEDIR'}) { $EPOJAR = $ENV{'EPONINEDIR'}; } if ( ! -e $EPOJAR) { $EPOJAR =undef; } } %EPONINE_PARAMS = ('SEQ' => '/tmp/test.fa', 'THRESHOLD' => '0.999', 'EPOJAR' => '/usr/local/bin/eponine-scan.jar', 'JAVA' => '/usr/java/jre1.3.1_02/bin/java'); @EPONINE_PARAMS=qw(SEQ THRESHOLD JAVA EPOJAR); foreach my $attr ( @EPONINE_PARAMS) { $OK_FIELD{$attr}++; } } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $self->debug( "************ attr: $attr\n"); $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } sub new { my ($caller, @args) = @_; # chained new my $self = $caller->SUPER::new(@args); # so that tempfiles are cleaned up my $java; my $seq; my $threshold; my $epojar; my ($attr, $value); ($TMPDIR) = $self->tempdir(CLEANUP=>1); my $tfh; ($tfh,$TMPOUTFILE) = $self->io->tempfile(-dir => $TMPDIR); close($tfh); undef $tfh; while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/JAVA/i) { $java = $value; next; } if ($attr =~ /EPOJAR/i){ $epojar = $value; next; } if ($attr =~ /THRESHOLD/i){ $threshold = $value; next; } if ($attr =~ /SEQ/i){ $seq = $value; next; } $self->$attr($value); } $self->{'_java'} = undef; # location of java vm $self->{'_epojar'} = undef; # location of eponine-scan.jar executable JAR file. $self->{'_threshold'} = 0.999; # minimum posterior for filtering predictions $self->{'_filename'} = undef; #location of seq $seq = $EPONINE_PARAMS{'seq'} unless defined $seq; $threshold = $EPONINE_PARAMS{'threshold'} unless defined $threshold; if (! defined $epojar && defined $EPOJAR) { $epojar = $EPOJAR; } else { $epojar = $EPONINE_PARAMS{'epojar'} unless defined $epojar; } if (! defined $java && defined $PROGRAM) { $java = $PROGRAM; } else { $java = $EPONINE_PARAMS{'JAVA'} unless defined $java; } $self->filename($seq) if ($seq); if (-x $java) { # full path assumed $self->java($java); } $self->epojar($epojar) if (defined $epojar); if (defined $threshold && $threshold >=0 ){ $self->threshold($threshold); } else { $self->threshold($DEFAULT_THRESHOLD); } return $self; } =head2 java Title : java Usage : $obj->java('/usr/opt/java130/bin/java'); Function: Get/set method for the location of java VM Args : File path (optional) =cut sub executable { shift->java(@_); } sub java { my ($self, $exe,$warn) = @_; if( defined $exe ) { $self->{'_pathtojava'} = $exe; } unless( defined $self->{'_pathtojava'} ) { if( $PROGRAM && -e $PROGRAM && -x $PROGRAM ) { $self->{'_pathtojava'} = $PROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($PROGRAMNAME) ) && -x $exe ) { $self->{'_pathtojava'} = $exe; } else { $self->warn("Cannot find executable for $PROGRAMNAME") if $warn; $self->{'_pathtojava'} = undef; } } } $self->{'_pathtojava'}; } =head2 epojar Title : epojar Usage : $obj->epojar('/some/path/to/eponine-scan.jar'); Function: Get/set method for the location of the eponine-scan executable JAR Args : Path (optional) =cut sub epojar { my ($self, $location) = @_; if ($location) { unless( $location ) { $self->warn("eponine-scan.jar not found at $location: $!\n"); return; } $self->{'_epojar'} = $location ; } return $self->{'_epojar'}; } =head2 threshold Title : threshold Usage : my $threshold = $self->threshold Function: Get/Set the threshold for Eponine Returns : string Args : b/w 0.9 and 1.0 =cut sub threshold{ my ($self, $threshold) = @_; if (defined $threshold) { $self->{'_threshold'} = $threshold ; } return $self->{'_threshold'}; } =head2 run Title : run Usage : my @genes = $self->run($seq) Function: runs Eponine and creates an array of features Returns : An Array of SeqFeatures Args : A Bio::PrimarySeqI =cut sub run{ my ($self,$seq) = @_; my $infile = $self->_setinput($seq); my @tss = $self->_run_eponine($infile); return @tss; } =head2 predict_TSS Title : predict_TSS Usage : Alias for run() =cut sub predict_TSS { return shift->run(@_); } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : =cut sub _setinput { my ($self,$seq) = @_; #better be a file if(!ref $seq){ return $seq; } my ($tfh1,$inputfile) = $self->tempfile(-dir=>$TMPDIR); my $in = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'Fasta'); $in->write_seq($seq); close($tfh1); undef $tfh1; return ($inputfile); } =head2 _run_eponine Title : run_eponine Usage : $obj->_run_eponine() Function: execs the Java VM to run eponine Returns : none Args : none =cut sub _run_eponine { my ($self,$infile) = @_; my $result = $TMPOUTFILE; my @tss; #run eponine $self->debug( "Running eponine-scan\n"); my ($java,$epojar) = ( $self->java, $self->epojar); unless( defined $java && -e $java && -x $java ) { $self->warn("Cannot find java"); return; } if (! defined $epojar) { $self->warn("Don't know the name of the Eponine jar file"); return; } if (! -e $epojar) { $self->warn("Cannot find Eponine jar: $epojar - either you specified an incorrect path in\nEPONINEDIR or it was not in the current working directory"); return; } my $cmd = $self->java.' -jar '.$self->epojar.' -seq '.$infile.' -threshold '.$self->threshold." > ".$result; $self->throw("Error running eponine-scan on ".$self->filename. " \n Check your java version, it has to be version 1.2 or later. Eponine crashed ($cmd) crashed: $? \n") if (system ($cmd)); #parse results even though it's wierd.. thought parser and wrapper should be separate my $epoParser = Bio::Tools::Eponine->new(-file =>$result); while (my $tss = $epoParser->next_prediction()){ push (@tss, $tss); } return @tss; } 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/FootPrinter.pm000066400000000000000000000375231302566030400233370ustar00rootroot00000000000000# BioPerl module for FootPrinter # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::FootPrinter - wrapper for the FootPrinter program =head1 SYNOPSIS use Bio::Tools::Run::FootPrinter; my @params = (size => 10, max_mutations_per_branch => 4, sequence_type => 'upstream', subregion_size => 30, position_change_cost => 5, triplet_filtering => 1, pair_filtering => 1, post_filtering => 1, inversion_cost => 1, max_mutations => 4, tree => "~/software/FootPrinter2.0/tree_of_life" ); my $fp = Bio::Tools::Run::FootPrinter->new(@params, -verbose => 1); my $sio = Bio::SeqIO->new(-file => "seq.fa", -format => "fasta"); while (my $seq = $sio->next_seq){ push @seq, $seq; } my @fp = $fp->run(@seq); foreach my $result(@fp){ print "***************\n".$result->seq_id."\n"; foreach my $feat($result->sub_SeqFeature){ print $feat->start."\t".$feat->end."\t".$feat->seq->seq."\n"; } } =head1 DESCRIPTION From the FootPrinter manual: FootPrinter is a program that performs phylogenetic footprinting. It takes as input a set of unaligned orthologous sequences from various species, together with a phylogenetic tree relating these species. It then searches for short regions of the sequences that are highly conserved, according to a parsimony criterion. The regions identified are good candidates for regulatory elements. By default, the program searches for regions that are well conserved across all of the input sequences, but this can be relaxed to find regions conserved in only a subset of the species =head2 About Footprinter Written by Mathieu Blanchette and Martin Tompa. Available here: http://www.mcb.mcgill.ca/~blanchem/FootPrinter2.1.tar.gz =head2 Running Footprinter To run FootPrinter, you will need to set the environment variable FOOTPRINTER_DIR to where the binary is located (even if the executable is in your path). For example: setenv FOOTPRINTER_DIR /usr/local/bin/FootPrinter2.0/ =head2 Available Parameters PARAM VALUES DESCRIPTION ------------------------------------------------------------------------ tree REQUIRED, Tree in Newick Format to evaluate parsimony score REQUIRED unless tree_of_life exists in FOOTPRINTER_DIR sequence_type upstream Default upstream downstream other size 4-16 Specifies the size of the motifs sought max_mutations 0-20 maximum parsimony score allowed for the motifs max_mutations_per_branch 0-20 Allows at most a fixed number of mutations per branch of the tree losses files give span constraints so that the motifs reported are statistically significant Example files universal([6-9]|1[0-2])(loose|tight)?.config come with FootPrinter2.0. Install these in FOOTPRINTER_DIR and use by setting "losses" to "somewhat significant", "significant", or "very significant". Do not set loss_cost. loss_cost 0-20 a cost associated with losing a motif along some branch of the tre subregion_size 1-infinity penalize motifs whose position in the sequences varies too much position_change_cost 0-20 Cost for changing subregion triplet_filtering 1/0 pre-filtering step that removes from consideration any substring that does not have a sufficiently good pair of matching substrings in some pair of the other input sequences pair_filtering 1/0 Same as triplet filtering, but looks only for one match per other sequence post_filtering 1/0 when used in conjunction with the triplet filtering option, this often significantly speeds up the program, while still garanteeing optimal results indel_cost 1-5 insertions and deletions will be allowed in the motifs sought, at the given cost inversion_cost 1-5 This option allows for motifs to undergo inversions, at the given cost. An inversion reverse-complements the motif. details 1/0 Shows some of the details about the progress of the computation html 1/0 produce html output (never deleted) ps 1/0 produce postscript output (never deleted) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut package Bio::Tools::Run::FootPrinter; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @FP_SWITCHES @FP_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Cwd; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::FootPrinter; use Bio::SeqIO; # Let the code begin... @ISA = qw(Bio::Tools::Run::WrapperBase); BEGIN { @FP_PARAMS = qw(SEQUENCE_TYPE SIZE MAX_MUTATIONS MAX_MUTATIONS_PER_BRANCH LOSSES LOSS_COST TREE PROGRAM SUBREGION_SIZE POSITION_CHANGE_COST INDEL_COST INVERSION_COST ); @FP_SWITCHES = qw(TRIPLET_FILTERING PAIR_FILTERING POST_FILTERING DETAILS); @OTHER_SWITCHES = qw(QUIET HTML PS); # Authorize attribute fields foreach my $attr ( @FP_PARAMS, @FP_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'FootPrinter'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{FOOTPRINTER_DIR}) if $ENV{FOOTPRINTER_DIR}; } =head2 executable Title : executable Usage : my $exe = $footprinter->executable('FootPrinter'); Function: Finds the full path to the 'FootPrinter' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my $self = shift; my $exe = $self->SUPER::executable(@_) || return; # even if its executable, we still need the environment variable to have # been set if (! $ENV{FOOTPRINTER_DIR}) { $self->warn("Environment variable FOOTPRINTER_DIR must be set, even if the FootPrinter executable is in your path"); return; } return $exe; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new($seq) Function: creates a new wrapper Returns: Bio::Tools::Run::FootPrinter Args : self =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } if(!$self->tree && -e $ENV{FOOTPRINTER_DIR}."/tree_of_life"){ $self->tree($ENV{FOOTPRINTER_DIR}."/tree_of_life"); } unless($self->tree){ $self->debug("Phylogenetic tree not provided. FootPrinter won't be able to run without it. use \$fp->tree to set the tree file"); } return $self; } =head2 run Title : run Usage : $fp->run(@seq) Function: carry out FootPrinter Example : Returns : An array of SeqFeatures Args : An array of Bio::PrimarySeqI compliant object At least 2 are needed. =cut sub run { my ($self,@seq) = @_; #need at least 2 for comparative genomics duh. $#seq > 0 || $self->throw("Need at least two sequences"); $self->tree || $self->throw("Need to specify a phylogenetic tree using -tree option"); my $infile = $self->_setinput(@seq); my $param_string = $self->_setparams(); my @footprint_feats = $self->_run($infile,$self->tree,$param_string); return @footprint_feats; } =head2 _run Title : _run Usage : $fp->_run ($filename,$param_string) Function: internal function that runs FootPrinter Example : Returns : an array of features Args : the filename to the input sequence, filename to phylo tree and the parameter string =cut sub _run { my ($self,$infile,$tree,$param_string) = @_; my $instring; my $exe = $self->executable || return; $self->debug( "Program ".$self->executable."\n"); my $outfile = $infile.".seq.txt"; my $cmd_str = $self->executable. " $infile $tree $param_string"; $self->debug("FootPrinter command = $cmd_str"); if ($self->verbose <=0){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd_str.= " >&$null > $null"; } # will do brute-force clean up of junk files generated by FootPrinter my $cwd = cwd(); opendir(my $cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); my %ok_files; foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /^mlc\./) { $ok_files{$thing} = 1; } } closedir($cwd_dir); my $status = system($cmd_str); $self->throw("FootPrinter Call($cmd_str) crashed: $?\n") unless $status == 0 || $status==256; unless (open (FP, $outfile)) { $self->throw("Cannot open FootPrinter outfile for parsing"); } my $fp_parser = Bio::Tools::FootPrinter->new(-fh=>\*FP); my @fp_feat; while(my $fp_feat = $fp_parser->next_feature){ push @fp_feat, $fp_feat; } unless( $self->save_tempfiles ) { unlink $outfile; unlink $infile; # is this dangerous?? unlink "$infile.order.txt"; # is this dangerous?? opendir($cwd_dir, $cwd) || $self->throw("Could not open the current directory '$cwd'!"); foreach my $thing (readdir($cwd_dir)) { if ($thing =~ /^mlc\./) { unlink($thing) unless $ok_files{$thing}; } } closedir($cwd_dir); $self->cleanup(); } return @fp_feat; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for FootPrinter program Example : Returns : parameter string to be passed to FootPrinter Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @FP_PARAMS ) { $value = $self->$attr(); next if $attr=~/TREE/i; next unless (defined $value); my $attr_key = lc $attr; #put params in format expected by dba if ($attr_key eq 'losses' && $value =~ /^\s*(somewhat|very)?\s*significant\s*$/) { $value = "$ENV{FOOTPRINTER_DIR}/universal".$self->size(); if (defined $1) { if ($1 eq 'somewhat') { $value .= 'loose'; } else { # $1 eq 'very' $value .= 'tight'; } } $value .= '.config'; -f $value or $self->throw("universal losses file $value does not exist"); } $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( @FP_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } $self->html() or $param_string .= " -no_html"; $self->ps() or $param_string .= " -no_ps"; return $param_string; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : a Bio::PrimarySeqI compliant object =cut sub _setinput { my ($self,@seq) = @_; my ($tfh1,$outfile1); $outfile1 = $self->outfile_name(); if (defined $outfile1) { open($tfh1,">$outfile1"); } else { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); } my $out1 = Bio::SeqIO->new('-fh' => $tfh1, '-format' => 'Fasta'); foreach my $seq(@seq){ $seq->isa("Bio::PrimarySeqI") || $self->throw("Need a Bio::PrimarySeq compliant object for FootPrinter"); $out1->write_seq($seq); } $out1->close(); # close the SeqIO object close($tfh1); # close the fh explicitly (just in case) undef($tfh1); # really get rid of it (just in case) return ($outfile1); } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Genemark.pm000066400000000000000000000207501302566030400226070ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Genemark # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Mark Johnson # # Special thanks to Chris Fields, Sendu Bala # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Genemark - Wrapper for local execution of the GeneMark family of programs. =head1 SYNOPSIS # GeneMark.hmm (prokaryotic) my $factory = Bio::Tools::Run::Genemark->new('-program' => 'gmhmmp', '-m' => 'model.icm'); # Pass the factory Bio::Seq objects # returns a Bio::Tools::Genemark object my $genemark = $factory->run($seq); =head1 DESCRIPTION Wrapper module for the GeneMark family of programs. Should work with all flavors of GeneMark.hmm at least, although only the prokaryotic version has been tested. General information about GeneMark is available at L. Contact information for licensing inquiries is available at: L Note that GeneMark.hmm (prokaryotic at least) will only process the first sequence in a fasta file (if you run() more than one sequence at a time, only the first will be processed). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark Johnson Email: johnsonm-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Genemark; use strict; use warnings; use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Genemark; use English; use IPC::Run; # Should be okay on WIN32 (See IPC::Run Docs) use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @params = (qw(program)); our @genemark_params = (qw(i m p)); our @genemark_switches = (qw(a n r)); =head2 program_name Title : program_name Usage : $factory>program_name() Function: gets/sets the program name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->program($val) if $val; return $self->program(); } =head2 program_dir Title : program_dir Usage : $factory->program_dir() Function: gets/sets the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 new Title : new Usage : $genemark->new(@params) Function: creates a new Genemark factory Returns: Bio::Tools::Run::Genemark Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->_set_from_args( \@args, -methods => [ @params, @genemark_params, @genemark_switches, ], -create => 1, ); unless (defined($self->program())) { $self->throw('Must specify program'); } unless (defined($self->m())) { $self->throw('Must specify model'); } return $self; } =head2 run Title : run Usage : $obj->run($seq_file) Function: Runs Genemark Returns : A Bio::Tools::Genemark object Args : An array of Bio::PrimarySeqI objects =cut sub run { my ($self, @seq) = @_; unless (@seq) { $self->throw("Must supply at least one Bio::PrimarySeqI"); } foreach my $seq (@seq) { unless ($seq->isa('Bio::PrimarySeqI')) { $self->throw("Object does not implement Bio::PrimarySeqI"); } } my $program_name = $self->program_name(); my $file_name = $self->_write_seq_file(@seq); # GeneMark.hmm (prokaryotic version, anyway) ignores sequences after the # first in a fasta file if ($program_name eq 'gmhmmp') { if (@seq > 1) { $self->warn("Program $program_name processes one sequence at a time"); } } return $self->_run($file_name, $seq[0]->display_id()); } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An instance of Bio::Tools::Genemark Args : file name, sequence identifier (optional) =cut sub _run { my ($self, $seq_file_name, $seq_id) = @_; my ($temp_fh, $temp_file_name) = $self->io->tempfile(-dir=>$self->tempdir()); close($temp_fh); # IPC::Run wants an array where the first element is the executable my @cmd = ( $self->executable(), split(/\s+/, $self->_setparams()), '-o', $temp_file_name, $seq_file_name, ); my $cmd = join(' ', @cmd); $self->debug("GeneMark Command = $cmd"); # Run the program via IPC::Run so: # 1) The console doesn't get cluttered up with the program's STDERR/STDOUT # 2) We don't have to embed STDERR/STDOUT redirection in $cmd # 3) We don't have to deal with signal handling (IPC::Run should take care # of everything automagically. my ($program_stdout, $program_stderr); eval { IPC::Run::run( \@cmd, \undef, \$program_stdout, \$program_stderr, ) || die $CHILD_ERROR; }; if ($EVAL_ERROR) { $self->throw("GeneMark call crashed: $EVAL_ERROR"); } ## The prokaryotic version of GeneMark.HMM, at least, returns ## 0 (success) even when the license has expired. if ((-z $temp_file_name) && ($program_stderr =~ /license period has ended/i)) { $self->throw($program_stderr); } elsif ($program_stderr =~ /\d+ days remaining/i) { $self->warn($program_stderr); } $self->debug(join("\n", 'GeneMark STDOUT:', $program_stdout)) if $program_stdout; $self->debug(join("\n", 'GeneMark STDERR:', $program_stderr)) if $program_stderr; return Bio::Tools::Genemark->new(-file => $temp_file_name, -seqname => $seq_id); } sub _setparams { my ($self) = @_; my $param_string = $self->SUPER::_setparams( -params => [@genemark_params], -switches => [@genemark_switches], -dash => 1, ); # Kill leading and trailing whitespace $param_string =~ s/^\s+//g; $param_string =~ s/\s+$//g; return $param_string; } =head2 _write_seq_file Title : _write_seq_file Usage : obj->_write_seq_file($seq) or obj->_write_seq_file(@seq) Function: Internal(not to be used directly) Returns : Name of a temp file containing program output Args : One or more Bio::PrimarySeqI objects =cut sub _write_seq_file { my ($self, @seq) = @_; my ($fh, $file_name) = $self->io->tempfile(-dir=>$self->tempdir()); my $out = Bio::SeqIO->new(-fh => $fh , '-format' => 'Fasta'); foreach my $seq (@seq){ $out->write_seq($seq); } close($fh); $out->close(); return $file_name; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Genewise.pm000066400000000000000000000305511302566030400226240ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by # # Copyright to a FUGU Student Intern # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Genewise - Object for predicting genes in a given sequence given a protein =head1 SYNOPSIS # Build a Genewise alignment factory my $factory = Bio::Tools::Run::Genewise->new(); # Pass the factory 2 Bio:SeqI objects (in the order of query peptide # and target_genomic). # @genes is an array of Bio::SeqFeature::Gene::GeneStructure objects my @genes = $factory->run($protein_seq, $genomic_seq); # Alternatively pass the factory a profile HMM filename and a # Bio:SeqI object (in the order of query HMM and target_genomic). # Set hmmer switch first to tell genewise to expect an HMM $factory->hmmer(1); my @genes = $factory->run($hmmfile, $genomic_seq); =head1 DESCRIPTION Genewise is a gene prediction program developed by Ewan Birney http://www.sanger.ac.uk/software/wise2. =head2 Available Params: NB: These should be passed without the '-' or they will be ignored, except switches such as 'hmmer' (which have no corresponding value) which should be set on the factory object using the AUTOLOADed methods of the same name. Model [-codon,-gene,-cfreq,-splice,-subs,-indel,-intron,-null] Alg [-kbyte,-alg] HMM [-hmmer] Output [-gff,-gener,-alb,-pal,-block,-divide] Standard [-help,-version,-silent,-quiet,-errorlog] =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - FUGU Student Intern Email: fugui@worf.fugu-sg.org =head1 CONTRIBUTORS Jason Stajich jason-AT-bioperl_DOT_org Keith James kdj@sanger.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Genewise; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @GENEWISE_SWITCHES @GENEWISE_PARAMS @OTHER_SWITCHES %OK_FIELD); use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::SeqFeature::Gene::Exon; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::SeqFeature::FeaturePair; use Bio::SeqFeature::Gene::Transcript; use Bio::SeqFeature::Gene::GeneStructure; use Bio::Tools::Genewise; use Bio::Tools::AnalysisResult; use strict; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); # Two ways to run the program ..... # 1. define an environmental variable WISEDIR # export WISEDIR =/usr/local/share/wise2.2.0 # where the wise2.2.20 package is installed # # 2. include a definition of an environmental variable WISEDIR in # every script that will use DBA.pm # $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; BEGIN { @GENEWISE_PARAMS = qw( DYMEM CODON GENE CFREQ SPLICE GENESTATS INIT SUBS INDEL INTRON NULL INSERT SPLICE_MAX_COLLAR SPLICE_MIN_COLLAR GW_EDGEQUERY GW_EDGETARGET GW_SPLICESPREAD KBYTE HNAME ALG BLOCK DIVIDE GENER U V S T G E M); @GENEWISE_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD TREV PSEUDO NOSPLICE_GTAG SPLICE_GTAG NOGWHSP GWHSP TFOR TABS BOTH HMMER ); # Authorize attribute fields foreach my $attr ( @GENEWISE_PARAMS, @GENEWISE_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'genewise'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin/") if $ENV{WISEDIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $prog = $self->executable; my $string = `$prog -version`; if( $string =~ /Version:\s+\$\s*Name:\s+(\S+)\s+\$/ ) { return $1; } elsif( $string =~ /(Version *)/i ) { return $1; } else { return undef; } } =head2 predict_genes Title : predict_genes Usage : DEPRECATED. Use $factory->run($seq1,$seq2) Function: Predict genes Returns : A Bio::Seqfeature::Gene:GeneStructure object Args : Name of a file containing a set of 2 fasta sequences in the order of peptide and genomic sequences or else 2 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 2 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub predict_genes { return shift->run(@_); } =head2 run Title : run Usage : 2 sequence objects $genes = $factory->run($seq1, $seq2); Function: run Returns : A Bio::Seqfeature::Gene:GeneStructure object Args : Names of a files each containing a fasta sequence in the order of either (peptide sequence, genomic sequence) or (profile HMM, genomic sequence). Alternatively any of the fasta sequence filenames may be substituted with a Bio::Seq object. Throws an exception if argument is not either a string (eg a filename) or Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. Also throws an exception if a profile HMM is expected (the -hmmer genewise switch has been set). =cut sub run{ my ($self, $seq1, $seq2) = @_; my ($attr, $value, $switch); $self->io->_io_cleanup(); # Create input file pointer my ($infile1,$infile2)= $self->_setinput($seq1, $seq2); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) ");} # run genewise my @genes = $self->_run($infile1,$infile2); return @genes; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: Makes actual system call to a genewise program Example : Returns : L Args : Name of a files containing 2 sequences in the order of peptide and genomic =cut sub _run { my ($self,$infile1,$infile2) = @_; my $instring; $self->debug("Program ".$self->executable."\n"); unless ( $self->executable ) { $self->throw("Cannot run Genewise unless the executable is found. Check your environment variables or make sure genewise is in your path."); } my $paramstring = $self->_setparams; my $commandstring = $self->executable." $paramstring $infile1 $infile2"; # this is to capture STDERR messages which leak out when you run programs # with open(FH, "... |"); if (($self->silent && $self->quiet) && ($^O !~ /os2|dos|amigaos/)) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 2> $null"; } my ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); $self->debug("genewise command = $commandstring"); my $status = system("$commandstring > $outfile1"); $self->throw("Genewise call $commandstring crashed: $? \n") unless $status == 0; my $genewiseParser = Bio::Tools::Genewise->new(-file=> $outfile1); my @genes; while (my $gene = $genewiseParser->next_prediction()) { push @genes, $gene; } close ($tfh1); undef ($tfh1); return @genes; } sub get_strand { my ($self,$start,$end) = @_; $start || $self->throw("Need a start"); $end || $self->throw("Need an end"); my $strand; if ($start > $end) { my $tmp = $start; $start = $end; $end = $tmp; $strand = -1; } else { $strand = 1; } return ($start,$end,$strand); } sub _setinput { my ($self, $arg1, $seq2) = @_; my ($tfh1,$tfh2,$outfile1,$outfile2); $self->throw("calling with not enough arguments") unless $arg1 && $seq2; # Not going to set _query_pep/_subject_dna_seq if you pass in a # filename unless( ref($arg1) ) { unless( -e $arg1 ) { if ($self->hmmer) { $self->throw("Argument1 was not a HMMER profile HMM file\n") } else { $self->throw("Argument1 is not a Bio::PrimarySeqI object nor file\n"); } } $outfile1 = $arg1; } else { if ($self->hmmer) { $self->throw("Argument1 was not a HMMER profile HMM file\n") } else { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new('-fh' => $tfh1, '-format' => 'fasta'); $out1->write_seq($arg1); $self->_query_pep_seq($arg1); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh1); undef $tfh1; } } unless( ref($seq2) ) { unless( -e $seq2 ) { $self->throw("Sequence2 is not a Bio::PrimarySeqI object nor file\n"); } $outfile2 = $seq2; } else { ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out2 = Bio::SeqIO->new('-fh' => $tfh2, '-format' => 'fasta'); $out2->write_seq($seq2); $self->_subject_dna_seq($seq2); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh2); undef $tfh2; } return ($outfile1,$outfile2); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self) = @_; my $param_string; foreach my $attr(@GENEWISE_PARAMS){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .= $attr_key.' '.$value; } foreach my $attr(@GENEWISE_SWITCHES){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key; } $param_string = $param_string." -genesf"; #specify the output option return $param_string; } =head2 _query_pep_seq Title : _query_pep_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_pep_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_pep_seq'} = $seq; } return $self->{'_query_pep_seq'}; } =head2 _subject_dna_seq Title : _subject_dna_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _subject_dna_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_subject_dna_seq'} = $seq; } return $self->{'_subject_dna_seq'}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Genscan.pm000066400000000000000000000136771302566030400224460ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Genscan # # Please direct questions and support issues to # # Cared for by # # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Genscan - Object for identifying genes in a given sequence given a matrix(for appropriate organisms). =head1 SYNOPSIS # Build a Genscan factory my $param = ('MATRIX'=>HumanIso.smat); my $factory = Bio::Tools::Run::Genscan->new($param); # Pass the factory a Bio::Seq object #@genes is an array of Bio::Tools::Predictions::Gene objects my @genes = $factory->run($seq); =head1 DESCRIPTION Genscan is a gene identifying program developed by Christopher Burge http://genes.mit.edu/burgelab/ By default it looks for an executable called I and data/parameter files in the directory specified by the I environmental variable. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Genscan; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @GENSCAN_PARAMS %OK_FIELD); use strict; use Bio::Seq; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Genscan; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @GENSCAN_PARAMS=qw(MATRIX VERBOSE QUIET); foreach my $attr ( @GENSCAN_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'genscan'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{GENSCANDIR}); } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 predict_genes() Title : predict_genes() Usage : DEPRECATED: use $obj->run($seq) instead Function: Runs genscan and creates an array of Genes Returns : An array of Bio::Tools::Prediction::Gene objects Args : A Bio::PrimarySeqI =cut sub predict_genes{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs genscan and creates an array of Genes Returns : An array of Bio::Tools::Prediction::Gene objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$seq) = @_; my $infile1 = $self->_writeSeqFile($seq); $self->_set_input($infile1); my @feat = $self->_run(); return @feat; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::Tools::Prediction::Gene objects Args : =cut sub _run { my ($self) = @_; my @genes; my $gene; my $str = $self->executable.' '.$self->MATRIX.' '.$self->{'input'}; if($self->verbose){ $str.=" -v "; } if($self->quiet){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(STDERR,">$null"); } unless (open(GENSCAN, "$str |")){ $self->warn("Cannot run $str"); } close(STDERR); my $genScanParser = Bio::Tools::Genscan->new(-fh=> \*GENSCAN); while( $gene = $genScanParser->next_prediction()){ push(@genes, $gene); } $self->cleanup(); return @genes; } =head2 _set_input() Title : _set_input Usage : obj->_set_input($matrixFile,$seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _set_input() { my ($self,$infile1) = @_; $self->{'input'}=$infile1; } =head2 _writeSeqFile() Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile(){ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Glimmer.pm000066400000000000000000000266451302566030400224630ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Glimmer # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Mark Johnson # # Special thanks to Chris Fields, Sendu Bala # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Glimmer - Wrapper for local execution of Glimmer, GlimmerM and GlimmerHMM. =head1 SYNOPSIS # glimmer2 my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmer3', '-model' => 'model.icm'); # glimmer3 my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmer2', '-model' => 'model.icm'); # glimmerm my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmerm'); # glimmerHMM my $factory = Bio::Tools::Run::Glimmer->new('-program' => 'glimmerHMM'); # Pass the factory Bio::Seq objects # returns a Bio::Tools::Glimmer object my $glimmer = $factory->run($seq); or my $glimmer = $factor->run(@seq); =head1 DESCRIPTION Wrapper module for the Glimmer family of programs. Should work with all currently available flavors: Glimmer, GlimmerM and GlimmerHMM. However, only Glimmer 2.X and 3.X have been tested. Glimmer is open source and available at L. GlimmerM is open source and available at L. GlimmerHMM is open source and available at L. Note that Glimmer 2.X will only process the first sequence in a fasta file (if you run() more than one sequence at a time, only the first will be processed). Note that Glimmer 3.X produces two output files. This module only passes the .predict file to Bio::Tools::Glimmer. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark Johnson Email: johnsonm-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Glimmer; use strict; use warnings; use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Glimmer; use English; use IPC::Run; # Should be okay on WIN32 (See IPC::Run Docs) use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @params = (qw(program model)); our @glimmer2_params = (qw(C L g i o p q s t w)); our @glimmer2_switches = (qw(M X f l r)); our @glimmer3_params = (qw(A C E L M P Z b g i t z)); our @glimmer3_switches = (qw(X f l o q r)); our @glimmerM_params = (qw(d g t)); our @glimmerM_switches = (qw(5 3 f r s)); our @glimmerHMM_params = (qw(d n p)); our @glimmerHMM_switches = (qw(f h v)); =head2 program_name Title : program_name Usage : $factory>program_name() Function: gets/sets the program name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->program($val) if $val; return $self->program(); } =head2 program_dir Title : program_dir Usage : $factory->program_dir() Function: gets/sets the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 model Title : model Usage : $factory>model() Function: gets/sets the name of the model (icm) file Returns: string Args : string =cut sub model { my ($self, $val) = @_; $self->{'_model'} = $val if $val; return $self->{'_model'}; } =head2 new Title : new Usage : $glimmer->new(@params) Function: creates a new Glimmer factory Returns: Bio::Tools::Run::Glimmer Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->_set_from_args( \@args, -methods => [ @params, @glimmer2_params, @glimmer2_switches, @glimmer3_params, @glimmer3_switches, @glimmerM_params, @glimmerM_switches, @glimmerHMM_params, @glimmerHMM_switches ], -create => 1, ); unless (defined($self->program())) { $self->throw('Must specify program'); } unless (defined($self->model())) { $self->throw('Must specify model'); } return $self; } =head2 run Title : run Usage : $obj->run($seq_file) Function: Runs Glimmer/GlimmerM/GlimmerHMM Returns : A Bio::Tools::Glimmer object Args : An array of Bio::PrimarySeqI objects =cut sub run{ my ($self, @seq) = @_; unless (@seq) { $self->throw("Must supply at least one Bio::PrimarySeqI"); } foreach my $seq (@seq) { unless ($seq->isa('Bio::PrimarySeqI')) { $self->throw("Object does not implement Bio::PrimarySeqI"); } } my $program_name = $self->program_name(); my $file_name = $self->_write_seq_file(@seq); my @run_args = ( $file_name ); # Glimmer 2.X ignores sequences after the first in a fasta file # Glimmer 3.X will process multiple sequences at once if ($program_name eq 'glimmer2') { if (@seq > 1) { $self->warn("Program $program_name processes one sequence at a time"); } push @run_args, $seq[0]->display_id(); push @run_args, $seq[0]->length(); } return $self->_run(@run_args); } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An instance of Bio::Tools::Glimmer Args : file name, sequence identifier (optional) =cut sub _run { my ($self, $seq_file_name, $seq_id, $seq_length) = @_; my @cmd = ( $self->executable(), $seq_file_name, $self->model(), split(/\s+/, $self->_setparams()), ); my $cmd = join(' ', @cmd); $self->debug("Glimmer Command = $cmd"); my $program_name = $self->program_name(); my ($output_fh, $output_file_name, $detail_file_name); my ($program_stdout, $program_stderr); my @ipc_args = (\@cmd, \undef); # No STDOUT option for glimmer3, it takes a # 'tag' argument, and outputs tag.predict and # tag.detail. It seems that tag can be a path, # which is handy. if ($program_name eq 'glimmer3') { my $temp_dir = $self->tempdir(); my $glimmer3_tag = "$temp_dir/glimmer3"; push @cmd, $glimmer3_tag; $output_file_name = "$glimmer3_tag.predict"; $detail_file_name = "$glimmer3_tag.detail"; push @ipc_args, \$program_stdout, \$program_stderr; } else { ($output_fh, $output_file_name) = $self->io->tempfile(-dir=>$self->tempdir()); close($output_fh); push @ipc_args, '>', $output_file_name; push @ipc_args, '2>', \$program_stderr; } # Run the program via IPC::Run so: # 1) The console doesn't get cluttered up with the program's STDERR/STDOUT # 2) We don't have to embed STDERR/STDOUT redirection in $cmd # 3) We don't have to deal with signal handling (IPC::Run should take care # of everything automagically. eval { IPC::Run::run(@ipc_args) || die $CHILD_ERROR;; }; if ($EVAL_ERROR) { $self->throw("Glimmer call crashed: $EVAL_ERROR"); } $self->debug(join("\n", 'Glimmer STDOUT:', $program_stdout)) if $program_stdout; $self->debug(join("\n", 'Glimmer STDERR:', $program_stderr)) if $program_stderr; my %parser_args = (-file => $output_file_name); # Pass along $seq_id and $seq_length if they were provided # (only should be for glimmer2). if (defined($seq_id)) { $parser_args{-seqname } = $seq_id; } if (defined($seq_length)) { $parser_args{-seqlength} = $seq_length; } # Pass along the name of extra output file, with handy information about # sequence lengths (only produced by glimmer3) if (defined($detail_file_name)) { $parser_args{-detail} = $detail_file_name; } return Bio::Tools::Glimmer->new(%parser_args); } sub _setparams { my ($self) = @_; my $param_string = $self->SUPER::_setparams( -params => [ @glimmer2_params, @glimmer3_params, @glimmerM_params, @glimmerHMM_params, ], -switches => [ @glimmer2_switches, @glimmer2_switches, @glimmerM_switches, @glimmerHMM_switches, ], -dash => 1 ); # Kill leading and trailing whitespace $param_string =~ s/^\s+//g; $param_string =~ s/\s+$//g; return $param_string; } =head2 _write_seq_file Title : _write_seq_file Usage : obj->_write_seq_file($seq) or obj->_write_seq_file(@seq) Function: Internal(not to be used directly) Returns : Name of a temp file containing program output Args : One or more Bio::PrimarySeqI objects =cut sub _write_seq_file { my ($self, @seq) = @_; my ($fh, $file_name) = $self->io->tempfile(-dir=>$self->tempdir()); my $out = Bio::SeqIO->new(-fh => $fh , '-format' => 'Fasta'); foreach my $seq (@seq){ $out->write_seq($seq); } close($fh); $out->close(); return $file_name; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Hmmer.pm000077500000000000000000000427361302566030400221410ustar00rootroot00000000000000# You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Hmmer - Wrapper for local execution of hmmalign, hmmbuild, hmmcalibrate, hmmemit, hmmpfam, hmmsearch =head1 SYNOPSIS # run hmmsearch (similar for hmmpfam) my $factory = Bio::Tools::Run::Hmmer->new(-hmm => 'model.hmm'); # Pass the factory a Bio::Seq object or a file name, returns a Bio::SearchIO my $searchio = $factory->hmmsearch($seq); while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $result->query_name, $hsp->query->start, $hsp->query->end, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->score, $hsp->evalue, $hsp->seq_str, )), "\n"; } } } # build a hmm using hmmbuild my $aio = Bio::AlignIO->new(-file => "protein.msf", -format => 'msf'); my $aln = $aio->next_aln; my $factory = Bio::Tools::Run::Hmmer->new(-hmm => 'model.hmm'); $factory->hmmbuild($aln); # calibrate the hmm $factory->calibrate(); # emit a sequence stream from the hmm my $seqio = $factory->hmmemit(); # align sequences to the hmm my $alnio = $factory->hmmalign(@seqs); =head1 DESCRIPTION Wrapper module for Sean Eddy's HMMER suite of program to allow running of hmmalign, hmmbuild, hmmcalibrate, hmmemit, hmmpfam and hmmsearch. Binaries are available at http://hmmer.janelia.org/ You can pass most options understood by the command-line programs to new(), or set the options by calling methods with the same name as the argument. In both instances, case sensitivity matters. Additional methods are hmm() to specifiy the hmm file (needed for all HMMER programs) which you would normally set in the call to new(). The HMMER programs must either be in your path, or you must set the environment variable HMMERDIR to point to their location. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email: shawnh-at-gmx.net =head1 CONTRIBUTORS Shawn Hoon shawnh-at-gmx.net Jason Stajich jason -at- bioperl -dot- org Scott Markel scott -at- scitegic -dot com Sendu Bala bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Hmmer; use strict; use Bio::SeqIO; use Bio::SearchIO; use Bio::AlignIO; use base qw(Bio::Tools::Run::WrapperBase); our $DefaultFormat = 'msf'; our $DefaultReadMethod = 'hmmer'; our %ALL = (quiet => 'q', o => 'outfile'); our @ALIGN_PARAMS = qw(mapali outformat withali o); our @ALIGN_SWITCHES = qw(m oneline q); our @BUILD_PARAMS = qw(n archpri cfile gapmax idlevel null pam pamwgt pbswitch prior swentry swexit o); our @BUILD_SWITCHES = qw(f g s A F amino binary fast hand noeff nucleic wblosum wgsc wme wnone wpb wvoronoi); our @CALIBRATE_PARAMS = qw(fixed histfile mean num sd seed cpu); our @CALIBRATE_SWITCHES = qw(); our @EMIT_PARAMS = qw(n seed o); our @EMIT_SWITCHES = qw(c q); our @PFAM_PARAMS = qw(A E T Z domE domT informat cpu); our @PFAM_SWITCHES = qw(n acc cut_ga cut_gc cut_nc forward null2 xnu); our @SEARCH_PARAMS = @PFAM_PARAMS; our @SEARCH_SWITCHES = @PFAM_SWITCHES; our %OTHER = (_READMETHOD => '_readmethod', program_name => [qw(PROGRAM program)], hmm => [qw(HMM db DB)]); # just to be explicit our @UNSUPPORTED = qw(h verbose a compat pvm); =head2 new Title : new Usage : $HMMER->new(@params) Function: Creates a new HMMER factory Returns : Bio::Tools::Run::HMMER Args : -hmm => filename # the hmm, used by all program types; if not set # here, must be set with hmm() method prior to # running anything -_READMETHOD => 'hmmer' (default) || 'hmmer_pull' # the parsing # module to use for # hmmpfam/hmmsearch Any option supported by a Hmmer program, where switches are given a true value, eg. -q => 1, EXCEPT for the following which are handled internally/ incompatible: h verbose a compat pvm WARNING: the default sequence format passed to hmmpfam is msf. If you are using a different format, you need to pass it with informat. e.g. my $factory = Bio::Tools::Run::Hmmer->new(-hmm => 'model.hmm', -informat => 'fasta'); -q is synonymous with -quiet -o is synonymous with -outfile # may be specified here, allowing run() to be used, or # it can be omitted and the corresponding method (eg. # hmmalign()) used later. -program => hmmalign|hmmbuild|hmmcalibrate|hmmemit|hmmpfam|hmmsearch =cut sub new { my($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $ALL{$_} } keys %ALL), (map { $_ => $OTHER{$_} } keys %OTHER), (map { $_ => $_ } (@ALIGN_PARAMS, @ALIGN_SWITCHES, @BUILD_PARAMS, @BUILD_SWITCHES, @CALIBRATE_PARAMS, @CALIBRATE_SWITCHES, @EMIT_PARAMS, @EMIT_SWITCHES, @PFAM_PARAMS, @PFAM_SWITCHES, @SEARCH_PARAMS, @SEARCH_SWITCHES))}, -create => 1, -case_sensitive => 1); $self->informat || $self->informat($DefaultFormat); $self->_READMETHOD || $self->_READMETHOD($DefaultReadMethod); return $self; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs one of the Hmmer programs, according to the current setting of program() (as typically set during new(-program => 'name')). Returns : A Bio::SearchIO, Bio::AlignIO, Bio::SeqIO or boolean depending on the program being run (see method corresponding to program name for details). Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub run { my $self = shift; my $program = lc($self->program_name || $self->throw("The program must already be specified")); $self->can($program) || $self->throw("'$program' wasn't a valid program"); return $self->$program(@_); } =head2 hmmalign Title : hmmalign Usage : $obj->hmmalign() Function: Runs hmmalign Returns : A Bio::AlignIO Args : list of Bio::SeqI OR Bio::Align::AlignI OR filename of file with sequences or an alignment =cut sub hmmalign { my $self = shift; $self->program_name('hmmalign'); my $input = $self->_setinput(@_); unless (defined $self->o()) { $self->q(1); } if (! $self->outformat) { $self->outformat($DefaultFormat); } return $self->_run($input); } =head2 hmmbuild Title : hmmbuild Usage : $obj->hmmbuild() Function: Runs hmmbuild, outputting an hmm to the file currently set by method hmm() or db(), or failing that, o() or outfile(), or failing that, to a temp location. Returns : true on success Args : Bio::Align::AlignI OR filename of file with an alignment =cut sub hmmbuild { my $self = shift; $self->program_name('hmmbuild'); my $input = $self->_setinput(@_); unless (defined $self->hmm()) { $self->hmm($self->o() || $self->io->tempfile(-dir => $self->tempdir)); } return $self->_run($input); } =head2 hmmcalibrate Title : hmmcalibrate Usage : $obj->hmmcalibrate() Function: Runs hmmcalibrate Returns : true on success Args : none (hmm() must be set, most likely by the -hmm option of new()), OR optionally supply an hmm filename to set hmm() and run =cut sub hmmcalibrate { my ($self, $hmm) = @_; $self->program_name('hmmcalibrate'); $self->hmm($hmm) if $hmm; $self->hmm || $self->throw("hmm() must be set first"); return $self->_run(); } =head2 hmmemit Title : hmmemit Usage : $obj->hmmemit() Function: Runs hmmemit Returns : A Bio::SeqIO Args : none (hmm() must be set, most likely by the -hmm option of new()), OR optionally supply an hmm filename to set hmm() and run =cut sub hmmemit { my ($self, $hmm) = @_; $self->program_name('hmmemit'); $self->hmm($hmm) if $hmm; $self->hmm || $self->throw("hmm() must be set first"); unless (defined $self->o()) { $self->q(1); } return $self->_run(); } =head2 hmmpfam Title : hmmpfam Usage : $obj->hmmpfam() Function: Runs hmmpfam Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub hmmpfam { my $self = shift; $self->program_name('hmmpfam'); my $input = $self->_setinput(@_); return $self->_run($input); } =head2 hmmsearch Title : hmmsearch Usage : $obj->hmmsearch() Function: Runs hmmsearch Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub hmmsearch { my $self = shift; $self->program_name('hmmsearch'); my $input = $self->_setinput(@_); return $self->_run($input); } =head2 _setinput Title : _setinput Usage : $obj->_setinput() Function: Internal(not to be used directly) Returns : filename Args : A Bio::PrimarySeqI, Bio::Align::AlignI or filename =cut sub _setinput { my ($self, @things) = @_; @things || $self->throw("At least one input is required"); my $infile; if (ref $things[0] && $things[0]->isa("Bio::PrimarySeqI") ){# it is an object $infile = $self->_writeSeqFile(@things); } elsif(ref $things[0] && $things[0]->isa("Bio::Align::AlignI")){ $infile = $self->_writeAlignFile(@things); } elsif (-e $things[0]) { $infile = $things[0]; } else { $self->throw("Unknown kind of input '@things'"); } return $infile; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : Bio::SearchIO Args : file name =cut sub _run { my ($self, $file) = @_; # Use double quotes if file path have empty spaces if ($file =~ m/ /) { $file = "\"$file\""; } my $str = $self->executable; # Use double quotes if executable path have empty spaces if ($str =~ m/ /) { $str = "\"$str\""; } $str .= $self->_setparams; $str .= ' '.$file if $file; $self->debug("HMMER command = $str"); my $progname = $self->program_name; my @in; my @verbose = (-verbose => $self->verbose); if ($progname =~ /align|build|emit/) { my $outfile = $self->o; if ($outfile || $progname eq 'hmmbuild') { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $str .= " > $null" if $self->quiet; if ($progname eq 'hmmbuild') { my $status = system($str); return $status ? 0 : 1; } else { system($str) && $self->throw("HMMER call ($str) crashed: $?\n"); @in = (-file => $outfile); } } else { open(my $fh, "$str |") || $self->throw("HMMER call ($str) crashed: $?\n"); @in = (-fh => $fh); } } elsif ($progname =~ /pfam|search/i) { open(my $fh, "$str |") || $self->throw("HMMER call ($str) crashed: $?\n"); return Bio::SearchIO->new(-fh => $fh, @verbose, -format => $self->_READMETHOD); } if ($progname eq 'hmmalign') { return Bio::AlignIO->new(@in, @verbose, -format => $self->outformat); } elsif ($progname eq 'hmmemit') { return Bio::SeqIO->new(@in, @verbose, -format => 'fasta'); } elsif ($progname =~ /calibrate/) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $str .= " > $null 2> $null" if $self->quiet; my $status = system($str); return $status ? 0 : 1; } } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my @execparams; my @execswitches; SWITCH: for ($self->program_name) { /align/ && do { @execparams = @ALIGN_PARAMS; @execswitches = @ALIGN_SWITCHES; last SWITCH; }; /build/ && do { @execparams = @BUILD_PARAMS; @execswitches = @BUILD_SWITCHES; last SWITCH; }; /calibrate/ && do { @execparams = @CALIBRATE_PARAMS; @execswitches = @CALIBRATE_SWITCHES; last SWITCH; }; /emit/ && do { @execparams = @EMIT_PARAMS; @execswitches = @EMIT_SWITCHES; last SWITCH; }; /pfam/ && do { @execparams = @PFAM_PARAMS; @execswitches = @PFAM_SWITCHES; last SWITCH; }; /search/ && do { @execparams = @SEARCH_PARAMS; @execswitches = @SEARCH_SWITCHES; last SWITCH; }; } my $param_string = $self->SUPER::_setparams(-params => \@execparams, -switches => \@execswitches, -mixed_dash => 1); my $hmm = $self->hmm || $self->throw("Need to specify either HMM file or Database"); # Use double quotes if hmm path have empty spaces if ($hmm =~ m/ /) { $hmm = "\"$hmm\""; } $param_string .= ' '.$hmm; return $param_string; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : none =cut sub program_name { my $self = shift; if (@_) { $self->{program_name} = shift; # hack so that when program_name changes, so does executable() delete $self->{'_pathtoexe'}; } return $self->{program_name} || ''; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : none =cut sub program_dir { return $ENV{HMMERDIR} if $ENV{HMMERDIR}; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : filename Args : list of Bio::SeqI =cut sub _writeSeqFile { my ($self, @seq) = @_; my ($tfh, $inputfile) = $self->io->tempfile(-dir=>$self->tempdir); $self->informat('fasta'); my $out = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); foreach my $s (@seq) { $out->write_seq($s); } $out->close(); $out = undef; close($tfh); undef $tfh; return $inputfile; } =head2 _writeAlignFile Title : _writeAlignFile Usage : obj->_writeAlignFile($seq) Function: Internal(not to be used directly) Returns : filename Args : list of Bio::Align::AlignI =cut sub _writeAlignFile{ my ($self, @align) = @_; my ($tfh, $inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $out = Bio::AlignIO->new('-fh' => $tfh, '-format' => $self->informat); foreach my $a (@align) { $out->write_aln($a); } $out->close(); $out = undef; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Infernal.pm000066400000000000000000001200111302566030400226030ustar00rootroot00000000000000# # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code # # _history # # March 2007 - first full implementation; needs some file IO tweaking between # runs but works for now # April 2008 - add 0.81 parameters (may be removed in the 1.0 release) # # July 2009 - updated for v1.0. No longer supporting pre-1.0 Infernal =head1 NAME Bio::Tools::Run::Infernal - Wrapper for local execution of cmalign, cmbuild, cmsearch, cmscore =head1 SYNOPSIS # parameters which are switches are set with any value that evals TRUE, # others are set to a specific value my $factory = Bio::Tools::Run::Infernal->new(@params); # run cmalign|cmbuild|cmsearch|cmscore|cmemit directly as a wrapper method # this resets the program flag if previously set $factory->cmsearch(@seqs); # searches Bio::PrimarySeqI's based on set cov. model # saves output to optional outfile_name, returns # Bio::SearchIO # only values which are allowed for a program are set, so one can use the same # wrapper for the following... $factory->cmalign(@seqs); # aligns Bio::PrimarySeqI's to a set cov. model, # --merge option allows two alignments generated # from the same CM to be merged. # output to outfile_name, returns Bio::AlignIO $factory->cmscore(); # scores set cov. model against Bio::PrimarySeqI, # output to outfile_name/STDOUT. $factory->cmbuild($aln); # builds covariance model based on alignment # CM to outfile_name or model_file (one is required # here), output to STDOUT. $factory->cmemit(); # emits sequence from specified cov. model; # set one if no file specified. output to # outfile_name, returns Bio::SeqIO or (if -a is set) # Bio::AlignIO $factory->cmcalibrate($file); # calibrates specified cov. model; output to # STDOUT $factory->cmstat($file); # summary stats for cov. model; set one if no file # specified; output to STDOUT # run based on the setting of the program parameter my $factory = Bio::Tools::Run::Infernal->new(-program => 'cmsearch', @params); my $search = $factory->run($seq); # using cmsearch returns a Bio::SearchIO object while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $r->query_name, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->meta, $hsp->score, )), "\n"; } } } =head1 DESCRIPTION Wrapper module for Sean Eddy's Infernal suite of programs. The current implementation runs cmsearch, cmcalibrate, cmalign, cmemit, cmbuild, cmscore, and cmstat. cmsearch will return a Bio::SearchIO, cmemit a Bio::SeqIO/AlignIO, and cmalign a Bio::AlignIO. All others send output to STDOUT. Optionally, any program's output can be redirected to outfile_name. We HIGHLY suggest upgrading to Infernal 1.0. In that spirit, this wrapper now supports parameters for Infernal 1.0 only; for wrapping older versions of Infernal we suggest using the version of Bio::Tools::Run::Infernal that came with previous versions of BioPerl-run. NOTE: Due to conflicts in the way Infernal parameters are now formatted vs. subroutine naming in Perl (specifically the inclusion of hyphens) and due to the very large number of parameters available, setting and resetting parameters via set_parameters() and reset_parameters() is required. All valid parameters can be set, but only ones valid for the executable set via program()/program_name() are used for calling the executables, the others are silently ignored. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Chris Fields Email: cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS cjfields-at-uiuc-dot-edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Infernal; use strict; use warnings; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase Bio::ParameterBaseI); use Bio::SeqIO; use Bio::SearchIO; use Bio::AlignIO; use Data::Dumper; # yes, these are the current parameters our %INFERNAL_PARAMS = ( 'A' => ['switch', '-', qw(cmbuild)], 'E' => ['param', '-', qw(cmsearch cmstat)], 'F' => ['switch', '-', qw(cmbuild)], 'Lmax' => ['param', '--', qw(cmscore)], 'Lmin' => ['param', '--', qw(cmscore)], 'T' => ['param', '-', qw(cmsearch cmstat)], 'Wbeta' => ['param', '--', qw(cmbuild)], 'Z' => ['param', '-', qw(cmsearch cmstat)], 'a' => ['switch', '-', qw(cmbuild cmemit cmscore)], 'afile' => ['param', '--', qw(cmstat)], 'ahmm' => ['param', '--', qw(cmemit)], 'all' => ['switch', '--', qw(cmstat)], 'aln-hbanded' => ['switch', '--', qw(cmsearch)], 'aln-optacc' => ['switch', '--', qw(cmsearch)], 'aln2bands' => ['switch', '--', qw(cmscore cmsearch)], 'banddump' => ['param', '--', qw(cmalign)], 'begin' => ['param', '--', qw(cmemit)], 'beta' => ['param', '--', qw(cmalign cmscore cmsearch cmstat)], 'betae' => ['param', '--', qw(cmscore)], 'betas' => ['param', '--', qw(cmscore)], 'bfile' => ['param', '--', qw(cmstat)], 'binary' => ['switch', '--', qw(cmbuild)], 'bits' => ['switch', '--', qw(cmstat)], 'bottomonly' => ['switch', '--', qw(cmsearch)], 'c' => ['switch', '-', qw(cmemit)], 'call' => ['switch', '--', qw(cmbuild)], 'cdump' => ['param', '--', qw(cmbuild)], 'cfile' => ['param', '--', qw(cmbuild)], 'checkfb' => ['switch', '--', qw(cmalign)], 'checkpost' => ['switch', '--', qw(cmalign)], 'cmL' => ['param', '--', qw(cmstat)], 'cmaxid' => ['param', '--', qw(cmbuild)], 'cmtbl' => ['param', '--', qw(cmbuild)], 'corig' => ['switch', '--', qw(cmbuild)], 'ctarget' => ['param', '--', qw(cmbuild)], 'cyk' => ['switch', '--', qw(cmalign cmbuild cmsearch)], 'devhelp' => ['switch', '--', qw(cmalign cmbuild cmcalibrate cmemit cmscore cmsearch)], 'dlev' => ['param', '--', qw(cmalign)], 'dna' => ['switch', '--', qw(cmalign cmemit cmsearch)], 'eX' => ['param', '--', qw(cmbuild)], 'eent' => ['switch', '--', qw(cmbuild)], 'efile' => ['param', '--', qw(cmstat)], 'ehmmre' => ['param', '--', qw(cmbuild)], 'elself' => ['param', '--', qw(cmbuild)], 'emap' => ['param', '--', qw(cmbuild)], 'emit' => ['switch', '--', qw(cmscore)], 'end' => ['param', '--', qw(cmemit)], 'enone' => ['switch', '--', qw(cmbuild)], 'ere' => ['param', '--', qw(cmbuild)], 'exp' => ['param', '--', qw(cmemit)], 'exp-T' => ['param', '--', qw(cmcalibrate)], 'exp-beta' => ['param', '--', qw(cmcalibrate)], 'exp-cmL-glc' => ['param', '--', qw(cmcalibrate)], 'exp-cmL-loc' => ['param', '--', qw(cmcalibrate)], 'exp-ffile' => ['param', '--', qw(cmcalibrate)], 'exp-fract' => ['param', '--', qw(cmcalibrate)], 'exp-gc' => ['param', '--', qw(cmcalibrate)], 'exp-hfile' => ['param', '--', qw(cmcalibrate)], 'exp-hmmLn-glc' => ['param', '--', qw(cmcalibrate)], 'exp-hmmLn-loc' => ['param', '--', qw(cmcalibrate)], 'exp-hmmLx' => ['param', '--', qw(cmcalibrate)], 'exp-no-qdb' => ['switch', '--', qw(cmcalibrate)], 'exp-pfile' => ['param', '--', qw(cmcalibrate)], 'exp-qqfile' => ['param', '--', qw(cmcalibrate)], 'exp-random' => ['switch', '--', qw(cmcalibrate)], 'exp-sfile' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-cglc' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-cloc' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-hglc' => ['param', '--', qw(cmcalibrate)], 'exp-tailn-hloc' => ['param', '--', qw(cmcalibrate)], 'exp-tailp' => ['param', '--', qw(cmcalibrate)], 'exp-tailxn' => ['param', '--', qw(cmcalibrate)], 'fil-E-hmm' => ['param', '--', qw(cmsearch)], 'fil-E-qdb' => ['param', '--', qw(cmsearch)], 'fil-F' => ['param', '--', qw(cmcalibrate)], 'fil-N' => ['param', '--', qw(cmcalibrate)], 'fil-Smax-hmm' => ['param', '--', qw(cmsearch)], 'fil-T-hmm' => ['param', '--', qw(cmsearch)], 'fil-T-qdb' => ['param', '--', qw(cmsearch)], 'fil-aln2bands' => ['switch', '--', qw(cmcalibrate)], 'fil-beta' => ['param', '--', qw(cmsearch)], 'fil-dfile' => ['param', '--', qw(cmcalibrate)], 'fil-gemit' => ['switch', '--', qw(cmcalibrate)], 'fil-no-hmm' => ['switch', '--', qw(cmsearch)], 'fil-no-qdb' => ['switch', '--', qw(cmsearch)], 'fil-nonbanded' => ['switch', '--', qw(cmcalibrate)], 'fil-tau' => ['param', '--', qw(cmcalibrate)], 'fil-xhmm' => ['param', '--', qw(cmcalibrate)], 'fins' => ['switch', '--', qw(cmalign cmbuild)], 'forecast' => ['param', '--', qw(cmcalibrate cmsearch)], 'forward' => ['switch', '--', qw(cmscore cmsearch)], 'g' => ['switch', '-', qw(cmsearch cmstat)], 'ga' => ['switch', '--', qw(cmsearch cmstat)], 'gapthresh' => ['param', '--', qw(cmalign cmbuild)], 'gcfile' => ['param', '--', qw(cmsearch)], 'ge' => ['switch', '--', qw(cmstat)], 'gfc' => ['switch', '--', qw(cmstat)], 'gfi' => ['switch', '--', qw(cmstat)], 'gibbs' => ['switch', '--', qw(cmbuild)], 'gtbl' => ['param', '--', qw(cmbuild)], 'gtree' => ['param', '--', qw(cmbuild)], 'h' => ['switch', '-', qw(cmalign cmbuild cmcalibrate cmemit cmscore cmsearch cmstat)], 'hbanded' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'hmm-W' => ['param', '--', qw(cmsearch)], 'hmm-cW' => ['param', '--', qw(cmsearch)], 'hmmL' => ['param', '--', qw(cmstat)], 'hsafe' => ['switch', '--', qw(cmalign cmscore)], 'ignorant' => ['switch', '--', qw(cmbuild)], 'iins' => ['switch', '--', qw(cmbuild)], 'infile' => ['param', '--', qw(cmscore)], 'informat' => ['param', '--', qw(cmalign cmbuild cmsearch)], 'inside' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'l' => ['switch', '-', qw(cmalign cmbuild cmemit cmscore)], 'lambda' => ['param', '--', qw(cmsearch)], 'le' => ['switch', '--', qw(cmstat)], 'lfc' => ['switch', '--', qw(cmstat)], 'lfi' => ['switch', '--', qw(cmstat)], 'm' => ['switch', '-', qw(cmstat)], 'matchonly' => ['switch', '--', qw(cmalign)], 'merge' => ['switch', '--', qw(cmalign)], 'mpi' => ['switch', '--', qw(cmalign cmcalibrate cmscore cmsearch)], 'mxsize' => ['param', '--', qw(cmalign cmbuild cmcalibrate cmscore cmsearch)], 'n' => ['param', '-', qw(cmbuild cmemit cmscore)], 'nc' => ['switch', '--', qw(cmsearch cmstat)], 'no-null3' => ['switch', '--', qw(cmalign cmcalibrate cmscore cmsearch)], 'no-qdb' => ['switch', '--', qw(cmsearch)], 'noalign' => ['switch', '--', qw(cmsearch)], 'nobalance' => ['switch', '--', qw(cmbuild)], 'nodetach' => ['switch', '--', qw(cmbuild)], 'nonbanded' => ['switch', '--', qw(cmalign cmbuild cmscore)], 'null' => ['param', '--', qw(cmbuild)], 'null2' => ['switch', '--', qw(cmsearch)], 'o' => ['param', '-', qw(cmalign cmsearch)], 'old' => ['switch', '--', qw(cmscore)], 'onepost' => ['switch', '--', qw(cmalign)], 'optacc' => ['switch', '--', qw(cmalign)], 'outfile' => ['param', '--', qw(cmscore)], 'p' => ['switch', '-', qw(cmalign cmsearch)], 'pad' => ['switch', '--', qw(cmscore)], 'pbegin' => ['param', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'pbswitch' => ['param', '--', qw(cmbuild)], 'pebegin' => ['switch', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'pend' => ['param', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'pfend' => ['param', '--', qw(cmalign cmcalibrate cmemit cmscore cmsearch)], 'prior' => ['param', '--', qw(cmbuild)], 'q' => ['switch', '-', qw(cmalign)], 'qdb' => ['switch', '--', qw(cmalign cmscore)], 'qdbboth' => ['switch', '--', qw(cmscore)], 'qdbfile' => ['param', '--', qw(cmstat)], 'qdbsmall' => ['switch', '--', qw(cmscore)], 'random' => ['switch', '--', qw(cmscore)], 'rdump' => ['param', '--', qw(cmbuild)], 'refine' => ['param', '--', qw(cmbuild)], 'regress' => ['param', '--', qw(cmalign cmbuild cmscore)], 'resonly' => ['switch', '--', qw(cmalign)], 'rf' => ['switch', '--', qw(cmalign cmbuild)], 'rna' => ['switch', '--', qw(cmalign cmemit cmsearch)], 'rsearch' => ['param', '--', qw(cmbuild)], 'rtrans' => ['switch', '--', qw(cmsearch)], 's' => ['param', '-', qw(cmalign cmbuild cmcalibrate cmemit cmscore)], 'sample' => ['switch', '--', qw(cmalign)], 'scoreonly' => ['switch', '--', qw(cmscore)], 'search' => ['switch', '--', qw(cmscore cmstat)], 'seqfile' => ['param', '--', qw(cmstat)], 'sfile' => ['param', '--', qw(cmstat)], 'shmm' => ['param', '--', qw(cmemit)], 'small' => ['switch', '--', qw(cmalign)], 'stall' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'sub' => ['switch', '--', qw(cmalign cmbuild cmscore)], 'sums' => ['switch', '--', qw(cmalign cmsearch)], 'tabfile' => ['param', '--', qw(cmsearch)], 'tau' => ['param', '--', qw(cmalign cmbuild cmscore cmsearch)], 'taue' => ['param', '--', qw(cmscore)], 'taus' => ['param', '--', qw(cmscore)], 'tc' => ['switch', '--', qw(cmsearch cmstat)], 'tfile' => ['param', '--', qw(cmalign cmbuild cmemit cmscore)], 'toponly' => ['switch', '--', qw(cmsearch cmstat)], 'u' => ['switch', '-', qw(cmemit)], 'v' => ['switch', '-', qw(cmbuild cmcalibrate)], 'viterbi' => ['switch', '--', qw(cmalign cmscore cmsearch)], 'wblosum' => ['switch', '--', qw(cmbuild)], 'wgiven' => ['switch', '--', qw(cmbuild)], 'wgsc' => ['switch', '--', qw(cmbuild)], 'wid' => ['param', '--', qw(cmbuild)], 'withali' => ['param', '--', qw(cmalign)], 'withpknots' => ['switch', '--', qw(cmalign)], 'wnone' => ['switch', '--', qw(cmbuild)], 'wpb' => ['switch', '--', qw(cmbuild)], 'x' => ['switch', '-', qw(cmsearch)], 'xfile' => ['param', '--', qw(cmstat)], ); our %INFERNAL_PROGRAM = ( 'cmalign' => "cmalign [-options] \n". 'cmalign [-options] --merge ', 'cmbuild' => 'cmbuild [-options] ', 'cmcalibrate' => 'cmcalibrate [-options] ', 'cmemit' => 'cmemit [-options] ', 'cmscore' => 'cmscore [-options] ', 'cmsearch' => 'cmsearch [-options] ', 'cmstat' => 'cmstat [-options] ', ); # this is a simple lookup for easy validation for passed methods our %LOCAL_PARAMS = map {$_ => 1} qw(program outfile tempfile model); =head2 new Title : new Usage : my $wrapper = Bio::Tools::Run::Infernal->new(@params) Function: creates a new Infernal factory Returns: Bio::Tools::Run::Infernal wrapper Args : list of parameters =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); # these are specific parameters we do not want passed on to set_parameters my ($program, $model, $validate, $q, $o1, $o2) = $self->_rearrange([qw(PROGRAM MODEL_FILE VALIDATE_PARAMETERS QUIET OUTFILE_NAME O)], @args); if ($o1 && $o2) { $self->warn("Only assign to either -outfile_name or -o, not both;"); } my $out = $o1 || $o2; $self->validate_parameters($validate); $q && $self->quiet($q); $program && $self->program($program); $model && $self->model_file($model); $out ||= ''; $self->outfile_name($out); $self->io->_initialize_io(); $self->set_parameters(@args); return $self; } =head2 program Title : program Usage : $obj->program() Function: Set the program called when run() is used. Synonym of program_name() Returns : String (program name) Args : String (program name) Status : Unstable (may delegate to program_name, which is the interface method) =cut sub program { my $self = shift; return $self->program_name(@_); } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; if (@_) { my $p = shift; $self->throw("Program '$p' not supported") if !exists $INFERNAL_PROGRAM{lc $p}; $self->{'_program'} = lc $p; # set up cache of valid parameters while (my ($p, $data) = each %INFERNAL_PARAMS) { my %in_exe = map {$_ => 1} @$data[2..$#{$data}]; $self->{valid_params}->{$p} = 1 if exists $in_exe{$self->{'_program'}}; } } return $self->{'_program'}; } =head2 model_file Title : model_file Usage : $obj->model_file() Function: Set the model file used when run() is called. Returns : String (file location of covariance model) Args : String (file location of covariance model) =cut sub model_file { my $self = shift; return $self->{'_model_file'} = shift if @_; return $self->{'_model_file'}; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { my ($self, $dir) = @_; if ($dir) { $self->{_program_dir} = $dir; } return Bio::Root::IO->catfile($ENV{INFERNALDIR}) || ''; } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program (uses cmsearch) Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return unless $self->executable; my $exe = $self->executable; my $string = `$exe -h 2>&1`; my $v; if ($string =~ m{Infernal\s([\d.]+)}) { $v = $1; $self->deprecated(-message => "Only Infernal 1.0 and above is supported.", -version => 1.006001) if $v < 1; } return $self->{'_progversion'} = $v || undef; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs Infernal and returns Bio::SearchIO Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI or file name =cut # TODO: update to accept multiple seqs, alignments sub run { my ($self,@seq) = @_; if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } elsif (ref $seq[0] && $seq[0]->isa("Bio::Align::AlignI") ){ # it is an object my $infile1 = $self->_writeAlignFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head1 Specific program interface methods =head2 cmsearch Title : cmsearch Usage : $obj->cmsearch($seqFile) Function: Runs Infernal cmsearch and returns Bio::SearchIO Returns : A Bio::SearchIO Args : Bio::PrimarySeqI or file name =cut sub cmsearch { my ($self,@seq) = @_; $self->program('cmsearch'); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run(-seq_files => [$infile1]); } else { return $self->_run(-seq_files => \@seq); } } =head2 cmalign Title : cmalign Usage : $obj->cmalign($seqFile) Function: Runs Infernal cmalign and returns Bio::AlignIO Returns : A Bio::AlignIO Args : Bio::PrimarySeqI or file name =cut sub cmalign { my ($self,@seq) = @_; $self->program('cmalign'); if (ref $seq[0]) { # it is an object if ($seq[0]->isa("Bio::PrimarySeqI") ){ my $infile1 = $self->_writeSeqFile(@seq); return $self->_run(-seq_files => [$infile1]); } elsif ( $seq[0]->isa("Bio::Align::AlignI") ) { if (scalar(@seq) != 2) { $self->throw("") } my $infile1 = $self->_writeAlignFile($seq[0]); my $infile2 = $self->_writeAlignFile($seq[1]); return $self->_run(-align_files => [$infile1, $infile2]); } } else { # we can maybe add a check for the file extension and try to DTRT my %params = $self->get_parameters('valid'); $params{merge} ? return $self->_run(-align_files => \@seq): return $self->_run(-seq_files => \@seq); return $self->_run(-seq_files => \@seq); } } =head2 cmemit Title : cmemit Usage : $obj->cmemit($modelfile) Function: Runs Infernal cmemit and returns Bio::AlignIO Returns : A Bio::AlignIO Args : None; set model_file() to use a specific model =cut sub cmemit { my ($self) = shift; $self->program('cmemit'); return $self->_run(@_); } =head2 cmbuild Title : cmbuild Usage : $obj->cmbuild($alignment) Function: Runs Infernal cmbuild and saves covariance model Returns : 1 on success (no object for covariance models) Args : Bio::AlignIO with structural information (such as from Stockholm format source) or alignment file name =cut sub cmbuild { my ($self,@seq) = @_; $self->program('cmbuild'); if (ref $seq[0] && $seq[0]->isa("Bio::Align::AlignI") ){# it is an object my $infile1 = $self->_writeAlignFile(@seq); return $self->_run(-align_files => [$infile1]); } else { return $self->_run(-align_files => \@seq); } } =head2 cmscore Title : cmscore Usage : $obj->cmscore($seq) Function: Runs Infernal cmscore and saves output Returns : None Args : None; set model_file() to use a specific model =cut sub cmscore { my ($self,@seq) = @_; $self->program('cmscore'); return $self->_run(); } =head2 cmcalibrate Title : cmcalibrate Usage : $obj->cmcalibrate('file') Function: Runs Infernal calibrate on specified CM Returns : None Args : None; set model_file() to use a specific model =cut sub cmcalibrate { my ($self,@seq) = @_; $self->program('cmcalibrate'); return $self->_run(); } =head2 cmstat Title : cmstat Usage : $obj->cmstat($seq) Function: Runs Infernal cmstat and saves output Returns : None Args : None; set model_file() to use a specific model =cut sub cmstat { my ($self,@seq) = @_; $self->program('cmstat'); return $self->_run(); } =head1 Bio::ParameterBaseI-specific methods These methods are part of the Bio::ParameterBaseI interface =cut =head2 set_parameters Title : set_parameters Usage : $pobj->set_parameters(%params); Function: sets the parameters listed in the hash or array Returns : None Args : [optional] hash or array of parameter/values. These can optionally be hash or array references Note : This only sets parameters; to set methods use the method name =cut sub set_parameters { my $self = shift; # circumvent any issues arising from passing in refs my %args = (ref($_[0]) eq 'HASH') ? %{$_[0]} : (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_; # set the parameters passed in, but only ones supported for the program my ($prog, $validate) = ($self->program, $self->validate_parameters); # parameter cleanup %args = map { my $a = $_; $a =~ s{^-}{}; lc $a => $args{$_} } sort keys %args; while (my ($key, $val) = each %args) { if (exists $INFERNAL_PARAMS{$key}) { my ($type, $prefix) = @{$INFERNAL_PARAMS{$key}}[0..1]; @{$self->{parameters}->{$key}} = ($type, $prefix); unshift @{$self->{parameters}->{$key}}, $type eq 'param' ? $val : $type eq 'switch' && $val ? 1 : 0; if ($validate) { my %in_exe = map {$_ => 1} @{$INFERNAL_PARAMS{$key}}[2..$#{$INFERNAL_PARAMS{$key}}]; $self->warn("Parameter $key not used for $prog") if !exists $in_exe{$key}; } } else { $self->warn("Parameter $key does not exist") if ($validate); } } } =head2 reset_parameters Title : reset_parameters Usage : resets values Function: resets parameters to either undef or value in passed hash Returns : none Args : [optional] hash of parameter-value pairs =cut sub reset_parameters { my $self = shift; delete $self->{parameters}; if (@_) { $self->set_parameters(@_); } } =head2 validate_parameters Title : validate_parameters Usage : $pobj->validate_parameters(1); Function: sets a flag indicating whether to validate parameters via set_parameters() or reset_parameters() Returns : Bool Args : [optional] value evaluating to True/False Note : Optionally implemented method; up to the implementation on whether to automatically validate parameters or optionally do so =cut sub validate_parameters { my ($self) = shift; if (@_) { $self->{validate_params} = defined $_[0] ? 1 : 0; } return $self->{validate_params}; } =head2 parameters_changed Title : parameters_changed Usage : if ($pobj->parameters_changed) {...} Function: Returns boolean true (1) if parameters have changed Returns : Boolean (0 or 1) Args : None Note : This module does not run state checks, so this always returns True =cut sub parameters_changed { 1 } =head2 available_parameters Title : available_parameters Usage : @params = $pobj->available_parameters() Function: Returns a list of the available parameters Returns : Array of parameters Args : [optional] name of executable being used; defaults to returning all available parameters =cut sub available_parameters { my ($self, $exec) = @_; my @params; if ($exec) { $self->throw("$exec is not part of the Infernal package") if !exists($INFERNAL_PROGRAM{$exec}); for my $p (sort keys %INFERNAL_PARAMS) { if (grep { $exec eq $_ } @{$INFERNAL_PARAMS{$p}}[2..$#{$INFERNAL_PARAMS{$p}}]) { push @params, $p; } } } else { @params = (sort keys %INFERNAL_PARAMS, sort keys %LOCAL_PARAMS); } return @params; } =head2 get_parameters Title : get_parameters Usage : %params = $pobj->get_parameters; Function: Returns list of set key-value pairs, parameter => value Returns : List of key-value pairs Args : [optional] 'full' - this option returns everything associated with the parameter as an array ref value; that is, not just the value but also the value, type, and prefix. Default is value only. 'valid'- same a 'full', but only returns the grouping valid for the currently set executable =cut sub get_parameters { my ($self, $option) = @_; $option ||= ''; # no option my %params; if (exists $self->{parameters}) { %params = (ref $option eq 'ARRAY') ? ( map {$_ => $self->{parameters}{$_}[0]} grep { exists $self->{parameters}{$_} } @$option) : (lc $option eq 'full') ? (%{$self->{parameters}}) : (lc $option eq 'valid') ? (map {$_ => $self->{parameters}{$_}} grep { exists $self->{valid_params}->{$_} } keys %{$self->{parameters}}) : (map {$_ => $self->{parameters}{$_}[0]} keys %{$self->{parameters}}); } else { %params = (); } return %params; } =head1 to_* methods All to_* methods are implementation-specific =cut =head2 to_exe_string Title : to_exe_string Usage : $string = $pobj->to_exe_string; Function: Returns string (command line string in this case) Returns : String Args : =cut sub to_exe_string { my ($self, @passed) = @_; my ($seqs, $aligns) = $self->_rearrange([qw(SEQ_FILES ALIGN_FILES)], @passed); if ($seqs || $aligns) { $self->throw("Seqs or alignments must be an array reference") unless ($seqs && ref($seqs) eq 'ARRAY') || ($aligns && ref($aligns) eq 'ARRAY' ); } my %args = map {$_ => []} qw(switch param input redirect); my %params = $self->get_parameters('valid'); my ($exe, $prog, $model, $outfile) = ($self->executable, $self->program_name, $self->model_file, $self->outfile_name); $self->throw("Executable not found") unless defined($exe); delete $params{o} if exists $params{o}; if (!defined($model) && $prog ne 'cmbuild') { $self->throw("model_file() not defined") } $outfile ||= ''; for my $p (sort keys %params) { if ($params{$p}[0]) { my $val = $params{$p}[1] eq 'param' ? ' '.$params{$p}[0] : ''; push @{$args{$params{$p}[1]}}, $params{$p}[2].$p.$val; } } # TODO: not sure what happens when we pass in multiple seq or alignment # filenames, may need checking if ($prog eq 'cmscore' || $prog eq 'cmstat' || $prog eq 'cmcalibrate') { push @{$args{'redirect'}}, "> $outfile" if $outfile; push @{$args{'input'}}, $model; } elsif ($prog eq 'cmsearch') { if (!defined $seqs) { $self->throw('cmsearch requires a sequence file name'); } push @{$args{'param'}}, "-o $outfile" if $outfile; push @{$args{'input'}}, ($model, @$seqs); } elsif ($prog eq 'cmalign') { if ($params{'merge'}) { $self->throw('cmalign with --merge option requires two alignment files') if !defined($aligns) || @$aligns < 2; push @{$args{'input'}}, ($model, @$aligns); } else { $self->throw('cmalign requires a sequence file') if !defined $seqs; push @{$args{'input'}}, ($model, @$seqs); } push @{$args{'param'}}, "-o $outfile" if $outfile; } elsif ($prog eq 'cmbuild') { $self->throw('cmbuild requires one alignment file') if !defined($aligns); if ($model) { push @{$args{'input'}}, ($model, @$aligns); push @{$args{'redirect'}}, "> $outfile" if $outfile; } else { push @{$args{'input'}}, ($outfile, @$aligns); } } elsif ($prog eq 'cmemit') { if (!$outfile) { $self->throw('cmemit requires an outfile_name; tempfile support not implemented yet'); } else { push @{$args{'input'}}, ($model, ,$outfile); } } # quiet! if ($self->quiet && $prog ne 'cmsearch') { if ($prog eq 'cmalign') { push @{$args{switch}}, '-q' if !exists $params{q}; } else { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; push @{$args{redirect}}, "> $null"; } } my $string = "$exe ".join(' ',(@{$args{switch}}, @{$args{param}}, @{$args{input}}, @{$args{redirect}})); $string; } ############### PRIVATE ############### #=head2 _run # # Title : _run # Usage : $obj->_run() # Function: Internal(not to be used directly) # Returns : # Args : # #=cut { my %ALLOWED = map {$_ => 1} qw(run cmsearch cmalign cmemit cmbuild cmcalibrate cmstat cmscore); sub _run { my ($self)= shift; my ($prog, $model, $out, $version) = ($self->program, $self->model_file, $self->outfile_name, $self->version); if (my $caller = (caller(1))[3]) { $caller =~ s{.*::(\w+)$}{$1}; $self->throw("Calling _run() from disallowed method") unless exists $ALLOWED{$caller}; } else { $self->throw("Can't call _run directly"); } # a model and a file must be defined for all but cmemit; cmemit must have a # file or model defined (using $file if both are defined) # relevant files are passed on to the string builder my $str = $self->to_exe_string(@_); $self->debug("Infernal command: $str\n"); my %has = $self->get_parameters('valid'); my $obj = ($prog eq 'cmsearch') ? Bio::SearchIO->new(-format => 'infernal', -version => $version, -model => $model) : ($prog eq 'cmalign' ) ? Bio::AlignIO->new(-format => 'stockholm') : ($prog eq 'cmemit' && $has{a}) ? Bio::AlignIO->new(-format => 'stockholm') : ($prog eq 'cmemit') ? Bio::SeqIO->new(-format => 'fasta') : undef; my @args; # file output if ($out) { my $status = system($str); if($status || !-e $out || -z $out ) { my $error = ($!) ? "$! Status: $status" : "Status: $status"; $self->throw( "Infernal call crashed: $error \n[command $str]\n"); return undef; } if ($obj && ref($obj)) { $obj->file($out); @args = (-file => $out); } # fh-based (no outfile) } else { open(my $fh,"$str |") || $self->throw("Infernal call ($str) crashed: $?\n"); if ($obj && ref($obj)) { $obj->fh($fh); @args = (-fh => $fh); } else { # dump to debugging my $io; while(<$fh>) {$io .= $_;} close($fh); $self->debug($io) if $io; return 1; } } $obj->_initialize_io(@args) if $obj && ref($obj); return $obj || 1; } } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile { my ($self,@seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); foreach my $s(@seq){ $in->write_seq($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } =head2 _writeAlignFile Title : _writeAlignFile Usage : obj->_writeAlignFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeAlignFile{ my ($self,@align) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::AlignIO->new('-fh' => $tfh , '-format' => 'stockholm'); foreach my $s(@align){ $in->write_aln($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } # this is a private sub used to regenerate the class data structures, # dumped to STDOUT # could probably add in a description field if needed... sub _dump_params { my %params; my %usage; for my $exec (qw(cmalign cmbuild cmcalibrate cmemit cmscore cmsearch cmstat)) { my $output = `$exec --devhelp`; if ($?) { $output = `$exec -h`; } my @lines = split("\n",$output); for my $line (@lines) { next if $line =~ /^#/; if ($line =~ /^\s*(-{1,2})(\S+)\s+(<\S+>)?/) { my %data; ($data{prefix}, my $p, $data{arg}) = ($1, $2, $3 ? 'param' : 'switch'); if (exists $params{$p}) { if ($data{prefix} ne $params{$p}{prefix}) { warn("$data{prefix} for $p in $exec doesn't match prefix for same parameter in ".$params{$p}{exec}[-1].":".$params{$p}{prefix}); } if ($data{arg} ne $params{$p}{arg}) { warn("$data{arg} for $p in $exec doesn't match arg for same parameter in ".$params{$p}{exec}[-1].":".$params{$p}{arg}); } } while (my ($key, $val) = each %data) { $params{$p}->{$key} = $val; } push @{$params{$p}->{exec}}, $exec; } elsif ($line =~ /Usage:\s*(.+)$/) { push @{$usage{$exec}}, $1; } else { #print "$line\n"; } } } # generate data structure print "our %INFERNAL_PARAMS = (\n"; for my $k (sort keys %params) { printf(" %-17s => [","'$k'"); for my $sub (qw(arg prefix exec)) { my $str = (ref($params{$k}{$sub}) eq 'ARRAY') ? "qw(".join(' ', @{$params{$k}{$sub}}).")" : "'".$params{$k}{$sub}."',"; printf("%-10s", $str); } print "],\n"; } print ");\n\n"; # generate usage data structure print "our %INFERNAL_PROGRAM = (\n"; for my $k (sort keys %usage) { printf(" %-17s => [\n","'$k'"); print ' '.join(",\n ", map {"'$_'"} @{$usage{$k}})."\n"; print " ],\n"; } print ");\n"; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/MCS.pm000077500000000000000000000230601302566030400215000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::MCS # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::MCS - Wrapper for MCS =head1 SYNOPSIS use Bio::Tools::Run::MCS; # Make a MCS factory $factory = Bio::Tools::Run::MCS->new(); # Run MCS on an alignment my @results = $factory->run($alignfilename); # or with alignment object @results = $factory->run($bio_simplalign); # look at the results foreach my $feat (@results) { my $seq_id = $feat->seq_id; my $start = $feat->start; my $end = $feat->end; my $score = $feat->score; my ($pvalue) = $feat->get_tag_values('pvalue'); my ($kind) = $feat->get_tag_values('kind'); # 'all', 'exon' or 'nonexon' } =head1 DESCRIPTION This is a wrapper for running the MCS (binCons) scripts by Elliott H Margulies. You can get details here: http://zoo.nhgri.nih.gov/elliott/mcs_doc/. MCS is used for the prediciton of transcription factor binding sites and other regions of the genome conserved amongst different species. Note that this wrapper assumes you already have alignments, so only uses MCS for the latter stages (the stages involving align2binomial.pl, generate_phyloMAX_score.pl and generate_mcs_beta.pl). You can try supplying normal MCS command-line arguments to new(), eg. $factory->new(-percentile => 95) or calling arg-named methods (excluding the initial hyphens, eg. $factory->percentile(95) to set the --percentile arg). You will need to enable this MCS wrapper to find the MCS scripts. This can be done in (at least) three ways: 1. Make sure the MCS scripts are in your path. 2. Define an environmental variable MCSDIR which is a directory which contains the MCS scripts: In bash: export MCSDIR=/home/username/mcs/ In csh/tcsh: setenv MCSDIR /home/username/mcs 3. Include a definition of an environmental variable MCSDIR in every script that will use this MCS wrapper module, e.g.: BEGIN { $ENV{MCSDIR} = '/home/username/mcs/' } use Bio::Tools::Run::MCS; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::MCS; use strict; use Cwd; use File::Spec; use Bio::AlignIO; use Bio::FeatureIO; use Bio::Annotation::SimpleValue; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'align2binomial.pl'; our $PROGRAM_DIR; # methods for the mcs args we support our @PARAMS = qw(neutral percentile mcs specificity sensitivity name); our @SWITCHES = qw(neg-score); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(ucsc gtf neutral-only fourd-align align-only ar); BEGIN { # lets add all the mcs scripts to the path so that when we call # align2binomial.pl it can find its siblings $PROGRAM_DIR = $ENV{'MCSDIR'}; $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::MCS->new() Function: creates a new MCS factory Returns : Bio::Tools::Run::MCS Args : Many options understood by MCS can be supplied as key => value pairs. These options can NOT be used with this wrapper: ucsc gtf neutral-only fourd-align align-only ar =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($align_file_or_object, Bio::Location::Atomic, [Bio::SeqFeatureI]); Function: Runs the MCS scripts on an alignment. Returns : list of Bio::SeqFeatureI feature objects (with coordinates corrected according to the supplied offset, if any) Args : The first argument represents an alignment, the optional second argument represents the chromosome, stand and end and the optional third argument represents annotation of the exons in the alignment. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The position in the genome can be provided as a Bio::Location::Atomic with start, end and seq_id set. The annnotation can be provided as an array of Bio::SeqFeatureI objects. =cut sub run { my ($self, $aln, $offset, $exon_feats) = @_; $self->_alignment($aln || $self->alignment || $self->throw("An alignment must be supplied")); return $self->_run($offset, $exon_feats); } sub _run { my ($self, $atomic, $exon_feats) = @_; my $exe = $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $offset = ''; my $start_adjust = 0; if ($atomic) { $start_adjust = $atomic->start; $offset = '--ucsc '.$atomic->seq_id.':'.$start_adjust.'-'.$atomic->end; $start_adjust--; } my $gtf_file = 'exons.gtf'; if ($exon_feats) { my $fout = Bio::FeatureIO->new(-file => ">$gtf_file", -format => 'gtf'); foreach my $feat (@{$exon_feats}) { $fout->write_feature($feat); } } my $gtf = $exon_feats ? "--gtf $gtf_file" : ''; # step '2' (http://zoo.nhgri.nih.gov/elliott/mcs_doc/node1.html) of MCS: # run align2binomial.pl to calculate individual species binomial scores my $aln_file = $self->_write_alignment; my $error_file = 'stderr'; my $binomial_file = 'align_name.binomial'; my $cmd = "align2binomial.pl $offset $gtf $aln_file > $binomial_file 2> $error_file"; #system("rm -fr $cwd/mcs_dir; cp -R $temp_dir $cwd/mcs_dir"); my $throw = system($cmd); open(my $efh, "<", $error_file) || $self->throw("Could not open error file '$error_file'"); my $error; while (<$efh>) { $error .= $_; $throw = 1 if /not divisible by 3/; } close($efh); $self->throw($error) if $throw; # step '3': run generate_phyloMAX_score.pl to combine the individual # binomial scores and generate the final Multi-species Conservation Score my $phylo_file = 'align_name.phylo'; system("generate_phyloMAX_score.pl $binomial_file > $phylo_file") && $self->throw("generate_phyloMAX_score.pl call failed: $?, $!"); # step '4': Generate MCSs from the conservation score using # generate_mcs_beta.pl my $mcs_file = 'mcs_result.stdout'; my $bed_file = 'align_name.bed'; # hardcoded in generate_mcs_beta.pl system("generate_mcs_beta.pl $offset $gtf $phylo_file > $mcs_file") && $self->throw("generate_mcs_beta.pl failed: $?, $!"); my @feats; my $fin = Bio::FeatureIO->new(-file => $bed_file, -format => 'bed'); my $source = Bio::Annotation::SimpleValue->new(-value => 'MCS'); while (my $feat = $fin->next_feature()) { # convert coords given offset if ($start_adjust) { $feat->start($feat->start + $start_adjust); $feat->end($feat->end + $start_adjust); } $feat->source($source); push(@feats, $feat); } # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string = $self->SUPER::_setparams(-params => \@PARAMS, -switches => \@SWITCHES, -dash => 1); my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Maq.pm000077500000000000000000000455121302566030400216020ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Maq # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Maq - Run wrapper for the Maq short-read assembler *BETA* =head1 SYNOPSIS # create an assembly $maq_fac = Bio::Tools::Run::Maq->new(); $maq_assy = $maq_fac->run( 'reads.fastq', 'refseq.fas' ); # if IO::Uncompress::Gunzip is available... $maq_assy = $maq_fac->run( 'reads.fastq.gz', 'refseq.gz'); # paired-end $maq_assy = $maq_fac->run( 'reads.fastq', 'refseq.fas', 'paired-reads.fastq'); # be more strict $maq_fac->set_parameters( -c2q_min_map_quality => 60 ); $maq_assy = $maq_fac->run( 'reads.fastq', 'refseq.fas', 'paired-reads.fastq'); # run maq commands separately $maq_fac = Bio::Tools::Run::Maq->new( -command => 'pileup', -single_end_quality => 1 ); $maq_fac->run_maq( -bfa => 'refseq.bfa', -map => 'maq_assy.map', -txt => 'maq_assy.pup.txt' ); =head1 DESCRIPTION This module provides a wrapper interface for Heng Li's reference-directed short read assembly suite C (see L for manuals and downloads). There are two modes of action. =over =item * EasyMaq The first is a simple pipeline through the C commands, taking your read data in and squirting out an assembly object of type L. The pipeline is based on the one performed by C: Action maq commands ------ ------------ data conversion to fasta2bfa, fastq2bfq maq binary formats map sequence reads map to reference seq assemble, creating assemble consensus convert map & cns mapview, cns2fq files to plaintext (for B:A:IO:maq) Command-line options can be directed to the C, C, and C steps. See L below. =item * BigMaq The second mode is direct access to C commands. To run a C command, construct a run factory, specifying the desired command using the C<-command> argument in the factory constructor, along with options specific to that command (see L): $maqfac->Bio::Tools::Run::Maq->new( -command => 'fasta2bfa' ); To execute, use the C methods. Input and output files are specified in the arguments of C (see L): $maqfac->run_maq( -fas => "myref.fas", -bfa => "myref.bfa" ); =back =head1 OPTIONS C is complex, with many subprograms (commands) and command-line options and file specs for each. This module attempts to provide commands and options comprehensively. You can browse the choices like so: $maqfac = Bio::Tools::Run::Maq->new( -command => 'assemble' ); # all maq commands @all_commands = $maqfac->available_parameters('commands'); @all_commands = $maqfac->available_commands; # alias # just for assemble @assemble_params = $maqfac->available_parameters('params'); @assemble_switches = $maqfac->available_parameters('switches'); @assemble_all_options = $maqfac->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. See L for the gory details. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $maqfac = Bio::Tools::Run::Maq->new( -command => 'map' ); @filespec = $maqfac->filespec; This example returns the following array: map bfa bfq1 #bfq2 2>#log This indicates that map (C binary mapfile), bfa (C binary fasta), and bfq (C binary fastq) files MUST be specified, another bfq file MAY be specified, and a log file receiving STDERR also MAY be specified. Use these in the C call like so: $maqfac->run_maq( -map => 'my.map', -bfa => 'myrefseq.bfa', -bfq1 => 'reads1.bfq', -bfq2 => 'reads2.bfq' ); Here, the C parameter was unspecified. Therefore, the object will store the programs STDERR output for you in the C attribute: handle_map_warning($maqfac) if ($maqfac->stderr =~ /warning/); STDOUT for a run is also saved, in C, unless a file is specified to slurp it according to the filespec. C STDOUT usually contains useful information on the run. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Maq; use strict; our $HAVE_IO_UNCOMPRESS; BEGIN { eval {require IO::Uncompress::Gunzip; $HAVE_IO_UNCOMPRESS = 1}; } use IPC::Run; # Object preamble - inherits from Bio::Root::Root use lib '../../..'; use Bio::Root::Root; use Bio::Tools::Run::Maq::Config; use Bio::Tools::GuessSeqFormat; use File::Basename qw(fileparse); use base qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase ); ## maq ( from tigr ) our $program_name = 'maq'; # name of the executable # Note: # other globals required by Bio::Tools::Run::AssemblerBase are # imported from Bio::Tools::Run::Maq::Config our $qual_param = 'quality_file'; our $use_dash = 1; our $join = ' '; our $asm_format = 'maq'; =head2 new() Title : new Usage : my $obj = new Bio::Tools::Run::Maq(); Function: Builds a new Bio::Tools::Run::Maq object Returns : an instance of Bio::Tools::Run::Maq Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->parameters_changed(1); $self->_register_program_commands( \@program_commands, \%command_prefixes ); unless (grep /command/, @args) { push @args, '-command', 'run'; } $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); $self->program_name($program_name) if not defined $self->program_name(); if ($^O =~ /cygwin/) { my @kludge = `PATH=\$PATH:/usr/bin:/usr/local/bin which $program_name`; chomp $kludge[0]; $self->program_name($kludge[0]); } $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI $self->_assembly_format($asm_format); return $self; } =head2 run Title : run Usage : $assembly = $maq_assembler->run($read1_fastq_file, $refseq_fasta_file, $read2_fastq_file); Function: Run the maq assembly pipeline. Returns : Assembly results (file, IO object or Assembly object) Args : - fastq file containing single-end reads - fasta file containing the reference sequence - [optional] fastq file containing paired-end reads Note : gzipped inputs are allowed if IO::Uncompress::Gunzip is available =cut sub run { my ($self, $rd1_file, $ref_file, $rd2_file) = @_; # Sanity checks $self->_check_executable(); $rd1_file or $self->throw("Fastq reads file required at arg 1"); $ref_file or $self->throw("Fasta refseq file required at arg 2"); # expand gzipped files as nec. for ($rd1_file, $ref_file, $rd2_file) { next unless $_; if (/\.gz[^.]*$/) { unless ($HAVE_IO_UNCOMPRESS) { croak( "IO::Uncompress::Gunzip not available, can't expand '$_'" ); } my ($tfh, $tf) = $self->io->tempfile; my $z = IO::Uncompress::Gunzip->new($_); while (<$z>) { print $tfh $_ } close $tfh; $_ = $tf; } } my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$rd1_file); $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 1"); $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$ref_file); $guesser->guess eq 'fasta' or $self->throw("Refseq file doesn't look like fasta at arg 2"); if ($rd2_file) { $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$rd2_file); $guesser->guess eq 'fastq' or $self->throw("Reads file doesn't look like fastq at arg 3"); } # maq format conversion ($rd1_file, $ref_file, $rd2_file) = $self->_prepare_input_sequences($rd1_file, $ref_file, $rd2_file); # Assemble my ($maq_file, $faq_file) = $self->_run($rd1_file, $ref_file, $rd2_file); # Export results in desired object type my $asm = $self->_export_results($maq_file); return $asm; } =head2 run_maq() Title : run_maq Usage : $obj->run_maq( @file_args ) Function: Run a maq command as specified during object contruction Returns : Args : a specification of the files to operate on: =cut sub run_maq { my ($self, @args) = @_; # _translate_params will provide an array of command/parameters/switches # -- these are set at object construction # to set up the run, need to add the files to the call # -- provide these as arguments to this function my $cmd = $self->command if $self->can('command'); $self->throw("No maq command specified for the object") unless $cmd; # setup files necessary for this command my $filespec = $command_files{$cmd}; $self->throw("No command-line file specification is defined for command '$cmd'; check Bio::Tools::Run::Maq::Config") unless $filespec; # parse args based on filespec # require named args $self->throw("Named args are required") unless !(@args % 2); s/^-// for @args; my %args = @args; # validate my @req = map { my $s = $_; $s =~ s/^[012]?[<>]//; $s =~ s/[^a-zA-Z0-9_]//g; $s } grep !/[#]/, @$filespec; !defined($args{$_}) && $self->throw("Required filearg '$_' not specified") for @req; # set up redirects my ($in, $out, $err); for (@$filespec) { m/^1?>(.*)/ && do { defined($args{$1}) && ( open($out,">", $args{$1}) or $self->throw("Open for write error : $!")); next; }; m/^2>#?(.*)/ && do { defined($args{$1}) && (open($err, ">", $args{$1}) or $self->throw("Open for write error : $!")); next; }; m/^<#?(.*)/ && do { defined($args{$1}) && (open($in, "<", $args{$1}) or $self->throw("Open for read error : $!")); next; } } my $dum; $in || ($in = \$dum); $out || ($out = \$self->{'stdout'}); $err || ($err = \$self->{'stderr'}); # Get program executable my $exe = $self->executable; # Get command-line options my $options = $self->_translate_params(); # Get file specs sans redirects in correct order my @specs = map { my $s = $_; $s =~ s/[^a-zA-Z0-9_]//g; $s } grep !/[<>]/, @$filespec; my @files = @args{@specs}; # expand arrayrefs my $l = $#files; for (0..$l) { splice(@files, $_, 1, @{$files[$_]}) if (ref($files[$_]) eq 'ARRAY'); } @files = map { defined $_ ? $_ : () } @files; # squish undefs my @ipc_args = ( $exe, @$options, @files ); eval { IPC::Run::run(\@ipc_args, $in, $out, $err) or die ("There was a problem running $exe : $!"); }; if ($@) { $self->throw("$exe call crashed: $@"); } # return arguments as specified on call return @args; } =head2 stdout() Title : stdout Usage : $fac->stdout() Function: store the output from STDOUT for the run, if no file specified in run_maq() Example : Returns : scalar string Args : on set, new value (a scalar or undef, optional) =cut sub stdout { my $self = shift; return $self->{'stdout'} = shift if @_; return $self->{'stdout'}; } =head2 stderr() Title : stderr Usage : $fac->stderr() Function: store the output from STDERR for the run, if no file is specified in run_maq() Example : Returns : scalar string Args : on set, new value (a scalar or undef, optional) =cut sub stderr { my $self = shift; return $self->{'stderr'} = shift if @_; return $self->{'stderr'}; } =head1 Bio::Tools::Run::AssemblerBase overrides =head2 _check_sequence_input() No-op. =cut sub _check_sequence_input { return 1; } =head2 _check_optional_quality_input() No-op. =cut sub _check_optional_quality_input { return 1; } =head2 _prepare_input_sequences Convert input fastq and fasta to maq format. =cut sub _prepare_input_sequences { my ($self, @args) = @_; my (%args, $read1, $read2, $refseq); if (grep /^-/, @args) { # named parms $self->throw("Input args not an even number") unless !(@args % 2); %args = @args; ($read1, $refseq, $read2) = @args{qw( -read1 -refseq -read2 )}; } else { ($read1, $refseq, $read2) = @args; } # just handle file input for now... $self->throw("maq requires at least one FASTQ read file and one FASTA reference sequence") unless (defined $read1 && defined $refseq); $self->throw("File cannot be found") unless ( -e $read1 && -e $refseq && (!defined $read2 || -e $read2) ); # maq needs its own fasta/fastq format. Use its own converters to # create tempfiles in bfa, bfq format. my ($ref_h, $ref_file, $rd1_h, $rd1_file, $rd2_h, $rd2_file); ($ref_h, $ref_file) = $self->io->tempfile( -dir => $self->tempdir() ); ($rd1_h, $rd1_file) = $self->io->tempfile( -dir => $self->tempdir() ); $ref_h->close; $rd1_h->close; my $fac = Bio::Tools::Run::Maq->new( -command => 'fasta2bfa' ); $fac->run_maq( -bfa => $ref_file, -fas => $refseq ); $fac->set_parameters( -command => 'fastq2bfq' ); $fac->run_maq( -bfq => $rd1_file, -faq => $read1 ); if (defined $read2) { ($rd2_h, $rd2_file) = $self->io->tempfile( -dir => $self->tempdir() ); $rd2_h->close; $fac->run_maq( -bfq => $rd2_file, -faq => $read2); } return ($rd1_file, $ref_file, $rd2_file); } =head2 _collate_subcmd_args() Title : _collate_subcmd_args Usage : $args_hash = $self->_collate_subcmd_args Function: collate parameters and switches into command-specific arg lists for passing to new() Returns : hash of named argument lists Args : [optional] composite cmd prefix (scalar string) [default is 'run'] =cut sub _collate_subcmd_args { my $self = shift; my $cmd = shift; my %ret; # default command is 'run' $cmd ||= 'run'; my @subcmds = @{$composite_commands{$cmd}}; my %subcmds; my $cur_options = $self->{'_options'}; # collate foreach my $subcmd (@subcmds) { # find the composite cmd form of the argument in # the current params and switches # e.g., map_max_mismatches my @params = grep /^${subcmd}_/, @{$$cur_options{'_params'}}; my @switches = grep /^${subcmd}_/, @{$$cur_options{'_switches'}}; $ret{$subcmd} = []; # create an argument list suitable for passing to new() of # the subcommand factory... foreach my $opt (@params, @switches) { my $subopt = $opt; $subopt =~ s/^${subcmd}_//; push(@{$ret{$subcmd}}, '-'.$subopt => $self->$opt) if defined $self->$opt; } } return \%ret; } =head2 _run() Title : _run Usage : $factory->_run() Function: Run a maq assembly pipeline Returns : depends on call (An assembly file) Args : - single end read file in maq bfq format - reference seq file in maq bfa format - [optional] paired end read file in maq bfq format =cut sub _run { my ($self, $rd1_file, $ref_file, $rd2_file) = @_; my ($cmd, $filespec, @ipc_args); # Get program executable my $exe = $self->executable; # treat run() as a separate command and duplicate the component-specific # parameters in the config globals # Setup needed files and filehandles first my $tdir = $self->tempdir(); my ($maph, $mapf) = $self->io->tempfile( -template => 'mapXXXX', -dir => $tdir ); #map my ($cnsh, $cnsf) = $self->io->tempfile( -template => 'cnsXXXX', -dir => $tdir ); #consensus my ($maqh, $maqf) = $self->_prepare_output_file(); my ($nm,$dr,$suf) = fileparse($maqf,".maq"); my $faqf = $dr.$nm.".cns.fastq"; $_->close for ($maph, $cnsh, $maqh); # Get command-line options for the component commands: my $subcmd_args = $self->_collate_subcmd_args(); # map reads to ref seq # set up subcommand options my $maq = Bio::Tools::Run::Maq->new( -command => 'map', @{$subcmd_args->{map}} ); $maq->run_maq( -map => $mapf, -bfa => $ref_file, -bfq1 => $rd1_file, -bfq2 => $rd2_file ); # assemble reads into consensus $maq = Bio::Tools::Run::Maq->new( -command => 'assemble', @{$subcmd_args->{asm}} ); $maq->run_maq( -cns => $cnsf, -bfa => $ref_file, -map => $mapf ); # convert map into plain text $maq = Bio::Tools::Run::Maq->new( -command => 'mapview' ); $maq->run_maq( -map => $mapf, -txt => $maqf ); # convert consensus into plain text fastq $maq = Bio::Tools::Run::Maq->new( -command => 'cns2fq', @{$subcmd_args->{c2q}} ); $maq->run_maq( -cns => $cnsf, -faq => $faqf ); return ($maqf, $faqf); } =head2 available_parameters() Title : available_parameters Usage : @cmds = $fac->available_commands('commands'); Function: Use to browse available commands, params, or switches Returns : array of scalar strings Args : 'commands' : all maq commands 'params' : parameters for this object's command 'switches' : boolean switches for this object's command 'filespec' : the filename spec for this object's command 4Geeks : Overrides Bio::ParameterBaseI via Bio::Tools::Run::AssemblerBase =cut sub available_parameters { my $self = shift; my $subset = shift; for ($subset) { # get commands !defined && do { # delegate return $self->SUPER::available_parameters($subset); }; m/^c/i && do { return grep !/^run$/, @program_commands; }; m/^f/i && do { # get file spec return @{$command_files{$self->command}}; }; do { #else delegate... return $self->SUPER::available_parameters($subset); }; } } sub available_commands { shift->available_parameters('commands') }; sub filespec { shift->available_parameters('filespec') }; 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Maq/000077500000000000000000000000001302566030400212325ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Maq/Config.pm000077500000000000000000000222121302566030400227770ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Maq::Config # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Maq::Config - Configuration data for maq commands =head1 SYNOPSIS Used internally by L. =head1 DESCRIPTION This package exports information describing maq commands, parameters, switches, and input and output filetypes for individual maq commands. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj@fortinbras.us Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Maq::Config; use strict; use warnings; no warnings qw(qw); use Bio::Root::Root; use Exporter; use base qw(Bio::Root::Root ); our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( @program_commands %command_prefixes %composite_commands @program_params @program_switches %param_translation %command_files ); @EXPORT_OK = qw(); our @program_commands = qw( run fasta2bfa fastq2bfq map mapmerge rmdup assemble indelpe indelsoa sol2sanger bfq2fastq mapass2maq mapview mapcheck pileup cns2fq cns2snp cns2view cns2ref cns2win fasta2csfa csmap2nt submap eland2maq export2maq ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # our %composite_commands = ( 'run' => [qw( map asm c2q )] ); # prefixes only for commands that take params/switches... our %command_prefixes = ( 'fastq2bfq' => 'q2q', 'map' => 'map', 'assemble' => 'asm', 'mapview' => 'mv', 'mapcheck' => 'mck', 'pileup' => 'pup', 'cns2fq' => 'c2q', 'cns2win' => 'c2w', 'submap' => 'sub', 'eland2maq' => 'l2m', 'export2maq' => 'x2m', 'run' => 'run' ); our @program_params = qw( command q2q|n map|adaptor_file map|first_read_length map|max_hits map|max_mismatches map|max_outer_distance map|max_outer_distance_rf map|mismatch_dump map|mismatch_posn_dump map|mismatch_thr map|mutation_rate map|second_read_length map|unmapped_dump asm|error_dep_coeff asm|het_fraction asm|max_mismatches asm|max_quality_sum asm|min_map_quality asm|num_haplotypes mck|max_mismatches mck|min_map_quality pup|max_mismatches pup|max_quality_vals pup|min_map_quality pup|site_input_file c2q|min_map_quality c2q|min_read_depth c2q|min_nbr_quality c2q|max_read_depth c2w|window_size c2w|ref_seq c2w|start_posn c2w|end_posn c2w|min_cons_quality sub|max_mismatches sub|max_quality_sum sub|min_map_quality l2q|def_qual x2m|max_outer_distance x2m|first_read_length x2m|second_read_length ); our @program_switches = qw( asm|single_end_quality asm|discard_wrong_pairs mv|omit_seq_qual mv|show_mismatch_posns mck|single_end_quality pup|single_end_quality pup|discard_wrong_pairs pup|verbose pup|show_base_posn sub|discard_wrong_pairs x2m|retain_filt_reads ); our %param_translation = ( 'q2q|n' => 'n', 'map|max_mismatches' => 'n', 'map|max_outer_distance' => 'a', 'map|max_outer_distance_rf' => 'A', 'map|first_read_length' => '1', 'map|second_read_length' => '2', 'map|mutation_rate' => 'm', 'map|adaptor_file' => 'd', 'map|unmapped_dump' => 'u', 'map|mismatch_thr' => 'e', 'map|mismatch_dump' => 'H', 'map|max_hits' => 'C', 'map|mismatch_posn_dump' => 'N', 'asm|error_dep_coeff' => 't', 'asm|het_fraction' => 'r', 'asm|max_mismatches' => 'm', 'asm|max_quality_sum' => 'Q', 'asm|min_map_quality' => 'q', 'asm|num_haplotypes' => 'N', 'mck|max_mismatches' => 'm', 'mck|min_map_quality' => 'q', 'pup|max_mismatches' => 'm', 'pup|max_quality_vals' => 'Q', 'pup|min_map_quality' => 'q', 'pup|site_input_file' => 'l', 'c2q|min_map_quality' => 'Q', 'c2q|min_read_depth' => 'd', 'c2q|min_nbr_quality' => 'n', 'c2q|max_read_depth' => 'D', 'c2w|window_size' => 'w', 'c2w|ref_seq' => 'c', 'c2w|start_posn' => 'b', 'c2w|end_posn' => 'e', 'c2w|min_cons_quality' => 'q', 'sub|max_mismatches' => 'm', 'sub|max_quality_sum' => 'Q', 'sub|min_map_quality' => 'q', 'l2q|def_qual' => 'q', 'x2m|max_outer_distance' => 'a', 'x2m|first_read_length' => '1', 'x2m|second_read_length' => '2', 'asm|single_end_quality' => 's', 'asm|discard_wrong_pairs' => 'p', 'mv|omit_seq_qual' => 'b', 'mv|show_mismatch_posns' => 'N', 'mck|single_end_quality' => 's', 'pup|single_end_quality' => 's', 'pup|discard_wrong_pairs' => 'p', 'pup|verbose' => 'v', 'pup|show_base_posn' => 'P', 'sub|discard_wrong_pairs' => 'p', 'x2m|retain_filt_reads' => 'n' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_maq # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'run' => [qw( faq fas faq )], 'fastq2bfq' => [qw( faq bfq )], 'fasta2bfa' => [qw( fas bfa )], 'map' => [qw( map bfa bfq1 #bfq2 2>#log )], 'mapmerge' => [qw( out_map *in_map )], 'rmdup' => [qw( out_map in_map )], 'assemble' => [qw( cns bfa map 2>#log )], 'indelpe' => [qw( bfa map >txt )], 'indelsoa' => [qw( bfa map >txt )], 'sol2sanger' => [qw( in_faq out_faq )], 'bfq2fastq' => [qw( bfq faq )], 'mapass2maq' => [qw( in_map out_map )], 'mapview' => [qw( map >txt )], 'mapcheck' => [qw( bfa map >txt )], 'pileup' => [qw( bfa map >txt )], 'cns2fq' => [qw( cns >faq )], 'cns2snp' => [qw( cns >txt )], 'cns2view' => [qw( cns >txt )], 'cns2ref' => [qw( cns >fas )], 'cns2win' => [qw( cns >txt )], 'fasta2csfa' => [qw( in_fas >out_fas )], 'csmap2nt' => [qw( out_map bfa in_map )], 'submap' => [qw( out_map in_map )], 'eland2maq' => [qw( map lis eld )], 'export2maq' => [qw( map lis xpt )] ); INIT { # add subcommand params and switches for # composite commands my @sub_params; my @sub_switches; foreach my $cmd (keys %composite_commands) { foreach my $subcmd ( @{$composite_commands{$cmd}} ) { my @sub_program_params = grep /^$subcmd\|/, @program_params; my @sub_program_switches = grep /^$subcmd\|/, @program_switches; for (@sub_program_params) { m/^$subcmd\|(.*)/; push @sub_params, "$cmd\|${subcmd}_".$1; } for (@sub_program_switches) { m/^$subcmd\|(.*)/; push @sub_switches, "$cmd\|${subcmd}_".$1; } } } push @program_params, @sub_params; push @program_switches, @sub_switches; # translations for subcmd params/switches not necessary } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Match.pm000066400000000000000000000171751302566030400221210ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Match # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Match - Wrapper for Transfac's match(TM) =head1 SYNOPSIS use Bio::Tools::Run::Match; # Make a Match factory $factory = Bio::Tools::Run::Match->new(-mxlib => '/path/to/matrix.dat'); # Run Match on an sequence object my @results = $factory->run($bio_seq); # look at the results foreach my $feat (@results) { my $seq_id = $feat->seq_id; my $start = $feat->start; my $end = $feat->end; my $score = $feat->score; my ($pvalue) = $feat->get_tag_values('pvalue'); } =head1 DESCRIPTION This is a wrapper for running the match(TM) program supplied with Transfac Pro distributions. You can try supplying normal match command-line arguments to new(), eg. new(-b => 1) or calling arg-named methods (excluding the initial hyphens, eg. $factory->b(1) to set the -b option to true). Histogram output isn't supported. -p is supported by using -mxprf, see the docs of new() for details. You will need to enable this match wrapper to find the match executable. This can be done in (at least) three ways: 1. Make sure match is in your path. 2. Define an environmental variable MATCHDIR which is a directory which contains the match executable: In bash: export MATCHDIR=/home/username/match/ In csh/tcsh: setenv MATCHDIR /home/username/match 3. Include a definition of an environmental variable MATCHDIR in every script that will use this match wrapper module, e.g.: BEGIN { $ENV{MATCHDIR} = '/home/username/match/' } use Bio::Tools::Run::Match; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Match; use strict; use Cwd; use File::Spec; use Bio::SeqIO; use Bio::FeatureIO; use Bio::Annotation::SimpleValue; use Bio::Tools::Match; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'match'; our $PROGRAM_DIR = $ENV{'MATCHDIR'}; # methods for the match args we support our @PARAMS = qw(mxlib mxprf imcut); # these aren't actually match args, but # are methods we use internally our @SWITCHES = qw(b u); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(H HH pp ppg pn png pr jkn i p); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Match->new() Function: creates a new MCS factory Returns : Bio::Tools::Run::MCS Args : The following args can either be supplied here or set by calling arg-named methods (eg. $factory->imcut(2) ). -mxlib => path to the matrix.dat file containing Transfac matricies -mxprf => path to a profile file | [core_thresh, [matrix_thresh]] (defaults to a standard one based on the mxlib provided if file not supplied, using core_thresh and matrix_thresh values if those are supplied instead) -imcut => floating point number, the importance cutoff -b | -u => boolean, mutually exclusive =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($bio_seqi_object); Function: Runs match on a sequence. Returns : list of Bio::SeqFeatureI feature objects Args : Bio::SeqI compliant object NB: mxlib has to have been set prior to calling run(), either as an argument to new() or by calling mxlib(). =cut sub run { my ($self, $seq) = @_; $self->mxlib || $self->throw("mxlib has to have been set first"); return $self->_run($seq); } sub _run { my ($self, $seq) = @_; my $exe = $self->executable || return; my $mxlib = File::Spec->rel2abs($self->mxlib()); my $mxprf_file = $self->mxprf(); if ($mxprf_file && -e $mxprf_file) { $mxprf_file = File::Spec->rel2abs($mxprf_file); } # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); # make the profile file if necessary if (! $mxprf_file || ! -e $mxprf_file) { my @thresh; if ($mxprf_file && ref($mxprf_file) eq 'ARRAY') { @thresh = @{$mxprf_file}; } $mxprf_file = 'mxprf'; system("$exe $mxlib ignored ignored $mxprf_file -p @thresh") && $self->throw("Something went wrong whist creating profile: $! | $?"); } # output the sequence to a fasta file my $seq_file = 'sequence.fa'; my $so = Bio::SeqIO->new(-file => ">$seq_file", -format => 'fasta'); $so->write_seq($seq); $so->close(); # run match my $result_file = 'out'; my $param_str = $self->_setparams(); my $cmd_line = "$exe $mxlib $seq_file $result_file $mxprf_file".$param_str; system($cmd_line) && $self->throw("Something went wrong whist running '$cmd_line': $! | $?"); # parse the results my $parser = Bio::Tools::Match->new(-file => $result_file); # correct the coords my @feats; while (my $feat = $parser->next_result) { push(@feats, $feat); } # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string = $self->SUPER::_setparams(-switches => \@SWITCHES, -dash => 1); my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Mdust.pm000066400000000000000000000242731302566030400221560ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Mdust # # Please direct questions and support issues to # # Cared for by Donald Jackson, donald.jackson@bms.com # # Copyright Donald Jackson # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Mdust - Perl extension for Mdust nucleotide filtering =head1 SYNOPSIS use Bio::Tools::Run::Mdust; my $mdust = Bio::Tools::Run::Mdust->new(); $mdust->run($bio_seq_object); =head1 DESCRIPTION Perl wrapper for the nucleic acid complexity filtering program B as available from TIGR via L. Takes a Bio::SeqI or Bio::PrimarySeqI object of type DNA as input. If a Bio::Seq::RichSeqI is passed then the low-complexity regions will be added to the feature table of the target object as Bio::SeqFeature::Generic items with primary tag = 'Excluded' . Otherwise a new target object will be returned with low-complexity regions masked (by N's or other character as specified by maskchar()). The mdust executable must be in a directory specified with either the PATH or MDUSTDIR environment variable. =head1 SEE ALSO L, L, L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Donald Jackson (donald.jackson@bms.com) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Mdust; require 5.005_62; use strict; use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; use vars qw($AUTOLOAD $PROGRAMNAME @ARGNAMES @MASKCHARS $VERSION @ISA); @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); @ARGNAMES = qw(TARGET WSIZE CUTOFF MASKCHAR COORDS TMPDIR DEBUG); $PROGRAMNAME = 'mdust'; @MASKCHARS = qw(N X L); =head2 new Title : new Usage : my $mdust = Bio::Tools::Run::Mdust->new( -target => $target_bioseq) Purpose : Create a new mdust object Returns : A Bio::Seq object Args : target - Bio::Seq object for masking - alphabet MUST be DNA. wsize - word size for masking (default = 3) cutoff - cutoff score for masking (default = 28) maskchar - character for replacing masked regions (default = N) coords - boolean - indicate low-complexity regions as Bio::SeqFeature::Generic objects with primary tag 'Excluded', do not change sequence (default 0) tmpdir - directory for storing temporary files debug - boolean - toggle debugging output, do not remove temporary files Notes : All of the arguments can also be get/set with their own accessors, such as: my $wsize = $mdust->wsize(); When processing multiple sequences, call Bio::Tools::Run::Mdust->new() once then pass each sequence as an argument to the target() or run() methods. =cut sub new { my ($proto, @args) = @_; my $pkg = ref($proto) || $proto; my %args; my $self = { wsize => undef, cutoff => undef, maskchar => undef, coords => 0, }; bless ($self, $pkg); @args{@ARGNAMES} = $self->_rearrange(\@ARGNAMES, @args); # load target first since it requires special handling $self->target($args{'TARGET'}) if ($args{'TARGET'}); # package settings $self->{'coords'} = $args{'COORDS'} if (defined $args{'COORDS'}); $self->{'tmpdir'} = $args{'TMPDIR'} || $ENV{'TMPDIR'} || $ENV{'TMP'} || '.'; # mdust options $self->{'wsize'} = $args{'WSIZE'} if (defined $args{'WSIZE'}); $self->{'cutoff'} = $args{'CUTOFF'} if (defined $args{'CUTOFF'}); $self->{'maskchar'} = $args{'MASKCHAR'} if (defined $args{'CUTOFF'}); # set debugging $self->verbose($args{'DEBUG'}); return $self; } =head2 run Title : run Usage : $mdust->run(); Purpose : Run mdust on the target sequence Args : target (optional) - Bio::Seq object of alphabet DNA for masking Returns : Bio::Seq object with masked sequence or low-complexity regions added to feature table. =cut sub run { my ($self, $target) = @_; if ($target) { $self->target($target); } return $self->_run_mdust; } sub program_dir { return Bio::Root::IO->catfile($ENV{MDUSTDIR}) if $ENV{MDUSTDIR}; } sub program_name { return $PROGRAMNAME; } sub _run_mdust { # open a pipe to the mdust command. Pass in sequence(s?) as fasta # files on STDIN, recover filtered seqs on STDOUT my ($self) = @_; my $target = $self->target or warn "No target sequence specified\n" && return undef; # make sure program is available - doesn't seem to check #my $executable = $self->executable('mdust', 1); # add options my $mdust_cmd = $self->program_path; $mdust_cmd .= " -w " . $self->wsize if (defined $self->wsize); $mdust_cmd .= " -v " . $self->cutoff if (defined $self->cutoff); $mdust_cmd .= " -m " . $self->maskchar if (defined $self->maskchar); $mdust_cmd .= " -c" if ($self->coords); print STDERR "Running mdust: $mdust_cmd\n" if ($self->debug); my $maskedfile = $self->_maskedfile; eval { my $pid = open (MDUST, "| $mdust_cmd > $maskedfile"); # bind STDIN of mdust to filehandle local $| = 1; my $seqout = Bio::SeqIO->new(-fh => \*MDUST, -format => 'Fasta'); $seqout->write_seq($target); close MDUST; # need to do this to get output to flush! }; $self->throw($@) if ($@); my $rval; if ($self->coords) { $self->_parse_coords($maskedfile); $rval = $self->target; } else { # replace original seq w/ masked seq my $seqin = Bio::SeqIO->new(-file=>$maskedfile, -format => 'Fasta'); $rval = $seqin->next_seq } unlink $maskedfile unless $self->save_tempfiles; return $rval; } =head2 target Title : target Usage : $mdust->target($bio_seq) Purpose : Set/get the target (sequence to be filtered). Returns : Target Bio::Seq object Args : Bio::SeqI or Bio::PrimarySeqI object using the DNA alphabet (optional) Note : If coordinate parsing is selected ($mdust->coords = 1) then target MUST be a Bio::Seq::RichSeqI object. Passing a RichSeqI object automatically turns on coordinate parsing. =cut sub target { my ($self, $targobj) = @_; if ($targobj) { return $self->_set_target($targobj); } else { return $self->{'target'}; } } sub _set_target { my ($self, $targobj) = @_; unless ($targobj->isa('Bio::SeqI') or ($targobj->isa('Bio::PrimarySeqI'))) { $self->throw( -text => "Target must be passed as a Bio::SeqI or Bio::PrimarySeqI object", -class => 'Bio::Root::BadParameter', -value => $targobj ); } if ($self->coords) { unless ($targobj->isa('Bio::Seq::RichSeqI')) { $self->throw( -text => "Target must be passed as a Bio::Seq::RichSeqSeqI object when coords == 1", -class => 'Bio::Root::BadParameter', -value => $targobj ); } } elsif ($targobj->isa('Bio::Seq::RichSeqI')) { $self->coords(1); } unless ($targobj->alphabet eq 'dna') { $self->throw( -text => "Target must be a DNA sequence", -class => 'Bio::Root::BadParameter', -value => $targobj ); } $self->{'target'} = $targobj; return 1; } sub _maskedfile { my ($self, $file) = @_; my $tmpdir = $self->tempdir; if ($file) { $self->{'maskedfile'} = $file; # add some sanity chex for writability? } elsif (!$self->{'maskedfile'}) { ($self->{'maskedfh'},$self->{'maskedfile'}) = $self->io->tempfile(-dir=>$self->tempdir()); } return $self->{'maskedfile'}; } sub _parse_coords { my ($self, $file) = @_; my $target = $self->target; open(FILE, $file) or die "Unable to open $file: $!"; while () { chomp; s/\r//; my ($seq, $length, $mstart, $mstop) = split(/\t/); # add masked region as a SeqFeature in target my $masked = Bio::SeqFeature::Generic->new( -start => $mstart, -end => $mstop, ); $masked->primary_tag('Excluded'); $masked->source_tag('mdust'); $target->add_SeqFeature($masked); } return 1; } =head2 maskchar Title : maskchar Usage : $mdust->maskchar('N') Purpose : Set/get the character for masking low-complexity regions Returns : True on success Args : Either N (default), X or L (lower case) =cut sub maskchar { my ($self, $maskchar) = @_; return $self->{'maskchar'} unless (defined $maskchar); unless ( grep {$maskchar eq $_} @MASKCHARS ) { $self->throw( -text => "maskchar must be one of N, X or L", -class => 'Bio::Root::BadParameter', -value => $maskchar ); } $self->{'maskchar'} = $maskchar; 1; } sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } sub AUTOLOAD { my ($self, $value) = @_; my $name = $AUTOLOAD; $name =~ s/.+:://; return if ($name eq 'DESTROY'); if (defined $value) { $self->{$name} = $value; } unless (exists $self->{$name}) { warn "Attribute $name not defined for ", ref($self), "\n" if ($self->debug); return undef; } return $self->{$name}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Meme.pm000066400000000000000000000274101302566030400217410ustar00rootroot00000000000000# BioPerl module for Meme # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Meme - Wrapper for Meme Program =head1 SYNOPSIS use Bio::Tools::Run::Meme; my $factory = Bio::Tools::Run::Meme->new(-dna => 1, -mod => 'zoops'); # return a Bio::AlignIO given Bio::PrimarySeqI objects my $alignio = $factory->run($seq1, $seq2, $seq3...); # add a Bio::Map::Prediction to the appropriate maps given Bio::Map::GeneMap # objects (predict on the full map sequences supplied) or Bio::Map::Gene # objects (predict on the full map sequences of the maps the supplied Genes # are on) or Bio::Map::PositionWithSequence objects my $prediction = $factory->run($biomap1, $biomap2, $biomap3...); =head1 DESCRIPTION This is a wrapper for running meme, a transcription factor binding site prediction program. It can be found here: http://meme.sdsc.edu/meme4/meme-download.html You can try supplying normal meme command-line arguments to new(), eg. new(-mod => 'oops') or calling arg-named methods (excluding the initial hyphen(s), eg. $factory->mod('oops') to set the -mod option to 'oops'). You will need to enable this MEME wrapper to find the meme program. During standard installation of meme you will have set up an environment variable called MEME_BIN which is used for this purpose. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut package Bio::Tools::Run::Meme; use strict; use Bio::SeqIO; use Bio::AlignIO; use Bio::Map::Prediction; use Bio::Map::Position; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'meme'; our $PROGRAM_DIR = $ENV{'MEME_BIN'}; # methods for the meme args we support our @PARAMS = qw(mod nmotifs evt nsites minsites maxsites wnsites w minw maxw wg ws bfile maxiter distance prior b plib spfuzz spmap cons maxsize p time sf); our @SWITCHES = qw(dna protein nomatrim noendgaps revcomp pal); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(h text nostatus); =head2 new Title : new Usage : $rm->new($seq) Function: creates a new wrapper Returns: Bio::Tools::Run::Meme Args : Most options understood by meme can be supplied as key => value pairs, with a boolean value for switches. -quiet can also be set to silence meme completely. These options can NOT be used with this wrapper (they are handled internally or don't make sense in this context): -h -text -nostatus =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } =head2 version Title : version Usage : n/a Function: Determine the version number of the program, which is non-discoverable for Meme Returns : undef Args : none =cut sub version { return; } =head2 run Title : run Usage : $rm->run($seq1, $seq2, $seq3...); Function: Run Meme on the sequences/Bio::Map::* set as the argument Returns : Bio::AlignIO if sequence objects supplied, OR Bio::Map::Prediction if Bio::Map::* objects supplied undef if no executable found Args : list of Bio::PrimarySeqI compliant objects, OR list of Bio::Map::GeneMap objects, OR list of Bio::Map::Gene objects, OR list of Bio::Map::PositionWithSequence objects =cut sub run { my ($self, @things) = @_; my $infile = $self->_setinput(@things); return $self->_run($infile); } =head2 _run Title : _run Usage : $rm->_run ($filename,$param_string) Function: internal function that runs meme Returns : as per run(), undef if no executable found Args : the filename to the input sequence file =cut sub _run { my ($self, $infile) = @_; my $exe = $self->executable || return; my $outfile = $infile.".out"; my $command = $exe.$self->_setparams($infile, $outfile); $self->debug("meme command = $command\n"); open(my $pipe, "$command |") || $self->throw("meme call ($command) failed to start: $? | $!"); my $error = ''; while (<$pipe>) { print unless $self->quiet; $error .= $_; } close($pipe) || ($error ? $self->throw("meme call ($command) failed: $error") : $self->throw("meme call ($command) crashed: $?")); #my $status = system($cmd_str); #$self->throw("Meme call ($cmd_str) crashed: $?\n") unless $status == 0; my $aio = Bio::AlignIO->new(-format => 'meme', -file => $outfile); unless ($self->{map_mode}) { # return directly the AlignIO return $aio; } else { # use the AlignIO meme parser to generate a Bio::Map::Prediction and # return that my $pred = Bio::Map::Prediction->new(-source => "meme"); while (my $aln = $aio->next_aln) { foreach my $seq ($aln->each_seq) { my $id = $seq->id; unless ($id) { $self->warn("Got a sequence in the alignment with no id, but I need one to determine the map"); next; } my ($uid) = $id =~ /^([^\[]+)/; my $map = Bio::Map::GeneMap->get(-uid => $uid); my ($start, $end) = ($seq->start, $seq->end); if ($seq->strand == -1) { my $length; my ($pos_s, $pos_e) = $id =~ /\[(\d+)\.\.(\d+)\]$/; if (defined($pos_s) && defined($pos_e)) { $length = $pos_e - $pos_s + 1; } else { $length = length($map->seq); } my $motif_length = $end - $start + 1; $end = $length - $start + 1; $start = $end - $motif_length + 1; } Bio::Map::Position->new(-element => $pred, -start => $start, -end => $end, -map => $map); } } delete $self->{map_mode}; return $pred; } } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for meme program Returns : parameter string to be passed to meme Args : none =cut sub _setparams { my ($self, $infile, $outfile) = @_; my $param_string = ' '.$infile; # -text and -nostatus must be set $param_string .= ' -text -nostatus'; $param_string .= $self->SUPER::_setparams(-params => \@PARAMS, -switches => \@SWITCHES, -dash => 1); $param_string .= " > $outfile"; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null" if $self->quiet || $self->verbose < 0; return $param_string; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Returns : string (file name) Args : as per run() =cut sub _setinput { my ($self, @inputs) = @_; $self->throw("At least two sequence or map objects must be supplied") unless @inputs >= 2; ref($inputs[0]) || $self->throw("Inputs must be object references"); my ($fh, $outfile) = $self->io->tempfile(-dir => $self->tempdir); my $out = Bio::SeqIO->new(-fh => $fh, '-format' => 'fasta'); my %done; foreach my $input (@inputs) { if ($input->isa('Bio::Map::MappableI')) { # we want to work on all its maps, since mappables themselves don't # have sequences push(@inputs, $input->known_maps); next; } $input->can('seq') || $self->throw("Supplied an input [$input] with no seq() method!"); if ($input->isa('Bio::Map::EntityI')) { $self->{map_mode} = 1; if ($input->isa('Bio::Map::MapI')) { # change the id of the seq so we'll know what input object it # came from later my $id = $input->unique_id; next if $done{$id}; $input->id($id); $done{$id} = 1; #*** should this be automatic in GeneMap? Anyway, we don't want # to alter users genemap id here permanently... } else { my $id = $input->id; unless ($id) { $input->id($input->map->unique_id.'['.$input->toString.']'); } } } $out->write_seq($input); } close($fh); return $outfile; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Minimo.pm000066400000000000000000000213231302566030400223030ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Minimo # # Copyright Florent E Angly # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Minimo - Wrapper for local execution of the Minimo assembler =head1 SYNOPSIS use Bio::Tools::Run::Minimo; # Run Minmo using an input FASTA file my $factory = Bio::Tools::Run::Minimo->new( -minimum_overlap_length => 35 ); my $asm_obj = $factory->run($fasta_file, $qual_file); # An assembly object is returned by default for my $contig ($assembly->all_contigs) { ... do something ... } # Read some sequences use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta'); my @seqs; while (my $seq = $sio->next_seq()) { push @seqs,$seq; } # Run Minimo using input sequence objects and returning an assembly file my $asm_file = 'results.ace'; $factory->out_type($asm_file); $factory->run(\@seqs); =head1 DESCRIPTION Wrapper module for the local execution of the DNA assembly program Minimo. Minimo is based on AMOS (http://sourceforge.net/apps/mediawiki/amos/) and implements the same conservative assembly algorithm as Minimus (http://sourceforge.net/apps/mediawiki/amos/index.php?title=Minimus). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Florent E Angly Email: florent-dot-angly-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Minimo; use strict; use IPC::Run; use File::Copy; use File::Spec; use File::Basename; use base qw( Bio::Root::Root Bio::Tools::Run::AssemblerBase ); our $program_name = 'Minimo'; # name of the executable our @program_params = (qw( qual_in good_qual bad_qual min_len min_ident aln_wiggle out_prefix ace_exp )); our @program_switches; our %param_translation = ( 'qual_in' => 'D QUAL_IN', 'good_qual' => 'D GOOD_QUAL', 'bad_qual' => 'D BAD_QUAL', 'min_len' => 'D MIN_LEN', 'min_ident' => 'D MIN_IDENT', 'aln_wiggle' => 'D ALN_WIGGLE', 'out_prefix' => 'D OUT_PREFIX', 'ace_exp' => 'D ACE_EXP' ); our $qual_param = 'qual_in'; our $use_dash = 1; our $join = '='; our $asm_format = 'ace'; =head2 new Title : new Usage : $assembler->new( -min_len => 50, -min_ident => 95 ); Function: Creates a Minimo factory Returns : A Bio::Tools::Run::Minimo object Args : Minimo options available in this module: qual_in Input quality score file good_qual Quality score to set for bases within the clear range if no quality file was given (default: 30) bad_qual Quality score to set for bases outside clear range if no quality file was given (default: 10). If your sequences are trimmed, try the same value as GOOD_QUAL. min_len / minimum_overlap_length Minimum contig overlap length (between 20 and 100 bp, default: 35) min_ident / minimum_overlap_similarity Minimum contig overlap identity percentage (between 0 and 100 %, default: 98) aln_wiggle Alignment wiggle value when determining the consensus sequence (default: 2 bp) out_prefix Prefix to use for the output file path and name =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); *minimum_overlap_length = \&min_len; *minimum_overlap_similarity = \&min_ident; $self->program_name($program_name) if not defined $self->program_name(); $self->_assembly_format($asm_format); return $self; } =head2 out_type Title : out_type Usage : $factory->out_type('Bio::Assembly::ScaffoldI') Function: Get/set the desired type of output Returns : The type of results to return Args : Desired type of results to return (optional): 'Bio::Assembly::IO' object 'Bio::Assembly::ScaffoldI' object (default) The name of a file to save the results in =cut =head2 run Title : run Usage : $factory->run($fasta_file); Function: Run TIGR Assembler Returns : - a Bio::Assembly::ScaffoldI object, a Bio::Assembly::IO object, a filename, or undef if all sequences were too small to be usable Returns : Assembly results (file, IO object or assembly object) Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut =head2 _run Title : _run Usage : $factory->_run() Function: Make a system call and run TIGR Assembler Returns : An assembly file Args : - FASTA file - optional QUAL file =cut sub _run { my ($self, $fasta_file, $qual_file) = @_; # qual_in Input quality score file # fasta_exp Export results in FASTA format (0:no 1:yes, default: 1) # ace_exp Export results in ACE format (0:no 1:yes, default: 1) # Specify that we want an ACE output file $self->ace_exp(1); # Setup needed files and filehandles first my ($output_fh, $output_file) = $self->_prepare_output_file( ); my ($stdout_fh, $stdout_file) = $self->io->tempfile( -dir => $self->tempdir() ); # Get program executable my $exe = $self->executable; # Get command-line options my $options = $self->_translate_params(); # Usage: Minimo FASTA_IN [options] # Options are of the style: -D PARAM=VAL my @program_args = ( $exe, $fasta_file, @$options); my @ipc_args = ( \@program_args, '>', $stdout_file); # Print command for debugging if ($self->verbose() >= 0) { my $cmd = ''; $cmd .= join ( ' ', @program_args ); for ( my $i = 1 ; $i < scalar @ipc_args ; $i++ ) { my $element = $ipc_args[$i]; my $ref = ref($element); my $value; if ( $ref && $ref eq 'SCALAR') { $value = $$element; } else { $value = $element; } $cmd .= " $value"; } $self->debug( "$exe command = $cmd\n" ); } # Execute command my $log_file = "$fasta_file.runAmos.log"; eval { IPC::Run::run(@ipc_args) || die("There was a problem running $exe. The ". "error message is: $!. Check the log file $log_file for possible causes."); }; if ($@) { $self->throw("$exe call crashed: $@"); } # Close filehandles close($output_fh); close($stdout_fh); # Result files my $base = $self->out_prefix(); if (not defined $base) { my $dirname = dirname($fasta_file); my $basename = basename($fasta_file); $basename =~ s/^(.+)\..+$/$1/; $base = File::Spec->catfile($dirname, $basename); } my $ace_file = "$base-contigs.ace"; my $amos_file = "$base-contigs.afg"; # Remove all files except for the ACE file for my $file ($log_file, $stdout_file, $amos_file) { unlink $file; } # Clean the ACE file $self->_clean_file($ace_file); # Move the ACE file to its final destination move ($ace_file, $output_file) or $self->throw("Could not move file ". "'$ace_file' to '$output_file': $!"); return $output_file; } =head2 _clean_file Title : _clean_file Usage : $factory->_clean_file($file) Function: Clean file in place by removing NULL characters. NULL characters can be present in the output files of AMOS 2.0.8 but they do not validate as proper sequence characters in Bioperl. Returns : 1 for success Args : Filename =cut sub _clean_file { my ($self, $file) = @_; # Set in-place file editing mode local $^I = "~"; local @ARGV = ( $file ); # Replace lines in file while (<>) { s/\x0//g; print; } return 1; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Newbler.pm000066400000000000000000000331221302566030400224510ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Newbler # # Copyright Florent E Angly # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Newbler - Wrapper for local execution of Newbler =head1 SYNOPSIS use Bio::Tools::Run::Newbler; # Run Minmo using an input FASTA file my $factory = Bio::Tools::Run::Newbler->new( -minimum_overlap_length => 35 ); my $asm_obj = $factory->run($fasta_file, $qual_file); # An assembly object is returned by default for my $contig ($assembly->all_contigs) { ... do something ... } # Read some sequences use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta'); my @seqs; while (my $seq = $sio->next_seq()) { push @seqs,$seq; } # Run Newbler using input sequence objects and returning an assembly file my $asm_file = 'results.ace'; $factory->out_type($asm_file); $factory->run(\@seqs); =head1 DESCRIPTION Wrapper module for the local execution of the proprietary DNA assembly program GS De Novo Assembler (Newbler) from Roche/454 v2.0.00.20: http://www.454.com/products-solutions/analysis-tools/gs-de-novo-assembler.asp =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Florent E Angly Email: florent-dot-angly-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Newbler; use strict; use IPC::Run; use File::Copy; use File::Path; use File::Spec; use File::Basename; use base qw( Bio::Root::Root Bio::Tools::Run::AssemblerBase ); our $program_name = 'runAssembly'; # name of the executable our @program_params = (qw( expected_depth mid_conf_file vector_trim vector_screen aln_identity_score aln_difference_score min_ovl_identity min_ovl_length seed_count seed_length seed_step out_dir )); our @program_switches = (qw( large ace ace_raw ace_trimmed no_trim in_memory no_auto_rescore no_duplicates )); our %param_translation = ( 'large' => 'large', 'ace' => 'ace', 'ace_raw' => 'ar', 'ace_trimmed' => 'at', 'expected_depth' => 'e', 'mid_conf_file' => 'mcf', 'no_trim' => 'notrim', 'vector_trim' => 'vt', 'vector_screen' => 'vs', 'aln_identity_score' => 'ais', 'aln_difference_score'=> 'ads', 'in_memory' => 'm', 'min_ovl_identity' => 'mi', 'min_ovl_length' => 'ml', 'no_auto_rescore' => 'nor', 'seed_count' => 'sc', 'seed_length' => 'sl', 'seed_step' => 'ss', 'no_duplicates' => 'ud', 'out_dir' => 'o' ); our $qual_param; our $use_dash = 1; our $join = ' '; our $asm_format = 'ace'; our $asm_variant = '454'; =head2 new Title : new Usage : $assembler->new( -min_len => 50, -min_ident => 95 ); Function: Creates a Newbler factory Returns : A Bio::Tools::Run::Newbler object Args : Newbler options available in this module (from the Newbler manual): large Shortcut some of the computationally expensive algorithms to save some time. Useful for large or complex datasets (default: off). ace_raw Output the full "raw" read sequence (default: off). ace_trimmed Output only the "trimmed" sequences (after low quality, vector and key trimming) (default: on). expected_depth Expected depth of the assembly. Filters out random-chance level events at bigger depths. 0 means to not use the expected depth information (default: 0). mid_conf_file MID configuration file for decoding the multiplex data. no_trim Disable the quality and primer trimming of the input sequences (default: off). vector_trim Specify a vector trimming database (in FASTA format) to trim the ends of input sequences. vector_screen Specify a vector screening database (in FASTA format) to remove contaminants, i.e. input reads that align against a sequence in the database. aln_identity_score Set the alignment identity score. When multiple alignments are found, it is the per-overlap column identity score used to sort the overlaps for use in the progressive alignment (default: 2). aln_difference_score Set the alignment difference score. For multiple alignments this is the per-overlap difference score used to sort the overlaps for use in the progressive multi-alignment (default: -3). in_memory Keep all sequence data in memory throughout the computation. Can speed up the computation but requires more computer memory (default: off). min_ovl_identity / minimum_overlap_similarity Minimum overlap identity, i.e. the minimum percent identity of overlaps used by the assembler (default: 40). min_ovl_length / minimum_overlap_length Minimum overlap length, i.e. the minimum length of overlaps considered by the assembler (default: 90). Warning: It seems like this parameter is not respected by the program in the current version no_auto_rescore Do not use the quality score re-scoring algorithm (default: off). seed_count Set the seed count parameter, the number of seeds required in a window before an extension is made (default: 1). seed_length Set the seed length parameter, i.e. the number of bases between seed generation locations used in the exact k-mer matching part of the overlap detection (between 6 16) (default: 16). seed_step Set the seed step parameter, i.e. the number of bases used for each seed in the exact k-mer matching part of the overlap detection (i.e. the "k" value) (default: 12). no_duplicates Treat each read as a separate read and do not group them into duplicates for assembly or consensus calling (default: off). =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); *minimum_overlap_length = \&min_ovl_length; *minimum_overlap_similarity = \&min_ovl_identity; $self->program_name($program_name) if not defined $self->program_name(); $self->_assembly_format($asm_format); $self->_assembly_variant($asm_variant); return $self; } =head2 _check_sequence_input Title : _check_sequence_input Usage : $assembler->_check_sequence_input($seqs) Function: Check that the sequence input is arrayref of sequence objects or a FASTA file, or a MIDinfo + dir, or a MIDinfo + file. If not, an error is thrown. Returns : 1 if the check passed Args : sequence input =cut sub _check_sequence_input { my ($self, $seqs) = @_; if (not $seqs) { $self->throw("Must supply sequences as a FASTA filename or a sequence object". " (Bio::PrimarySeqI or Bio::SeqI) array reference"); } else { if (ref($seqs) =~ m/ARRAY/i ) { for my $seq (@$seqs) { unless ($seq->isa('Bio::PrimarySeqI') || $seq->isa('Bio::SeqI')) { $self->throw("Not a valid Bio::PrimarySeqI or Bio::SeqI object"); } } } else { # [midinfo@]sffile|[midinfo@]projectdir|fastafile my ($mid, $file_or_dir) = ($seqs =~ m/^(.+@)?(.+)$/); if (not defined $file_or_dir) { $self->throw("Input string $seqs does not seem valid."); } else { if (not -e $file_or_dir) { $self->throw("Input file or directory '$file_or_dir' does not seem to exist."); } } } } return 1; } =head2 out_type Title : out_type Usage : $factory->out_type('Bio::Assembly::ScaffoldI') Function: Get/set the desired type of output Returns : The type of results to return Args : Desired type of results to return (optional): 'Bio::Assembly::IO' object 'Bio::Assembly::ScaffoldI' object (default) The name of a file to save the results in =cut =head2 run Title : run Usage : $factory->run($fasta_file); Function: Run TIGR Assembler Returns : - a Bio::Assembly::ScaffoldI object, a Bio::Assembly::IO object, a filename, or undef if all sequences were too small to be usable Returns : Assembly results (file, IO object or assembly object) Args : Sequence input can be: * a sequence object arrayref * a FASTA file * a SFF file and optional MID information. Example: mid2@/home/xxx/myreads.sff * the path to an run analysis directory and MID information The reads must be between 50 and 2000 bp. Newbler does not support for input quality files. See the Newbler manual for details. =cut =head2 _run Title : _run Usage : $factory->_run() Function: Make a system call and run TIGR Assembler Returns : An assembly file Args : - FASTA file, SFF file and MID, or analysis dir and MID - optional QUAL file =cut sub _run { my ($self, $fasta_file, $qual_file) = @_; # fasta_file: [midinfo@]sffile|[midinfo@]projectdir|fastafile # qual_file: not supported by newbler # Specify that we want a single ACE output file containing all contigs $self->ace(1); # Setup needed files and filehandles first my ($output_fh, $output_file) = $self->_prepare_output_file( ); # Set the output directory based on the the output file name my $output_dir = dirname($output_file); $self->out_dir($output_dir); # Set a log file my $log_file = File::Spec->catfile($output_dir, '454Log.txt'); # Get program executable my $exe = $self->executable; # Get command-line options my $options = $self->_translate_params(); # Usage: runAssembly [options] (sfffile | [regionlist:]analysisDir | readfastafile)... # where options is: [-o projdir] [-nrm] [-p (sfffile | [regionlist:]analysisDir)]... my @program_args = ( $exe, @$options, $fasta_file); my @ipc_args = ( \@program_args, '>', $log_file ); # Print command for debugging if ($self->verbose() >= 0) { my $cmd = ''; $cmd .= join ( ' ', @program_args ); for ( my $i = 1 ; $i < scalar @ipc_args ; $i++ ) { my $element = $ipc_args[$i]; my $ref = ref($element); my $value; if ( $ref && $ref eq 'SCALAR') { $value = $$element; } else { $value = $element; } $cmd .= " $value"; } $self->debug( "$exe command = $cmd\n" ); } # Execute command eval { IPC::Run::run(@ipc_args) || die("There was a problem running $exe. The ". "error message is: $!."); }; if ($@) { $self->throw("$exe call crashed: $@"); } # Close filehandles close($output_fh); # Result files my $ace_file = File::Spec->catfile($output_dir, '454Contigs.ace'); my $aln_file = File::Spec->catfile($output_dir, '454AlignmentInfo.tsv'); my $all_cont_fasta_file = File::Spec->catfile($output_dir, '454AllContigs.fna'); my $all_cont_qual_file = File::Spec->catfile($output_dir, '454AllContigs.qual'); my $large_cont_fasta_file = File::Spec->catfile($output_dir, '454LargeContigs.fna'); my $large_cont_qual_file = File::Spec->catfile($output_dir, '454LargeContigs.qual'); my $metrics_file = File::Spec->catfile($output_dir, '454NewblerMetrics.txt'); my $status_file = File::Spec->catfile($output_dir, '454ReadStatus.txt'); my $progress_file = File::Spec->catfile($output_dir, '454NewblerProgress.txt'); my $trim_file = File::Spec->catfile($output_dir, '454TrimStatus.txt'); my $sff_dir = File::Spec->catfile($output_dir, 'sff'); # Remove all files except for the ACE file for my $file ($aln_file, $all_cont_fasta_file, $all_cont_qual_file, $large_cont_fasta_file, $large_cont_qual_file, $metrics_file, $status_file, $progress_file, $trim_file, $log_file ) { unlink $file; } rmtree( $sff_dir ); # Move output file to proper location/name move ($ace_file, $output_file) or $self->throw("Could not move file ". "'$ace_file' to '$output_file': $!"); return $output_file; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phrap.pm000077500000000000000000000321701302566030400221320ustar00rootroot00000000000000# # Phrap wraper module # # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phrap - a wrapper for running Phrap =head1 SYNOPSIS use Bio::Tools::Run::Phrap; # Run Phrap using an input FASTA file my $factory = Bio::Tools::Run::Phrap->new( -penalty => -2, -raw => 1 ); my $asm_obj = $factory->run($fasta_file, $qual_file); # An assembly object is returned by default for my $contig ($assembly->all_contigs) { ... do something ... } # Read some sequences use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta'); my @seqs; while (my $seq = $sio->next_seq()) { push @seqs,$seq; } # Run Phrap using input sequence objects and returning an assembly file my $asm_file = 'results.phrap'; $factory->out_type($asm_file); $factory->run(\@seqs); =head1 DESCRIPTION Wrapper module for the Phrap assembly program Phrap is available at: http://www.phrap.org/ =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh-at-stanford.edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phrap; use strict; use File::Copy; use base qw(Bio::Root::Root Bio::Tools::Run::AssemblerBase); our $program_name = 'phrap'; our @program_params = (qw(penalty gap_init gap_ext ins_gap_ext del_gap_ext matrix minmatch maxmatch max_group_size bandwidth minscore vector_bound masklevel default_qual subclone_delim n_delim group_delim trim_start forcelevel bypasslevel maxgap repeat_stringency node_seg node_space max_subclone_size trim_penalty trim_score trim_qual confirm_length confirm_trim confirm_penalty confirm_score indexwordsize)); our @program_switches = (qw(raw word_raw revise_greedy shatter_greedy preassemble force_high retain_duplicates)); our %param_translation; our $qual_param; our $use_dash = 1; our $join = ' '; our $asm_format = 'phrap'; =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phrap->new( -penalty => -2, # parameter option and value -raw => 1 # flag (1=yes, 0=no) ); Function: Create a new Phrap factory Returns : A Bio::Tools::Run::Phrap object Args : Phrap options available in this module: Option names & default values taken from the PHRAP manual: 1. Scoring of pairwise alignments -penalty -2 Mismatch (substitution) penalty for SWAT comparisons. -gap_init penalty-2 Gap initiation penalty for SWAT comparisons. -gap_ext penalty-1 Gap extension penalty for SWAT comparisons. -ins_gap_ext gap_ext Insertion gap extension penalty for SWAT comparisons (insertion in subject relative to query). -del_gap_ext gap_ext Deletion gap extension penalty for SWAT comparisons (deletion in subject relative to query). -matrix [None] Score matrix for SWAT comparisons (if present, supersedes -penalty) -raw * Use raw rather than complexity-adjusted Smith-Waterman scores. 2. Banded search -maxmatch 30 Maximum length of matching word. For cross_match, the default value is equal to minmatch, instead of 30. -max_group_size 20 Group size (query file, forward strand words) -word_raw * Use raw rather than complexity-adjusted word length, in testing against minmatch (N.B. maxmatch always refer to raw lengths). (The default is to adjust word length to reflect complexity of matching sequence). -bandwidth 14 1/2 band width for banded SWAT searches (full width is 2 times bandwidth + 1). Decreasing bandwidth also decreases running time at the expense of sensitivity. Phrap assemblies of clones containing long tandem repeats of a short repeat unit (< 30 bp) may be more accurately assembled by decreasing -bandwidth; -bandwidth should be set such that 2 bandwidth + 1 is less than the length of a repeat unit. -bandwidth 0 can be used to find gap-free alignments. 3. Filtering of matches -minscore 30 Minimum alignment score. -vector_bound 80 Number of potential vector bases at beginning of each read. Matches that lie entirely within this region are assumed to represent vector matches and are ignored. For cross_match, the default value is 0 instead of 80. -masklevel 80 (cross_match only). A match is reported only if at least (100 - masklevel)% of the bases in its "domain" (the part of the query that is aligned) are not contained within the domain of any higher-scoring match. Special cases: -masklevel 0 report only the single highest scoring match for each query -masklevel 100 report any match whose domain is not completely contained within a higher scoring match -masklevel 101 report all matches 4. Input data interpretation -default_qual 15 Quality value to be used for each base, when no input .qual file is provided. Note that a quality value of 15 corresponds to an error rate of approximately 1 in 30 bases, i.e. relatively accurate sequence. If you are using sequence that is substantially less accurate than this and do not have phred-generated quality values you should be sure to decrease the value of this parameter. -subclone_delim . (phrap only). Subclone name delimiter: Character used to indicate end of that part of the read name that corresponds to the subclone name -n_delim 1 (phrap only). Indicates which occurrence of the subclone delimiter character denotes the end of the subclone name (so for example -subclone_delim _ -n_delim 2 means that the end of the subclone name occurs at the second occurrence of the character '_'). Must be the same for all reads! -group_delim _ (phrap only). Group name delimiter: Character used to indicate end of that part of the read name that corresponds to the group name (relevant only if option -preassemble is used); this character must occur before the subclone delimiter (else it has no effect, and the read is not assigned to a group). -trim_start 0 (phrap only). No. of bases to be removed at beginning of each read. 5. Assembly -forcelevel 0 (phrap only). Relaxes stringency to varying degree during final contig merge pass. Allowed values are integers from 0 (most stringent) to 10 (least stringent), inclusive. -bypasslevel 1 (phrap only). Controls treatment of inconsistent reads in merge. Currently allowed values are 0 (no bypasses allowed; most stringent) and 1 (a single conflicting read may be bypassed). -maxgap 30 (phrap only). Maximum permitted size of an unmatched region in merging contigs, during first (most stringent) merging pass. -repeat_stringency .95 (phrap only). Controls stringency of match required for joins. Must be less than 1 (highest stringency), and greater than 0 (lowest stringency). -revise_greedy * (phrap only). Splits initial greedy assembly into pieces at "weak joins", and then tries to reattach them to give higher overall score. Use of this option should correct some types of missassembly. -shatter_greedy * (phrap only). Breaks assembly at weak joins (as with -revise_greedy) but does not try to reattach pieces. -preassemble * (phrap only). Preassemble reads within groups, prior to merging with other groups. This is useful for example when the input data set consists of reads from two distinct but overlapping clones, and it is desired to assemble the reads from each clone separately before merging in order to reduce the risk of incorrect joins due to repeats. The preassemble merging pass is relatively stringent and not guaranteed to merge all of the reads from a group. Groups are indicated by the first part of the read name, up to the character specified by -group_delim. -force_high * (phrap only). Causes edited high-quality discrepancies to be ignored during final contig merge pass. This option may be useful when it is suspected that incorrect edits are causing a misassembly. 6. Consensus sequence construction -node_seg 8 (phrap only). Minimum segment size (for purposes of traversing weighted directed graph). -node_space 4 (phrap only). Spacing between nodes (in weighted directed graph). 7. Output Not implemented in this Perl module. 8. Miscellaneous -retain_duplicates * (phrap only). Retain exact duplicate reads, rather than eliminating them. -max_subclone_size 5000 (phrap only). Maximum subclone size -- for forward-reverse read pair consistency checks. -trim_penalty -2 (phrap only). Penalty used for identifying degenerate sequence at beginning & end of read. -trim_score 20 (phrap only). Minimum score for identifying degenerate sequence at beginning & end of read. -trim_qual 13 (phrap only). Quality value used in to define the "high-quality" part of a read, (the part which should overlap; this is used to adjust qualities at ends of reads. -confirm_length 8 (phrap only). Minimum size of confirming segment (segment starts at 3d distinct nuc following discrepancy). -confirm_trim 1 (phrap only). Amount by which confirming segments are trimmed at edges. -confirm_penalty -5 (phrap only). Penalty used in aligning against "confirming" reads. -confirm_score 30 (phrap only). Minimum alignment score for a read to be allowed to "confirm" part of another read. -indexwordsize 10 Size of indexing (hashing) words, used in finding word matches between sequences. The value of this parameter has a generally minor effect on run time and memory usage. =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); $self->program_name($program_name) if not defined $self->program_name(); $self->_assembly_format($asm_format); return $self; } =head2 out_type Title : out_type Usage : $assembler->out_type('Bio::Assembly::ScaffoldI') Function: Get/set the desired type of output Returns : The type of results to return Args : Desired type of results to return (optional): 'Bio::Assembly::IO' object 'Bio::Assembly::ScaffoldI' object (default) The name of a file to save the results in =cut =head2 run Title : run Usage : $asm = $factory->run($fasta_file) Function: Run Phrap Returns : Assembly results (file, IO object or assembly object) Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut =head2 _run Title : _run Usage : $factory->_run() Function: Make a system call and run Phrap Returns : An assembly file Args : - FASTA file - optional QUAL file =cut sub _run { my ($self, $fasta_file, $qual_file) = @_; # Move quality file to proper place my $tmp_qual_file = "$fasta_file.qual"; if ($qual_file && not -f $tmp_qual_file) { $tmp_qual_file = "$fasta_file.qual"; # by Cap3 convention link ($qual_file, $tmp_qual_file) or copy ($qual_file, $tmp_qual_file) or $self->throw("Could not copy file '$qual_file' to '$tmp_qual_file': $!"); } # Setup needed files and filehandles my ($output_fh, $output_file) = $self->_prepare_output_file( ); # Get program executable my $exe = $self->executable; # Get command-line options my $options = join ' ', @{$self->_translate_params()}; # Usage: phrap seq_file1 [seq_file2 ...] [-option value] [-option value] ... my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; my $str = "$exe $options $fasta_file 1> $output_file 2> $null"; if ($self->verbose() >= 0) { $self->debug( "$exe command = $str\n" ); }; my $status = system($str); $self->throw( "Phrap call ($str) crashed: $? \n") unless $status==0; close($output_fh); # Result files my $log_file = "$fasta_file.log"; my $contigs_file = "$fasta_file.contigs"; my $problems_file = "$fasta_file.problems"; my $problems_qual_file = "$fasta_file.problems.qual"; my $contigs_qual_file = "$fasta_file.contigs.qual"; my $singlets_file = "$fasta_file.singlets"; # Remove all files except for the PHRAP file for my $file ($log_file, $contigs_file, $problems_file, $problems_qual_file, $contigs_qual_file, $singlets_file, $tmp_qual_file) { unlink $file; } return $output_file; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/000077500000000000000000000000001302566030400216075ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/FastTree.pm000066400000000000000000000241171302566030400236670ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::FastTree # # Please direct questions and support issues to # # Copyright Brian Osborne # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::FastTree =head1 SYNOPSIS # Build a FastTree factory $factory = Bio::Tools::Run::Phylo::FastTree->new(-quiet => 1, -fastest => 1); # Get an alignment my $alignio = Bio::AlignIO->new( -format => 'fasta', -file => '219877.cdna.fasta'); my $alnobj = $alignio->next_aln; # Analyze the aligment and get a Tree my $tree = $factory->run($alnobj); =head1 DESCRIPTION Get a Bio::Tree object given a protein or DNA alignment. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I Do not contact the module maintainer directly. Many experienced experts at bioperl-l will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Brian Osborne Email briano@bioteam.net =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::FastTree; use strict; use Bio::Seq; use Bio::SeqIO; use Bio::TreeIO; use Bio::AlignIO; use Bio::Root::IO; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @FastTree_PARAMS = qw(log cat n intree intree1 constraints sprlength topm close refresh constraintWeight spr mlacc nni mlnni seed matrix gtrrates gtrfreq makematrix ); our @FastTree_SWITCHES = qw(quiet nopr nt fastest slow nosupport gtr wag quote noml nome gamma mllen slownni nocat notoo 2nd no2nd nj bionj top notop nomatrix rawdist ); our $PROGRAM_NAME = 'FastTree'; =head2 new Title : new Usage : my $treebuilder = Bio::Tools::Run::Phylo::FastTree->new(); Function: Constructor Returns : Bio::Tools::Run::Phylo::FastTree Args : -outfile_name => $outname =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args( \@args, -methods => [ @FastTree_PARAMS, @FastTree_SWITCHES ], -create => 1 ); my ($out) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME)], @args ); $self->outfile_name( $out || '' ); $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory Returns: string Args : =cut sub program_dir { undef; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ( $self, $value ) = @_; $self->{'error_string'} = $value if ( defined $value ); $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1`; $string =~ /FastTree\s+version\s+([\d\.]+)/; return $1 || undef; } =head2 run Title : run Usage : $factory->run($stockholm_file) OR $factory->run($align_object) Function: Runs FastTree to generate a tree Returns : Bio::Tree::Tree object Args : File name for your input alignment in stockholm format, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ($self, $in) = @_; if (ref $in && $in->isa("Bio::Align::AlignI")) { $in = $self->_write_alignfile($in); } elsif (! -e $in) { $self->throw("When not supplying a Bio::Align::AlignI object, you must supply a readable filename"); } $self->_run($in); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: Runs the application Returns : Tree object Args : Alignment file name =cut sub _run { my ( $self, $file ) = @_; # If -nt is not set check the alphabet of the input $self->_alphabet($file) if ( ! $self->nt ); my $exe = $self->executable || return; my $param_str = $self->arguments . " " . $self->_setparams($file); my $command = "$exe $param_str"; $self->debug("FastTree command = $command"); my $status = system($command); my $outfile = $self->outfile_name(); if ( !-e $outfile || -z $outfile ) { $self->warn("FastTree call had status of $status: $? [command $command]\n"); return undef; } my $treeio = Bio::TreeIO->new( -format => 'newick', -file => $outfile ); my $tree = $treeio->next_tree; # if bootstraps were enabled, the bootstraps are the ids; convert to # bootstrap and no id # if ($self->boot) { # my @nodes = $tree->get_nodes; # my %non_internal = map { $_ => 1 } ($tree->get_leaf_nodes, $tree->get_root_node); # foreach my $node (@nodes) { # next if exists $non_internal{$node}; # $node->bootstrap && next; # protect ourselves incase the parser improves # $node->bootstrap($node->id); # $node->id(''); # } # } $tree; } =head2 _write_alignfile Title : _write_alignfile Usage : Internal function, not to be called directly Function: Create an alignment file Returns : filename Args : Bio::Align::AlignI =cut sub _write_alignfile { my ( $self, $align ) = @_; my ( $tfh, $tempfile ) = $self->io->tempfile( -dir => $self->tempdir ); my $out = Bio::AlignIO->new( -file => ">$tempfile", -format => 'phylip' ); $out->write_aln($align); $out->close(); undef($out); close($tfh); undef($tfh); die "Alignment file $tempfile was not created" if ( ! -e $tempfile ); $tempfile; } =head2 _alphabet Title : _alphabet Usage : my $alphabet = $self->_alphabet; Function: Get the alphabet of the input alignment, defaults to 'dna' Returns : 'dna' or 'protein' Args : Alignment file =cut sub _alphabet { my ($self,$file) = @_; if ( $file ) { if ( -e $file ) { my $in = Bio::AlignIO->new(-file => $file); my $aln = $in->next_aln; # arbitrary, the first one my $seq = $aln->get_seq_by_pos(1); my $alphabet = $seq->alphabet; $self->{_alphabet} = $alphabet; $self->nt(1) if ( $alphabet eq 'dna' ); } else { die "File $file can not be found"; } } # default is 'dna' return $self->{'_alphabet'} || 'dna'; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for FastTree program Example : Returns : parameter string to be passed to FastTree Args : name of calling object =cut sub _setparams { my ($self,$infile) = @_; my ( $attr, $value, $param_string ); $param_string = ''; my $laststr; for $attr (@FastTree_PARAMS) { $value = $self->$attr(); next unless ( defined $value ); my $attr_key = lc $attr; $attr_key = ' -' . $attr_key; $param_string .= $attr_key . ' ' . $value; } for $attr (@FastTree_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; $attr_key = ' -' . $attr_key; $param_string .= $attr_key; } # Set default output file if no explicit output file has been given if ( ! $self->outfile_name ) { my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $self->tempdir() ); close($tfh); undef $tfh; $self->outfile_name($outfile); } $param_string .= " $infile > " . $self->outfile_name; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 2> $null" if ( $self->quiet() || $self->verbose < 0 ); $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $FastTree->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $FastTree->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Gerp.pm000077500000000000000000000230021302566030400230420ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Gerp # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Gerp - Wrapper for GERP =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Gerp; # Make a Gerp factory $factory = Bio::Tools::Run::Phylo::Gerp->new(); # Run Gerp with an alignment and tree file my $parser = $factory->run($alignfilename, $treefilename); # or with alignment object and tree object (which needs branch lengths) $parser = $factory->run($bio_simplalign, $bio_tree_tree); # (mixtures of the above are possible) # look at the results while (my $feat = $parser->next_result) { my $start = $feat->start; my $end = $feat->end; my $rs_score = $feat->score; my $p_value = ($feat->annotation->get_Annotations('p-value'))[0]->value; } =head1 DESCRIPTION This is a wrapper for running the GERP (v2) programs 'gerpcol' and 'gerpelem' by Eugene Davydov (originally Gregory M. Cooper et al.). You can get details here: http://mendel.stanford.edu/sidowlab/. GERP can be used for phylogenetic footprinting/ shadowing (it finds 'constrained elements in multiple alignments'). You can try supplying normal gerpcol/gerpelem command-line arguments to new(), eg. $factory-Enew(-e =E 0.05) or calling arg-named methods, eg. $factory-Ee(0.05). The filename-related args (t, f, x) are handled internally by the run() method. This wrapper currently only supports running GERP on a single alignment at a time (ie. F isn't used at all, nor are multiple fs possible). You will need to enable this GERP wrapper to find the GERP executables. This can be done in (at least) three ways: 1. Make sure gerpcol and gerpelem are in your path. 2. Define an environmental variable GERPDIR which is a directory which contains the GERP executables: In bash: export GERPDIR=/home/username/gerp/ In csh/tcsh: setenv GERPDIR /home/username/gerp 3. Include a definition of an environmental variable GERPDIR in every script that will use this GERP wrapper module, e.g.: BEGIN { $ENV{GERPDIR} = '/home/username/gerp/' } use Bio::Tools::Run::Phylo::Gerp; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Gerp; use strict; use Cwd; use File::Spec; use File::Basename; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Phylo::Gerp; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'gerpcol'; our $PROGRAM_DIR; # methods for the gerp args we support our @COLPARAMS = qw(r n s); our @ELEMPARAMS = qw(l L t d p b a c r e); our @SWITCHES = qw(v); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(h t f F x); BEGIN { # lets add all the gerp executables to the path $PROGRAM_DIR = $ENV{'GERPDIR'}; $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { my $self = shift; if (@_) { $self->{program_name} = shift } return $self->{program_name} || $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Gerp->new() Function: creates a new GERP factory Returns : Bio::Tools::Run::Phylo::Gerp Args : Most options understood by GERP can be supplied as key => value pairs. These options can NOT be used with this wrapper: h, t, f, F and x =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@COLPARAMS, @ELEMPARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 run Title : run Usage : $parser = $factory->run($align_file, $tree_file); -or- $parser = $factory->run($align_object, $tree_object); Function: Runs GERP on an alignment. Returns : Bio::Tools::Phylo::Gerp parser object, containing the results Args : The first argument represents an alignment, the second argument a phylogenetic tree with branch lengths. The alignment can be provided as a MAF format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. In all cases, the alignment sequence names must correspond to node ids in the tree. Multi-word species names should have the spaces replaced with underscores (eg. Homo_sapiens) =cut sub run { my ($self, $aln, $tree) = @_; $self->_alignment($aln || $self->throw("An alignment must be supplied")); $self->_tree($tree || $self->throw("A phylo tree must be supplied")); # check node and seq names match $self->_check_names; return $self->_run; } sub _run { my $self = shift; $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); foreach my $prog ('gerpcol', 'gerpelem') { delete $self->{'_pathtoexe'}; $self->program_name($prog); my $exe = $self->executable || $self->throw("'$prog' executable not found"); my $command = $exe.$self->_setparams($prog); $self->debug("gerp command = $command\n"); #eval { # local $SIG{ALRM} = sub { die "alarm\n" }; # alarm 60; # system($command) && $self->throw("gerp call ($command) failed: $! | $?"); # alarm 0; #}; #die if $@ && $@ ne "alarm\n"; #if ($@) { # die "Gerp timed out\n"; #} # # system("rm -fr $cwd/gerp_dir; cp -R $temp_dir $cwd/gerp_dir"); open(my $pipe, "$command |") || $self->throw("gerp call ($command) failed to start: $? | $!"); my $error = ''; my $warning = ''; while (<$pipe>) { if ($self->quiet) { $error .= $_; $warning .= $_ if /warning/i; } else { print; } } close($pipe) || ($error ? $self->throw("gerp call ($command) failed: $error") : $self->throw("gerp call ($command) crashed: $?")); # (throws most likely due to seg fault in gerpelem when ~25000 entries # in rates file, not much I can do about it!) $self->warn("GERP: ".$warning) if $warning; } #system("rm -fr $cwd/gerp_dir; cp -R $temp_dir $cwd/gerp_dir"); my $result_file = $self->{align_base}.'.rates.elems'; my $parser = Bio::Tools::Phylo::Gerp->new(-file => $result_file); # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return $parser; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my ($self, $prog) = @_; my $param_string; if ($prog eq 'gerpcol') { my $align_file = $self->_write_alignment; $param_string .= ' -f '.$align_file; $self->{align_base} = basename($align_file); $param_string .= ' -t '.$self->_write_tree; $param_string .= $self->SUPER::_setparams(-params => \@COLPARAMS, -switches => \@SWITCHES, -dash => 1); } else { $param_string .= ' -f '.$self->{align_base}.'.rates'; $param_string .= $self->SUPER::_setparams(-params => \@ELEMPARAMS, -switches => \@SWITCHES, -dash => 1); } $param_string .= " 2>&1"; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Gumby.pm000066400000000000000000000316431302566030400232370ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Gumby # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Gumby - Wrapper for gumby =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Gumby; # Make a Gumby factory $factory = Bio::Tools::Run::Phylo::Gumby->new(); # Run gumby with an alignment and tree file my @results = $factory->run($alignfilename, $treefilename); # or with alignment object and tree objects @results = $factory->run($bio_simplalign, $bio_tree_tree); # or with alignment object and Bio::DB::Taxonomy object @results = $factory->run($bio_simplalign, $bio_db_taxonomy); # specify the positions of exons in (at least) one of the alignment sequences # to get better results $factory->econs(1); $factory->annots($gff_filename); @results = $factory->run($alignfilename, $treefilename); # or using feature objects $factory->annots(@bio_seqfeature_objects); @results = $factory->run($alignfilename, $treefilename); # (mixtures of all the above are possible) # look at the results foreach my $feat (@results) { my $seq_id = $feat->seq_id; my $start = $feat->start; my $end = $feat->end; my $score = $feat->score; my ($pvalue) = $feat->get_tag_values('pvalue'); my ($kind) = $feat->get_tag_values('kind'); # 'all', 'exon' or 'nonexon' } =head1 DESCRIPTION This is a wrapper for running the gumby application by Shyam Prabhakar. You can get details here: http://pga.lbl.gov/gumby/. Gumby is used for phylogenetic footprinting/ shadowing. You can try supplying normal gumby command-line arguments to new(), eg. $factory->new(-ratio => 2); or calling arg-named methods (excluding the initial hyphen, eg. $factory->econs(1); to set the -econs arg). You will need to enable this Gumby wrapper to find the gumby program. This can be done in (at least) three ways: 1. Make sure the gumby executable is in your path. 2. Define an environmental variable GUMBYDIR which is a directory which contains the gumby application: In bash: export GUMBYDIR=/home/username/gumby/ In csh/tcsh: setenv GUMBYDIR /home/username/gumby 3. Include a definition of an environmental variable GUMBYDIR in every script that will use this Gumby wrapper module, e.g.: BEGIN { $ENV{GUMBYDIR} = '/home/username/gumby/' } use Bio::Tools::Run::Phylo::Gumby; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Gumby; use strict; use Cwd; use File::Spec; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::GFF; use Bio::Tools::Phylo::Gumby; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'gumby'; our $PROGRAM_DIR = $ENV{'GUMBYDIR'}; # methods for the gumby args we support our @PARAMS = qw(annots ratio base plen prob); our @SWITCHES = qw(econs); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(o minseq blklen); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Gumby->new() Function: creates a new Gumby factory Returns : Bio::Tools::Run::Phylo::Gumby Args : Most options understood by gumby can be supplied as key => value pairs. These options can NOT be used with this wrapper: o minseq blklen =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], -create => 1); return $self; } =head2 annots Title : annots Usage : $factory->annots(@gff_filenames) Function: Specify annotation files for gumby to use Returns : string of absolute filepaths to gff files Args : list of gff filenames (can be relative), where the first column corresponds to a sequence id from the alignment that will be supplied to run() OR list of Bio::SeqFeatureI objects, which have seq_id() values that will correspond to the sequence ids from the alignment that will be supplied to run() (the objects will be grouped by seq_id and output to gff files for use by gumby; filepaths to those tempfiles will be returned). Note that all features must have source, seq_id and primary_tag set or none will be used. NB: feature coordinates must be relative to the parts of the sequences in the alignment you will supply, as though numbering started at 1 for each each sequence in the alignment. There is currently no automatic correction for this. =cut sub annots { my $self = shift; if (@_) { my @files; my %feats; foreach my $thing (@_) { if (ref($thing) && $thing->isa('Bio::SeqFeatureI')) { my $seq_id = $thing->seq_id || $self->throw("Supplied a feature with no seq_id"); push(@{$feats{$seq_id}}, $thing); } elsif (-e $thing) { push(@files, File::Spec->rel2abs($thing)); } else { $self->throw("'$thing' was not a Bio::SeqFeatureI or a file"); } } if (keys %feats) { my $temp_dir = $self->tempdir; while (my ($seq_id, $feats) = each %feats) { my $temp_file = File::Spec->catfile($temp_dir, $seq_id.'.gff'); $temp_file = File::Spec->rel2abs($temp_file); my $gffout = Bio::Tools::GFF->new(-file => ">$temp_file", -gff_version => 2); $gffout->write_feature(@{$feats}); push(@files, $temp_file); } } $self->{annots} = \@files; } if (defined $self->{annots}) { return join(' ', @{$self->{annots}}); } return; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs gumby on an alignment. Returns : list of Bio::SeqFeature::Annotated (one per prediction and sequence) Args : The first argument represents an alignment, the second argument a species tree. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases, the alignment sequence names must correspond to node ids in the species tree. Multi-word species names should have the spaces removed to form the sequence names, eg. Homosapiens. Underscores may also be used for either or both of sequence and node ids ('Homo_sapiens'), but underscores will be removed internally. NB: Gumby treats each sequence in the alignment as starting at position 1. This method returns results with the coordinates corrected so they match the coordinates of your input alignment. Eg. if 'Homo_sapiens' sequence had the range 20..60 in your alignment, the first Gumby result might be 1..5 which is corrected to 20..24. =cut sub run { my ($self, $aln, $tree) = @_; ($aln && $tree) || $self->throw("alignment and tree must be supplied"); $aln = $self->_alignment($aln); $tree = $self->_tree($tree); $tree->force_binary; # adjust seq & node ids to remove spaces and underscores (eg. if tree # generated from taxonomy/ user input bad names) foreach my $thing ($tree->get_leaf_nodes, $aln->each_seq) { my $id = $thing->id; $id =~ s/_aligned//; #*** dubious custom-handling for the allowed case of mlagan adding _aligned to id (according to gumby author) $id =~ s/[ _]//g; $thing->id($id); } my $new_aln = $aln->new; foreach my $seq ($aln->each_seq) { $new_aln->add_seq($seq); } $self->_alignment($new_aln); #*** at some stage we want to revert the ids back to original... # check node and seq names match $self->_check_names; return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $tree_file = 'tree_file'; my $aln_file = $self->_write_alignment; # generate a gumby-friendly tree file my $tree = $self->_tree; $tree = $tree->simplify_to_leaves_string; open(my $tfhandle, '>', $tree_file) || $self->throw("Could not write to tree file '$tree_file'"); print $tfhandle $tree, "\n"; close($tfhandle); my $command = $exe.$self->_setparams($aln_file, $tree_file); $self->debug("gumby command = $command\n"); open(PIPE, "$command |") || $self->throw("gumby call ($command) failed to start: $? | $!"); my $error = ''; while () { print unless $self->quiet; if (/^ERROR: (.+)/ || /^mbgumbel\(\): (.+)/) { $error .= $1; } } close(PIPE) || ($error ? $self->warn("gumby call ($command) failed: $error") : $self->throw("gumby call ($command) crashed: $?")); my $aln = $self->_alignment(); my %offsets; foreach my $seq ($aln->each_seq) { $offsets{lc($seq->id)} = $seq->start - 1; } my @feats = (); foreach my $file ('out_all.align', 'out_exon.align', 'out_nonexon.align') { -e $file || next; my $parser = Bio::Tools::Phylo::Gumby->new(-file => $file); while (my @results = $parser->next_result) { foreach my $result (@results) { my $this_adjust = $offsets{lc($result->seq_id)}; $result->start($result->start + $this_adjust); $result->end($result->end + $this_adjust); } push(@feats, @results); } unlink($file); } # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $tree_file) = @_; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; my $param_string = ' '.$tree_file; $param_string .= ' '.$aln_file; $param_string .= $self->SUPER::_setparams(-params => \@PARAMS, -switches => \@SWITCHES, -dash => 1,); $param_string .= ' -o out'; $param_string .= ' 2>&1'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/000077500000000000000000000000001302566030400227105ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/Base.pm000066400000000000000000000417431302566030400241310ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::Base # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::Base - Hyphy wrapping base methods =head1 SYNOPSIS FIXME =head1 DESCRIPTION HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::Base; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values Valid and default values are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. =cut our $PROGRAMNAME = 'HYPHYMP'; our $PROGRAM; BEGIN { if( defined $ENV{'HYPHYDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'HYPHYDIR'},$PROGRAMNAME). ($^O =~ /mswin/i ?'.exe':'');; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters (this needs to be specified per child class). Returns an empty array in the base class. Args : None =cut sub valid_values { return (); } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{HYPHYDIR}) if $ENV{HYPHYDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy object Returns : Bio::Tools::Run::Phylo::Hyphy Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my $versionstring = $self->version(); return $self; } =head2 prepare Title : prepare Usage : my $rundir = $hyphy->prepare($aln); Function: prepare the analysis using the default or updated parameters the alignment parameter must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$aln,$tree) = @_; $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run hyphy"); return 0; } my ($tempdir) = $self->tempdir(); my ($tempseqFH,$tempalnfile); if( ! ref($aln) && -e $aln ) { $tempalnfile = $aln; } else { ($tempseqFH,$tempalnfile) = $self->io->tempfile('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); $aln->set_displayname_flat(1); my $alnout = Bio::AlignIO->new('-format' => 'fasta', '-fh' => $tempseqFH); $alnout->write_aln($aln); $alnout->close(); undef $alnout; close($tempseqFH); } $self->{'_params'}{'tempalnfile'} = $tempalnfile; # setting a new temp file to hold the run output for debugging $self->{'run_output'} = "$tempdir/run_output"; my $outfile = $self->outfile_name; if ($outfile eq "") { $outfile = "$tempdir/results.out"; $self->outfile_name($outfile); } my ($temptreeFH,$temptreefile); if( ! ref($tree) && -e $tree ) { $temptreefile = $tree; } else { ($temptreeFH,$temptreefile) = $self->io->tempfile('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } $self->{'_params'}{'temptreefile'} = $temptreefile; $self->create_wrapper; $self->{_prepared} = 1; return $tempdir; } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $redirect = "stdinRedirect"; my ($self,$batchfile) = @_; my $tempdir = $self->tempdir; $self->update_ordered_parameters; #check version of HYPHY: my $versionstring = $self->version(); $versionstring =~ /.*?(\d+\.\d+).*/; my $version = $1; my $wrapper = "$tempdir/wrapper.bf"; open(WRAPPER, ">", $wrapper) or $self->throw("cannot open $wrapper for writing"); print WRAPPER qq{$redirect = {};\n\n}; my $counter = sprintf("%02d", 0); foreach my $elem (@{ $self->{'_orderedparams'} }) { my ($param,$val) = each %$elem; if ($val eq "") { $val = "$tempdir/$param"; # any undefined parameters must be temporary output files. } print WRAPPER qq{$redirect ["$counter"] = "$val";\n}; $counter = sprintf("%02d",$counter+1); } # This next line is for BatchFile: if ((ref ($self)) =~ m/BatchFile/) { print WRAPPER "\nExecuteAFile ($batchfile, $redirect);\n"; } else { # Not exactly sure what version of HYPHY caused this change, # but Github source changes suggest that it was sometime # after version 0.9920060501 was required. $batchfile =~ s/"//g; # remove any extra quotes in the batchfile name. if ($version >= 0.9920060501) { print WRAPPER qq{\nExecuteAFile (HYPHY_LIB_DIRECTORY + "TemplateBatchFiles" + DIRECTORY_SEPARATOR + "$batchfile", stdinRedirect);\n}; } else { print WRAPPER qq{\nExecuteAFile (HYPHY_BASE_DIRECTORY + "TemplateBatchFiles" + DIRECTORY_SEPARATOR + "$batchfile", stdinRedirect);\n}; } } close(WRAPPER); $self->{'_wrapper'} = $wrapper; } =head2 run Title : run Usage : my ($rc,$results) = $BatchFile->run(); Function: run the Hyphy analysis using the specified batchfile and its ordered parameters Returns : Return code, Hash Args : none =cut sub run { my ($self) = @_; my $aln = $self->alignment; my $tree = $self->tree; unless (defined($self->{'_prepared'})) { $self->prepare($aln,$tree); } my $rc = 1; my $results = ""; my $commandstring; my $exe = $self->executable(); unless ($exe && -e $exe && -x _) { $self->throw("unable to find or run executable for 'HYPHY'"); } #runs the HYPHY command $commandstring = $exe . " BASEPATH=" . $self->program_dir . " " . $self->{'_wrapper'}; my $pid = open(RUN, "-|", "$commandstring") or $self->throw("Cannot open exe $exe"); my $waiting = waitpid $pid,0; # waitpid will leave a nonzero error in $? if the HYPHY command crashes, so we should bail gracefully. my $error = $? & 127; if ($error != 0) { $self->throw("Error: " . $self->program_name . " ($waiting) quit unexpectedly with signal $error"); } #otherwise, return the results and exit with 1 so that the parent knows we were successful. while (my $line = ) { $results .= "$line"; } close(RUN); # process the errors from $? and set the error values. $rc = $? >> 8; if (($results =~ m/error/i) || ($rc == 0)) { # either the child process had an error, or HYPHY put one in the output. $rc = 0; $self->warn($self->program_name . " reported error $rc - see error_string for the program output"); $results =~ m/(error.+)/is; $self->error_string($1); } # put these run results into the temp run output file: open (OUT, ">", $self->{'run_output'}); print OUT $results; close OUT; return ($rc,$results); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $hyphy->alignment($aln); Function: Get/Set the L object Returns : L object Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment { my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || !$aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to alignment(): you specified a " . ref($aln)); return; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $hyphy->tree($tree); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( !ref($tree) || !$tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to tree(): you specified a " . ref($tree)); return; } else { $self->{'_tree'} = $tree; } } return $self->{'_tree'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters { my ($self) = @_; # we're returning a copy of this return %{ $self->{'_params'} }; } =head2 set_parameter Title : set_parameter Usage : $hyphy->set_parameter($param,$val); Function: Sets a hyphy parameter, will be validated against the valid values. The checks can be ignored if one turns off param checks like this: $hyphy->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter { my ($self,$param,$value) = @_; # FIXME - add validparams checking $self->{'_params'}{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $obj->set_default_parameters(); Function: (Re)set the default parameters from the defaults (the first value in each array in the valid_values() array) Returns : none Args : none =cut sub set_default_parameters { my ($self) = @_; my @validvals = $self->valid_values(); foreach my $elem (@validvals) { keys %$elem; #reset hash iterator my ($param,$val) = each %$elem; if (ref($val)=~/ARRAY/i ) { unless (ref($val->[0])=~/HASH/i) { push @{ $self->{'_orderedparams'} }, {$param, $val->[0]}; } else { $val = $val->[0]; } } if ( ref($val) =~ /HASH/i ) { my $prevparam; while (defined($val)) { last unless (ref($val) =~ /HASH/i); last unless (defined($param)); $prevparam = $param; ($param,$val) = each %{$val}; push @{ $self->{'_orderedparams'} }, {$prevparam, $param}; push @{ $self->{'_orderedparams'} }, {$param, $val} if (defined($val)); } } elsif (ref($val) !~ /HASH/i && ref($val) !~ /ARRAY/i) { push @{ $self->{'_orderedparams'} }, {$param, $val}; } } } =head2 update_ordered_parameters Title : update_ordered_parameters Usage : $hyphy->update_ordered_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub update_ordered_parameters { my ($self) = @_; for (my $i=0; $i < scalar(@{$self->{'_orderedparams'}}); $i++) { my ($param,$val) = each %{$self->{'_orderedparams'}[$i]}; if (exists $self->{'_params'}{$param}) { $self->{'_orderedparams'}[$i] = {$param, $self->{'_params'}{$param}}; } else { $self->{'_orderedparams'}[$i] = {$param, $val}; } } } =head2 outfile_name Title : outfile_name Usage : my $outfile = $hyphy->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name { my $self = shift; if( @_ ) { return $self->{'_params'}->{'outfile'} = shift @_; } return $self->{'_params'}->{'outfile'}; } =head2 version Title : version Usage : $obj->version() Function: Returns the version string from HYPHY Returns : string Args : none =cut sub version { my $self = shift; my $tempdir = $self->tempdir; if (defined $self->{'_version'}) { return $self->{'_version'}; } # if it's not already defined, write out a small batchfile to return the version string, then clean up. my $versionbf = "$tempdir/version.bf"; open(WRAPPER, ">", $versionbf) or $self->throw("cannot open $versionbf for writing"); print WRAPPER qq{GetString (versionString, HYPHY_VERSION, 2);\nfprintf (stdout, versionString);}; close(WRAPPER); my $exe = $self->executable(); unless ($exe && -e $exe && -x _) { $self->throw("unable to find or run executable for 'HYPHY'"); } my $commandstring = $exe . " BASEPATH=" . $self->program_dir . " " . $versionbf; open(RUN, "$commandstring |") or $self->throw("Cannot open exe $exe"); my $output = ; close(RUN); unlink $versionbf; $self->{'_version'} = $output; return $output; } =head2 hyphy_lib_dir Title : hyphy_lib_dir Usage : $obj->hyphy_lib_dir() Function: Returns the HYPHY_LIB_DIRECTORY from HYPHY Returns : string Args : none =cut sub hyphy_lib_dir { my $self = shift; if (defined $self->{'_hyphylibdir'}) { return $self->{'_hyphylibdir'}; } # if it's not already defined, write out a small batchfile to return the version string, then clean up. my $hyphylibdirbf = $self->io->catfile($self->tempdir,"hyphylibdir.bf"); open(WRAPPER, ">", $hyphylibdirbf) or $self->throw("cannot open $hyphylibdirbf for writing"); print WRAPPER qq{fprintf (stdout, HYPHY_LIB_DIRECTORY);}; close(WRAPPER); my $exe = $self->executable(); unless ($exe && -e $exe && -x _) { $self->throw("unable to find or run executable for 'HYPHY'"); } my $commandstring = $exe . " BASEPATH=" . $self->program_dir . " " . $hyphylibdirbf; open(RUN, "$commandstring |") or $self->throw("Cannot open exe $exe"); my $output = ; close(RUN); unlink $hyphylibdirbf; $self->{'_hyphylibdir'} = $output; return $output; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/BatchFile.pm000066400000000000000000000232541302566030400250750ustar00rootroot00000000000000=head1 NAME Bio::Tools::Run::Phylo::Hyphy::BatchFile - Wrapper for custom execution of Hyphy batch files =head1 SYNOPSIS my $aln = Bio::Align::AlignI->new(); my $treeio = Bio::TreeIO->new(-format => "nexus", -file => "$tree_file"); my $tree = $treeio->next_tree(); my $bf_exec = Bio::Tools::Run::Phylo::Hyphy::BatchFile->new(-params => {'bf' => "hyphybatchfile.bf", 'order' => ["Universal", "Custom", $aln, "001001", $tree]}); $bf_exec->set_parameter('3', "012012"); my ($rc,$parser) = $bf_exec->run(); =head1 DESCRIPTION This module creates a generic interface to processing of HBL files in HyPhy ([Hy]pothesis Testing Using [Phy]logenies), a package by Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. Instances of this module require only a link to the batch file and an ordered list of parameters, as described in the HyPhy documentation "SelectionAnalyses.pdf." =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Daisie Huang Email daisieh@zoology.ubc.ca =head1 CONTRIBUTORS Additional contributors names and emails here =cut package Bio::Tools::Run::Phylo::Hyphy::BatchFile; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'tempalnfile' => undef }, # aln file goes here {'temptreefile' => undef }, # tree file goes here ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::BatchFile->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::BatchFile object Returns : Bio::Tools::Run::Phylo::Hyphy::BatchFile Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) this hashref should include 'bf' => custombatchfile.bf 'order' => [array of ordered parameters] -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 update_ordered_parameters Title : update_ordered_parameters Usage : $BatchFile->update_ordered_parameters(); Function: updates all of the parameters needed for the ordered input redirect in HBL. Returns : nothing Args : none =cut sub update_ordered_parameters { my ($self) = @_; unless (defined ($self->{'_params'}{'order'})) { $self->throw("No ordered parameters for HYPHY were defined."); } for (my $i=0; $i< scalar @{$self->{'_params'}{'order'}}; $i++) { my $item = @{$self->{'_params'}{'order'}}[$i]; #FIXME: update_ordered_parameters should be more flexible. It should be able to tell what type of object $item is and, if necessary, create a temp file for it. if (ref ($item) =~ m/Bio::SimpleAlign/) { $item = $self->{'_params'}{'tempalnfile'}; } elsif (ref ($item) =~ m/Bio::Tree::Tree/) { $item = $self->{'_params'}{'temptreefile'}; } $self->{'_orderedparams'}[$i] = {$i, $item}; } $self->SUPER::update_ordered_parameters(); } =head2 run Title : run Usage : my ($rc,$results) = $BatchFile->run(); Function: run the Hyphy analysis using the specified batchfile and its ordered parameters Returns : Return code, Hash Args : none =cut sub run { my $self = shift; my ($rc, $results) = $self->SUPER::run(); my $outfile = $self->outfile_name(); open(OUTFILE, ">", $outfile) or $self->throw("cannot open $outfile for writing"); print OUTFILE $results; close(OUTFILE); return ($rc,$results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: Creates the wrapper file for the batchfile specified in the hash, saves it to the hash as '_wrapper'. Returns : nothing Args : none =cut sub create_wrapper { my $self = shift; my $batchfile = $self->batchfile; unless (defined($batchfile)) { $self->throw("No batchfile specified, couldn't create wrapper."); } unless (-f $batchfile) { # check to see if maybe this batchfile is a template batchfile my $new_bf = $self->io->catfile($self->hyphy_lib_dir,"TemplateBatchFiles",$batchfile); $new_bf =~ s/\"//g; if (-f $new_bf) { $self->batchfile($new_bf); } else { $self->throw ("Specified batchfile $batchfile not found."); return; } } $self->SUPER::create_wrapper('"' . $self->batchfile . '"'); } =head2 set_parameter Title : set_parameter Usage : $hyphy->set_parameter($param,$val); Function: Sets the named parameter $param to $val if it is a non-numeric parameter If $param is a number, sets the corresponding value of the ordered redirect array (starts from 1). Returns : boolean if set was successful Args : $param => name of the parameter $value => value to set the parameter to =cut sub set_parameter { my ($self,$param,$value) = @_; if ($param =~ /\d+/) { $self->{'_params'}{'order'}[$param-1] = $value; } else { $self->{'_params'}{$param} = $value; } return 1; } =head2 batchfile Title : batchfile Usage : $hyphy->batchfile($bf_name); Function: Gets/sets the batchfile that is run by $hyphy. Returns : The batchfile path. Args : $bf_name => path of new batchfile =cut sub batchfile { my ($self,$bf) = @_; if (defined $bf) { $self->set_parameter('bf', $bf); } if ($self->{'_params'}{'bf'}) { return $self->{'_params'}{'bf'}; } else { $self->warn ("Batchfile was requested but no batchfile was found."); } return; } =head2 make_batchfile_with_contents Title : make_batchfile_with_contents Usage : $hyphy->make_batchfile_with_contents($bf_string); Function: Creates a temporary file with the specified string of contents for the batchfile. Returns : The batchfile path. Args : $bf_string => contents for the batchfile =cut sub make_batchfile_with_contents { my ($self,$bf_string) = @_; my $temp_bf = $self->io->catfile($self->tempdir,"temp.bf"); open (BF, ">", $temp_bf) or $self->throw("cannot open $temp_bf for writing"); print BF "$bf_string\n"; close BF; return $self->batchfile($temp_bf); } =head2 set_default_parameters Title : set_default_parameters Usage : $BatchFile->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the valid_values) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters { my ($self,$keepold) = @_; unless (defined $keepold) { $keepold = 0; } my @validvals = $self->valid_values(); for (my $i=0; $i< scalar (@validvals); $i++) { my $elem = $validvals[$i]; keys %$elem; #reset hash iterator my ($param,$val) = each %$elem; # skip if we want to keep old values and it is already set if (ref($val)=~/ARRAY/i ) { $self->{'_orderedparams'}[$i] = {$param, $val->[0]}; } else { $self->{'_orderedparams'}[$i] = {$param, $val}; } #FIXME: for alignment and treefile, this should default to the ones in params. } } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/FEL.pm000066400000000000000000000170731302566030400236640ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::FEL # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::FEL - Wrapper around the Hyphy FEL analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::FEL; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $fel = Bio::Tools::Run::Phylo::Hyphy::FEL->new(); $fel->alignment($aln); $fel->tree($tree); my ($rc,$results) = $fel->run(); =head1 DESCRIPTION This is a wrapper around the FEL analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. This module will generate the correct list of options for interfacing with TemplateBatchFiles/Ghostrides/Wrapper.bf. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::FEL; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 Default Values Valid and default values for FEL are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. INCOMPLETE DOCUMENTATION OF ALL METHODS =cut =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'New/Restore' => [ "New Analysis", "Restore"]}, {'tempalnfile' => undef }, # aln file goes here {'Model Options' => [ { "Custom" => '010010' }, { "Default" => undef } ] }, {'temptreefile' => undef }, # tree file goes here {'Model Fit Results' => [ $null] }, # Windows have NUL instead of /dev/null {'dN/dS bias parameter' => [ { "Estimate dN/dS only" => undef }, { "Neutral" => undef }, { "Estimate" => undef }, { "Estimate + CI" => undef }, { "User" => '3' } ] }, {'Ancestor Counting' => [ 'Two rate FEL','Single Ancestor Counting','Weighted Ancestor Counting', 'Sample Ancestal States','Process Sampled Ancestal States', 'One rate FEL','Rate Distribution', 'Full site-by-site LRT','Multirate FEL'] }, {'Significance level' => '0.05' }, {'Branch Options' => ['Internal Only','All','A Subtree only','Custom subset'] }, {'outfile' => undef }, # outfile goes here ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::FEL->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::FEL object Returns : Bio::Tools::Run::Phylo::Hyphy::FEL Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $fel->run($aln); Function: run the fel analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, Hash Args : L object, L object [optional] =cut sub run { my $self = shift; my ($rc, $run_results) = $self->SUPER::run(); my $results ={}; my $outfile = $self->outfile_name(); open(OUTFILE, "$outfile") or $self->throw("cannot open $outfile for reading"); my $readed_header = 0; my @elems; while () { if ($readed_header) { # FEL results are csv my @values = split("\,",$_); for my $i (0 .. (scalar(@values)-1)) { $elems[$i] =~ s/\n//g; push @{$results->{$elems[$i]}}, $values[$i]; } } else { @elems = split("\,",$_); $readed_header = 1; } } return ($rc, $results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "QuickSelectionDetection.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/Modeltest.pm000066400000000000000000000157441302566030400252210ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::Modeltest # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::Modeltest - Wrapper around the Hyphy Modeltest analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::Modeltest; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $modeltest = Bio::Tools::Run::Phylo::Hyphy::Modeltest->new(); $modeltest->alignment($aln); $modeltest->tree($tree); my ($rc,$results) = $modeltest->run(); =head1 DESCRIPTION This is a wrapper around the Modeltest analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. This module will generate the correct list of options for interfacing with TemplateBatchFiles/Modeltest.bf. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::Modeltest; use vars qw(@ISA); use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 Default Values Valid and default values for Modeltest are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. =cut =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { return ( {'tempalnfile' => undef }, # aln file goes here {'temptreefile' => undef }, # tree file goes here {'Number of Rate Classes' => [ '4' ] }, {'Model Selection Method' => [ 'Both', 'Hierarchical Test', 'AIC Test'] }, {'Model rejection level' => '0.05' }, {'hieoutfile' => undef }, {'aicoutfile' => undef } ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::Modeltest->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::Modeltest object Returns : Bio::Tools::Run::Phylo::Hyphy::Modeltest Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $modeltest->run($aln); Function: run the modeltest analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, hash containing the "Hierarchical Testing" and "AIC" results, both as hashes. Args : L object, L object [optional] =cut sub run { my $self = shift; my ($rc, $run_results) = $self->SUPER::run(); my $results = {}; my @run_result_array = split (/\n/, $run_results); my $line = shift @run_result_array; my $current_model = "error"; # if this stays "error" when you're trying to add results for a model, something's wrong. while (defined $line) { if ($line =~ m/Hierarchical Testing based model \((.*)\)/) { $current_model = "Hierarchical Testing"; $results->{$current_model}{'model_name'} = $1; } elsif ($line =~ m/AIC based model \((.*)\)/) { $current_model = "AIC"; $results->{$current_model}{'model_name'} = $1; } elsif ($line =~ m/Model String:(\d+)/) { $results->{$current_model}{'model_string'} = $1; } elsif ($line =~ m/Model Options: (.+)/) { $results->{$current_model}{'model_options'} = $1; } elsif ($line =~ m/Equilibrium Frequencies Option: (.+)/) { $results->{$current_model}{'eq_freq_option'} = $1; } $line = shift @run_result_array; } return ($rc,$results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "ModelTest.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/REL.pm000066400000000000000000000145061302566030400236760ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::REL # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::REL - Wrapper around the Hyphy REL analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::REL; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $rel = Bio::Tools::Run::Phylo::Hyphy::REL->new(); $rel->alignment($aln); $rel->tree($tree); my ($rc,$results) = $rel->run(); =head1 DESCRIPTION This is a wrapper around the REL analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. This module will generate the correct list of options for interfacing with TemplateBatchFiles/Ghostrides/Wrapper.bf. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::REL; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'tempalnfile' => undef }, # aln file goes here {'temptreefile' => undef }, # tree file goes here {'Model' => [ "Null for Test 1", "Null for Test 2", "Alternative"]}, {'outfile' => undef } # site-by-site conditional probabilities go to this file ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::REL->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::REL object Returns : Bio::Tools::Run::Phylo::Hyphy::REL Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $rel->run($aln); Function: run the rel analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, Hash Args : L object, L object [optional] =cut sub run { my $self = shift; my ($rc,$run_results) = $self->SUPER::run(); my $results = {}; my $outfile = $self->outfile_name(); open(OUTFILE, "$outfile") or $self->throw("cannot open $outfile for reading"); my $readed_header = 0; my @elems; while () { if ($readed_header) { # REL results are csv my @values = split("\,",$_); for my $i (0 .. (scalar(@values)-1)) { $elems[$i] =~ s/\n//g; push @{$results->{$elems[$i]}}, $values[$i]; } } else { @elems = split("\,",$_); $readed_header = 1; } } return ($rc,$results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "YangNielsenBranchSite2005.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Hyphy/SLAC.pm000066400000000000000000000173261302566030400240010ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Hyphy::SLAC # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Hyphy::SLAC - Wrapper around the Hyphy SLAC analysis =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Hyphy::SLAC; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/hyphy1.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'newick', -file => 't/data/hyphy1.tree'); my $slac = Bio::Tools::Run::Phylo::Hyphy::SLAC->new(); $slac->alignment($aln); $slac->tree($tree); my ($rc,$results) = $slac->run(); =head1 DESCRIPTION This is a wrapper around the SLAC analysis of HyPhy ([Hy]pothesis Testing Using [Phy]logenies) package of Sergei Kosakowsky Pond, Spencer V. Muse, Simon D.W. Frost and Art Poon. See http://www.hyphy.org for more information. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Hyphy::SLAC; use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Hyphy::Base; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::Phylo::Hyphy::Base); =head2 Default Values Valid and default values for SLAC are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. INCOMPLETE DOCUMENTATION OF ALL METHODS =cut =head2 valid_values Title : valid_values Usage : $factory->valid_values() Function: returns the possible parameters Returns: an array holding all possible parameters. The default values are always the first one listed. These descriptions are essentially lifted from the python wrapper or provided by the author. Args : None =cut sub valid_values { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; return ( {'geneticCode' => [ "Universal","VertebratemtDNA","YeastmtDNA","Mold/ProtozoanmtDNA", "InvertebratemtDNA","CiliateNuclear","EchinodermmtDNA","EuplotidNuclear", "Alt.YeastNuclear","AscidianmtDNA","FlatwormmtDNA","BlepharismaNuclear"]}, {'New/Restore' => [ "New Analysis", "Restore"]}, {'tempalnfile' => undef }, # aln file goes here {'Model Options' => [ { "Custom" => '010010' }, { "Default" => undef } ] }, {'temptreefile' => undef }, # tree file goes here {'Model Fit Results' => [ $null] }, # Windows have NUL instead of /dev/null {'dN/dS bias parameter' => [ { "Estimate dN/dS only" => undef }, { "Neutral" => undef }, { "Estimate" => undef }, { "Estimate + CI" => undef }, { "User" => '3' } ] }, {'Ancestor Counting' => [ 'Single Ancestor Counting','Weighted Ancestor Counting', 'Sample Ancestal States','Process Sampled Ancestal States', 'One rate FEL','Two rate FEL','Rate Distribution', 'Full site-by-site LRT','Multirate FEL'] }, {'SLAC Options' => ['Full tree','Tips vs Internals'] }, {'Treatment of Ambiguities' => ['Resolved','Averaged'] }, {'Test Statistic' => ['Approximate','Simulated Null'] }, {'Significance level' => '0.05' }, {'Output options' => 'Export to File' }, #we force a tsv file here {'outfile' => undef }, # outfile goes here {'Rate class estimator' => [ 'Skip','Count'] }, ); } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Hyphy::SLAC->new(); Function: Builds a new Bio::Tools::Run::Phylo::Hyphy::SLAC object Returns : Bio::Tools::Run::Phylo::Hyphy::SLAC Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of parameters (all passed to set_parameter) -executable => where the hyphy executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : my ($rc,$results) = $slac->run($aln); Function: run the slac analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, hash Args : L object, L object [optional] =cut sub run { my $self = shift; my $results = {}; my ($rc, $run_output) = $self->SUPER::run(); my $outfile = $self->outfile_name(); open(OUTFILE, "$outfile") or $self->throw("cannot open $outfile for reading"); my $readed_header = 0; my @elems; while (my $line = ) { if ($readed_header) { # SLAC results are tsv my @values = split("\t",$line); for my $i (0 .. (scalar(@values)-1)) { $elems[$i] =~ s/\n//g; push @{$results->{$elems[$i]}}, $values[$i]; } } else { @elems = split("\t",$line); $readed_header = 1; } } return ($rc, $results); } =head2 create_wrapper Title : create_wrapper Usage : $self->create_wrapper Function: It will create the wrapper file that interfaces with the analysis bf file Example : Returns : Args : =cut sub create_wrapper { my $self = shift; my $batchfile = "QuickSelectionDetection.bf"; $self->SUPER::create_wrapper($batchfile); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/LVB.pm000066400000000000000000000305161302566030400225750ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::LVB # # Created by Daniel Barker, based on ProtPars.pm by Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::LVB - Object for using the LVB program to create an array of L objects from a nucleotide multiple alignment file or a nucleotide SimpleAlign object. Works with LVB version 2.1. =head1 SYNOPSIS use Bio::Tools::Run::Phylo::LVB; # Create a SimpleAlign object. # NOTE. Aligning nucleotide sequence directly, as below, makes # sense for non-coding nucleotide sequence (e.g., structural RNA # genes, introns, ITS). For protein-coding genes, to prevent # Clustal intronducing frameshifts one should instead align the # translations of the genes, then convert the multiple alignment # to nucleotide by referring to the corresponding transcript # sequences (e.g., using EMBOSS tranalign). use Bio::Tools::Run::Alignment::Clustalw; $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new(quiet => 1); $inputfilename = "/Users/daniel/nuc.fa"; $aln = $aln_factory->align($inputfilename); # Create the tree or trees. $tree_factory = Bio::Tools::Run::Phylo::LVB->new(quiet => 1); @trees = $tree_factory->run($aln); # Or one can pass in a file name containing a nucleotide multiple # alignment in Phylip 3.6 format: $tree_factory = Bio::Tools::Run::Phylo::LVB->new(quiet => 1); $tree = $tree_factory->run("/Users/daniel/nuc.phy"); =head1 DESCRIPTION Wrapper for LVB, which uses a simulated annealing heuristic search to seek parsimonious trees from a nucleotide multiple alignment. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 PARAMETERS FOR LVB COMPUTATION =head2 FORMAT Title : FORMAT Description : (optional) When running LVB from a Phylip 3.6-format multiple alignment file, this specifies the layout of the file. It may be "interleaved" or "sequential". FORMAT is automatically set to "interleaved" if running from a SimpleAlign object. Defaults to "interleaved". =head2 GAPS Title : GAPS Description : (optional) LVB can treat gaps represented in the multiple alignment by "-" as either "fifthstate" or "unknown". "fifthstate" regards "-" as equivalent to "O", which is an unambiguous character state distinct from all nucleotides. "unknown" regards "-" as equivalent to "?", which is as an ambiguous site that may contain "A" or "C" or "G" or "T" or "O". Defaults to "unknown". =head2 SEED Title : SEED Description : (optional) This specifies the random number seed for LVB. SEED must be an integer in the range 0 to 900000000 inclusive. If no seed is specified, LVB takes a seed from the system clock. By default, no seed is specified. =head2 DURATION Title : DURATION Description : (optional) This specifies the duration of the analysis, which may be "fast" or "slow". "slow" causes LVB to perform a more thorough and more time-consuming search than "fast". Defaults to "slow". =head2 BOOTSTRAPS Title : BOOTSTRAPS Description : (optional) This specifies the number of bootstrap replicates to use, which must be a positive integer. Set bootstraps to 0 for no bootstrapping. Defaults to 0. =head1 AUTHOR Daniel Barker =head1 CONTRIBUTORS Email jason-AT-bioperl_DOT_org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::LVB; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @LVB_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Cwd; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Root::IO; use File::Copy; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable the LVB program. # You can set the path to the program through doing: # my @params('executable'=>'/usr/local/bin/lvb'); # my $lvb_factory = Bio::Tools::Run::Phylo::LVB->new(@params); # BEGIN { # NOTE. The order of the members of @LVB_PARAMS is vital! @LVB_PARAMS = qw(FORMAT GAPS SEED DURATION BOOTSTRAPS); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@LVB_PARAMS, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : ->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'lvb'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns undef Args : =cut sub program_dir { return undef; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); # set defaults $self->FORMAT("interleaved"); $self->GAPS("unknown"); $self->SEED(""); $self->DURATION("slow"); $self->BOOTSTRAPS(0); # re-set with user's values where specified my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 run Title : run Usage : $inputfilename = '/Users/daniel/nuc.phy'; @trees = $factory->run($inputfilename); Function: Create one or more LVB trees from a SimpleAlign object or a file containing a Phylip 3.6-format nucleotide multiple alignment. Example : Returns : Array of L objects Args : Name of a file containing a nucleotide multiple alignment in Phylip 3.6 format, or a SimpleAlign object =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for lvb. Probably bad input data in $input !");} # Create parameter string to pass to lvb program my $param_string = $self->_setparams(); # run lvb my @trees = $self->_run($infilename,$param_string); } =head2 create_tree Title : create_tree Usage : $inputfilename = '/Users/daniel/nuc.phy'; @trees = $factory->create_tree($inputfilename); Function: Create one or more LVB trees from a SimpleAlign object or a file containing a Phylip 3.6-format nucleotide multiple alignment. Example : Returns : Array of L objects Args : Name of a file containing a nucleotide multiple alignment in Phylip 3.6 format, or a SimpleAlign object =cut sub create_tree{ return shift->run(@_); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to lvb program Example : Returns : Array of Bio::Tree objects Args : Name of a file containing a multiple alignment in Phylip 3.6 format and a parameter string to be passed to LVB =cut sub _run { my ($self,$infile,$param_string) = @_; return unless( $self->executable ); my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $param_string; $self->debug( "Program ".$self->executable || ''."\n"); # create LVB's working copy of the input file, which must be named "infile" # NOTE, we cut trailing spaces since they can cause trouble with LVB 2.1 my $lvb_infile = $self->tempdir . "/infile"; open(LVB_SUB_RUN_TMP_IN_FH, "$infile"); open(LVB_SUB_RUN_TMP_OUT_FH, ">$lvb_infile"); while () { s/ +$//; print LVB_SUB_RUN_TMP_OUT_FH or $self->throw("output error on $lvb_infile"); } chdir($self->tempdir); #open a pipe to run lvb to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(LVB_PIPE,"|".$self->executable.">$null"); } else { open(LVB_PIPE,"|".$self->executable); } print LVB_PIPE $instring; close(LVB_PIPE); chdir($curpath); #get the results my $treefile = $self->tempdir . "/outtree"; $self->throw("LVB did not create treefile correctly") unless (-e $treefile); #create the trees my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick'); my @trees = (); while (my $tree = $in->next_tree()) { push @trees, $tree; } unless ( $self->save_tempfiles ) { # Clean up the temporary files created along the way... unlink $lvb_infile; unlink $treefile; } return @trees; } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for lvb program Example : Returns : name of file containing a multiple alignment in Phylip 3.6 format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh); # If $input is not a reference it better be the name of a # file with the sequence/ # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } # $input may be a SimpleAlign Object if ($input->isa("Bio::Align::AlignI")) { # Open temporary file for both reading & writing of BioSeq array ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip',idlength=>$10); $alnIO->write_aln($input); $alnIO->close(); close($tfh); $tfh = undef; unless ($self->format() =~ /^interleaved$/i) { $self->warn("resetting LVB format to interleaved"); $self->format("interleaved"); } return $alnfilename; } return 0; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for lvb program Example : Returns : parameter string to be passed to LVB Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr (@LVB_PARAMS) { $value = $self->$attr(); if ($attr =~/SEED/i) { $value = "" unless defined $value; $param_string .= "$value\n"; } elsif ($attr =~ /BOOTSTRAPS/i) { $value = 0 unless defined $value; $param_string .= "$value\n"; } else { # we want I for "interleaved" or S for "sequential", # U for "unknown" or F for "fifthstate", # F for "fast" or S for "slow" $param_string .= uc(substr $value, 0, 1) . "\n"; } } return $param_string; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Molphy/000077500000000000000000000000001302566030400230575ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Molphy/ProtML.pm000066400000000000000000000451371302566030400246040ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Molphy::ProtML # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Molphy::ProtML - A wrapper for the Molphy pkg app ProtML =head1 SYNOPSIS use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Molphy::ProtML; my %args = ( 'models' => 'jtt', 'search' => 'quick', 'other' => [ '-information', '-w'] ); my $verbose = 0; # change to 1 if you want some debugging output my $protml = Bio::Tools::Run::Phylo::Molphy::ProtML->new(-verbose => $verbose, -flags => \%args); die("cannot find the protml executable") unless $protml->executable; # read in a previously built protein alignment my $in = Bio::AlignIO->new(-format => 'clustalw', -file => 't/data/cel-cbr-fam.aln'); my $aln = $in->next_aln; $protml->alignment($aln); my ($rc,$results) = $protml->run(); # This may be a bit of overkill, but it is possible we could # have a bunch of results and $results is a # Bio::Tools::Phylo::Molphy object my $r = $results->next_result; # $r is a Bio::Tools::Phylo::Molphy::Result object my @trees; while( my $t = $r->next_tree ) { push @trees, $t; } print "search space is ", $r->search_space, "\n"; "1st tree score is ", $tree[0]->score, "\n"; my $out = Bio::TreeIO->new(-file => ">saved_MLtrees.tre", -format => "newick"); $out->write_tree($tree[0]); $out = undef; =head1 DESCRIPTION This is a wrapper for the exe from the Molphy (MOLecular PHYlogenetics) package by Jun Adachi & Masami Hasegawa. The software can be downloaded from L. Note that PHYLIP (Joe Felsenstein) also provides a version of protml which this module is currently NOT prepared to handle. Use the package available directly from MOLPHY authors if you want to use the module in its present implementation (extensions are welcomed!). The main components are the protml and nucml executables which are used to build maximum likelihood (ML) phylogenetic trees based on either protein or nucleotide sequences. Here are the valid input parameters, we have added a longhand version of the parameters to help you understand what each one does. Either the longhand or the original Molphy parameter will work. Bioperl Molphy Description Longhand parameter Model (one of these): --------------- jtt j Jones, Taylor & Thornton (1992) jtt-f jf JTT w/ frequencies dayhoff d Dahoff et al. (1978) dayhoff-f d dayhoff w/ frequencies mtrev24 m mtREV24 Adachi & Hasegwa (1995) mtrev24-f mf mtREV24 w/ frequencies poisson p Poisson proportional pf Proportional rsr r Relative Substitution Rate rsr-f rf RSR w/ frequencies frequencies f data frequencies Search Strategy (one of these): ---------------- usertrees u User trees (must also supply a tree) rearrangement R Local rearrangement lbp RX Local boostrap prob exhaustive e Exhaustive search star s Star decomposition search (may not be ML) quick q Quick Add OTU search (may not be ML) distance D ML Distance matrix --> NJDIST (need to supply NJDIST tree) Others (can be some or all of these): --------------- norell-bp b No RELL-BP minimumevolution M Minimum evolution sequential S Sequence is in Sequential format _OR_ interleaved I Sequence is in Interleaved format verbose v Verbose messages directed to STDERR information i Output some information (tree vals) w More some extra information (transition matricies, etc) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-AT-bioperl_DOT_org =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Molphy::ProtML; use vars qw(@ISA $PROGRAMNAME $PROGRAM $MINNAMELEN %VALIDVALUES %VALIDFLAGS); use strict; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Phylo::Molphy; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); BEGIN { $MINNAMELEN = 25; %VALIDFLAGS = ( 'models' => { # models jtt => 'j', # Jones, Taylor & Thornton (1992) 'jtt-f' => 'jf', # jtt w/ frequencies dayhoff => 'd', # Dahoff et al. (1978) 'dayhoff-f' => 'df', # dayhoff w/ frequencies mtrev24 => 'm', # Adachi & Hasegwa (1995) 'mtrev24-f' => 'mf', # mtREV24 w/ frequencies poisson => 'p', # Poisson proportional => 'pf', # Proportional rsr => 'r', # Relative Substitution Rate 'rsr-f' => 'rf', # RSR w/ frequencies frequencies => 'f', # data frequencies }, 'search' => { # search strategy usertrees => 'u', # must also supply tree rearrangement => 'R', # local rearrangement lbp => 'RX', # local boostrap prob exhaustive => 'e', # exhaustive star => 's', # star decomposition search (may not be ML) quick => 'q', # quick add OTU search (may not be ML) distance => 'D', # ML Distance matrix --> NJDIST }, 'others' => { # others 'norell-bp' => 'b', sequential => 'S', # sequential format interleaved => 'I', # interleaved format minimumevolution => 'M', # minimum evolution verbose => 'v', # verbose to stderr information => 'i', # output some information w => 'w', # some extra information } ); # this will allow for each of the parameters to also accept the original # protML params my @toadd; foreach my $type ( keys %VALIDFLAGS ) { my @keys = keys %{ $VALIDFLAGS{$type} }; for my $k ( @keys ) { my $v = $VALIDFLAGS{$type}->{$k}; $VALIDFLAGS{$type}->{$v} = $v; } } %VALIDVALUES = (num_retained => sub { my $a = shift; if( $a =~ /^\d+$/) { return 'n'; }}, # should be a number percent_retained => sub { my $a = shift; if( $a =~ /^\d+$/ && $a >= 0 && $a <= 100) { return 'P'; }} ); } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'protml'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{MOLPHYDIR}) if $ENV{MOLPHYDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Molphy::ProtML->new(); Function: Builds a new Bio::Tools::Run::Phylo::Molphy::ProtML object Returns : Bio::Tools::Run::Phylo::Molphy::ProtML Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of PAML parameters (all passed to set_parameter) -executable => where the protml executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{'_protmlparams'} = {}; $self->{'_protmlflags'} = {}; my ($aln, $tree, $st, $flags, $params, $exe) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES FLAGS PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree ); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); if( defined $flags ) { if( ref($flags) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { foreach my $type ( keys %$flags ) { if( $type =~ /other/i ) { foreach my $flag ( @{$flags->{$type}} ) { $self->set_flag('others', $flag) ; } } else { $self->set_flag($type, $flags->{$type}) ; } } } } if( defined $params ) { if( ref($flags) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 run Title : run Usage : $protml->run(); Function: run the protml analysis using the default or updated parameters the alignment parameter must have been set Returns : Bio::Tools::Phylo::Molphy Args : =cut sub run { my ($self) = @_; unless ( $self->save_tempfiles ) { $self->cleanup(); } my $align = $self->alignment(); if( ! $align ) { $self->warn("must have provided a valid alignment object"); return -1; } if( $align->get_seq_by_pos(1)->alphabet ne 'protein' ) { $self->warn("Must have provided a valid protein alignment"); return -1; } my %params = $self->get_parameters; my %flags = $self->get_flags(); my $cmdstring = $self->executable; if( ! defined $flags{'search'} ) { $self->warn("Must have set a valid 'search' flag to run protml this is one of ".join(",", keys %{$VALIDFLAGS{'search'}})); return; } my $tree = $self->tree; for my $t ( keys %flags ) { if( $t eq 'others' ) { $cmdstring .= " " . join(" ", map { '-'.$_ } keys %{$flags{$t}}); } else { next if $flags{$t} eq 'u'; $cmdstring .= " -".$flags{$t}; } } while( my ($param,$val) = each %params ) { $cmdstring .= " \-$param $val"; } my ($tmpdir) = $self->tempdir(); my ($tempseqFH,$tempseqfile) = $self->io->tempfile ('DIR' => $tmpdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'phylip', '-fh' => $tempseqFH, '-interleaved' => 0, '-idlinebreak' => 1, '-idlength' => $MINNAMELEN > $align->maxdisplayname_length() ? $MINNAMELEN : $align->maxdisplayname_length() +1); $alnout->write_aln($align); $alnout->close(); $alnout = undef; close($tempseqFH); $tempseqFH = undef; $cmdstring .= " $tempseqfile"; if( $tree && defined $flags{'search'} eq 'u' ) { my ($temptreeFH,$temptreefile) = $self->io->tempfile ('DIR' => $tmpdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); $cmdstring .= " $temptreefile"; } $self->debug( "cmdstring is $cmdstring\n"); unless( open(PROTML, "$cmdstring |") ) { $self->warn("Cannot run $cmdstring"); return undef; } my $parser= Bio::Tools::Phylo::Molphy->new(-fh => \*PROTML); return (1,$parser); } =head2 alignment Title : alignment Usage : $protml->align($aln); Function: Get/Set the Bio::Align::AlignI object Returns : Bio::Align::AlignI object Args : [optional] Bio::Align::AlignI Comment : We could potentially add support for running directly on a file but we shall keep it simple See also : L, L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function"); return undef; } $self->{'_alignment'} = $aln; } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $protml->tree($tree, %params); Function: Get/Set the Bio::Tree::TreeI object Returns : Bio::Tree::TreeI Args : [optional] $tree => Bio::Tree::TreeI, Comment : We could potentially add support for running directly on a file but we shall keep it simple See also : L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head2 get_flags Title : get_flags Usage : my @params = $protml->get_flags(); Function: returns the list of flags Returns : array of flag names coded in the way that Args : none =cut sub get_flags{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_protmlflags'} }; } =head2 set_flag Title : set_flag Usage : $protml->set_parameter($type,$val); Function: Sets a protml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $protml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $type => name of the parameter This can be one of 'search', 'model', 'other' $value => flag value See also: L =cut sub set_flag{ my ($self,$type,$param) = @_; $type = lc($type); while( substr($type,0,1) eq '-') { # handle multiple '-' substr($type,0,1,''); } if( ! defined $type || ! defined $param ) { $self->debug("Must supply a type and param when setting flag"); return 0; } if( ! $VALIDFLAGS{$type} ) { $self->warn("$type is an unrecognized type"); } $param = lc($param); while( substr($param,0,1) eq '-') { # handle multiple '-' substr($param,0,1,''); } if(! $self->no_param_checks && ! defined $VALIDFLAGS{$type}->{$param} ) { $self->warn("unknown flag ($type) $param will not be set unless you force by setting no_param_checks to true"); return 0; } if($type eq 'others' ) { $self->{'_protmlflags'}->{$type}->{$VALIDFLAGS{$type}->{$param} || $param} = 1; } else { $self->{'_protmlflags'}->{$type} = $VALIDFLAGS{$type}->{$param} || $param; } return 1; } =head2 get_parameters Title : get_parameters Usage : my %params = $protml->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_protmlparams'} }; } =head2 set_parameter Title : set_parameter Usage : $protml->set_parameter($param,$val); Function: Sets a protml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $protml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; $param = lc($param); $param =~ s/^\-//; if(! $self->no_param_checks && ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } my $paramflag = $VALIDVALUES{$param}->($value); if( $paramflag ) { $self->{'_protmlparams'}->{$paramflag} = $value; } else { print "value $value was not valid for param $param\n"; return 0; } return 1; } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $protml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $protml->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Njtree/000077500000000000000000000000001302566030400230365ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Njtree/Best.pm000066400000000000000000000354511302566030400243010ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Njtree::Best # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Njtree::Best - Wrapper around the Njtree (Njtree/phyml) best program. =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Njtree::Best; use Bio::AlignIO; use Bio::TreeIO; my $alignio = Bio::AlignIO->new(-format => 'fasta', -file => 't/data/njtree_aln2.nucl.mfa'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new( -format => 'nhx', -file => 't/data/species_tree_njtree.nh'); my $tree = $treeio->next_tree; my $njtree_best = Bio::Tools::Run::Phylo::Njtree::Best->new(); $njtree_best->alignment($aln); $njtree_best->tree($tree); my $nhx_tree = $njtree_best->run(); =head1 DESCRIPTION This is a wrapper around the best program of Njtree by Li Heng. See http://treesoft.sourceforge.net/njtree.shtml for more information. Wrapper for the calculation of a reconciled phylogenetic tree with inferred duplication tags from amultiple sequence alignment and a species tree using NJTREE. =head2 Helping the module find your executable You will need to enable NJTREEDIR to find the njtree program. This can be done in (at least) three ways: 1. Make sure the njtree executable is in your path (i.e. 'which njtree' returns a valid program 2. define an environmental variable NJTREEDIR which points to a directory containing the 'njtree' app: In bash export NJTREEDIR=/home/progs/treesoft/njtree or In csh/tcsh setenv NJTREEDIR /home/progs/treesoft/njtree 3. include a definition of an environmental variable NJTREEDIR in every script that will BEGIN {$ENV{NJTREEDIR} = '/home/progs/treesoft/njtree'; } use Bio::Tools::Run::Phylo::Njtree::Best; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Njtree::Best; use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM @NJTREE_BEST_PARAMS @NJTREE_BEST_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @NJTREE_BEST_PARAMS = qw(C p F c k a d l L b); @NJTREE_BEST_SWITCHES = qw(P S A r D s g N); # Authorize attribute fields foreach my $attr ( @NJTREE_BEST_PARAMS, @NJTREE_BEST_SWITCHES ) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'njtree'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{NJTREEDIR}) if $ENV{NJTREEDIR}; } =head2 new Title : new Usage : my $njtree_best = Bio::Tools::Run::Phylo::Njtree::Best->new(); Function: Builds a new Bio::Tools::Run::Phylo::Njtree::Best Returns : Bio::Tools::Run::Phylo::Njtree::Best Args : -alignment => the Bio::Align::AlignI object -tree => the Bio::Tree::TreeI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -executable => where the njtree executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); return $self; } =head2 prepare Title : prepare Usage : my $rundir = $njtree_best->prepare(); Function: prepare the njtree_best analysis using the default or updated parameters the alignment parameter and species tree must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$aln,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("Must have supplied a valid alignment file in order to run njtree_best"); return 0; } if( ! $tree ) { $self->warn("Must have supplied a valid species tree file in order to run njtree_best"); return 0; } my ($tempdir) = $self->tempdir(); my $tempalnFH; if( ! ref($aln) && -e $aln ) { $self->{_tempalnfile} = $aln; } else { ($tempalnFH,$self->{_tempalnfile}) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'fasta', '-fh' => $tempalnFH); $aln->set_displayname_flat(1); $alnout->write_aln($aln); $alnout->close(); undef $alnout; close($tempalnFH); } my ($temptreeFH); if( ! ref($tree) && -e $tree ) { $self->{_temptreefile} = $tree; } else { ($temptreeFH,$self->{_temptreefile}) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } $self->{_prepared} = 1; $self->{_njtree_best_params} = $self->_setparams(); return $tempdir; } =head2 run Title : run Usage : my $nhx_tree = $njtree_best->run(); Function: run the njtree_best analysis using the default or updated parameters the alignment parameter must have been set Returns : L object [optional] Args : L object L object =cut sub run { my ($self,$aln,$tree) = @_; $self->prepare($aln,$tree) unless (defined($self->{_prepared})); my ($rc,$nhx_tree) = (1); my ($tmpdir) = $self->tempdir(); my $outfile = $self->outfile_name; { my $commandstring; my $exit_status; #./njtree best [other_params] -f species_file.nh -p tree -o inputfile.best.nhx inputfile.nucl.mfa my $njtree_executable = $self->executable; $commandstring = $njtree_executable." best "; $commandstring .= $self->{_njtree_best_params}; $commandstring .= " -f $self->{_temptreefile} -p tree -o "; unless ($self->outfile_name ) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } $commandstring .= $self->outfile_name; $commandstring .= " $self->{_tempalnfile} "; $self->throw("unable to find or run executable for 'njtree'") unless $njtree_executable && -e $njtree_executable && -x _; open(RUN, "$commandstring |") or $self->throw("Cannot run $commandstring"); my @output = ; $exit_status = close(RUN); $self->error_string(join('',@output)); if( (grep { /^\[ /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $nhx_tree = Bio::TreeIO->new(-file => "$tmpdir/$outfile", -format => 'nhx'); }; if( $@ ) { $self->warn($self->error_string); } } unless ( $self->save_tempfiles ) { $self->cleanup(); } return ($rc,$nhx_tree); } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = $attr; # aliasing $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; $string =~ /Version\:\s+(\d+.\d+.\d+)/m; return $1 || undef; } =head2 alignment Title : alignment Usage : $njtree_best->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment { my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $njtree_best->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, [optional] %parameters => hash of tree-specific parameters Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head2 check_names Title : check_names Usage : Function: Example : Returns : Args : =cut sub check_names { my $self = shift; my $tree = $self->tree; my $aln = $self->alignment; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run njtree_best"); return 0; } if( ! $tree ) { $self->warn("must have supplied a valid species tree file in order to run njtree_best"); return 0; } foreach my $leaf ($tree->get_leaf_nodes) { my $id = $leaf->id; $id =~ s/\-\*.+//; # njtree does not consider anything after a \-\* $self->{_treeids}{$id} = 1; } foreach my $seq ($aln->each_seq) { my $id = $seq->id; $id =~ s/.+\_//; # njtree only looks at the right side of the \_ $self->{_alnids}{$id} = 1; } foreach my $alnid (keys %{$self->{_alnids}}) { $self->{_unmappedids}{$alnid} = 1 unless (defined($self->{_treeids}{$alnid})); } if (defined($self->{_unmappedids})) { my $count = scalar(keys%{$self->{_unmappedids}}); my $unmapped = join(",",keys %{$self->{_unmappedids}}); $self->warn("$count unmapped ids between the aln and the tree $unmapped"); } } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for njtree_best program Example : Returns : parameter string to be passed to njtree_best during align or profile_align Args : name of calling object =cut sub _setparams { my ($self) = @_; my ($attr, $value,$param_string); $param_string = ''; my $laststr; for $attr ( @NJTREE_BEST_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = $attr; $attr_key = ' -'.$attr_key; $param_string .= $attr_key .' '.$value; } for $attr ( @NJTREE_BEST_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = $attr; $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } return $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $njtree_best->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $njtree_best->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/PAML/000077500000000000000000000000001302566030400223405ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/PAML/Baseml.pm000066400000000000000000000473331302566030400241130ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::PAML::Baseml # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::PAML::Baseml - Wrapper aroud the PAML program baseml =head1 SYNOPSIS use Bio::Tools::Run::Phylo::PAML::Baseml; use Bio::AlignIO; my $alignio = Bio::AlignIO->new(-format => 'phylip', -file => 't/data/gf-s85.phylip'); my $aln = $alignio->next_aln; my $bml = Bio::Tools::Run::Phylo::PAML::Baseml->new(); $bml->alignment($aln); my ($rc,$parser) = $bml->run(); while( my $result = $parser->next_result ) { my @otus = $result->get_seqs(); my $MLmatrix = $result->get_MLmatrix(); # 0 and 1 correspond to the 1st and 2nd entry in the @otus array } =head1 DESCRIPTION This is a wrapper around the baseml program of PAML (Phylogenetic Analysis by Maximum Likelihood) package of Ziheng Yang. See http://abacus.gene.ucl.ac.uk/software/paml.html for more information. This module will generate a proper baseml.ctl file and will run the program in a separate temporary directory to avoid creating temp files all over the place and will cleanup after itself.. The values you can feed to the configuration file are documented here. 'noisy' => [ 0..3,9], 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much 'runmode' => [0..5], # for runmode # 0: use the provided tree structure(s) in treefile # 1,2: mean heuristic search by star-decomposition alg # 2: starts from star tree while 1 reads a multifurcating # tree from treefile and ties to estimate the best # bifurcating tree # 3: stepwise addition # 4: NNI perturbation with the starting tree # Tree search DOES NOT WORK WELL so estimate a tree # using other programs first 'model' => '0', # for model # 0: JC69 (uncorrected) # 1: K80 (transitions/transversion weighted differently) # 2: F81 # 3: F84 # 4: HKY85 # 5: T92 (Tamura 92) # 6: TN93 (Tajima-Nei) correct for multiple substitutions # 7: REV (aka GTR) # 8: UNREST # 9: REVu #10: UNRESTu # See Yang 1994 JME 39:105-111 # model 8 special case of the REV model # model 9 is special case of unrestricted model # can also supply special rate parameters # so for example (from pamlDOC.pdf # $model = '8 [2 (CT) (AG)]'; # TN93 # $model = '8 [2 (TA AT TG CA CG) (AG)]'; # TN93 # $model = '9 [1 (TC CT AG GA)]; # K80 # $model = '9 [0]'; # JC69 # $model = '9 [11 (TA) (TG) (CT) (CA) (CG) (AT) (AC) (AG) (GT) (GC) (GA)], 'outfile' => 'mlb', 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa 'kappa' => '2.5', # initial or fixed kappa 'fix_alpha'=> [1,0], # 0: estimate gamma shape param # 1: fix it at alpha 'alpha' => '0', # initial of fixed alpha # 0: infinity (constant rate) 'Malpha' => [0,1], # different alphas for genes 'fix_rho'=> [1,0], # 0: estimate gamma shape param # 1: fix it at alpha 'rho' => '0', # initial of fixed alpha # 0: infinity (constant rate) 'ncatG' => '5', # number of categories in the dD,AdG, or nparkK models of rates 'nparK' => [0..4], # rate-class models # 1:rk 2:rk&fK # 3:rK&MK(1/K) 4:rK&MK 'nhomo' => [0..4], # 0 & 1: homogeneous, # 2: kappa for brances # 3:N1 4:N2 'getSE' => [0,1], 'RateAncestor' => [1,0,2], # rates (alpha > 0) or # ancestral states 'cleandata' => [1,0], # remove sites with # ambiguity data (1:yes or 0:no) 'fix_blength' => [-1,0,1,2], # 0: ignore, -1: random, # 1: initial, 2: fixed # 'icode' => [ 0..10], # (with RateAncestor=1. #try "GC" in data,model=4,Mgene=4) 'ndata' => [5,1..10], 'clock' => [0..3], # 0: no clock, 1: clock, 2: local clock, 3: CombinedAnalysis 'Small_Diff' => '1e-6', #underflow issues? =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 CONTRIBUTORS Sendu Bala - bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::PAML::Baseml; use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM); use strict; use Cwd; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Phylo::PAML; use base qw(Bio::Tools::Run::Phylo::PhyloBase); BEGIN { $MINNAMELEN = 25; $PROGRAMNAME = 'baseml' . ($^O =~ /mswin/i ?'.exe':''); if( defined $ENV{'PAMLDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'PAMLDIR'},$PROGRAMNAME); } # valid values for parameters, the default one is always # the first one in the array # much of the documentation here is lifted directly from the baseml.ctl # example file provided with the package %VALIDVALUES = ( 'noisy' => [ 0..3,9], 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much 'runmode' => [0..5], # for runmode # 0: use the provided tree structure(s) in treefile # 1,2: mean heuristic search by star-decomposition alg # 2: starts from star tree while 1 reads a multifurcating # tree from treefile and ties to estimate the best # bifurcating tree # 3: stepwise addition # 4: NNI perturbation with the starting tree # Tree search DOES NOT WORK WELL so estimate a tree # using other programs first 'model' => [5, 0..8], # for model # 0: JC69 (uncorrected) # 1: K80 (transitions/transversion weighted differently) # 2: F81 # 3: F84 # 4: HKY85 # 5: T92 (Tamura 92) # 6: TN93 (Tajima-Nei) correct for multiple substitutions # 7: REV (aka GTR) # 8: UNREST # See Yang 1994 JME 39:105-111 # model 8 special case of the REV model # model 9 is special case of unrestricted model # can also supply special rate parameters # so for example (from pamlDOC.pdf # $model = '8 [2 (CT) (AG)]'; # TN93 # $model = '8 [2 (TA AT TG CA CG) (AG)]'; # TN93 # $model = '9 [1 (TC CT AG GA)]; # K80 # $model = '9 [0]'; # JC69 # $model = '9 [11 (TA) (TG) (CT) (CA) (CG) (AT) (AC) (AG) (GT) (GC) (GA)], 'outfile' => 'mlb', 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa 'kappa' => '2.5', # initial or fixed kappa 'fix_alpha'=> [1,0], # 0: estimate gamma shape param # 1: fix it at alpha 'alpha' => '0', # initial of fixed alpha # 0: infinity (constant rate) 'Malpha' => [0,1], # different alphas for genes 'fix_rho'=> [1,0], # 0: estimate gamma shape param # 1: fix it at alpha 'rho' => '0', # initial of fixed alpha # 0: infinity (constant rate) 'ncatG' => '5', # number of categories in the dD,AdG, or nparkK models of rates 'nparK' => [0..4], # rate-class models # 1:rk 2:rk&fK # 3:rK&MK(1/K) 4:rK&MK 'nhomo' => [0..4], # 0 & 1: homogeneous, # 2: kappa for brances # 3:N1 4:N2 'getSE' => [0,1], 'RateAncestor' => [0,1,2], # rates (alpha > 0) or # ancestral states 'cleandata' => [1,0], # remove sites with # ambiguity data (1:yes or 0:no) 'fix_blength' => [0,-1,1,2], # 0: ignore, -1: random, # 1: initial, 2: fixed 'icode' => [ 0..10], # (with RateAncestor=1. #try "GC" in data,model=4,Mgene=4) 'ndata' => [1..10], 'clock' => [0..3], # 0: no clock, 1: clock, 2: local clock, 3: CombinedAnalysis 'Small_Diff' => '1e-6', #underflow issues? 'Mgene' => [0..4], # 0:rates, 1:separate; 2:diff pi, 3:diff kapa, 4:all diff ); } =head2 program_name Title : program_name Usage : $obj->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PAMLDIR}) if $ENV{PAMLDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::PAML::Baseml->new(); Function: Builds a new Bio::Tools::Run::Phylo::PAML::Baseml object Returns : Bio::Tools::Run::Phylo::PAML::Baseml Args : -alignment => the L object -tree => the L object if you want to use runmode 0 or 1 -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln,$tree,$st) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); return $self; } =head2 run Title : run Usage : $yn->run(); Function: run the Baseml analysis using the default or updated parameters the alignment parameter must have been set Returns : 3 values, $rc = 1 for success, 0 for errors hash reference of the Yang calculated Ka/Ks values this is a set of pairwise observations keyed as sequencenameA->sequencenameB->datatype hash reference same as the previous one except it for the Nei and Gojobori calculated Ka,Ks,omega values Args : optionally, a value appropriate for alignment() and one for tree() NB : Since Baseml doesn't handle spaces in tree node ids, if a tree is in use spaces will be converted to underscores in both the tree node ids and alignment sequence ids. =cut sub run { my ($self, $aln, $tree) = @_; $aln = $self->alignment($aln) if $aln; $tree = $self->tree($tree) if $tree; $aln ||= $self->alignment(); $tree ||= $self->tree(); my %params = $self->get_parameters; if( ! $aln ) { $self->warn("must have supplied a valid aligment file in order to run baseml"); return 0; } if ((defined $params{runmode} && ($params{runmode} == 0 || $params{runmode} == 1)) && ! $tree) { $self->warn("must have supplied a tree in order to run baseml in runmode 0 or 1"); return 0; } # replace spaces with underscores in ids, since baseml really doesn't like # spaces (actually, the resulting double quotes) in tree ids if ($tree) { my $changed = 0; foreach my $thing ($aln->each_seq, $tree ? $tree->get_leaf_nodes : ()) { my $id = $thing->id; if ($id =~ / /) { $id =~ s/\s+/_/g; $thing->id($id); $changed = 1; } } if ($changed) { my $new_aln = $aln->new; foreach my $seq ($aln->each_seq) { $new_aln->add_seq($seq); } $aln = $new_aln; $aln = $self->alignment($aln); $tree = $self->tree($tree); } # check node and seq names match $self->_check_names; } # output the alignment and tree to tempfiles my $tempseqfile = $self->_write_alignment('phylip', -interleaved => 0, -idlinebreak => 1, -line_length => 60, -wrap_sequential => 1, -idlength => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1); $tree = $self->_write_tree() if $tree; # now let's print the baseml.ctl file. # many of the these programs are finicky about what the filename is # and won't even run without the properly named file. Ack my $tmpdir = $self->tempdir(); my $baseml_ctl = "$tmpdir/baseml.ctl"; open(my $mlfh, ">$baseml_ctl") or $self->throw("cannot open $baseml_ctl for writing"); print $mlfh "seqfile = $tempseqfile\n"; print $mlfh "treefile = $tree\n" if $tree; my $outfile = $self->outfile_name; print $mlfh "outfile = $outfile\n"; while( my ($param,$val) = each %params ) { next if $param eq 'outfile'; print $mlfh "$param = $val\n"; } close($mlfh); my ($rc,$parser) = (1); { my $cwd = cwd(); my $exit_status; chdir($tmpdir); my $ynexe = $self->executable(); $self->throw("unable to find executable for 'baseml'") unless $ynexe; open(my $run, "$ynexe |"); my @output = <$run>; $exit_status = close($run); $self->error_string(join('', grep { /\berr(or)?: /io } @output)); if ($self->error_string || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $parser = Bio::Tools::Phylo::PAML->new(-file => "$tmpdir/mlb", -dir => "$tmpdir"); }; if( $@ ) { $self->warn($self->error_string); } chdir($cwd); } if( $self->verbose > 0 ) { open(my $in, "$tmpdir/mlb"); while(<$in>) { $self->debug($_); } close($in); } return ($rc,$parser); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { chomp($value); $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $baseml->alignment($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment{ my $self = shift; return $self->_alignment(@_); } sub tree { my $self = shift; return $self->_tree(@_); } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_basemlparams'} }; } =head2 set_parameter Title : set_parameter Usage : $baseml->set_parameter($param,$val); Function: Sets a baseml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if on turns of param checks like this: $baseml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $paramname => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; if( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not set unless you force by setting no_param_checks to true"); return 0; } if( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { my %allowed = map { $_ => 1 } @{ $VALIDVALUES{$param} }; unless ( exists $allowed{$value} ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } $self->{'_basemlparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $baseml->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values NB : using this isn't an especially good idea! You don't need to do anything to end up using default parameters: hence 'default'! =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_basemlparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_basemlparams'}->{$param} = $val->[0]; } else { $self->{'_basemlparams'}->{$param} = $val; } } } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $baseml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name { my $self = shift; if( @_ ) { return $self->{'_basemlparams'}->{'outfile'} = shift @_; } unless (defined $self->{'_basemlparams'}->{'outfile'}) { $self->{'_basemlparams'}->{'outfile'} = 'mlb'; } return $self->{'_basemlparams'}->{'outfile'}; } =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $baseml->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/PAML/Codeml.pm000066400000000000000000000637061302566030400241150ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::PAML::Codeml # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::PAML::Codeml - Wrapper aroud the PAML program codeml =head1 SYNOPSIS use Bio::Tools::Run::Phylo::PAML::Codeml; use Bio::AlignIO; my $alignio = Bio::AlignIO->new(-format => 'phylip', -file => 't/data/gf-s85.phylip'); my $aln = $alignio->next_aln; my $codeml = Bio::Tools::Run::Phylo::PAML::Codeml->new(); $codeml->alignment($aln); my ($rc,$parser) = $codeml->run(); my $result = $parser->next_result; my $MLmatrix = $result->get_MLmatrix(); print "Ka = ", $MLmatrix->[0]->[1]->{'dN'},"\n"; print "Ks = ", $MLmatrix->[0]->[1]->{'dS'},"\n"; print "Ka/Ks = ", $MLmatrix->[0]->[1]->{'omega'},"\n"; =head1 DESCRIPTION This is a wrapper around the codeml program of PAML (Phylogenetic Analysis by Maximum Likelihood) package of Ziheng Yang. See http://abacus.gene.ucl.ac.uk/software/paml.html for more information. This module is more about generating the properl codeml.ctl file and will run the program in a separate temporary directory to avoid creating temp files all over the place. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl-dot-org =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::PAML::Codeml; use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM); use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Phylo::PAML; use Cwd; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values Valid and default values for codeml programs are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the example codeml.ctl file and pamlDOC documentation provided by the author. B specifies the equilibrium codon frequencies in codon substitution model. These frequencies can be assumed to be equal (1/61 each for the standard genetic code, B = 0), calculated from the average nucleotide frequencies (B = 1), from the average nucleotide frequencies at the three codon positions (B = 2), or used as free parameters (B = 3). The number of parameters involved in those models of codon frequencies is 0, 3, 9, and 60 (under the universal code), for B = 0, 1, 2, and 3 respectively. B specifies whether equal amino acid distances are assumed (= 0) or Grantham's matrix is used (= 1) (Yang et al. 1998). B = -2 performs ML estimation of dS and dN in pairwise comparisons. The program will collect estimates of dS and dN into the files 2ML.dS and 2ML.dN. Since many users seem interested in looking at dN /dS ratios among lineages, examination of the tree shapes indicated by branch lengths calculated from the two rates may be interesting although the analysis is ad hoc. If your species names have no more than 10 characters, you can use the output distance matrices as input to Phylip programs such as neighbor without change. Otherwise you need to edit the files to cut the names short. B concerns assumptions about the dN/dS rate ratios among branches (Yang 1998; Yang and Nielsen 1998). B =0 means a single dN/dS ratio for all lineages (branches), 1 means one ratio for each branch (free ratio model), and 2 means arbitrary number of rations (such as the 2-ratios or 3-ratios models. with B =2, you may specify the omega ratios for the branches using branch labels (read about the tree structure file in the document). This option seems rather easy to use. Otherwise, the program will ask the user to input a branch mark for the dN/dS ratio assumed for each branch. This should be an integral number between 0 to k - 1 if k different dN/dS ratios (omega_0 - omega_k - 1) are assumed for the branches of the tree. B note basically, doing this interactively is not going to work very well, so this module is really focused around using the 0 or 1 parameters. Read the program documentation if you'd like some more detailed instructions. B specifies models that allow the dN/dS ratio (omega) to vary among sites (Nielsen and Yang 1998, Yang et al. 2000) B = m corresponds to model Mm in Yang et al (2000). The variable B is used to specify the number of categories in the omega distribution under some models. The values of ncatG() used to perform our analyses are 3 for M3 (discrete), 5 for M4 (freq), 10 for the continuous distributions (M5: gamma, M6: 2gamma, M7: beta, M8:beta&w, M9:beta&gamma, M10: beta&gamma+1, M11:beta&normal>1, and M12:0&2normal>1, M13:3normal>0). This means M8 will have 11 site classes (10 from the beta distribution plus 1 additional class). The posterior probabilities for site classes as well as the expected omega values for sites are listed in the file rst, which may be useful to pinpoint sites under positive selection, if they exist. To make it easy to run several B models in one go, the executable L can be used, which asks you how many and which models to run at the start of the program. The number of categories used will then match those used in Yang et al(2000). As noted in that paper, some of the models are hard to use, in particular, M12 and M13. Recommended models are 0 (one-ratio), 1 (neutral), 2 (selection), 3 (discrete), 7 (beta), and 8 (beta&omega ). Some of the models like M2 and M8 are noted to be prone to the problem of multiple local optima. You are advised to run the program at least twice, once with a starting omega value E1 and a second time with a value E1, and use the results corresponding to the highest likelihood. The continuous neutral and selection models of Nielsen and Yang (1998) are not implemented in the program. B for genetic code and these correspond to 1-11 in the genbank transl table. 0:universal code 1:mamalian mt 2:yeast mt 3:mold mt, 4:invertebrate mt 5:ciliate nuclear 6:echinoderm mt 7:euplotid mt 8:alternative yeast nu. 9:ascidian mt 10:blepharisma nu B For codon sequences, ancestral reconstruction is not implemented for the models of variable dN/dS ratios among sites. The output under codon-based models usually shows the encoded amino acid for each codon. The output under "Prob of best character at each node, listed by site" has two posterior probabilities for each node at each codon (amino acid) site. The first is for the best codon. The second, in parentheses, is for the most likely amino acid under the codon substitution model. This is a sum of posterior probabilities across synonymous codons. In theory it is possible although rare for the most likely amino acid not to match the most likely codon. B for codon sequences (seqtype = 1): The codon frequencies in each sequence are counted and listed in a genetic code table, together with their sums across species. Each table contains six or fewer species. For data of multiple genes (option G in the sequence file), codon frequencies in each gene (summed over species) are also listed. The nucleotide distributions at the three codon positions are also listed. The method of Nei and Gojobori (1986) is used to calculate the number of synonymous substitutions per synonymous site (dS ) and the number of nonsynonymous substitutions per nonsynonymous site (dN ) and their ratio (dN /dS ). These are used to construct initial estimates of branch lengths for the likelihood analysis but are not MLEs themselves. Note that the estimates of these quantities for the a- and b-globin genes shown in Table 2 of Goldman and Yang (1994), calculated using the MEGA package (Kumar et al., 1993), are not accurate. Results of ancestral reconstructions (B = 1) are collected in the file rst. Under models of variable dN/dS ratios among sites (NSsites models), the posterior probabilities for site classes as well as positively selected sites are listed in rst. INCOMPLETE DOCUMENTATION OF ALL METHODS =cut BEGIN { $MINNAMELEN = 25; $PROGRAMNAME = 'codeml' . ($^O =~ /mswin/i ?'.exe':''); if( defined $ENV{'PAMLDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'PAMLDIR'},$PROGRAMNAME). ($^O =~ /mswin/i ?'.exe':'');; } # valid values for parameters, the default one is always # the first one in the array # much of the documentation here is lifted directly from the codeml.ctl # example file provided with the package %VALIDVALUES = ( 'outfile' => 'mlc', 'noisy' => [ 0..3,9], 'verbose' => [ 1,0,2], # 0:concise, 1:detailed, 2:too much # (runmode) 0:user tree, 1:semi-autmatic, 2:automatic # 3:stepwise addition, 4,5:PerturbationNNI # -2:pairwise 'runmode' => [ -2, 0..5], 'seqtype' => [ 1..3], # 1:codons, 2:AAs, 3:codons->AAs 'CodonFreq' => [ 2, 0,1,3,4,5,6,7], # 0:1/61 each, 1:F1X4, # 2:F3X4, 3:codon table # (aaDist) 0:equal, +:geometric, -:linear, # 1-6:G1974,Miyata, c,p,v,a 'aaDist' => [ 0,'+','-', 1..6], # (aaRatefile) only used for aa seqs # with model=empirical(_F) # default is usually 'wag.dat', also # dayhoff.dat, jones.dat, mtmam.dat, or your own 'aaRatefile' => 'wag.dat', # (model) models for codons # 0: one, 1:b, 2:2 or more dN/dS ratios for branches 'model' => [0..3,7], # (NSsites) number of S sites # 0: one w;1:neutral;2:selection; 3:discrete;4:freqs; # 5:gamma;6:2gamma;7:beta;8:beta&w;9:betaγ # 10:beta&gamma+1; 11:beta&normal>1; 12:0&2normal>1; # 13:3normal>0 'NSsites' => [0..13], # (icode) genetic code # 0:universal code # 1:mamalian mt # 2:yeast mt # 3:mold mt, # 4:invertebrate mt # 5:ciliate nuclear # 6:echinoderm mt # 7:euplotid mt # 8:alternative yeast nu. # 9:ascidian mt #10:blepharisma nu # these correspond to 1-11 in the genbank transl table 'icode' => [ 0..10], 'Mgene' => [0,1], # 0:rates, 1:separate 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa 'kappa' => '2', # initial or fixed kappa 'fix_omega'=> [0,1], # 0: estimate omega, 1: fix omega 'omega' => '1', # initial or fixed omega for # codons or codon-base AAs 'fix_alpha'=> [1,0], # 0: estimate gamma shape param # 1: fix it at alpha 'alpha' => '0.', # initial or fixed alpha # 0: infinity (constant rate) 'Malpha' => [0,1], # different alphas for genes 'ncatG' => [1..10], # number of categories in # dG of NSsites models # (clock) # 0: no clock, 1: global clock, 2: local clock # 3: TipDate 'clock' => [0..3], # (getSE) Standard Error: # 0:don't want them, 1: want S.E. 'getSE' => [0,1], # (RateAncestor) # 0,1,2 rates (alpha>0) or # ancestral states (1 or 2) 'RateAncestor' => [1,0,2], 'Small_Diff' => '.5e-6', # (cleandata) remove sites with ambiguity data # 1: yes, 0:no 'cleandata' => [0,1], # this is the number of datasets in # the file - we would need to change # our api to allow >1 alignment object # to be referenced at time 'ndata' => 1, # (method) # 0: simultaneous,1: 1 branch at a time 'method' => [0,1], # allow branch lengths to be fixed # 0 ignore # -1 use random starting points # 1 use the branch lengths in initial ML iteration # 2 branch lengths are fixed 'fix_blength' => [0,-1,1,2], ); } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'codeml'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PAMLDIR}) if $ENV{PAMLDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::PAML::Codeml->new(); Function: Builds a new Bio::Tools::Run::Phylo::PAML::Codeml object Returns : Bio::Tools::Run::Phylo::PAML::Codeml Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -branchlengths => 0: ignore any branch lengths found on the tree 1: use as initial values 2: fix branch lengths -params => a hashref of PAML parameters (all passed to set_parameter) -executable => where the codeml executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{'_branchLengths'} = 0; my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE BRANCHLENGTHS)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree, branchLengths => ($ubl || 0) ); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 prepare Title : prepare Usage : my $rundir = $codeml->prepare($aln); Function: prepare the codeml analysis using the default or updated parameters the alignment parameter must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare{ my ($self,$aln,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run codeml"); return 0; } my ($tempdir) = $self->tempdir(); my ($tempseqFH,$tempseqfile); if( ! ref($aln) && -e $aln ) { $tempseqfile = $aln; } else { ($tempseqFH,$tempseqfile) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'phylip', '-fh' => $tempseqFH, '-interleaved' => 0, '-idlength' => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1); $alnout->write_aln($aln); $alnout->close(); undef $alnout; close($tempseqFH); } # now let's print the codeml.ctl file. # many of the these programs are finicky about what the filename is # and won't even run without the properly named file. Ack my $codeml_ctl = "$tempdir/codeml.ctl"; open(CODEML, ">$codeml_ctl") or $self->throw("cannot open $codeml_ctl for writing"); print CODEML "seqfile = $tempseqfile\n"; my $outfile = $self->outfile_name; print CODEML "outfile = $outfile\n"; if( $tree ) { my ($temptreeFH,$temptreefile); if( ! ref($tree) && -e $tree ) { $temptreefile = $tree; } else { ($temptreeFH,$temptreefile) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } print CODEML "treefile = $temptreefile\n"; } my %params = $self->get_parameters; while( my ($param,$val) = each %params ) { next if $param eq 'outfile'; print CODEML "$param = $val\n"; } close(CODEML); # my ($rc,$parser) = (1); # { # my $cwd = cwd(); # my $exit_status; # chdir($tempdir); # } return $tempdir; } =head2 run Title : run Usage : my ($rc,$parser) = $codeml->run($aln,$tree); Function: run the codeml analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, L Args : L object, L object [optional] =cut sub run { my ($self) = shift;; my $outfile = $self->outfile_name; my $tmpdir = $self->prepare(@_); my ($rc,$parser) = (1); { my $cwd = cwd(); my $exit_status; chdir($tmpdir); my $codemlexe = $self->executable(); $self->throw("unable to find or run executable for 'codeml'") unless $codemlexe && -e $codemlexe && -x _; my $run; if( $self->{'_branchLengths'} ) { open($run, "echo $self->{'_branchLengths'} | $codemlexe |") or $self->throw("Cannot open exe $codemlexe"); } else { open($run, "$codemlexe |") or $self->throw("Cannot open exe $codemlexe"); } my @output = <$run>; $exit_status = close($run); $self->error_string(join('',@output)); if( (grep { /\berr(or)?: /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $parser = Bio::Tools::Phylo::PAML->new(-file => "$tmpdir/$outfile", -verbose => $self->verbose, -dir => "$tmpdir"); }; if( $@ ) { $self->warn($self->error_string); } chdir($cwd); } return ($rc,$parser); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $codeml->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $codeml->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, [optional] %parameters => hash of tree-specific parameters: branchLengths: 0, 1 or 2 out Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; if ( defined $params{'_branchLengths'} ) { my $ubl = $params{'_branchLengths'}; if ($ubl !~ m/^(0|1|2)$/) { $self->throw("The branchLengths parameter to tree() must be 0 (ignore), 1 (initial values) or 2 (fixed values) only"); } $self->{'_branchLengths'} = $ubl; } } return $self->{'_tree'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_codemlparams'} }; } =head2 set_parameter Title : set_parameter Usage : $codeml->set_parameter($param,$val); Function: Sets a codeml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $codeml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; unless (defined $self->{'no_param_checks'} && $self->{'no_param_checks'} == 1) { if ( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } if ( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep { $value eq $_ } @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } } $self->{'_codemlparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $codeml->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_codemlparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_codemlparams'}->{$param} = $val->[0]; } else { $self->{'_codemlparams'}->{$param} = $val; } } } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut sub no_param_checks{ my ($self,$value) = @_; if( defined $value) { $self->{'no_param_checks'} = $value; } return $self->{'no_param_checks'}; } =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name { my $self = shift; if( @_ ) { return $self->{'_codemlparams'}->{'outfile'} = shift @_; } unless (defined $self->{'_codemlparams'}->{'outfile'}) { $self->{'_codemlparams'}->{'outfile'} = 'mlc'; } return $self->{'_codemlparams'}->{'outfile'}; } =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/PAML/Evolver.pm000066400000000000000000000530651302566030400243310ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::PAML::Evolver # # based on the Bio::Tools::Run::Phylo::PAML::Codeml # by Jason Stajich # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::PAML::Evolver - Wrapper aroud the PAML program evolver =head1 SYNOPSIS use Bio::Tools::Run::Phylo::PAML::Evolver; my $evolver = Bio::Tools::Run::Phylo::PAML::Evolver->new(); # Get a $tree object somehow $evolver->tree($tree); # FIXME: evolver generates a tree (first run with option 1 or 2)? # One or more alns are created my @alns = $evolver->run(); #### # Or with all the data coming from a previous PAML run my $parser = Bio::Tools::Phylo::PAML->new ( -file => "$inputfile", ); my $result = $parser->next_result(); my $tree = $result->next_tree; $evolver->tree($tree); my @codon_freqs = $result->get_CodonFreqs(); $evolver->set_CodonFreqs(\@codon_freqs); my $val = $evolver->prepare(); # FIXME: something similar for nucleotide frequencies: # Option (5) Simulate nucleotide data sets (use MCbase.dat)? # FIXME: something similar for aa parameters: # Option (7) Simulate amino acid data sets (use MCaa.dat)? # FIXME: With an initial RootSeq.txt =head1 DESCRIPTION This is a wrapper around the evolver program of PAML (Phylogenetic Analysis by Maximum Likelihood) package of Ziheng Yang. See http://abacus.gene.ucl.ac.uk/software/paml.html for more information. This module is more about generating the properl MCmodel.ctl file and will run the program in a separate temporary directory to avoid creating temp files all over the place. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.shtml - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://bioperl.org/bioperl-bugs/ =head1 AUTHOR - Albert Vilella Email avilella-AT-gmail-DOT-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::PAML::Evolver; use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM); use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::SeqIO; use Bio::TreeIO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Phylo::PAML; use Cwd; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values Valid and default values for evolver programs are listed below. The default values are always the first one listed. These descriptions are essentially lifted from the example MCcodon.dat file and pamlDOC documentation provided by the author. Stub: B specifies something. B specifies something else. INCOMPLETE DOCUMENTATION OF ALL METHODS =cut BEGIN { $MINNAMELEN = 25; $PROGRAMNAME = 'evolver' . ($^O =~ /mswin/i ?'.exe':''); if( defined $ENV{'PAMLDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'PAMLDIR'},$PROGRAMNAME). ($^O =~ /mswin/i ?'.exe':'');; } # valid values for parameters, the default one is always # the first one in the array # much of the documentation here is lifted directly from the MCcodon.dat # example file provided with the package # Evolver calls time for seed: SetSeed(i==-1?(int)time(NULL):i); my $rand = int(time); # my $rand = int(rand(999999)); %VALIDVALUES = ( # FIXME: there should be a 6-7-8 option that fits MCcodon or MCbase or MCaa 'outfmt' => [0,1], # 0 * 0:paml format (mc.paml); 1:paup format (mc.paup) # random number seed (odd number) # FIXME: set seed to null here and ask for it later? 'seed' => "$rand", # numseq can actually be calculated from the tree external nodes # nucleotide sites 'nuclsites' => '1000', # replicates 'replicates' => '1', # tree length; use -1 if tree has absolute branch lengths # Note that tree length and branch lengths under the codon model are # measured by the expected number of nucleotide substitutions per codon # (see Goldman & Yang 1994). For amino acid models, they are defined as # the expected number of amino acid changes per amino acid site. 'tree_length' => '1.5', # omega # FIXME: if one wants to call for different omegas (NSsites), # right now it has to be done like: # $evolver->set_parameter(omega,"3\n0.2\t0.3\t0.5\n0.5\t0.9\t3.2\n"); # 3 * number of site classes, followed by frequencies and omega's. # 0.6 0.3 0.1 # Freqs # 0.1 0.8 3.2 # Omegas 'omega' => '0.3', # kappa 'kappa' => '5', # FIXME: this only for MCbase.dat ? # model: 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85, 5:T92, 6:TN93, 7:REV # FIXME: this applies to only some models? # 10 5 1 2 3 * kappa or rate parameters in model # FIXME: this applies to only MCbase.dat ? # 0.5 4 * <#categories for discrete gamma> ); # end of validvalues } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'evolver'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PAMLDIR}) if $ENV{PAMLDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::PAML::Evolver->new(); Function: Builds a new Bio::Tools::Run::Phylo::PAML::Evolver object Returns : Bio::Tools::Run::Phylo::PAML::Evolver -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object (FIXME: optional if this is done in a first run) -params => a hashref of PAML parameters (all passed to set_parameter) -executable => where the evolver executable resides See also: L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); # $self->{'_branchLengths'} = 0; my ($tree, $st, $params, $exe) = $self->_rearrange([qw(TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 prepare Title : prepare Usage : my $rundir = $evolver->prepare($aln); Function: prepare the evolver analysis using the default or updated parameters the alignment parameter must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$aln,$tree) = @_; # FIXME: To consider: to have save_tempfiles always TRUE by default # or simply never delete unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; my ($tempdir) = $self->tempdir(); # FIXME: # If multiple replicates, evolver gives: # A file with a concatenation of sequential phylips separated by a # double return which gets correctly parsed by AlignIO next_aln # Or a concatenation of paup entries with tags separating them which # wont get correctly parsed with current AlignIO (failed with # nexus) # FIXME: To consider: force phylip outfmt and split the files if # replicates > 1 # if( ! ref($aln) && -e $aln ) { # $tempseqfile = $aln; # } else { # ($tempseqFH,$tempseqfile) = $self->io->tempfile # ('-dir' => $tempdir, # UNLINK => ($self->save_tempfiles ? 0 : 1)); # my $alnout = Bio::AlignIO->new('-format' => 'phylip', # '-fh' => $tempseqFH, # '-interleaved' => 0, # '-idlength' => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1); # $alnout->write_aln($aln); # $alnout->close(); # undef $alnout; # close($tempseqFH); # } # now let's print the MCcodon.dat file. # many of the these programs are finicky about what the filename is # and won't even run without the properly named file. Ack # FIXME: we should do the appropriate here if we are simulating codons, nts o aa. my $evolver_ctl = "$tempdir/MCcodon.dat"; my $evolverfh; open($evolverfh, ">$evolver_ctl") or $self->throw("cannot open $evolver_ctl for writing"); # FIXME: params follow an order in the control file, they are not a hash. Do we have an # clean example of this in bioperl-run? my %params = $self->get_parameters; print $evolverfh "$params{outfmt}\n"; print $evolverfh "$params{seed}\n"; # FIXME: call get_leaf_nodes to count only leafs - relates to newick onlyleafids bug # my $numseq = scalar($tree->get_nodes); my $numseq = scalar($tree->get_leaf_nodes); print $evolverfh "$numseq "; print $evolverfh "$params{nuclsites} "; print $evolverfh "$params{replicates}\n\n"; print $evolverfh "$params{tree_length}\n"; # FIXME: do #1:#n branch tagging magic here # FIXME: this pre flush stuff is for appending mode my $treeout = Bio::TreeIO->new ('-format' => 'newick', '-fh' => $evolverfh, -PRE =>'>>', '-flush', ); # $treeout->bootstrap_style('nointernalids'); $treeout->write_tree($tree); # Appending mode to add more control file contents here open($evolverfh, ">>$evolver_ctl") or $self->throw("cannot open $evolver_ctl for writing"); print $evolverfh "\n$params{omega}\n"; print $evolverfh "$params{kappa}\n"; # Print codon freqs here or defaults (below) my @codon_freqs = $self->get_CodonFreqs(); foreach my $firstbase (@codon_freqs) { foreach my $element (@$firstbase) { print $evolverfh " $element"; } print $evolverfh "\n"; } # FIXME: codon freqs or nt freqs should always come from an object? # Silly printing the default codonfreqs in the default # MCcodon.dat provided by PAML unless (@codon_freqs) { print $evolverfh "0.00983798 0.01745548 0.00222048 0.01443315\n", "0.00844604 0.01498576 0.00190632 0.01239105\n", "0.01064012 0.01887870 0 0\n", "0.00469486 0.00833007 0 0.00688776\n", "0.01592816 0.02826125 0.00359507 0.02336796\n", "0.01367453 0.02426265 0.00308642 0.02006170\n", "0.01722686 0.03056552 0.00388819 0.02527326\n", "0.00760121 0.01348678 0.00171563 0.01115161\n", "0.01574077 0.02792876 0.00355278 0.02309304\n", "0.01351366 0.02397721 0.00305010 0.01982568\n", "0.01702419 0.03020593 0.00384245 0.02497593\n", "0.00751178 0.01332811 0.00169545 0.01102042\n", "0.02525082 0.04480239 0.00569924 0.03704508\n", "0.02167816 0.03846344 0.00489288 0.03180369\n", "0.02730964 0.04845534 0.00616393 0.04006555\n", "0.01205015 0.02138052 0.00271978 0.01767859\n"; } print $evolverfh "\n// end of file.\n"; close($evolverfh); # FIXME: what do we return in prepare? # return } =head2 run Title : run Usage : my ($rc,$parser) = $evolver->run(); Function: run the evolver analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, L Args : L object, L object [optional] =cut sub run { my $self = shift; # FIXME: We should look for the stuff we prepared in the prepare method here my $rc = (1); { my $exit_status; my ($tmpdir) = $self->tempdir(); chdir($tmpdir); my $evolverexe = $self->executable(); $self->throw("unable to find or run executable for 'evolver'") unless $evolverexe && -e $evolverexe && -x _; open(RUN, "$evolverexe 6 MCcodon.dat |") or $self->throw("Cannot open exe $evolverexe"); my @output = ; $exit_status = close(RUN); $self->error_string(join('',@output)); if ( (grep { /\berr(or)?: /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } # FIXME - hardcoded mc.paml unless ($self->indel) { my $in = Bio::AlignIO->new('-file' => "$tmpdir/mc.paml", '-format' => 'phylip'); my $aln = $in->next_aln(); $self->alignment($aln); } } return $rc; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysis run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $evolver->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $evolver->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, [optional] %parameters => hash of tree-specific parameters: branchLengths: 0, 1 or 2 out Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; # FIXME: I think we dont need this in Evolver # if ( defined $params{'_branchLengths'} ) { # my $ubl = $params{'_branchLengths'}; # if ($ubl !~ m/^(0|1|2)$/) { # $self->throw("The branchLengths parameter to tree() must be 0 (ignore), 1 (initial values) or 2 (fixed values) only"); # } # $self->{'_branchLengths'} = $ubl; # } } return $self->{'_tree'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_evolverparams'} }; } =head2 set_parameter Title : set_parameter Usage : $evolver->set_parameter($param,$val); Function: Sets a evolver parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $evolver->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; unless ($self->{'no_param_checks'} == 1) { if ( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } if ( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep { $value eq $_ } @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } } $self->{'_evolverparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $evolver->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_evolverparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_evolverparams'}->{$param} = $val->[0]; } else { $self->{'_evolverparams'}->{$param} = $val; } } } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut sub no_param_checks{ my ($self,$value) = @_; if( defined $value) { $self->{'no_param_checks'} = $value; } return $self->{'no_param_checks'}; } =head2 set_CodonFreqs Title : set_CodonFreqs Usage : $obj->set_CodonFreqs($newval) Function: Get/Set the Codon Frequence table Returns : value of set_CodonFreqs (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub set_CodonFreqs{ my $self = shift; return $self->{'_codonfreqs'} = shift if @_; return $self->{'_codonfreqs'}; } =head2 get_CodonFreqs Title : get_CodonFreqs Usage : my @codon_freqs = $evolver->get_CodonFreqs() Function: Get the Codon freqs Returns : Array Args : none =cut sub get_CodonFreqs{ my ($self) = @_; return @{$self->{'_codonfreqs'} || []}; } =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $evolver->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $evolver->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } =head2 indel Title : indel Usage : $obj->indel($newval) Function: this is only useful if using evolver_indel instead of main evolver package: Exploring the Relationship between Sequence Similarity and Accurate Phylogenetic Trees Brandi L. Cantarel, Hilary G. Morrison and William Pearson Example : Returns : value of indel (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub indel{ my $self = shift; return $self->{'indel'} = shift if @_; return $self->{'indel'}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/PAML/Yn00.pm000066400000000000000000000317611302566030400234340ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::PAML::Yn00 # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::PAML::Yn00 - Wrapper aroud the PAML program yn00 =head1 SYNOPSIS use Bio::Tools::Run::Phylo::PAML::Yn00; use Bio::AlignIO; my $alignio = Bio::AlignIO->new(-format => 'phylip', -file => 't/data/gf-s85.phylip'); my $aln = $alignio->next_aln; my $yn = Bio::Tools::Run::Phylo::PAML::Yn00->new(); $yn->alignment($aln); my ($rc,$parser) = $yn->run(); while( my $result = $parser->next_result ) { my @otus = $result->get_seqs(); my $MLmatrix = $result->get_MLmatrix(); # 0 and 1 correspond to the 1st and 2nd entry in the @otus array my $dN = $MLmatrix->[0]->[1]->{dN}; my $dS = $MLmatrix->[0]->[1]->{dS}; my $kaks =$MLmatrix->[0]->[1]->{omega}; print "Ka = $dN Ks = $dS Ka/Ks = $kaks\n"; } =head1 DESCRIPTION This is a wrapper around the yn00 (method of Yang and Nielsen, 2000) program of PAML (Phylogenetic Analysis by Maximum Likelihood) package of Ziheng Yang. See http://abacus.gene.ucl.ac.uk/software/paml.html for more information. This module will generate a proper yn00.ctl file and will run the program in a separate temporary directory to avoid creating temp files all over the place and will cleanup after itself. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::PAML::Yn00; use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM); use strict; use Cwd; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Phylo::PAML; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values See the L module for documentation of the default values. =cut BEGIN { $MINNAMELEN = 25; $PROGRAMNAME = 'yn00' . ($^O =~ /mswin/i ?'.exe':''); if( defined $ENV{'PAMLDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'PAMLDIR'},$PROGRAMNAME); } # valid values for parameters, the default one is always # the first one in the array # much of the documentation here is lifted directly from the codeml.ctl # example file provided with the package %VALIDVALUES = ( 'noisy' => [ 0..3,9], 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much 'weighting' => [0,1], # weighting pathways between codons 'commonf3x4' => [0,1], # use same f3x4 for all sites # (icode) genetic code # 0:universal code # 1:mamalian mt # 2:yeast mt # 3:mold mt, # 4:invertebrate mt # 5:ciliate nuclear # 6:echinoderm mt # 7:euplotid mt # 8:alternative yeast nu. # 9:ascidian mt #10:blepharisma nu # these correspond to 1-11 in the genbank transl table 'icode' => [ 0..10], 'ndata' => [1..10], ); } =head2 program_name Title : program_name Usage : $yn00->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $yn00->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PAMLDIR}) if $ENV{PAMLDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::PAML::Yn00->new(); Function: Builds a new Bio::Tools::Run::Phylo::PAML::Yn00 object Returns : Bio::Tools::Run::Phylo::PAML::Yn00 Args : -alignment => the L object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln,$st) = $self->_rearrange([qw(ALIGNMENT SAVE_TEMPFILES)], @args); defined $aln && $self->alignment($aln); defined $st && $self->save_tempfiles($st); $self->set_default_parameters(); return $self; } =head2 run Title : run Usage : $yn->run(); Function: run the yn00 analysis using the default or updated parameters the alignment parameter must have been set Returns : 3 values, $rc = 1 for success, 0 for errors hash reference of the Yang calculated Ka/Ks values this is a set of pairwise observations keyed as sequencenameA->sequencenameB->datatype hash reference same as the previous one except it for the Nei and Gojobori calculated Ka,Ks,omega values Args : none =cut sub run{ my ($self,$aln) = @_; ($aln) ||= $self->alignment(); if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run yn00"); return 0; } my ($tmpdir) = $self->tempdir(); my ($tempseqFH,$tempseqfile); if( ! ref($aln) && -e $aln ) { $tempseqfile = $aln; } else { ($tempseqFH,$tempseqfile) = $self->io->tempfile ('-dir' => $tmpdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'phylip', '-fh' => $tempseqFH, '-interleaved' => 0, '-idlength' => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1); $alnout->write_aln($aln); $alnout->close(); undef $alnout; close($tempseqFH); undef $tempseqFH; } # now let's print the yn.ctl file. # many of the these programs are finicky about what the filename is # and won't even run without the properly named file. Ack my $yn_ctl = "$tmpdir/yn00.ctl"; open(YN, ">$yn_ctl") or $self->throw("cannot open $yn_ctl for writing"); print YN "seqfile = $tempseqfile\n"; my $outfile = $self->outfile_name; print YN "outfile = $outfile\n"; my %params = $self->get_parameters; while( my ($param,$val) = each %params ) { print YN "$param = $val\n"; } close(YN); my ($rc,$parser) = (1); { my $cwd = cwd(); my $exit_status; chdir($tmpdir); my $ynexe = $self->executable(); $self->throw("unable to find executable for 'yn'") unless $ynexe; open(RUN, "$ynexe |"); my @output = ; $exit_status = close(RUN); $self->error_string(join('',@output)); if( (grep { /\berr(or)?: /io } @output) || !$exit_status ) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $parser = Bio::Tools::Phylo::PAML->new(-file => "$tmpdir/mlc", -dir => "$tmpdir"); }; if( $@ ) { $self->warn($self->error_string); } chdir($cwd); } if( $self->verbose > 0 ) { open(IN, "$tmpdir/mlc"); while() { $self->debug($_); } } unless ( $self->save_tempfiles ) { unlink("$yn_ctl"); $self->cleanup(); } return ($rc,$parser); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $codeml->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function"); return undef; } $self->{'_alignment'} = $aln; } return $self->{'_alignment'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_codemlparams'} }; } =head2 set_parameter Title : set_parameter Usage : $codeml->set_parameter($param,$val); Function: Sets a codeml parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if on turns of param checks like this: $codeml->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $paramname => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; if( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not set unless you force by setting no_param_checks to true"); return 0; } if( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep {$value} @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } $self->{'_codemlparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $codeml->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_codemlparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_codemlparams'}->{$param} = $val->[0]; } else { $self->{'_codemlparams'}->{$param} = $val; } } } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phast/000077500000000000000000000000001302566030400226665ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phast/PhastCons.pm000066400000000000000000000363621302566030400251400ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Phast::PhastCons # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phast::PhastCons - Wrapper for footprinting using phastCons =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phast::PhastCons; # Make a PhastCons factory $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(); # Pass the factory an alignment and the corresponding species tree $align_filename = 't/data/apes.multi_fasta'; $species_tree_filename = 't/data/apes.newick'; @features = $factory->run($align_filename, $species_tree_filename); # or get a Bio::Align::AlignI (SimpleAlign) object from somewhere, and # generate the species tree automatically using a Bio::DB::Taxonomy database $tdb = Bio::DB::Taxonomy->new(-source => 'entrez'); @features = $factory->run($aln_obj, $tdb); # @features is an array of Bio::SeqFeature::Annotated, one feature per # alignment sequence and prediction =head1 DESCRIPTION This is a wrapper for running the phastCons application by Adam Siepel. You can get details here: http://compgen.bscb.cornell.edu/~acs/software.html phastCons is used for phylogenetic footprinting/ shadowing. Currently the interface is extremely simplified, allowing only one analysis method. The focus here is on ease of use, allowing phastCons to estimate as many parameters as possible and having it output just the 'most conserved' blocks it detects. You can, however, try supplying normal phastCons arguments to new(), or calling arg-named methods (excluding initial hyphens and converting others to underscores, eg. $factory-Eindels_only(1) to set the --indels-only arg). The particular analysis carried out here is to: 1. Use phyloFit to generate a tree model for initialization of the nonconserved model from the supplied alignment (all data) and species tree 2. Run phastCons in 'training' mode for parameter estimation using all the alignment data and the model from step 1 3. Run phastCons with the trees from step 2 to discover the most conserved regions See the 'HowTo' at http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html for details on how to improve results. WARNING: the API is likely to change in the future to allow for alternative analysis types. You will need to enable this phastCons wrapper to find the phast programs (at least phastCons and phyloFit). This can be done in (at least) three ways: 1. Make sure the phastCons and phyloFit executables are in your path. 2. Define an environmental variable PHASTDIR which is a directory which contains the phastCons and phyloFit applications: In bash: export PHASTDIR=/home/username/phast/bin In csh/tcsh: setenv PHASTDIR /home/username/phast/bin 3. Include a definition of an environmental variable PHASTDIR in every script that will use this PhastCons wrapper module, e.g.: BEGIN { $ENV{PHASTDIR} = '/home/username/phast/bin' } use Bio::Tools::Run::Phylo::Phast::PhastCons; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phast::PhastCons; use strict; use Cwd; use File::Basename; use Clone qw(clone); use Bio::AlignIO; use Bio::Tools::Run::Phylo::Phast::PhyloFit; use Bio::FeatureIO; use Bio::Annotation::SimpleValue; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'phastCons'; our $PROGRAM_DIR = $ENV{'PHASTDIR'}; # methods and their synonyms from the phastCons args we support our %PARAMS = (rho => 'R', nrates => 'k', transitions => 't', target_coverage => 'C', expected_length => ['E', 'expected_lengths'], lnl => 'L', log => 'g', max_micro_indel => 'Y', indel_params => 'D', lambda => 'l', extrapolate => 'e', hmm => 'H', catmap => 'c', states => 'S', reflect_strand => 'U', require_informative => 'M', not_informative => 'F'); our %SWITCHES = (quiet => 'q', indels => 'I', indels_only => 'J', FC => 'X', coding_potential => 'p', ignore_missing => 'z'); # just to be explicit, args we don't support (yet) or we handle ourselves our %UNSUPPORTED = (estimate_trees => 'T', estimate_rho => 'O', gc => 'G', msa_format => 'i', score => 's', no_post_probs => 'n', seqname => 'N', refidx => 'r', idpref => 'P', help => 'h', alias => 'A', most_conserved => ['V', 'viterbi']); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(@params) Function: Creates a new PhastCons factory Returns : Bio::Tools::Run::Phylo::Phast::PhastCons Args : Optionally, provide any of the following (defaults are not to use, see the same-named methods for information on what each option does): { -target_coverage => number between 0 and 1 AND -expected_length => int } -rho => number between 0 and 1 -quiet => boolean (turn on or off program output to console) Most other options understood by phastCons can be supplied as key => value pairs in this way. Options that don't normally take a value should be given a value of 1. You can type the keys as you would on the command line (eg. '--indels-only' => 1) or with only a single hyphen to start and internal hyphens converted to underscores (eg. -indels_only => 1) to avoid having to quote the key. These options can NOT be used with this wrapper currently: estimate_trees / T estimate_rho / O gc / G msa_format / i score / s no_post_probs / n seqname / N idpref / P help / h alias / A most_conserved / V / viterbi refidx / r =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS), (map { $_ => $SWITCHES{$_} } keys %SWITCHES)}, -create => 1); return $self; } =head2 target_coverage Title : target_coverage Usage : $factory->target_coverage(0.25); Function: Constrain transition parameters such that the expected fraction of sites in conserved elements is the supplied value. Returns : number (default undef) Args : None to get, number (between 0 and 1) to set =cut sub target_coverage { my ($self, $num) = @_; if (defined ($num)) { ($num > 0 && $num < 1) || $self->throw("target_coverage value must be between 0 and 1, exclusive"); $self->{coverage} = $num; } return $self->{coverage} || return; } =head2 expected_length Title : expected_length Usage : $factory->expected_length(5); Function: Set transition probabilities such that the expected length of a conserved element is the supplied value. target_coverage() must also be set. Returns : int (default undef) Args : None to get, int to set =cut # created automatically =head2 rho Title : rho Usage : $factory->rho(0.3); Function: Set the *scale* (overall evolutionary rate) of the model for the conserved state to be the supplied number times that of the model for the non-conserved state (default 0.3). Returns : number (default undef) Args : None to get, number (between 0 and 1) to set =cut sub rho { my ($self, $num) = @_; if (defined ($num)) { ($num > 0 && $num < 1) || $self->throw("rho value must be between 0 and 1, exclusive"); $self->{rho} = $num; } return $self->{rho} || return; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs phastCons on an alignment to find the most conserved regions ('footprinting'). Returns : array of Bio::SeqFeature::Annotated (one feature per alignment sequence and prediction) Args : The first argument represents an alignment, the second argument a species tree. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases, the alignment sequence names must correspond to node ids in the species tree. Multi-word species names should be joined with underscores to form the sequence names, eg. Homo_sapiens =cut sub run { my ($self, $aln, $tree) = @_; ($aln && $tree) || $self->throw("alignment and tree must be supplied"); my $aln_obj = $self->_alignment($aln); $tree = $self->_tree($tree); # if aln was a file, set the alignment id to match file name if (-e $aln) { my $aln_id = basename($aln); ($aln_id) = $aln_id =~ /^([^\.]+)/; $aln_obj->id($aln_id); } return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; # use phyloFit to generate tree model initialization (?) using species tree # and alignment my $pf = Bio::Tools::Run::Phylo::Phast::PhyloFit->new(-verbose => $self->verbose, -quiet => $self->quiet); my $init_mod = $pf->run($self->_alignment, $self->_tree) || $self->throw("phyloFit failed to work as expected, is it installed?"); # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $aln_file = $self->_write_alignment; # do training for parameter estimation my $command = $exe.$self->_setparams($aln_file, $init_mod); $self->debug("phastCons training command = $command\n"); system($command) && $self->throw("phastCons training call ($command) crashed: $?"); # do the final analysis $command = $exe.$self->_setparams($aln_file); $self->debug("phastCons command = $command\n"); system($command) && $self->throw("phastCons call ($command) crashed: $?"); # read in most_cons.bed as the result my $bedin = Bio::FeatureIO->new(-format => 'bed', -file => 'most_cons.bed'); # cd back to orig dir chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); my @feats = (); my $aln = $self->_alignment; while (my $feat = $bedin->next_feature) { $feat->source_tag('phastCons'); my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'predicted', -value => 1); $feat->annotation->add_Annotation($sv); # $feat->type('TF_binding_site'); causes seg fault in subsequent clone() # features are in zero-based alignment coords; make a feature for each # alignment sequence foreach my $seq ($aln->each_seq) { my $clone = clone($feat); # $clone->type('TF_binding_site'); causes massive slowdown if you later store/retrieve these features from Bio::DB::SeqFeature database # give it the correct id $clone->seq_id($seq->id); # check and correct the coords (sequence may not have the feature) my $sloc = $seq->location_from_column($feat->start + 1) || next; my $eloc = $seq->location_from_column($feat->end + 1) || next; $clone->start($sloc->start - 1); $clone->end($eloc->end - 1); push(@feats, $clone); } } return @feats; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment file name for result production, AND filename of phyloFit generated init.mod file to estimate trees =cut sub _setparams { my ($self, $aln_file, $init_mod) = @_; my $param_string = $self->SUPER::_setparams(-params => [keys %PARAMS], -switches => [keys %SWITCHES], -double_dash => 1, -underscore_to_dash => 1); $param_string .= ' --no-post-probs'; my $aln_id = $self->_alignment->id; $param_string .= " --seqname $aln_id --idpref $aln_id" if $aln_id; $param_string .= ' --refidx 0'; my $input = ' --msa-format FASTA '.$aln_file; if ($init_mod) { $param_string .= ' --estimate-trees mytrees '.$input.' '.$init_mod; } else { $param_string .= $input.' --most-conserved most_cons.bed --score mytrees.cons.mod,mytrees.noncons.mod'; } return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phast/PhyloFit.pm000066400000000000000000000243631302566030400247720ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Phast::PhyloFit # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phast::PhyloFit - Wrapper for phyloFit =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phast::PhyloFit; # Make a PhyloFit factory $factory = Bio::Tools::Run::Phylo::Phast::PhastCons->new(); # Generate an init.mod file for use by phastCons my $init_file = $factory->run($alignment, $tree); =head1 DESCRIPTION This is a wrapper for running the phyloFit application by Adam Siepel. You can get details here: http://compgen.bscb.cornell.edu/~acs/software.html Currently the interface is extremely simplified. Only the --tree form of usage is allowed (not --init-model), which means a tree must be supplied with the alignment (to run()). You can try supplying normal phyloFit arguments to new(), or calling arg-named methods (excluding initial hyphens and converting others to underscores, eg. $factory-Egaps_as_bases(1) to set the --gaps-as-bases arg). WARNING: the API may change in the future to allow for greater flexability and access to more phyloFit features. You will need to enable this PhyloFit wrapper to find the phast programs (at least phyloFit itself). This can be done in (at least) three ways: 1. Make sure the phyloFit executable is in your path. 2. Define an environmental variable PHASTDIR which is a directory which contains the phyloFit application: In bash: export PHASTDIR=/home/username/phast/bin In csh/tcsh: setenv PHASTDIR /home/username/phast/bin 3. Include a definition of an environmental variable PHASTDIR in every script that will use this PhyloFit wrapper module, e.g.: BEGIN { $ENV{PHASTDIR} = '/home/username/phast/bin' } use Bio::Tools::Run::Phylo::Phast::PhyloFit; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phast::PhyloFit; use strict; use Cwd; use File::Spec; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'phyloFit'; our $PROGRAM_DIR = $ENV{'PHASTDIR'}; # methods and their synonyms from the phastCons args we support our %PARAMS = (subst_mod => 's', min_informative => 'I', precision => 'p', log => 'l', ancestor => 'A', nrates => 'k', alpha => 'a', rate_constants => 'K', features => 'g', catmap => 'c', do_cats => 'C', reverse_groups => 'R'); our %SWITCHES = (gaps_as_bases => 'G', quiet => 'q', EM => 'E', init_random => 'r', estimate_freqs => 'F', markov => 'N', non_overlapping => 'V'); # just to be explicit, args we don't support (yet) or we handle ourselves our %UNSUPPORTED = (msa_format => 'i', out_root => 'o', tree => 't', help => 'h', lnl => 'L', init_model => 'M', scale_only => 'B', scale_subtree => 'S', no_freqs => 'f', no_rates => 'n', post_probs => 'P', expected_subs => 'X', expected_total_subs => 'Z', column_probs => 'U', windows => 'w', windows_explicit => 'v'); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Phast::PhyloFit->new() Function: creates a new PhyloFit factory Returns : Bio::Tools::Run::Phylo::Phast::PhyloFit Args : Most options understood by phastCons can be supplied as key => value pairs. Options that don't normally take a value should be given a value of 1. You can type the keys as you would on the command line (eg. '--gaps-as-bases' => 1) or with only a single hyphen to start and internal hyphens converted to underscores (eg. -gaps_as_bases => 1) to avoid having to quote the key. These options can NOT be used with this wrapper currently: msa_format / i out_root / o tree / t help / h lnl / L init_model / M scale_only / B scale_subtree / S no_freqs / f no_rates / n post_probs / P expected_subs / X expected_total_subs / Z column_probs / U windows / w windows_explicit / v =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS), (map { $_ => $SWITCHES{$_} } keys %SWITCHES)}, -create => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs phyloFit on an alignment. Returns : filename of init.mod file produced Args : The first argument represents an alignment, the second argument a species tree. The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases, the alignment sequence names must correspond to node ids in the species tree. Multi-word species names should be joined with underscores to form the sequence names, eg. Homo_sapiens =cut sub run { my ($self, $aln, $tree) = @_; ($aln && $tree) || $self->throw("alignment and tree must be supplied"); $self->_alignment($aln); $tree = $self->_tree($tree); $tree->force_binary; # adjust tree node ids to convert spaces to underscores (eg. if tree # generated from taxonomy) foreach my $node ($tree->get_leaf_nodes) { my $id = $node->id; $id =~ s/ /_/g; $node->id($id); } # check node and seq names match $self->_check_names; return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; # cd to a temp dir my $temp_dir = $self->tempdir; my $cwd = Cwd->cwd(); chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); my $aln_file = $self->_write_alignment; my $tree_file = $self->_write_tree; #...phyloFit --tree "(human,(mouse,rat))" --msa-format FASTA --out-root init alignment.fa my $command = $exe.$self->_setparams($aln_file, $tree_file); $self->debug("phyloFit command = $command\n"); system($command) && $self->throw("phyloFit call ($command) crashed: $?"); # cd back again chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); return File::Spec->catfile($temp_dir, 'init.mod'); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $tree_file) = @_; my $param_string = ' --tree '.$tree_file; $param_string .= ' --msa-format FASTA'; $param_string .= ' --out-root init'; # --min-informative defaults to 50, but must not be greater than the number # of bases in the alignment my $aln = $self->_alignment; my $length = $aln->length; my $min_informative = $self->min_informative || 50; if ($length < $min_informative) { $self->min_informative($length); } $param_string .= $self->SUPER::_setparams(-params => [keys %PARAMS], -switches => [keys %SWITCHES], -double_dash => 1, -underscore_to_dash => 1); $param_string .= ' '.$aln_file; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/000077500000000000000000000000001302566030400230545ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/Base.pm000066400000000000000000000112711302566030400242660ustar00rootroot00000000000000# $Id $ # # BioPerl module for Bio::Tools::Run::Phylo::Phylip::Base # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::Base - Base object for Phylip modules =head1 SYNOPSIS # Do not use directly # This module is for setting basic data sets for the Phylip wrapper # modules =head1 DESCRIPTION This module is just a base object for Bioperl Phylip wrappers. IMPORTANT PHYLIP VERSION ISSUES By default we assume you have Phylip 3.6 installed, if you have installed Phylip 3.5 you need to set the environment variable PHYLIPVERSION =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Phylip::Base; use vars qw(@ISA %DEFAULT %FILENAME); use strict; BEGIN { eval { require File::Spec }; if( $@) { Bio::Root::RootI->throw("Must have installed File::Spec to run Bio::Tools::Run::Phylo::Phylip tools"); } } use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::Phylo::Phylip::PhylipConf; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { %DEFAULT = ( 'VERSION' => $ENV{'PHYLIPVERSION'} || '3.6', ); %FILENAME = %Bio::Tools::Run::Phylo::Phylip::PhylipConf::FileName; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Phylip::Base->new(); Function: Builds a new Bio::Tools::Run::Phylo::Phylip::Base object Returns : an instance of Bio::Tools::Run::Phylo::Phylip::Base Args : =cut =head2 outfile Title : outfile Usage : $obj->outfile($newval) Function: Get/Set default PHYLIP outfile name ('outfile' usually) Changing this is only necessary when you have compiled PHYLIP to use a different filename for the default 'outfile' This will not change the default output filename by PHYLIP Returns : value of outfile Args : newvalue (optional) =cut sub outfile{ my $self = shift; $self->{'_outfile'} = shift if @_; return $self->{'_outfile'} || $FILENAME{$self->version}{'OUTFILE'} } =head2 treefile Title : treefile Usage : $obj->treefile($newval) Function: Get/Set the default PHYLIP treefile name ('treefile' usually) Returns : value of treefile Args : newvalue (optional) =cut sub treefile{ my $self = shift; $self->{'_treefile'} = shift if @_; return $self->{'_treefile'} || $FILENAME{$self->version}{'TREEFILE'}; } =head2 fontfile Title : fontfile Usage : $obj->fontfile($newval) Function: Get/Set the fontfile Returns : value of fontfile (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub fontfile{ my $self = shift; return $self->{'fontfile'} = shift if @_; return $self->{'fontfile'} ; } =head2 plotfile Title : plotfile Usage : $obj->plotfile($newval) Function: Get/Set the plotfile Returns : value of plotfile (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub plotfile { my $self = shift; return $self->{'plotfile'} = shift if @_; return $self->{'plotfile'} || $FILENAME{$self->version}{'PLOTFILE'}; } =head2 version Title : version Usage : $obj->version($newval) Function: Get/Set the version Returns : value of version (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub version { my $self = shift; return $self->{'version'} = shift if @_; return $self->{'version'} || $DEFAULT{'VERSION'}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/Consense.pm000066400000000000000000000415001302566030400251670ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::Consense # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::Consense - Wrapper for the phylip program Consense =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::Consense; use Bio::Tools::Run::Phylo::Phylip::SeqBoot; use Bio::Tools::Run::Phylo::Phylip::ProtDist; use Bio::Tools::Run::Phylo::Phylip::Neighbor; use Bio::Tools::Run::Phylo::Phylip::DrawTree; #first get an alignment my $aio= Bio::AlignIO->new(-file=>$ARGV[0],-format=>"clustalw"); my $aln = $aio->next_aln; # To prevent truncation of sequence names by PHYLIP runs, use set_displayname_safe my ($aln_safe, $ref_name)=$aln->set_displayname_safe(); #next use seqboot to generate multiple aligments my @params = ('datatype'=>'SEQUENCE','replicates'=>10); my $seqboot_factory = Bio::Tools::Run::Phylo::Phylip::SeqBoot->new(@params); my $aln_ref= $seqboot_factory->run($aln); Or, for long sequence names: my $aln_ref= $seqboot_factory->run($aln_safe); #next build distance matrices and construct trees my $pd_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(); my $ne_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(); foreach my $a (@{$aln_ref}){ my $mat = $pd_factory->create_distance_matrix($a); push @tree, $ne_factory->create_tree($mat); } #now use consense to get a final tree my $con_factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(); #you may set outgroup either by the number representing the order in #which species are entered or by the name of the species $con_factory->outgroup(1); $con_factory->outgroup('HUMAN'); my $tree = $con_factory->run(\@tree); # Restore original sequence names, after ALL phylip runs: my @nodes = $tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } #now draw the tree my $draw_factory = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(); my $image_filename = $draw_factory->draw_tree($tree); =head1 DESCRIPTION Wrapper for phylip consense program Taken from phylip documentation... CONSENSE reads a file of computer-readable trees and prints out (and may also write out onto a file) a consensus tree. At the moment it carries out a family of consensus tree methods called the M[l] methods (Margush and McMorris, 1981). These include strict consensus and majority rule consensus. Basically the consensus tree consists of monophyletic groups that occur as often as possible in the data. More documentation on using Consense and setting parameters may be found in the phylip package. VERSION Support This wrapper currently supports v3.5 of phylip. There is also support for v3.6 although this is still experimental as v3.6 is still under alpha release and not all functionalities maybe supported. =head1 PARAMETERS FOR Consense =head2 TYPE Title : TYPE Description : (optional) Only available in phylip v3.6 This program supports 3 types of consensus generation MRe : Majority Rule (extended) Any set of species that appears in more than 50% of the trees is included. The program then considers the other sets of species in order of the frequency with which they have appeared, adding to the consensus tree any which are compatible with it until STRICT: A set of species must appear in all input trees to be included in the strict consensus tree. MR : A set of species is included in the consensus tree if it is present in more than half of the input trees. Ml : The user is asked for a fraction between 0.5 and 1, and the program then includes in the consensus tree any set of species that occurs among the input trees more than that fraction of then time. The Strict consensus and the Majority Rule consensus are extreme cases of the M[l] consensus, being for fractions of 1 and 0.5 respectively usage: my $factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(-type=>"Ml 0.7"); Defaults to MRe =head2 ROOTED Title: ROOTED Description: (optional) toggles between the default assumption that the input trees are unrooted trees and the selection that specifies that the tree is to be treated as a rooted tree and not re-rooted. Otherwise the tree will be treated as outgroup-rooted and will be re-rooted automatically at the first species encountered on the first tree (or at a species designated by the Outgroup option) usage: my $factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(-rooted=>1); Defaults to unrooted =head2 OUTGROUP Title : OUTGROUP Description : (optional) It is in effect only if the Rooted option selection is not in effect. The trees will be re-rooted with a species of your choosing. usage my $factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(-outgroup=>2); Defaults to first species encountered. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::Consense; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @CONSENSE_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use IO::String; use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the Consense program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use Consense.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('executable'=>'/usr/local/bin/consense'); # my $Consense_factory = Bio::Tools::Run::Phylo::Phylip::Consense->new(@params); # BEGIN { @CONSENSE_PARAMS = qw(TYPE OUTGROUP ROOTED); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@CONSENSE_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $obj->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'consense'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.treefile'; $tree= $Consense_factory->run($inputfilename); or $tree= $consense_factory->run(\@tree); Function: Create bootstrap sets of alignments Example : Returns : a L Args : either a file containing trees in newick format or an array ref of L Throws an exception if argument is not either a string (eg a filename) or a Bio::Tree::TreeI object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) { $self->throw("Problems setting up for Consense. Probably bad input data in $input !"); } # Create parameter string to pass to Consense program my $param_string = $self->_setparams(); # run Consense my $aln = $self->_run($infilename,$param_string); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to Consense program Example : Returns : an array ref of Args : Name of a file containing a set of tree in newick format and a parameter string to be passed to Consense =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } my $tmpdir = $self->tempdir; chdir($self->tempdir); # open a pipe to run Consense to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(Consense,"| ".$self->executable .">$null"); } else { open(Consense,"| ".$self->executable); } $instring = $infile."\n".$param_string; $self->debug( "Program ".$self->executable." $instring\n"); print Consense $instring; close(Consense); # get the results my $outfile = $self->io->catfile($self->tempdir,$self->treefile); chdir($curpath); $self->throw("Consense did not create files correctly ($outfile)") unless (-e $outfile); #parse the alignments my @aln; my $tio = Bio::TreeIO->new(-file=>$outfile,-format=>"newick"); my $tree = $tio->next_tree; # Clean up the temporary files created along the way... unlink $outfile unless $self->save_tempfiles; return $tree; } sub _set_names_from_tree { my ($self,$tree) = @_; my $newick; my $ios = IO::String->new($newick); my $tio = Bio::TreeIO->new(-fh=>$ios,-format=>'newick'); $tio->write_tree($tree); my @names = $newick=~/(\w+):\d+/g; my %names; for(my $i=0; $i < $#names; $i++){ $names{$names[$i]} = $i+1; } $self->names(\%names); return; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for Consense program Example : Returns : name of file containing a trees in newick format Args : an array ref of Bio::Tree::Tree object or input file name =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$tfh); # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} my $tio = Bio::TreeIO->new(-file=>$alnfilename,-format=>'newick'); my $tree = $tio->next_tree; $self->_set_names_from_tree($tree); return $alnfilename; } # $input may be a SimpleAlign Object my @input = ref($input) eq "ARRAY" ? @{$input} : ($input); ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $treeIO = Bio::TreeIO->new(-fh => $tfh, -format=>'newick'); foreach my $tree(@input){ $tree->isa('Bio::Tree::TreeI') || $self->throw('Expected a Bio::TreeI object'); $treeIO->write_tree($tree); } #get the species names in order, using the first one $self->_set_names_from_tree($input[0]); $treeIO->close(); close($tfh); undef $tfh; return $alnfilename; } =head2 names() Title : names Usage : $tree->names(\%names) Function: get/set for a hash ref for storing names in matrix with rank as values. Example : Returns : hash reference Args : hash reference =cut sub names { my ($self,$name) = @_; if($name){ $self->{'_names'} = $name; } return $self->{'_names'}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Consense program Example : Returns : parameter string to be passed to Consense Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $rooted = 0; #for case where type is Ml my $Ml = 0; my $frac = 0.5; my %menu = %{$Menu{$self->version}->{'CONSENSE'}}; foreach my $attr ( @CONSENSE_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/ROOTED/i){ $rooted = 1; $param_string .= $menu{'ROOTED'}; } elsif($attr =~/OUTGROUP/i){ if($rooted == 1){ $self->warn("Outgroup option cannot be used with a rooted tree"); next; } if($value !~/^\d+$/){ # is a name my %names = %{$self->names}; $names{$value} || $self->throw("Outgroup $value not found"); $value = $names{$value}; } $param_string .=$menu{'OUTGROUP'}."$value\n"; } elsif($attr=~/TYPE/i){ if($value=~/Ml/i){ ($value,$frac) = split(/\s+/,$value); #default if not given $frac ||= 0.5; if($frac <= 0.5 || $frac > 1){ $self->warn("fraction given is out of range 0.5no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $Consense->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a Consense run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/DrawGram.pm000066400000000000000000000257221302566030400251260ustar00rootroot00000000000000# $Id $ # # BioPerl module for Bio::Tools::Run::Phylo::Phylip::DrawGram # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::DrawGram - use Phylip DrawTree program to draw phylograms or phenograms =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::DrawGram; my $drawfact = Bio::Tools::Run::Phylo::Phylip::DrawGram->new(); my $treeimage = $drawfact->run($tree); =head1 DESCRIPTION This is a module for automating drawing of trees through Joe Felsenstein's Phylip suite. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Phylip::DrawGram; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME $FONTFILE @DRAW_PARAMS @OTHER_SWITCHES %OK_FIELD %DEFAULT); use strict; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the neighbor program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use DrawGram.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/drawgram'); # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::DrawGram->new(@params) BEGIN { %DEFAULT = ('PLOTTER' => 'P', 'SCREEN' => 'N'); $DEFAULT{'FONTFILE'} = Bio::Root::IO->catfile($ENV{'PHYLIPDIR'},"font1") if $ENV{'PHYLIPDIR'}; $PROGRAMNAME = 'drawgram'; @DRAW_PARAMS = qw(PLOTTER SCREEN TREESTYLE USEBRANCHLENS LABEL_ANGLE HORIZMARGINS VERTICALMARGINS SCALE TREEDEPTH STEMLEN TIPSPACE ANCESTRALNODES FONT); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@DRAW_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Phylip::DrawGram->new(); Function: Builds a new Bio::Tools::Run::Phylo::Phylip::DrawGram object Returns : an instance of Bio::Tools::Run::Phylo::Phylip::DrawGram Args : The available DrawGram parameters =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } $self->plotter($DEFAULT{'PLOTTER'}) unless $self->plotter; $self->screen($DEFAULT{'SCREEN'}) unless $self->screen; $self->fontfile($DEFAULT{'FONTFILE'}) unless $self->fontfile; return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 run Title : run Usage : my $file = $app->run($treefile); Function: Draw a tree Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub run{ my ($self,$input) = @_; # Create input file pointer my ($infilename) = $self->_setinput($input); if (!$infilename) { $self->throw("Problems setting up for drawgram. Probably bad input data in $input !"); } # Create parameter string to pass to neighbor program my $param_string = $self->_setparams(); # run drawgram my $plotfile = $self->_run($infilename,$param_string); return $plotfile; } =head2 draw_tree Title : draw_tree Usage : my $file = $app->draw_tree($treefile); Function: This method is deprecated. Please use run instead. Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub draw_tree{ return shift->run(@_); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to drawgram program Example : Returns : Bio::Tree object Args : Name of a file the tree to draw in newick format and a parameter string to be passed to drawgram =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile . "\n"; if( ! defined $self->fontfile ) { $self->throw("You must have defined a fontfile"); } if( -e $self->io->catfile($curpath,'fontfile') ) { $instring .= $self->io->catfile($curpath,'fontfile')."\n"; } elsif( File::Spec->file_name_is_absolute($self->fontfile) ) { $instring .= $self->io->catfile($self->fontfile)."\n"; } else { $instring .= $self->io->catfile($curpath,$self->fontfile)."\n"; } chdir($self->tempdir); $instring .= $param_string; $self->debug( "Program ".$self->executable." $param_string\n"); # open a pipe to run drawgram to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(DRAW,"|".$self->executable.">$null"); } else { open(DRAW,"|".$self->executable); } print DRAW $instring; close(DRAW); chdir($curpath); #get the results my $plotfile = $self->io->catfile($self->tempdir,$self->plotfile); $self->throw("drawgram did not create plotfile correctly ($plotfile)") unless (-e $plotfile); return $plotfile; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for drawing program Example : Returns : filename containing tree in newick format Args : Bio::Tree::TreeI object =cut sub _setinput { my ($self, $input) = @_; my $treefile; unless (ref $input) { # check that file exists or throw $treefile = $input; unless (-e $input) {return 0;} } elsif ($input->isa("Bio::Tree::TreeI")) { # Open temporary file for both reading & writing of BioSeq array my $tfh; ($tfh,$treefile) = $self->io->tempfile(-dir=>$self->tempdir); my $treeIO = Bio::TreeIO->new(-fh => $tfh, -format=>'newick'); $treeIO->write_tree($input); $treeIO->close(); close($tfh); $tfh = undef; } return $treefile; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for drawgram program Example : Returns : parameter string to be passed to drawgram Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my ($hmargin,$vmargin); my %menu = %{$Menu{$self->version}->{'DRAWGRAM'}}; foreach my $attr ( @DRAW_PARAMS) { $value = $self->$attr(); next unless defined $value; my @vals; if( ref($value) ) { ($value,@vals) = @$value; } $attr = uc($attr); if( ! exists $menu{$attr} ) { $self->warn("unknown parameter $attr, known params are ". join(",",keys %menu). "\n"); } if( ref ($menu{$attr}) !~ /HASH/i ) { unless( @vals ) { $param_string .= $menu{$attr}; } else { $param_string .= sprintf($menu{$attr},$value,@vals); } next; } my $seen = 0; for my $stype ( keys %{$menu{$attr}} ) { if( $value =~ /$stype/i ) { $param_string .= sprintf($menu{$attr}->{$stype},@vals); $seen = 1; last; } } unless( $seen ) { $self->warn("Unknown requested attribute $attr, $value is not known\n"); } } $param_string .="Y\n"; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $dragram->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a DrawGram run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/DrawTree.pm000066400000000000000000000271101302566030400251300ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::Phylo::Phylip::DrawTree # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::DrawTree - use Phylip DrawTree program to draw trees =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::DrawTree; my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(); my $treeimagefile = $drawfact->run($tree); =head1 DESCRIPTION This is a module for automating drawing of trees through Joe Felsenstein's Phylip suite. To set parameters with option you need to pass in an array reference or a string, depending on the parameter. For example: $treedraw->HORIZMARGINS(['2.00','2.5']); $treedraw->ANCESTRALNODES('C'); $treedraw->TREESTYLE('PHEN'); $treedraw->USEBRANCHLENS('N'); This can be a brittle module as the menus change in PHYLIP. It should support phylip 3.6 but no guarantees. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Phylo::Phylip::DrawTree; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME $FONTFILE @DRAW_PARAMS @OTHER_SWITCHES %OK_FIELD %DEFAULT); use strict; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics use Bio::Tools::Run::Phylo::Phylip::Base; use Cwd; @ISA = qw( Bio::Tools::Run::Phylo::Phylip::Base ); use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); # You will need to enable the neighbor program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use DrawTree.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/drawgram'); # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params) BEGIN { %DEFAULT = ('PLOTTER' => 'P', 'SCREEN' => 'N'); $DEFAULT{'FONTFILE'} = Bio::Root::IO->catfile($ENV{'PHYLIPDIR'},"font1") if $ENV{'PHYLIPDIR'}; $PROGRAMNAME="drawtree"; if (defined $ENV{'PHYLIPDIR'}) { $PROGRAMDIR = $ENV{'PHYLIPDIR'} || ''; $PROGRAM = Bio::Root::IO->catfile($PROGRAMDIR, $PROGRAMNAME.($^O =~ /mswin/i ?'.exe':'')); $DEFAULT{'FONTFILE'} = Bio::Root::IO->catfile($ENV{'PHYLIPDIR'},"font1"); } else { $PROGRAM = $PROGRAMNAME; } @DRAW_PARAMS = qw(PLOTTER SCREEN LABEL_ANGLE ROTATION TREEARC ITERATE SCALE HORIZMARGINS VERTICALMARGINS FONT ); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@DRAW_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $obj->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $drawfact->program_dir() Function: returns the program directory, obtained from ENV variable. Returns : string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(); Function: Builds a new Bio::Tools::Run::Phylo::Phylip::DrawTree object Returns : an instance of Bio::Tools::Run::Phylo::Phylip::DrawTree Args : The available DrawGram parameters =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } $self->plotter($DEFAULT{'PLOTTER'}) unless $self->plotter; $self->screen($DEFAULT{'SCREEN'}) unless $self->screen; $self->fontfile($DEFAULT{'FONTFILE'}) unless $self->fontfile; return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 run Title : run Usage : my $file = $app->run($treefile); Function: Draw a tree Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub run{ my ($self,$input) = @_; # Create input file pointer my ($infilename) = $self->_setinput($input); if (!$infilename) { $self->throw("Problems setting up for drawgram. Probably bad input data in $input !"); } # Create parameter string to pass to neighbor program my $param_string = $self->_setparams(); # run drawgram my $plotfile = $self->_run($infilename,$param_string); return $plotfile; } =head2 draw_tree Title : draw_tree Usage : my $file = $app->draw_tree($treefile); Function: This method is deprecated. Please use run method. Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub draw_tree{ return shift->run(@_); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to drawgram program Example : Returns : Bio::Tree object Args : Name of a file the tree to draw in newick format and a parameter string to be passed to drawgram =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile . "\n"; if( ! defined $self->fontfile ) { $self->throw("You must have defined a fontfile"); } if( -e $self->io->catfile($curpath,'fontfile') ) { $instring .= $self->io->catfile($curpath,'fontfile')."\n"; } elsif( File::Spec->file_name_is_absolute($self->fontfile) ) { #$instring .= $self->io->catfile($self->tempdir,$self->fontfile)."\n"; $instring .= $self->io->catfile($self->fontfile)."\n"; } else { $instring .= $self->io->catfile($curpath,$self->fontfile)."\n"; } chdir($self->tempdir); $instring .= $param_string; $self->debug( "Program ".$self->executable." $param_string\n"); # open a pipe to run drawgram to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(DRAW,"|".$self->executable.">$null"); } else { open(DRAW,"|".$self->executable); } print DRAW $instring; close(DRAW); chdir($curpath); #get the results my $plotfile = $self->io->catfile($self->tempdir,$self->plotfile); $self->throw("drawgram did not create plotfile correctly ($plotfile)") unless (-e $plotfile); return $plotfile; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for drawing program Example : Returns : filename containing tree in newick format Args : Bio::Tree::TreeI object =cut sub _setinput { my ($self, $input) = @_; my $treefile; unless (ref $input) { # check that file exists or throw $treefile = $input; unless (-e $input) {return 0;} } elsif ($input->isa("Bio::Tree::TreeI")) { # Open temporary file for both reading & writing of BioSeq array my $tfh; ($tfh,$treefile) = $self->io->tempfile(-dir=>$self->tempdir); my $treeIO = Bio::TreeIO->new(-fh => $tfh, -format=>'newick'); $treeIO->write_tree($input); $treeIO->close(); close($tfh); undef $tfh; } return $treefile; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for drawgram program Example : Returns : parameter string to be passed to drawgram Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my ($hmargin,$vmargin); my %menu = %{$Menu{$self->version}->{'DRAWTREE'}}; foreach my $attr ( @DRAW_PARAMS) { $value = $self->$attr(); next unless defined $value; my @vals; if( ref($value) ) { ($value,@vals) = @$value; } $attr = uc($attr); if( ! exists $menu{$attr} ) { $self->warn("unknown parameter $attr, known params are ". join(",",keys %menu). "\n"); } if( ref ($menu{$attr}) !~ /HASH/i ) { unless( @vals ) { $param_string .= $menu{$attr}; } else { $param_string .= sprintf($menu{$attr},$value,@vals); } next; } my $seen = 0; for my $stype ( keys %{$menu{$attr}} ) { if( $value =~ /$stype/i ) { $param_string .= sprintf($menu{$attr}->{$stype},@vals); $seen = 1; last; } } unless( $seen ) { $self->warn("Unknown requested attribute $attr, $value is not known\n"); } } $param_string .="Y\n"; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $dragram->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a DrawTree run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a Bio::Root::IO object Returns : Bio::Root::IO object Args : none See L =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/Neighbor.pm000066400000000000000000000414651302566030400251610ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::Neighbor # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::Neighbor - Wrapper for the phylip program neighbor by Joseph Felsenstein for creating a phylogenetic tree(either through Neighbor or UPGMA) based on protein distances based on amino substitution rate. 14 Nov 2002 Shawn Works with Phylip version 3.6 =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object. # Create the Distance Matrix # using a default PAM matrix and id name lengths limit of 30 note to # use id name length greater than the standard 10 in neighbor, you # will need to modify the neighbor source code $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); my $matrix = $protdist_factory->run($aln); #Create the tree passing in the distance matrix @params = ('type'=>'NJ','outgroup'=>2,'lowtri'=>1, 'upptri'=>1,'subrep'=>1); my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); #you can set your outgroup using either a number specifying #the rank in the matrix or you can just use the name of the #species $neighbor_factory->outgroup('ENSP00001'); #or $neighbor_factory->outgroup(1); my ($tree) = $neighbor_factory->run($matrix); # Alternatively, one can create the tree by passing in a file name # containing a phylip formatted distance matrix(using protdist) my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); my ($tree) = $neighbor_factory->run('/home/shawnh/prot.dist'); # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds original names # Step 2. Run ProtDist and Neighbor: $matrix = $protdist_factory-> creat_distance_matrix($aln_safe); # Use $aln_safe instead of $aln $tree = $neighbor_factory->run($matrix); # Step 3. Retrieve orgininal OTU names: use Bio::Tree::Tree; my @nodes=$tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } =head1 PARAMTERS FOR NEIGHBOR COMPUTATION =cut =head2 TYPE Title : TYPE Description : (optional) This sets the type of tree to construct, using neighbor joining or UPGMA. NJ Neighbor Joining UPGMA UPGMA Usage : @params = ('type'=>'X');#where X is one of the values above Defaults to NJ For more information on the usage of the different models, please refer to the documentation found in the phylip package. =head2 OUTGROUP (*ONLY AVAILABLE FOR NEIGHBOR JOINING) Title : OUTGROUP Description : (optional) This option selects the species to be used as the outgroup Acceptable Values: integer Usage : @params = ('outgroup'=>'X'); where X is an positive integer not more than the number of sequences Defaults to 1 =head2 LOWTRI Title : LOWTRI Description : (optional) This indicates that the distance matrix is input in Lower-triangular form (the lower-left half of the distance matrix only, without the zero diagonal elements) Usage : @params = ('lowtri'=>'X'); where X is either 1 or 0 Defaults to 0 =head2 UPPTRI Title : UPPTRI Description : (optional) This indicates that the distance matrix is input in upper-triangular form (the upper-right half of the distance matrix only, without the zero diagonal elements.) Usage : @params = ('upptri'=>'X'); where X is either 1 or 0 Defaults to 0 =head2 SUBREP Title : SUBREP Description : (optional) This is the Subreplication option. It informs the program that after each distance will be provided an integer indicating that the distance is a mean of that many replicates. Usage : @params = ('subrep'=>'X'); where X is either 1 or 0 Defaults to 0 =head2 JUMBLE Title : JUMBLE Description : (optional) This enables you to tell the program to use a random number generator to choose the input order of species. seed: an integer between 1 and 32767 and of the form 4n+1 which means that it must give a remainder of 1 when divided by 4. Each different seed leads to a different sequence of addition of species. By simply changing the random number seed and re-running programs one can look for other, and better trees. iterations: Usage : @params = ('jumble'=>'17); where 17 is the random seed Defaults to no jumble =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 CONTRIBUTORS Email:jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::Neighbor; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @NEIGHBOR_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the neighbor program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable PHYLIPDIR in # every script that will use Neighbor.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/neighbor'); # my $neighbor_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params); # BEGIN { $PROGRAMNAME="neighbor"; if (defined $ENV{PHYLIPDIR}) { $PROGRAMDIR = $ENV{PHYLIPDIR} || ''; $PROGRAM = Bio::Root::IO->catfile($PROGRAMDIR, $PROGRAMNAME.($^O =~ /mswin/i ?'.exe':'')); } else { $PROGRAM = $PROGRAMNAME; } @NEIGHBOR_PARAMS = qw(TYPE OUTGROUP LOWTRI UPPTRI SUBREP JUMBLE MULTIPLE); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@NEIGHBOR_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'neighbor'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } if (! defined $self->idlength){ $self->idlength(10); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.dist'; $tree = $neighborfactory->run($inputfilename); or $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); $matrix = $protdist_factory->create_distance_matrix($aln); $tree= $neighborfactory->run($matrix); Function: a Bio:Tree from a protein distance matrix created by protidst Example : Returns : Bio::Tree Args : Name of a file containing a protein distance matrix in Phylip format or a hash ref to a matrix Throws an exception if argument is not either a string (eg a filename) or a Hash. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($temp,$infilename, $seq); my ($attr, $value, $switch); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for neighbor. Probably bad input data in $input !");} # Create parameter string to pass to neighbor program my $param_string = $self->_setparams(); # run neighbor my @tree = $self->_run($infilename,$param_string); return wantarray ? @tree: \@tree; } =head2 create_tree Title : create_tree Usage : my $file = $app->create_tree($treefile); Function: This method is deprecated. Please use run method. Returns : File containing the rendered tree Args : either a Bio::Tree::TreeI OR filename of a tree in newick format =cut sub create_tree{ return shift->run(@_); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to neighbor program Example : Returns : Bio::Tree object Args : Name of a file containing protein distances in Phylip format and a parameter string to be passed to neighbor =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile."\n$param_string"; $self->debug( "Program ".$self->executable."\n"); chdir($self->tempdir); #open a pipe to run neighbor to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(NEIGHBOR,"|".$self->executable.">$null"); } else { open(NEIGHBOR,"|".$self->executable); } print NEIGHBOR $instring; close(NEIGHBOR); chdir($curpath); #get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); my $treefile = $self->io->catfile($self->tempdir,$self->treefile); $self->throw("neighbor did not create tree correctly (expected $treefile) ") unless (-e $treefile); my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick'); my @tree; while (my $tree = $in->next_tree){ push @tree, $tree; } # Clean up the temporary files created along the way... unless ( $self->save_tempfiles ) { unlink $outfile; unlink $treefile; } return @tree; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for neighbor program Example : Returns : name of file containing the protein distance matrix in Phylip format Args : name of file created by protdist or ref to hash created by Bio::Tools:Run::Phylo::Phylip::ProtDist =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh); #If $input is not a filename it better be a HASF reference # a phy formatted alignment file created by protdist unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } my @input = ref($input) eq "ARRAY" ? @{$input} : ($input); ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $input_count = 0; foreach my $input(@input){ if ($input->isa("Bio::Matrix::PhylipDist")){ # Open temporary file for both reading & writing of distance matrix print $tfh $input->print_matrix; $input_count++; } } $self->_input_nbr($input_count); close($tfh); #get names from the first matrix, to be used in outgroup ordering my %names; $input = shift @input; #set the species names my @names = @{$input->names}; for(my $i=0; $i<= $#names; $i++){ $names{$names[$i]} = $i+1; } $self->names(\%names); return $alnfilename; } sub _input_nbr { my ($self,$val) = @_; if($val){ $self->{'_input_nbr'} = $val; } return $self->{'_input_nbr'}; } =head2 names() Title : names Usage : $tree->names(\%names) Function: get/set for a hash ref for storing names in matrix with rank as values. Example : Returns : hash reference Args : hash reference =cut sub names { my ($self,$name) = @_; if($name){ $self->{'_names'} = $name; } return $self->{'_names'}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for neighbor program Example : Returns : parameter string to be passed to neighbor Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $type =""; my $version = $self->version; my %menu = %{$Menu{$version}->{'NEIGHBOR'}}; foreach my $attr ( @NEIGHBOR_PARAMS) { $value = $self->$attr(); next unless (defined $value && $value); if ($attr =~/TYPE/i){ if ($value=~/UPGMA/i){ $type = "UPGMA"; $param_string .= $menu{'TYPE'}{'UPGMA'}; } } elsif($attr =~ /OUTGROUP/i){ if ($type ne "UPGMA"){ if($value !~/^\d+$/){ # is a name so find the rank my %names = %{$self->names}; $names{$value} || $self->throw("Outgroup $value not found"); $value = $names{$value}; } $param_string .= $menu{'OUTGROUP'}."$value\n"; } else { $self->throw("Can't set outgroup using UPGMA. Use Neighbor-Joining instead"); } } elsif ($attr =~ /JUMBLE/i){ $self->throw("Unallowed value for random seed, need odd number") unless ($value =~ /\d+/ && ($value % 2 == 1)); $param_string .=$menu{'JUMBLE'}."$value\n"; } elsif($attr=~/MULTIPLE/i){ $param_string.=$menu{'MULTIPLE'}."$value\n"; #version 3.6 needs a random seed if($version eq "3.6"){ $param_string .= (2 * int(rand(10000)) + 1)."\n"; } } else{ $param_string .= $menu{uc $attr}; } } if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){ $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n"; } $param_string .=$menu{'SUBMIT'}; return $param_string; } =head2 outfile Title : outfile Usage : $obj->outfile($newval) Function: Get/Set default PHYLIP outfile name ('outfile' usually) Returns : value of outfile Args : newvalue (optional) =cut =head2 treefile Title : treefile Usage : $obj->treefile($newval) Function: Get/Set the default PHYLIP treefile name ('treefile' usually) Returns : value of treefile Args : newvalue (optional) =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/PhylipConf.pm000066400000000000000000000172161302566030400254740ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::PhylipConf # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::PhylipConf =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phylip::PhylipConf; my %menu = %{$Bio::Tools::Run::Phylo::Phylip::PhylipConf::Menu->{$version}->{'PROTDIST'}}; =head1 DESCRIPTION A configuration for managing menu configuration differences between version 3.5 and 3.6 =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 CONTRIBUTORS Email:jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::PhylipConf; use strict; use Exporter; use vars qw(@ISA %Menu %FileName $RESOLUTIONX $RESOLUTIONY @EXPORT_OK); use base 'Exporter'; $RESOLUTIONX = 300; $RESOLUTIONY = 300; @EXPORT_OK = qw(%FileName %Menu); %FileName = ( "3.5"=>{'OUTFILE'=>'outfile', 'TREEFILE'=>'treefile', 'PLOTFILE'=>'plotfile', }, "3.6"=>{'OUTFILE'=>'outfile', 'TREEFILE'=>'outtree', 'PLOTFILE'=>'plotfile', }, ); %Menu = ( "3.5" => { 'PROTDIST' => { 'MODEL' => { 'CAT' =>"P\nP\n", 'KIMURA'=>"P\n", }, 'GENCODE'=> { 'ALLOWED'=>"UMVFY", 'OPTION' =>"C\n", }, 'CATEGORY'=>{ 'ALLOWED'=>"CHG", 'OPTION' =>"A\n", }, 'PROBCHANGE'=>"E\n", 'TRANS' =>"T\n", 'FREQ' =>"F\n", 'SUBMIT' =>"Y\n", 'MULTIPLE' =>"M\n", }, 'NEIGHBOR'=>{ 'TYPE' => { 'UPGMA'=>"N\n", }, 'OUTGROUP'=>"O\n", 'LOWTRI' =>"L\n", 'UPPTRI' =>"R\n", 'SUBREP' =>"S\n", 'JUMBLE' =>"J\n", 'SUBMIT' =>"Y\n", 'MULTIPLE' =>"M\n", }, 'PROTPARS'=>{ 'THRESHOLD'=>"T\n", 'JUMBLE' =>"J\n", 'OUTGROUP' =>"O\n", 'SUBMIT' =>"Y\n", }, 'SEQBOOT'=>{ 'DATATYPE' =>{ 'SEQUENCE'=>"", 'MORPH' =>"D\n", 'REST' =>"D\nD\n", 'GENEFREQ'=>"D\nD\nD\n", }, 'ALLELES' => "A\n", 'PERMUTE' => { 'BOOTSTRAP'=>"", 'JACKKNIFE'=>"J\n", 'PERMUTE' =>"J\nJ\n", }, 'REPLICATES'=>"R\n", 'SUBMIT' =>"Y\n", }, 'CONSENSE'=>{ 'ROOTED' => "R\n", 'OUTGROUP' => "O\n", 'SUBMIT' =>"Y\n", }, }, "3.6"=>{ 'PROTDIST'=>{ 'MODEL' => { 'PMB' =>"P\n", 'PAM' =>"P\nP\n", 'KIMURA' =>"P\nP\nP\n", 'CAT' =>"P\nP\nP\nP\n", 'JTT' =>"Y\n", }, 'GENCODE'=> { 'ALLOWED'=>"UMVFY", 'OPTION' =>"U\n", }, 'CATEGORY'=> { 'ALLOWED'=>"CHG", 'OPTION' =>"A\n", }, 'PROBCHANGE'=>"E\n", 'TRANS' =>"T\n", 'FREQ' =>"F\n", 'WEIGHTS' =>"W\n", 'SUBMIT' => "Y\n", 'MULTIPLE' =>"M\nD\n", }, 'NEIGHBOR' => { 'TYPE' => { 'UPGMA'=>"N\n", }, 'OUTGROUP'=>"O\n", 'LOWTRI' =>"L\n", 'UPPTRI' =>"R\n", 'SUBREP' =>"S\n", 'JUMBLE' =>"J\n", 'SUBMIT' =>"Y\n", 'MULTIPLE' =>"M\n", }, 'PROTPARS' => { 'THRESHOLD'=>"T\n", 'JUMBLE' =>"J\n", 'OUTGROUP' =>"O\n", 'SUBMIT' =>"Y\n", }, 'DRAWGRAM' => { 'SCALE' => "R\n", 'HORIZMARGINS' => "M\n%.2f\n%.2f\n", 'VERTICALMARGINS' => "M\n%.2f\n%.2f", 'SCREEN' => { 'Y|YES|1' => "V\nX\n", 'N|NO|0' => "V\nN\n", }, 'FONT' => "F\n%s\n", 'PAGES' => { 'L|PAGES|SIZE' => "#\nL\n%d\n%d\nM\n", 'P|PHYSICAL' => "#\nP\n%.4f\n%.4f\nM\n", 'O|OVERLAP' => "#\nO\n%.4f\n%.4f\nM\n", }, 'PLOTTER' => { 'P|POSTSCRIPT' => "P\nL\n", 'PICT' => "P\nM\n", "HP|PCL|LaserJect" => "P\nJ\n", "BMP" => "P\nW\n$RESOLUTIONX\n$RESOLUTIONY", "FIG" => "P\nF\n", "IDRAW" => "P\nA\n", "VRML" => "P\nZ\n", "PCX" => "P\nP\n3\n", }, 'ANCESTRALNODES' => { 'I|INTER|INTERMEDIETE' => "A\nI\n", 'W|WEIGHTED' => "A\nW\n", 'C|CENT|CENTERED' => "A\nC\n", 'N|INNER|INNERMOST' => "A\nN\n", 'V' => "A\nV\n", }, 'TREESTYLE' => { 'C|CLAD|CLADOGRAM' => "S\nC\n", 'P|PHEN|PHENOGRAM' => "S\nP\n", 'V|CURV|CURVOGRAM' => "S\nV\n", 'E|EURO|EUROGRAM' => "S\nE\n", 'S|SWOOP|SWOOPOGRAM' => "S\nS\n", 'O|CIRC|CIRCULAR' => "S\nO\n", }, 'TIPSPACE' => "C\n%.4f\n", 'STEMLEN' => "T\n%.4f\n", 'TREEDEPTH' => "D\n%.4f\n", 'LABEL_ANGLE' => "L\n%.4f\n", 'USEBRANCHLENS' => { '1|Y|YES' => "", '0|N|NO' => "B\n", }, }, 'DRAWTREE' => { 'SCREEN' => { 'Y|YES|1' => "V\nX\n", 'N|NO|0' => "V\nN\n", }, 'PLOTTER' => { 'L|P|POSTSCRIPT' => "P\nL\n", 'PICT' => "P\nM\n", "HP|PCL|LaserJect" => "P\nJ\n", "BMP" => "P\nW\n$RESOLUTIONX\n$RESOLUTIONY", "FIG" => "P\nF\n", "IDRAW" => "P\nA\n", "VRML" => "P\nZ\n", "PCX" => "P\nP\n3\n", }, 'LABEL_ANGLE' => { 'F|FIXED' => "L\nF\n%d\n", 'R|RADIAL' => "L\nR\n", 'A|ALONG' => "L\nA\n", 'M|MIDDLE' => "L\nM\n", }, 'ROTATION' => "R\n%d\n", 'ITERATE' => { 'E|EQUAL|DAYLIGHT' => "", 'N|NBODY|N-BODY' => "I\n", 'NO|FALSE' => "I\nI\n", }, 'TREEARC' => "I\nI\nA\n%d\n", 'SCALE' => "S\n%.2f\n", 'PAGES' => { 'L|PAGES|SIZE' => "#\nL\n%d\n%d\nM\n", 'P|PHYSICAL' => "#\nP\n%.4f\n%.4f\nM\n", 'O|OVERLAP' => "#\nO\n%.4f\n%.4f\nM\n", }, 'HORIZMARGINS' => "M\n%.2f\n%.2f\n", 'VERTICALMARGINS' => "M\n%.2f\n%.2f", }, 'SEQBOOT'=>{ 'DATATYPE' => { 'SEQUENCE'=> "", 'MORPH' =>"D\n", 'REST' =>"D\nD\n", 'GENEFREQ'=>"D\nD\nD\n", }, 'ALLELES' => "A\n", 'PERMUTE' => { 'BOOTSTRAP'=>"", 'JACKKNIFE'=>"J\n", 'PERMUTE' =>"J\nJ\n", }, 'REPLICATES'=>"R\n", 'SUBMIT' =>"Y\n", }, 'CONSENSE'=>{ 'TYPE' => { 'MRE' =>"", 'STRICT' =>"C\n", 'MR' =>"C\nC\n", 'ML' =>"C\nC\nC\n", }, 'ROOTED' => "R\n", 'OUTGROUP' => "O\n", 'SUBMIT' =>"Y\n", }, }, ); 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/ProtDist.pm000066400000000000000000000443441302566030400251730ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip program protdist =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object. # Create the Distance Matrix using a default PAM matrix and id name # lengths limit of 30 note to use id name length greater than the # standard 10 in protdist, you will need to modify the protdist source # code @params = ('MODEL' => 'PAM'); $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); my ($matrix) = $protdist_factory->run($aln); # an array of Bio::Matrix::PhylipDist matrix #finding the distance between two sequences my $distance = $matrix->get_entry('protein_name_1','protein_name_2'); my @column = $matrix->get_column('protein_name_1'); my @row = $martrix->get_row('protein_name_1'); my @diag = $matrix->get_diagonal(); print $matrix->print_matrix; #Alternatively, one can create the matrix by passing in a file #name containing a multiple alignment in phylip format $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); my ($matrix) = $protdist_factory->run('/home/shawnh/prot.phy'); # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds original names # Step 2. Run ProtDist and Neighbor: ($matrix) = $protdist_factory-> create_distance_matrix($aln_safe); # Use $aln_safe instead of $aln ($tree) = $neighbor_factory->run($matrix); # Step 3. Retrieve orgininal OTU names: use Bio::Tree::Tree; my @nodes=$tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } =head1 DESCRIPTION Wrapper for protdist Joseph Felsentein for creating a distance matrix comparing protein sequences from a multiple alignment file or a L object and returns a L object; VERSION Support This wrapper currently supports v3.5 of phylip. There is also support for v3.6. =head1 PARAMETERS FOR PROTDIST COMPUTATION =head2 MODEL Title : MODEL Description : (optional) This sets the model of amino acid substitution used in the calculation of the distances. 3 different models are supported: PAM Dayhoff PAM Matrix(default) KIMURA Kimura's Distance CAT Categories Distance Usage: @params = ('model'=>'X');#where X is one of the values above Defaults to PAM For more information on the usage of the different models, please refer to the documentation defaults to Equal (0.25,0.25,0.25,0.25) found in the phylip package. Additional models in PHYLIP 3.6 PMB - Henikoff/Tillier PMB matrix JTT - Jones/Taylor/Thornton =head2 MULTIPLE Title : MULTIPLE Description: (optional) This allows multiple distance matrices to be generated from multiple MSA. Usage: @params = ('MULTIPLE'=>100) where the value specifyies the number of aligments given. =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH THE Categories Distance MODEL* =head2 GENCODE Title : GENCODE Description : (optional) This option allows the user to select among various nuclear and mitochondrial genetic codes. Acceptable Values: U Universal M Mitochondrial V Vertebrate mitochondrial F Fly mitochondrial Y Yeast mitochondrial Usage : @params = ('gencode'=>'X'); where X is one of the letters above Defaults to U =head2 CATEGORY Title : CATEGORY Description : (optional) This option sets the categorization of amino acids all have groups: (Glu Gln Asp Asn), (Lys Arg His), (Phe Tyr Trp) plus: G George/Hunt/Barker: (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr Pro) C Chemical: (Cys Met), (Val Leu Ileu Gly Ala Ser Thr), (Pro) H Hall: (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr), (Pro) Usage : @params = ('category'=>'X'); where X is one of the letters above Defaults to G =head2 PROBCHANGE Title : PROBCHANGE Description : (optional) This option sets the ease of changing category of amino acid. (1.0 if no difficulty of changing,less if less easy. Can't be negative) Usage : @params = ('probchange'=>X) where 0<=X<=1 Defaults to 0.4570 =head2 TRANS Title : TRANS Description : (optional) This option sets transition/transversion ratio can be any positive number Usage : @params = ('trans'=>X) where X >= 0 Defaults to 2 =head2 FREQ Title : FREQ Description : (optional) This option sets the frequency of each base (A,C,G,T) The sum of the frequency must sum to 1. For example A,C,G,T = (0.25,0.5,0.125,0.125) Usage : @params = ('freq'=>('W','X','Y','Z') where W + X + Y + Z = 1 Defaults to Equal (0.25,0.25,0.25,0.25) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::ProtDist; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROTDIST_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Bio::Tools::Phylo::Phylip::ProtDist; use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the protdist program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable CLUSTALDIR in # every script that will use Clustal.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/protdist'); # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params); # BEGIN { @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS WEIGHTS FREQ MULTIPLE); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'protdist'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.phy'; $matrix= $prodistfactory->run($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $protdistfactory->align($seq_array_ref); $matrix = $protdistfactory->run($aln); Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file Example : Returns : L Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");} # Create parameter string to pass to protdist program my $param_string = $self->_setparams(); # run protdist my @mat = $self->_run($infilename,$param_string); return wantarray ? @mat:\@mat; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to protdist program Example : Returns : Bio::Tree object Args : Name of a file containing a set of multiple alignments in Phylip format and a parameter string to be passed to protdist =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile."\n$param_string"; $self->debug( "Program ".$self->executable." $instring\n"); chdir($self->tempdir); #open a pipe to run protdist to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(PROTDIST,"|".$self->executable .">$null"); } else { open(PROTDIST,"|".$self->executable); } print PROTDIST $instring; close(PROTDIST); # get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); chdir($curpath); $self->throw("protdist did not create matrix correctly ($outfile)") unless (-e $outfile); #Create the distance matrix here my $parser = Bio::Tools::Phylo::Phylip::ProtDist->new(-file=>$outfile); my @matrix; while (my $mat = $parser->next_matrix){ push @matrix, $mat; } # Clean up the temporary files created along the way... unlink $outfile unless $self->save_tempfiles; return @matrix; } =head2 create_distance_matrix Title : create_distance_matrix Usage : my $file = $app->create_distance_matrix($treefile); Function: This method is deprecated. Please use run method. Returns : L Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub create_distance_matrix{ return shift->run(@_); } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for protdist program Example : Returns : name of file containing a multiple alignment in Phylip format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$tfh); # suffix is used to distinguish alignment files from an align obkect #If $input is not a reference it better be the name of a file with the sequence/ # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } my @input = ref $input eq 'ARRAY' ? @{$input} : ($input); # $input may be a SimpleAlign Object ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip', -idlength=>$self->idlength()); my $input_count = 0; foreach my $input(@input){ if ($input->isa("Bio::SimpleAlign")){ # Open temporary file for both reading & writing of BioSeq array $alnIO->write_aln($input); } $input_count++; } $alnIO->close(); close($tfh); $tfh = undef; $self->_input_nbr($input_count); return $alnfilename; } sub _input_nbr { my ($self,$val) = @_; if($val){ $self->{'_input_nbr'} = $val; } return $self->{'_input_nbr'}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for protdist program Example : Returns : parameter string to be passed to protdist Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my %menu = %{$Menu{$self->version}->{'PROTDIST'}}; foreach my $attr ( @PROTDIST_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/MODEL/i){ if ($value=~/CAT/i){ $cat = 1; } $param_string .= $menu{'MODEL'}{$value}; } if($attr=~/MULTIPLE/i){ $param_string.=$menu{'MULTIPLE'}."$value\n"; } if ($cat == 1){ if($attr =~ /GENCODE/i){ my $allowed = $menu{'GENCODE'}{'ALLOWED'}; $self->throw("Unallowed value for genetic code") unless ($value =~ /[$allowed]/); $param_string .= $menu{'GENCODE'}{'OPTION'}."$value\n"; } if ($attr =~/CATEGORY/i){ my $allowed = $menu{'CATEGORY'}{'ALLOWED'}; $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[$allowed]/); $param_string .= $menu{'CATEGORY'}{'OPTION'}."$value\n"; } if ($attr =~/PROBCHANGE/i){ if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){ $param_string .= $menu{'PROBCHANGE'}."$value\n"; } else { $self->throw("Unallowed value for probability change category"); } } if ($attr =~/TRANS/i){ if (($value=~/\d+/) && ($value >=0)){ $param_string .=$menu{'TRANS'}."$value\n"; } } if ($attr =~ /FREQ/i){ my @freq = split(",",$value); if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25) $param_string .=$menu{'FREQ'}.$freq[0]."\n"; } elsif ($#freq == 3) {#must have 4 digits for each base $param_string .=$menu{'FREQ'}; foreach my $f (@freq){ $param_string.="$f\n"; } } else { $self->throw("Unallowed value for base frequencies"); } } } } #set multiple option is not set and there are more than one sequence if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){ $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n"; } $param_string .=$menu{'SUBMIT'}; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $protdist->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a ProtDist run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/ProtPars.pm000066400000000000000000000315531302566030400251730ustar00rootroot00000000000000# $Id$ # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtPars # # Created by Shawn Hoon # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::ProtPars - Object for creating a L object from a multiple alignment file or a SimpleAlign object 14 Nov 2002 Shawn Works with Phylip version 3.6 =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object. #Create the Tree #using a threshold value of 30 and id name lengths limit of 30 #note to use id name length greater than the standard 10 in protpars, # you will need to modify the protpars source code $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2); $tree = $tree_factory->run($aln); #Or one can pass in a file name containing a multiple alignment #in phylip format: $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars->new(idlength=>30,threshold=>10); $tree = $tree_factory->run("/usr/users/shawnh/COMPARA/prot.phy"); # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds original names # Step 2. Run ProtPars: $tree = $protpars_factory->run($aln_safe); # Use $aln_safe instead of $aln # Step 3. Retrieve orgininal OTU names: use Bio::Tree::Tree; my @nodes=$tree->get_nodes(); foreach my $nd (@nodes){ $nd->id($ref_name->{$nd->id_output}) if $nd->is_Leaf; } =head1 PARAMTERS FOR PROTPARS COMPUTATION =head2 THRESHOLD Title : THRESHOLD Description : (optional) This sets a threshold such that if the number of steps counted in a character is higher than the threshold, it will be taken to be the threshold value rather than the actual number of steps. You should use a positive real number greater than 1. Please see the documetation from the phylip package for more information. =head2 OUTGROUP Title : OUTGROUP Description : (optional) This specifies which species is to be used to root the tree by having it become the outgroup. Input values are integers specifying which species to use. Defaults to 1 =head2 JUMBLE Title : JUMBLE Description : (optional) This enables you to tell the program to use a random number generator to choose the input order of species. Input values is of the format: seed,iterations eg 17,10 seed: an integer between 1 and 32767 and of the form 4n+1 which means that it must give a remainder of 1 when divided by 4. Each different seed leads to a different sequence of addition of species. By simply changing the random number seed and re-running programs one can look for other, and better trees. iterations: For a value of 10, this will tell the program to try ten different orders of species in constructing the trees, and the results printed out will reflect this entire search process (that is, the best trees found among all 10 runs will be printed out, not the best trees from each individual run). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 CONTRIBUTORS Email jason-AT-bioperl_DOT_org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phylip::ProtPars; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROTPARS_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Cwd; use Bio::AlignIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the protpars program. This # can be done in (at least) two ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable CLUSTALDIR in # every script that will use Clustal.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('program'=>'/usr/local/bin/protdist'); # my $protpars_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars->new(@params); # BEGIN { @PROTPARS_PARAMS = qw(THRESHOLD JUMBLE OUTGROUP); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@PROTPARS_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'protpars'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.phy'; $tree = $factory->run($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $factory->run($seq_array_ref); $tree = $treefactory->run($aln); Function: Create a protpars tree from a SimpleAlign object Example : Returns : L object Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for protpars. Probably bad input data in $input !");} # Create parameter string to pass to protpars program my $param_string = $self->_setparams(); # run protpars my $aln = $self->_run($infilename,$param_string); } =head2 create_tree Title : create_tree Usage : $inputfilename = 't/data/prot.phy'; $tree = $factory->create_tree($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $factory->align($seq_array_ref); $tree = $treefactory->create_tree($aln); Function: Create a protpars tree from a SimpleAlign object Example : Returns : L object Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub create_tree{ return shift->run(@_); } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to protpars program Example : Returns : Bio::Tree object Args : Name of a file containing a set of multiple alignments in Phylip format and a parameter string to be passed to protpars =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } $instring = $infile."\n$param_string"; $self->debug( "Program ".$self->executable."\n"); chdir($self->tempdir); #open a pipe to run protpars to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(PROTPARS,"|".$self->executable.">$null"); } else { open(PROTPARS,"|".$self->executable); } print PROTPARS $instring; close(PROTPARS); chdir($curpath); #get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); my $treefile = $self->io->catfile($self->tempdir,$self->treefile); $self->throw("Protpars did not create treefile correctly") unless (-e $treefile); #create the tree my $in = Bio::TreeIO->new(-file => $treefile, '-format' => 'newick'); my $tree = $in->next_tree(); unless ( $self->save_tempfiles ) { # Clean up the temporary files created along the way... unlink $treefile; unlink $outfile; } return $tree; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for protpars program Example : Returns : name of file containing a multiple alignment in Phylip format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input, $suffix) = @_; my ($alnfilename,$infilename, $temp, $tfh,$input_tmp,$input_fh); # If $input is not a reference it better be the name of a # file with the sequence/ # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } # $input may be a SimpleAlign Object if ($input->isa("Bio::Align::AlignI")) { # Open temporary file for both reading & writing of BioSeq array ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip',idlength=>$self->idlength()); $alnIO->write_aln($input); $alnIO->close(); close($tfh); $tfh = undef; return $alnfilename; } return 0; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for protpars program Example : Returns : parameter string to be passed to protpars Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; my %menu = %{$Menu{$self->version}->{'PROTPARS'}}; for $attr ( @PROTPARS_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/JUMBLE/i){ my ($seed,$itr) = split(",",$value); $param_string .=$menu{'JUMBLE'}."$seed\n$itr\n"; } else { $param_string .= $menu{uc $attr}."$value\n"; } } $param_string .="Y\n"; return $param_string; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phylip/SeqBoot.pm000066400000000000000000000356651302566030400250050ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Phylo::Phylip::SeqBoot # # Created by # # Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phylip::SeqBoot - Wrapper for the phylip program SeqBoot =head1 SYNOPSIS #Create a SimpleAlign object @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); $inputfilename = 't/data/cysprot.fa'; $aln = $factory->align($inputfilename); # $aln is a SimpleAlign object. # Use seqboot to generate bootstap alignments my @params = ('datatype'=>'SEQUENCE','replicates'=>100); my $seq = Bio::Tools::Run::Phylo::Phylip::SeqBoot->new(@params); my $aln_ref = $seq->run($aln); my $aio = Bio::AlignIO->new(-file=>">alignment.bootstrap",-format=>"phylip"); foreach my $ai(@{$aln_ref}){ $aio->write_aln($ai); } # To prevent PHYLIP from truncating sequence names: # Step 1. Shelf the original names: my ($aln_safe, $ref_name)= # $aln_safe has serial names $aln->set_displayname_safe(); # $ref_name holds orginal names # Step 2. Run PHYLIP programs: $aln_ref = $seq->run($aln_safe); # Use $aln_safe instead of $aln # Step 3. Retrieve orgininal names $aio = Bio::AlignIO->new( -file=>">alignment.bootstrap", -format=>"fasta"); # FASTA output to view full names foreach my $ai(@{$aln_ref}){ my $new_aln=$ai->restore_displayname($ref_name); # Restore names $aio->write_aln($new_aln); } =head1 DESCRIPTION Wrapper for seqboot from the phylip package by Joseph Felsentein. Taken from phylip doc... "SEQBOOT is a general boostrapping tool. It is intended to allow you to generate multiple data sets that are resampled versions of the input data set. SEQBOOT can handle molecular sequences, binary characters, restriction sites, or gene frequencies." More documentation on using seqboot and setting parameters may be found in the phylip package. VERSION Support This wrapper currently supports v3.5 of phylip. There is also support for v3.6 although this is still experimental as v3.6 is still under alpha release and not all functionalities maybe supported. =head1 PARAMETERS FOR SEQBOOT =head2 MODEL Title : DATATYPE Description : (optional) This program supports 3 different datatypes SEQUENCE: Molecular Sequences MORPH : Discrete Morphological Characters REST : Restriction Sites GENEFREQ: Gene Frequencies Defaults to SEQUENCE =head2 PERMUTE Title: PERMUTE Description: (optional) 3 different resampling methods are available: BOOTSTRAP : creating a new data set by sampling N characters randomly with replacement The resulting data set has the same size as the original, but some characters have been left out and others are duplicated JACKKNIFE : Delete-half-jackknifing. It involves sampling a random half of the characters, and including them in the data but dropping the others The resulting data sets are half the size of the original, and no characters are duplicated. PERMUTE : Permuting species within characters. It involves permuting the columns of the data matrix separately. This produces data matrices that have the same number and kinds of characters but no taxonomic structure. Defaults to BOOTSTRAP =head2 REPLICATES Title : REPLICATES Description : (optional) This options allows the user to set the number of replicate data sets. Most statisticians would be happiest with 1000 to 10,000 replicates in a bootstrap, but 100 gives a good rough picture Defaults to 100 =head2 ALLELES Title : ALLELES Description : (optional) This option is to be used with gene frequencies datatype option to specify that all alleles at each locus are in the input file. Defaults to NULL =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' package Bio::Tools::Run::Phylo::Phylip::SeqBoot; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SEQBOOT_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::TreeIO; use Bio::Tools::Run::Phylo::Phylip::Base; use Bio::Tools::Run::Phylo::Phylip::PhylipConf qw(%Menu); use Bio::Matrix::PhylipDist; use Cwd; # inherit from Phylip::Base which has some methods for dealing with # Phylip specifics @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base); # You will need to enable the SeqBoot program. This # can be done in (at least) 3 ways: # # 1. define an environmental variable PHYLIPDIR: # export PHYLIPDIR=/home/shawnh/PHYLIP/bin # # 2. include a definition of an environmental variable CLUSTALDIR in # every script that will use Clustal.pm. # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin'; # # 3. You can set the path to the program through doing: # my @params('executable'=>'/usr/local/bin/seqboot'); # my $SeqBoot_factory = Bio::Tools::Run::Phylo::Phylip::SeqBoot->new(@params); # BEGIN { @SEQBOOT_PARAMS = qw(DATATYPE PERMUTE BLOCKSIZE REPLICATES READWEIGHTS READCAT); @OTHER_SWITCHES = qw(QUIET); foreach my $attr(@SEQBOOT_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : >program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'seqboot'; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR}; } sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } if ($attr =~ /IDLENGTH/i){ $self->idlength($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 idlength Title : idlength Usage : $obj->idlength ($newval) Function: Returns : value of idlength Args : newvalue (optional) =cut sub idlength{ my $self = shift; if( @_ ) { my $value = shift; $self->{'idlength'} = $value; } return $self->{'idlength'}; } =head2 run Title : run Usage : $inputfilename = 't/data/prot.phy'; $matrix= $seqboot_factory->run($inputfilename); or $seq_array_ref = \@seq_array; @seq_array is array of Seq objs $aln = $clustalw_factory->align($seq_array_ref); $aln_ref = $SeqBootfactory->run($aln); Function: Create bootstrap sets of alignments Example : Returns : an array ref of L Args : Name of a file containing a multiple alignment in Phylip format or an SimpleAlign object Throws an exception if argument is not either a string (eg a filename) or a Bio::SimpleAlign object. If argument is string, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self,$input) = @_; my ($infilename); # Create input file pointer $infilename = $self->_setinput($input); if (!$infilename) {$self->throw("Problems setting up for seqboot. Probably bad input data in $input !");} # Create parameter string to pass to SeqBoot program my $param_string = $self->_setparams(); # run SeqBoot my $aln = $self->_run($infilename,$param_string); return $aln; } ################################################# =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to SeqBoot program Example : Returns : an array ref of Args : Name of a file containing a set of multiple alignments in Phylip format and a parameter string to be passed to SeqBoot =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; my $curpath = cwd; unless( File::Spec->file_name_is_absolute($infile) ) { $infile = $self->io->catfile($curpath,$infile); } #odd random seed my $rand = (2 * int(rand(10000)) + 1); if ($self->version == 3.5){ $instring = $infile."\n$rand\n$param_string"; } else { $instring = $infile."\n$param_string$rand\n"; } $self->debug( "Program ".$self->executable." $instring\n"); chdir($self->tempdir); #open a pipe to run SeqBoot to bypass interactive menus if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; open(SeqBoot,"|".$self->executable .">$null"); } else { open(SeqBoot,"|".$self->executable); } print SeqBoot $instring; close(SeqBoot); # get the results my $outfile = $self->io->catfile($self->tempdir,$self->outfile); chdir($curpath); $self->throw("SeqBoot did not create files correctly ($outfile)") unless (-e $outfile); #parse the alignments my @aln; my @parse_params; push @parse_params, ('-interleaved' => 1) if $self->version == 3.6; my $aio = Bio::AlignIO->new(-file=>$outfile,-format=>"phylip", @parse_params); while (my $aln = $aio->next_aln){ push @aln, $aln; } # Clean up the temporary files created along the way... unlink $outfile unless $self->save_tempfiles; return \@aln; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input file for SeqBoot program Example : Returns : name of file containing a multiple alignment in Phylip format Args : SimpleAlign object reference or input file name =cut sub _setinput { my ($self, $input) = @_; my ($alnfilename,$tfh); # a phy formatted alignment file unless (ref $input) { # check that file exists or throw $alnfilename= $input; unless (-e $input) {return 0;} return $alnfilename; } my @input = ref($input) eq 'ARRAY' ? @{$input}: ($input); ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir); my $alnIO = Bio::AlignIO->new(-fh => $tfh, -format=>'phylip', -idlength=>$self->idlength()); foreach my $input(@input){ # $input should be a Bio::Align::AlignI $input->isa("Bio::Align::AlignI") || $self->throw("Expecting a Bio::Align::AlignI object"); # Open temporary file for both reading & writing of BioSeq array $alnIO->write_aln($input); } $alnIO->close(); close($tfh); return $alnfilename; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for SeqBoot program Example : Returns : parameter string to be passed to SeqBoot Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); #do nothing for now $self = shift; my $param_string = ""; my $cat = 0; my $gene_freq = 0; my %menu = %{$Menu{$self->version}->{'SEQBOOT'}}; foreach my $attr ( @SEQBOOT_PARAMS) { $value = $self->$attr(); next unless (defined $value); if ($attr =~/REPLICATES/i){ if( $value !~ /(\d+(\.\d+)?)/ ) { $self->warn("Expected a number in $attr\n"); next; } $param_string .= $menu{'REPLICATES'}."$value\n"; } elsif($attr=~/DATATYPE/i){ $gene_freq = 1 if $value =~/GENEFREQ/i; $param_string .= $menu{'DATATYPE'}{uc $value}; } else { if($attr =~/ALLELES/i){ if(!$gene_freq){ $self->warn("Alleles options only be used with alleles option"); return; } $param_string .=$menu{uc $attr}; } } } $param_string .= $menu{'SUBMIT'}; return $param_string; } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $SeqBoot->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory after a SeqBoot run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/PhyloBase.pm000077500000000000000000000205501302566030400240400ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::PhyloBase # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::PhyloBase- base module for phylo wrappers =head1 SYNOPSIS use base qw(Bio::Tools::Run::Phylo::PhyloBase); =head1 DESCRIPTION For use by Bio::Tools::Run::Phylo modules as a base in place of Bio::Tools::Run::WrapperBase. This is based on WrapperBase but provides additional phylo-related private helper subs. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::PhyloBase; use strict; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::WrapperBase); =head2 _alignment Title : _alignment Usage : $aln = $obj->_alignment() Function: Get/set an alignment object, generating one from a file if desired. Returns : Bio::Align::AlignI (probably a Bio::SimpleAlign) Args : none to get OR filename & input format of the alignment file (latter defaults to guess) to set from file OR Bio::Align::AlignI to set =cut sub _alignment { my ($self, $thing, $format) = @_; if (ref($thing) && $thing->isa('Bio::Align::AlignI')) { $self->{_align_obj} = $thing; } elsif ($thing && -e $thing) { my $align_in = Bio::AlignIO->new(-verbose => $self->verbose, -file => $thing, $format ? (-format => $format) : ()); my $aln = $align_in->next_aln || $self->throw("Alignment file '$thing' had no alignment!"); $align_in->close(); $self->{_align_obj} = $aln; } return $self->{_align_obj}; } =head2 _write_alignment Title : _write_alignment Usage : $obj->_write_alignment() Function: Writes the alignment object returned by _alignment() out in the desired format to a temp file. Returns : filename Args : string to desribe format (default 'fasta'), any other options to pass to AlignIO =cut sub _write_alignment { my ($self, $format, @options) = @_; my $align = $self->_alignment || $self->throw("_write_alignment called when _alignment had not been set"); $format ||= 'fasta'; my ($tfh, $tempfile) = $self->io->tempfile(-dir => $self->tempdir); my $out = Bio::AlignIO->new(-verbose => $self->verbose, '-fh' => $tfh, '-format' => $format, @options); $align->set_displayname_flat; $out->write_aln($align); $out->close(); $out = undef; close($tfh); undef $tfh; return $tempfile; } =head2 _tree Title : _tree Usage : $tree = $obj->_tree() Function: Get/set a tree object, generating one from a file/database if desired Returns : Bio::Tree::TreeI Args : none to get, OR to set: OR filename & input format of the tree file (latter defaults to guess) to set from file OR Bio::Tree::TreeI OR Bio::DB::Taxonomy when _alignment() has been set and where sequences in the alignment have ids matching species in the taxonomy database =cut sub _tree { my ($self, $thing, $format) = @_; if ($thing) { my $tree; if (ref($thing) && $thing->isa('Bio::Tree::TreeI')) { $tree = $thing; } elsif (ref($thing) && $thing->isa('Bio::DB::Taxonomy')) { # get all the alignment sequence names my @species_names = $self->_get_seq_names; $tree = $thing->get_tree(@species_names); # convert node ids to their seq_ids for correct output with TreeIO foreach my $node ($tree->get_nodes) { my $seq_id = $node->name('supplied'); $seq_id = $seq_id ? shift @{$seq_id} : ($node->node_name ? $node->node_name : $node->id); $node->id($seq_id); } } elsif (-e $thing) { my $tree_in = Bio::TreeIO->new(-verbose => $self->verbose, -file => $thing, $format ? (-format => $format) : ()); $tree = $tree_in->next_tree || $self->throw("Tree file '$thing' had no tree!"); $tree_in->close; } $self->{_tree_obj} = $tree || $self->throw("'$thing' supplied but unable to generate a tree from it"); } return $self->{_tree_obj}; } =head2 _write_tree Title : _write_tree Usage : $obj->_write_tree() Function: Writes the tree object returned by _tree() out in the desired format to a temp file. Returns : filename Args : string to desribe format (default 'newick') =cut sub _write_tree { my ($self, $format) = @_; my $tree = $self->_tree || $self->throw("_write_tree called when _tree had not been set"); $format ||= 'newick'; my ($tfh, $tempfile) = $self->io->tempfile(-dir => $self->tempdir); my $out = Bio::TreeIO->new(-verbose => $self->verbose, -fh => $tfh, -format => $format); $out->write_tree($tree); $out->close(); $out = undef; close($tfh); undef $tfh; return $tempfile; } =head2 _get_seq_names Title : _get_seq_names Usage : @names = $obj->_get_seq_names() Function: Get all the sequence names (from id()) of the sequenes in the alignment. _alignment() must be set prior to calling this. Returns : list of strings (seq ids) Args : none =cut sub _get_seq_names { my $self = shift; my $aln = $self->_alignment || $self->throw("_get_seq_names called when _alignment had not been set"); my @names; foreach my $seq ($aln->each_seq) { push(@names, $seq->id); } return @names; } =head2 _get_node_names Title : _get_node_names Usage : @names = $obj->_get_node_names() Function: Get all the node names (from id()) of the nodes in the tree. _tree must be set prior to calling this. Returns : list of strings (node ids) Args : none =cut sub _get_node_names { my $self = shift; my $tree = $self->_tree || $self->throw("_get_node_names called when _tree had not been set"); my @names; foreach my $node ($tree->get_leaf_nodes) { push(@names, $node->id); } return @names; } =head2 _check_names Title : _check_names Usage : if ($obj->_check_names) { ... } Function: Determine if all sequences in the alignment file have a corresponding node in the tree file. _alignment() and _tree() must be set prior to calling this. Returns : boolean (will also warn about the specific problems when returning false) Args : none =cut sub _check_names { my $self = shift; my @seq_names = $self->_get_seq_names; my %node_names = map { $_ => 1 } $self->_get_node_names; # (not interested in tree nodes that don't map to sequence, since we # expect the tree to have internal nodes not represented by sequence) foreach my $name (@seq_names) { $self->{_unmapped}{$name} = 1 unless defined $node_names{$name}; } if (defined($self->{_unmapped})) { my $count = scalar(keys %{$self->{_unmapped}}); my $unmapped = join(", ", keys %{$self->{_unmapped}}); $self->warn("$count unmapped ids between the supplied alignment and tree: $unmapped"); return 0; } return 1; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Phyml.pm000066400000000000000000000705011302566030400232410ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Phyml # # Please direct questions and support issues to # # Cared for by Heikki Lehvaslaiho # # Copyright Heikki Lehvaslaiho # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Phyml - Wrapper for rapid reconstruction of phylogenies using Phyml =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Phyml; # Make a Phyml factory $factory = Bio::Tools::Run::Phylo::Phyml->new(-verbose => 2); # it defaults to protein alignment # change parameters $factory->model('Dayhoff'); # Pass the factory an alignment and run $inputfilename = 't/data/protpars.phy'; $tree = $factory->run($inputfilename); # $tree is a Bio::Tree::Tree object. # or set parameters at object creation my %args = ( -data_type => 'dna', -model => 'HKY', -kappa => 4, -invar => 'e', -category_number => 4, -alpha => 'e', -tree => 'BIONJ', -opt_topology => '0', -opt_lengths => '1', ); $factory = Bio::Tools::Run::Phylo::Phyml->new(%args); # if you need the output files do $factory->save_tempfiles(1); $factory->tempdir($workdir); # and get a Bio::Align::AlignI (SimpleAlign) object from somewhere $tree = $factory->run($aln); =head1 DESCRIPTION This is a wrapper for running the phyml application by Stephane Guindon and Olivier Gascuel. You can download it from: http://atgc.lirmm.fr/phyml/ =head2 Installing After downloading, you need to rename a the copy of the program that runs under your operating system. I.e. C into C. You will need to help this Phyml wrapper to find the C program. This can be done in (at least) three ways: =over =item 1. Make sure the Phyml executable is in your path. Copy it to, or create a symbolic link from a directory that is in your path. =item 2. Define an environmental variable PHYMLDIR which is a directory which contains the 'phyml' application: In bash: export PHYMLDIR=/home/username/phyml_v2.4.4/exe In csh/tcsh: setenv PHYMLDIR /home/username/phyml_v2.4.4/exe =item 3. Include a definition of an environmental variable PHYMLDIR in every script that will use this Phyml wrapper module, e.g.: BEGIN { $ENV{PHYMLDIR} = '/home/username/phyml_v2.4.4/exe' } use Bio::Tools::Run::Phylo::Phyml; =back =head2 Running This wrapper has been tested with PHYML v2.4.4 and v.3.0. It may work with recent Phyml releases using a date format for the format, but the wrapper hasn't been extensively tested in these cases, so for the moment only the simpler numbered versions are supported. In its current state, the wrapper supports only input of one MSA and output of one tree. It can easily be extended to support more advanced capabilities of C. Two convienience methods have been added on top of the standard BioPerl WrapperBase ones: stats() and tree_string(). You can call them to after running the phyml program to retrieve into a string the statistics and the tree in Newick format. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHOR - Heikki Lehvaslaiho heikki at bioperl dot org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Phyml; use strict; use Bio::AlignIO; use File::Copy; use File::Spec; use Bio::TreeIO; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'phyml'; our $PROGRAM_DIR = $ENV{'PHYMLDIR'}; # valid substitution model names our $models; # DNA map { $models->{0}->{$_} = 1 } qw(JC69 K2P F81 HKY F84 TN93 GTR); # protein map { $models->{1}->{$_} = 1 } qw(JTT MtREV Dayhoff WAG); our $models3; # DNA map { $models3->{'nt'}->{$_} = 1 } qw(HKY85 JC69 K80 F81 F84 TN93 GTR ); # protein map { $models3->{'aa'}->{$_} = 1 } qw(LG WAG JTT MtREV Dayhoff DCMut RtREV CpREV VT Blosum62 MtMam MtArt HIVw HIVb ); =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Phyml->new(@params) Function: creates a new Phyml factory Returns : Bio::Tools::Run::Phylo::Phyml Args : Optionally, provide any of the following (default in []): -data_type => 'dna' or 'protein', [protein] -dataset_count => integer, [1] -model => 'HKY'... , [HKY|JTT] -kappa => 'e' or float, [e] -invar => 'e' or float, [e] -category_number => integer, [1] -alpha => 'e' or float (int v3),[e] -tree => 'BIONJ' or your own, [BION] -bootstrap => integer [123] -opt_topology => boolean [1] -opt_lengths => boolean [1] -no_memory_check => boolean [1] -program_name => string =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); # for consistency with other run modules, allow params to be dashless my %args = @args; while ( my ( $key, $val ) = each %args ) { if ( $key !~ /^-/ ) { delete $args{$key}; $args{ '-' . $key } = $val; } } my ( $data_type, $data_format, $dataset_count, $model, $freq, $kappa, $invar, $category_number, $alpha, $tree, $opt_topology, $opt_lengths, $opt, $search, $rand_start, $rand_starts, $rand_seed, $no_memory_check, $bootstrap, $program_name ) = $self->_rearrange( [ qw( DATA_TYPE DATA_FORMAT DATASET_COUNT MODEL FREQ KAPPA INVAR CATEGORY_NUMBER ALPHA TREE OPT_TOPOLOGY OPT_LENGTHS OPT SEARCH RAND_START RAND_STARTS RAND_SEED NO_MEMORY_CHECK BOOTSTRAP PROGRAM_NAME ) ], %args ); $self->data_type($data_type) if $data_type; $self->data_format($data_format) if $data_format; $self->dataset_count($dataset_count) if $dataset_count; $self->model($model) if $model; $self->freq($freq) if $freq; $self->kappa($kappa) if $kappa; $self->invar($invar) if $invar; $self->category_number($category_number) if $category_number; $self->alpha($alpha) if $alpha; $self->tree($tree) if $tree; $self->opt_topology($opt_topology) if $opt_topology; $self->opt_lengths($opt_lengths) if $opt_lengths; $self->opt($opt) if $opt; $self->search($search) if $search; $self->rand_start($rand_start) if $rand_start; $self->rand_starts($rand_starts) if $rand_starts; $self->rand_seed($rand_seed) if $rand_seed; $self->no_memory_check($no_memory_check) if $no_memory_check; $self->bootstrap($bootstrap) if $bootstrap; $self->program_name($program_name) if $program_name; return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { my ( $self, $value ) = @_; if ( defined($value) ) { if ( $value =~ /^$PROGRAM_NAME[-a-z]*$/ ) { $PROGRAM_NAME = $value; } else { $self->throw("$value is not a valid program name"); } } $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 version Title : version Usage : exit if $prog->version < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none Phyml before 3.0 did not display the version. Assume 2.44 when can not determine it. Some releases do not state version number, only date, so the version might have to be inferred from this date. =cut sub version { my $self = shift; return $self->{'_version'} if defined $self->{'_version'}; my $exe = $self->executable || return; my $string = substr `$exe -h`, 0, 40; my ($version) = $string =~ /PhyML v([\d+\.]+)/; if ( !$version ) { $string =~ /PhyML\s+(\d{8})/; # 3 was released August 2008 $version = 3 if ( $1 && $1 >= 20080801 ); } $self->{'_version'} = $version; $version ? ( return $version ) : return '2.44'; } =head2 run Title : run Usage : $factory->run($aln_file); $factory->run($align_object); Function: Runs Phyml to generate a tree Returns : Bio::Tree::Tree object Args : file name for your input alignment in a format recognised by AlignIO, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ( $self, $in ) = @_; if ( ref $in && $in->isa("Bio::Align::AlignI") ) { $in = $self->_write_phylip_align_file($in); } elsif ( !-e $in ) { $self->throw( "When not supplying a Bio::Align::AlignI object, " . "you must supply a readable filename" ); } elsif ( -e $in ) { copy( $in, $self->tempdir ); my $name = File::Spec->splitpath($in); # name is the last item in the array $in = File::Spec->catfile( $self->tempdir, $name ); } return $self->_run($in); } =head2 stats Title : stats Usage : $factory->stats; Function: Returns the contents of the phyml '_phyml_stat.txt' output file Returns : string with statistics about the run, undef before run() Args : none =cut sub stats { my $self = shift; return $self->{_stats}; } =head2 tree_string Title : tree_string Usage : $factory->tree_string; $factory->run($align_object); Function: Returns the contents of the phyml '_phyml_tree.txt' output file Returns : string with tree in Newick format, undef before run() Args : none =cut sub tree_string { my $self = shift; return $self->{_tree}; } =head2 Getsetters These methods are used to set and get program parameters before running. =head2 data_type Title : data_type Usage : $phyml->data_type('nt'); Function: Sets sequence alphabet to 'dna' (nt in v3) or 'aa' If leaved unset, will be set automatically Returns : set value, defaults to 'protein' Args : None to get, 'dna' ('nt') or 'aa' to set. =cut sub data_type { my ( $self, $value ) = @_; if ( $self->version && $self->version >= 3 ) { if ( defined $value ) { if ( $value eq 'nt' ) { $self->{_data_type} = 'nt'; } else { $self->{_data_type} = 'aa'; } } return 'aa' unless defined $self->{_data_type}; } else { if ( defined $value ) { if ( $value eq 'dna' ) { $self->{_data_type} = '0'; } else { $self->{_data_type} = '1'; } } return '1' unless defined $self->{_data_type}; } return $self->{_data_type}; } =head2 data_format Title : data_format Usage : $phyml->data_format('s'); Function: Sets PHYLIP format to 'i' interleaved or 's' sequential Returns : set value, defaults to 'i' Args : None to get, 'i' or 's' to set. =cut sub data_format { my ( $self, $value ) = @_; if ( defined $value ) { $self->throw("PHYLIP format must be 'i' or 's'") unless $value eq 'i' or $value eq 's'; $self->{_data_format} = $value; } return $self->{_data_format} || 'i'; } =head2 dataset_count Title : dataset_count Usage : $phyml->dataset_count(3); Function: Sets dataset number to deal with Returns : set value, defaults to 1 Args : None to get, positive integer to set. =cut sub dataset_count { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid positive integer [$value]" unless $value =~ /^[-+]?\d*$/ and $value > 0; $self->{_dataset_count} = $value; } return $self->{_dataset_count} || 1; } =head2 model Title : model Usage : $phyml->model('HKY'); Function: Choose the substitution model to use. One of JC69 | K2P | F81 | HKY | F84 | TN93 | GTR (DNA) JTT | MtREV | Dayhoff | WAG (amino acids) v3.0: HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR (DNA) LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV | CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb (amino acids) Returns : Name of the model, v2.4.4 defaults to {HKY|JTT} Args : None to get, string to set. =cut sub model { my ( $self, $value ) = @_; if ( defined($value) ) { if ( $self->version && $self->version >= 3 ) { unless ( $value =~ /\d{6}/ ) { $self->throw( "Not a valid model name [$value] for current data type (alphabet)" ) unless $models3->{ $self->data_type }->{$value}; } } else { $self->throw( "Not a valid model name [$value] for current data type (alphabet)" ) unless $models->{ $self->data_type }->{$value}; } $self->{_model} = $value; } if ( $self->{_model} ) { return $self->{_model}; } if ( $self->version && $self->version >= 3 ) { if ( $self->data_type eq 'aa' ) { return 'LG'; # protein } else { return 'HKY85'; # DNA } } else { if ( $self->data_type ) { return 'JTT'; # protein } else { return 'HKY'; # DNA } } } =head2 kappa Title : kappa Usage : $phyml->kappa(4); Function: Sets transition/transversion ratio, leave unset to estimate Returns : set value, defaults to 'e' Args : None to get, float or integer to set. =cut sub kappa { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d*\.?\d*$/ or $value eq 'e'; $self->{_kappa} = $value; } return 'e' unless defined $self->{_kappa}; return 'e' if $self->{_kappa} eq 'e'; return sprintf( "%.1f", $self->{_kappa} ); } =head2 invar Title : invar Usage : $phyml->invar(.3); Function: Sets proportion of invariable sites, leave unset to estimate Returns : set value, defaults to 'e' Args : None to get, float or integer to set. =cut sub invar { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d*\.\d*$/ or $value eq 'e'; $self->{_invar} = $value; } return 'e' unless defined $self->{_invar}; return 'e' if $self->{_invar} eq 'e'; return sprintf( "%.1f", $self->{_invar} ); } =head2 category_number Title : category_number Usage : $phyml->category_number(4); Function: Sets number of relative substitution rate categories Returns : set value, defaults to 1 Args : None to get, integer to set. =cut sub category_number { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid postive integer [$value]" unless $value =~ /^[+]?\d*$/ and $value > 0; $self->{_category_number} = $value; } return $self->{_category_number} || 1; } =head2 alpha Title : alpha Usage : $phyml->alpha(1.0); Function: Sets gamma distribution parameter, leave unset to estimate Returns : set value, defaults to 'e' Args : None to get, float or integer to set. =cut sub alpha { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d*\.?\d*$/ or $value eq 'e'; $self->{_alpha} = $value; } return 'e' unless defined $self->{_alpha}; return 'e' if $self->{_alpha} eq 'e'; return sprintf( "%.1f", $self->{_alpha} ) || 'e'; } =head2 tree Title : tree Usage : $phyml->tree('/tmp/tree.nwk'); Function: Sets starting tree, leave unset to estimate a distance tree Returns : set value, defaults to 'BIONJ' Args : None to get, newick tree file name to set. =cut sub tree { my ( $self, $value ) = @_; if ( defined $value ) { die "Invalid number [$value]" unless -e $value or $value eq 'BIONJ'; $self->{_tree} = $value; } return $self->{_tree} || 'BIONJ'; } =head2 v2 options These methods can be used with PhyML v2* only. =head2 opt_topology Title : opt_topology Usage : $factory->opt_topology(1); Function: Choose to optimise the tree topology Returns : 1 or 0. Default is 1. Args : None to get, boolean to set. v2.* only =cut sub opt_topology { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [opt_topology] for to PhyML v3") if $self->version && $self->version >= 3; if ( defined($value) ) { if ($value) { $self->{_opt_topology} = 1; } else { $self->{_opt_topology} = 0; } } return $self->{_opt_topology} || 1; } =head2 opt_lengths Title : opt_lengths Usage : $factory->opt_lengths(0); Function: Choose to optimise branch lengths and rate parameters Returns : 1 or 0. Default is 1. Args : None to get, boolean to set. v2.* only =cut sub opt_lengths { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [opt_lengths] for PhyML v3") if $self->version && $self->version >= 3; if ( defined($value) ) { if ($value) { $self->{_opt_lengths} = 1; } else { $self->{_opt_lengths} = 0; } } return $self->{_opt_lengths} || 1; } =head2 v3 options These methods can be used with PhyML v3* only. =head2 freq Title : freq Usage : $phyml->freq(e); $phyml->freq("0.2, 0.6, 0.6, 0.2"); Function: Sets nucleotide frequences or asks residue to be estimated according to two models: e or d Returns : set value, Args : None to get, string to set. v3 only. =cut sub freq { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [freq] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid value [$value]" unless $value =~ /^[\d\. ]$/ or $value eq 'e' or $value eq 'd'; $self->{_freq} = $value; } return $self->{_freq}; } =head2 opt Title : opt Usage : $factory->opt(1); Function: Optimise tree parameters: tlr|tl|tr|l|n Returns : {value|n} (default n) Args : None to get, string to set. v3.* only =cut sub opt { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [opt] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { $self->{_opt} = $value if $value =~ /tlr|tl|tr|l|n/; } return $self->{_opt} || 'n'; } =head2 search Title : search Usage : $factory->search(SPR); Function: Tree topology search operation algorithm: NNI|SPR|BEST Returns : string (defaults to NNI) Args : None to get, string to set. v3.* only =cut sub search { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [search] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { $self->{_search} = $value if $value =~ /NNI|SPR|BEST/; } return $self->{_search} || 'NNI'; } =head2 rand_start Title : rand_start Usage : $factory->rand_start(1); Function: Sets the initial SPR tree to random. Returns : boolean (defaults to false) Args : None to get, boolean to set. v3.* only; only meaningful if $prog-Esearch is 'SPR' =cut sub rand_start { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [rand_start] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { if ($value) { $self->{_rand_start} = 1; } else { $self->{_rand_start} = 0; } } return $self->{_rand_start}; } =head2 rand_starts Title : rand_starts Usage : $factory->rand_starts(10); Function: Sets the number of initial random SPR trees Returns : integer (defaults to 1) Args : None to get, integer to set. v3.* only; only valid if $prog-Esearch is 'SPR' =cut sub rand_starts { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [rand_starts] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d+$/; $self->{_rand_starts} = $value; } return $self->{_rand_starts} || 1; } =head2 rand_seed Title : rand_seed Usage : $factory->rand_seed(1769876); Function: Seeds the random number generator Returns : random integer Args : None to get, integer to set. v3.* only; only valid if $prog-Esearch is 'SPR' Uses perl rand() to initialize if not explicitely set. =cut sub rand_seed { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [rand_seed] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^[-+]?\d+$/; $self->{_rand_seed} = $value; } return $self->{_rand_seed} || int rand 1000000; } =head2 no_memory_check Title : no_memory_check Usage : $factory->no_memory_check(1); Function: Returns : boolean (defaults to false) Args : None to get, integer to set. =cut sub no_memory_check { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [no_memory_check] prior to PhyML v3") if $self->version < 3; if ( defined($value) ) { if ($value) { $self->{_no_memory_check} = 1; } else { $self->{_no_memory_check} = 0; } } return $self->{_no_memory_check} || 0; } =head2 bootstrap Title : bootstrap Usage : $factory->bootstrap(100); Function: Set number of bootstraps Returns : Args : None to get, integer to set. =cut sub bootstrap { my ( $self, $value ) = @_; $self->throw("Not a valid parameter [bootstrap] prior to PhyML v3") if $self->version < 3; if ( defined $value ) { die "Invalid number [$value]" unless $value =~ /^\d+$/; $self->{_bootstrap} = $value; } return $self->{_bootstrap}; } =head2 command Title : command Usage : $factory->command(...); Function: Returns : string Args : None to get, integer to set. =cut sub command { my ( $self, $value ) = @_; if ( defined($value) ) { if ($value =~ /$PROGRAM_NAME/ ) { $self->{_command} = $value; } else { $self->throw("$value is not a $PROGRAM_NAME command"); } } return $self->{_command} || ''; } =head2 Internal methods These methods are private and should not be called outside this class. =cut sub _run { my ( $self, $file ) = @_; my $exe = $self->executable || return; my $command; my $output_stat_file; if ( $self->version >= 3 ) { $command = $exe . " -i $file" . $self->_setparams; $output_stat_file = '_phyml_stats.txt'; } else { $command = $exe . " $file " . $self->arguments . $self->_setparams; $output_stat_file = '_phyml_stat.txt'; } $self->command($command); $self->debug("Phyml command = $command\n"); `$command`; # stats { my $stat_file = $file . $output_stat_file; open( my $FH_STAT, "<", $stat_file ) || $self->throw( "Phyml call ($command) did not give an output [$stat_file]: $?"); local $/; $self->{_stats} .= <$FH_STAT>; } #print $self->{stats}; # tree my $tree_file = $file . '_phyml_tree.txt'; { open( my $FH_TREE, "<", $tree_file ) || $self->throw("Phyml call ($command) did not give an output: $?"); local $/; $self->{_tree} .= <$FH_TREE>; } open( my $FH_TREE, "<", $tree_file ) || $self->throw("Phyml call ($command) did not give an output: $?"); my $treeio = Bio::TreeIO->new( -format => 'nhx', -fh => $FH_TREE ); my $tree = $treeio->next_tree; # could be faster to parse the tree only if needed? return $tree; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string; if ( $self->version >= 3 ) { # version 3 or higher $param_string = ' -d ' . $self->data_type; $param_string .= ' -q ' if $self->data_format eq 's'; $param_string .= ' -n ' . $self->dataset_count if $self->dataset_count > 1; $param_string .= ' -b ' . $self->bootstrap if $self->bootstrap; # $param_string .= ' 0'; # no bootstrap sets $param_string .= ' -m ' . $self->model; $param_string .= ' -f ' . $self->freq if $self->freq; if ( $self->data_type eq 'dna' ) { $param_string .= ' -t ' . $self->kappa; } $param_string .= ' -v ' . $self->invar; $param_string .= ' -c ' . $self->category_number; $param_string .= ' -a ' . $self->alpha; $param_string .= ' -u ' . $self->tree if $self->tree ne 'BIONJ'; $param_string .= ' -o ' . $self->opt if $self->opt; $param_string .= ' -s ' . $self->search; if ( $self->search eq 'SPR' ) { $param_string .= ' --rand_start ' if $self->rand_start; $param_string .= ' --n_rand_starts ' . $self->rand_starts if $self->rand_starts; $param_string .= ' --r_seed ' . $self->rand_seed; } $param_string .= ' --no_memory_check ' if $self->no_memory_check; } else { # version 2 $param_string = ' ' . $self->data_type; $param_string .= ' ' . $self->data_format; $param_string .= ' ' . $self->dataset_count; $param_string .= ' 0'; # no bootstrap sets $param_string .= ' ' . $self->model; unless ( $self->data_type ) { # only for DNA $param_string .= ' ' . $self->kappa; } $param_string .= ' ' . $self->invar; $param_string .= ' ' . $self->category_number; $param_string .= ' ' . $self->alpha; $param_string .= ' ' . $self->tree; $param_string .= ' ' . $self->opt_topology; $param_string .= ' ' . $self->opt_lengths; } return $param_string; } =head2 _write_phylip_align_file Title : _write_phylip_align_file Usage : obj->__write_phylip_align_file($aln) Function: Internal (not to be used directly) Writes the alignment into the tmp directory in PHYLIP interlieved format Returns : filename Args : Bio::Align::AlignI =cut sub _write_phylip_align_file { my ( $self, $align ) = @_; my $tempfile = File::Spec->catfile( $self->tempdir, "aln$$.phylip" ); $self->data_format('i'); my $out = Bio::AlignIO->new( -file => ">$tempfile", -format => 'phylip', -interleaved => 1, -longid => 1 ); $out->write_aln($align); $out->close(); $out = undef; return $tempfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/QuickTree.pm000066400000000000000000000207001302566030400240400ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::QuickTree # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::QuickTree - Wrapper for rapid reconstruction of phylogenies using QuickTree =head1 SYNOPSIS use Bio::Tools::Run::Phylo::QuickTree; # Make a QuickTree factory @params = (); $factory = Bio::Tools::Run::Phylo::QuickTree->new(@params); # Pass the factory an alignment $inputfilename = 't/data/cysprot.stockholm'; $tree = $factory->run($inputfilename); # $tree is a Bio::Tree::Tree object. # or get a Bio::Align::AlignI (SimpleAlign) object from somewhere $tree = $factory->run($aln); =head1 DESCRIPTION This is a wrapper for running the QuickTree application by Kevin Howe. You can download it here: http://www.sanger.ac.uk/Software/analysis/quicktree/ Currently only input with alignments and output of trees is supported. (Ie. no support for distance matrix in/out.) You will need to enable this QuickTree wrapper to find the quicktree program. This can be done in (at least) three ways: 1. Make sure the QuickTree executable is in your path. 2. Define an environmental variable QUICKTREEDIR which is a directory which contains the 'quicktree' application: In bash: export QUICKTREEDIR=/home/username/quicktree_1.1/bin In csh/tcsh: setenv QUICKTREEDIR /home/username/quicktree_1.1/bin 3. Include a definition of an environmental variable QUICKTREEDIR in every script that will use this QuickTree wrapper module, e.g.: BEGIN { $ENV{QUICKTREEDIR} = '/home/username/quicktree_1.1/bin' } use Bio::Tools::Run::Phylo::QuickTree; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::QuickTree; use strict; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::WrapperBase); our $PROGRAM_NAME = 'quicktree'; our $PROGRAM_DIR = $ENV{'QUICKTREEDIR'}; =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::QuickTree->new(@params) Function: creates a new QuickTree factory Returns : Bio::Tools::Run::Phylo::QuickTree Args : Optionally, provide any of the following (default in []): -upgma => boolean # Use the UPGMA method to construct the tree [0] -kimura => boolean # Use the kimura translation for pairwise # distances [0] -boot => int # Calculate bootstrap values with n iterations [0] =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); # for consistency with other run modules, allow params to be dashless my %args = @args; while (my ($key, $val) = each %args) { if ($key !~ /^-/) { delete $args{$key}; $args{'-'.$key} = $val; } } my ($upgma, $kimura, $boot) = $self->_rearrange([qw(UPGMA KIMURA BOOT)], %args); $self->upgma(1) if $upgma; $self->kimura(1) if $kimura; $self->boot($boot) if $boot; return $self; } =head2 upgma Title : upgma Usage : $factory->upgma(1); Function: Choose to use the UPGMA method to construct the tree. Returns : boolean (default 0) Args : None to get, boolean to set. =cut sub upgma { my ($self, $bool) = @_; if (defined ($bool)) { $self->{upgma} = $bool; } return $self->{upgma} || 0; } =head2 kimura Title : kimura Usage : $factory->kimura(1); Function: Choose to use the kimura translation for pairwise distances. Returns : boolean (default 0) Args : None to get, boolean to set. =cut sub kimura { my ($self, $bool) = @_; if (defined ($bool)) { $self->{kimura} = $bool; } return $self->{kimura} || 0; } =head2 boot Title : boot Usage : $factory->boot(100); Function: Choose to calculate bootstrap values with the supplied number of iterations. Returns : int (default 0) Args : None to get, int to set. =cut sub boot { my ($self, $int) = @_; if (defined ($int)) { $self->{boot} = $int; } return $self->{boot} || 0; } =head2 run Title : run Usage : $factory->run($stockholm_file); $factory->run($align_object); Function: Runs QuickTree to generate a tree Returns : Bio::Tree::Tree object Args : file name for your input alignment in stockholm format, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ($self, $in) = @_; if (ref $in && $in->isa("Bio::Align::AlignI")) { $in = $self->_writeAlignFile($in); } elsif (! -e $in) { $self->throw("When not supplying a Bio::Align::AlignI object, you must supply a readable filename"); } return $self->_run($in); } sub _run { my ($self, $file)= @_; my $exe = $self->executable || return; my $param_str = $self->arguments." ".$self->_setparams; my $command = $exe." $param_str ".$file; $self->debug("QuickTree command = $command"); open(my $result, "$command |") || $self->throw("QuickTree call ($command) crashed: $?"); my $treeio = Bio::TreeIO->new(-format => 'nhx', -fh => $result); my $tree = $treeio->next_tree; close($result); # if bootstraps were enabled, the bootstraps are the ids; convert to # bootstrap and no id if ($self->boot) { my @nodes = $tree->get_nodes; my %non_internal = map { $_ => 1 } ($tree->get_leaf_nodes, $tree->get_root_node); foreach my $node (@nodes) { next if exists $non_internal{$node}; $node->bootstrap && next; # protect ourselves incase the parser improves $node->bootstrap($node->id); $node->id(''); } } return $tree; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : none =cut sub _setparams { my $self = shift; my $param_string = '-in a -out t'; $param_string .= ' -upgma' if $self->upgma; $param_string .= ' -kimura' if $self->kimura; $param_string .= ' -boot '.$self->boot if $self->boot; return $param_string; } =head2 _writeAlignFile Title : _writeAlignFile Usage : obj->_writeAlignFile($seq) Function: Internal(not to be used directly) Returns : filename Args : Bio::Align::AlignI =cut sub _writeAlignFile{ my ($self, $align) = @_; my ($tfh, $tempfile) = $self->io->tempfile(-dir=>$self->tempdir); my $out = Bio::AlignIO->new('-fh' => $tfh, '-format' => 'stockholm'); $out->write_aln($align); $out->close(); $out = undef; close($tfh); undef $tfh; return $tempfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Raxml.pm000066400000000000000000000257241302566030400232420ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Raxml # # Please direct questions and support issues to # # Copyright Brian Osborne # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Raxml =head1 SYNOPSIS # Build a Raxml factory $factory = Bio::Tools::Run::Phylo::Raxml->new(-p => 100); # Get an alignment my $alignio = Bio::AlignIO->new( -format => 'fasta', -file => '219877.cdna.fasta'); my $alnobj = $alignio->next_aln; # Analyze the aligment and get a Tree my $tree = $factory->run($alnobj); =head1 DESCRIPTION Get a Bio::Tree object using raxml given a protein or DNA alignment. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I Do not contact the module maintainer directly. Many experienced experts at bioperl-l will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Brian Osborne Email briano@bioteam.net =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Raxml; use strict; use File::Basename; use File::Spec; use Bio::Seq; use Bio::SeqIO; use Bio::TreeIO; use Bio::AlignIO; use Bio::Root::IO; use Cwd; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @Raxml_PARAMS = qw(s n m a A b B c e E f g G i I J o p P q r R S t T w W x z N); our @Raxml_SWITCHES = qw(SSE3 PTHREADS PTHREADS-SSE3 HYBRID HYBRID-SSE3 F h k K M j U v X y C d D); our $PROGRAM_NAME = 'raxml'; # Specify some model if none is specified my $DEFAULTAAMODEL = 'PROTCATDAYHOFF'; my $DEFAULTNTMODEL = 'GTRCAT'; =head2 new Title : new Usage : my $treebuilder = Bio::Tools::Run::Phylo::Raxml->new(); Function: Constructor Returns : Bio::Tools::Run::Phylo::Raxml Args : Same as those used to run raxml. For example: $factory = Bio::Tools::Run::Phylo::Raxml->new(-p => 100, -SSE3 => 1) =cut sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args( \@args, -case_sensitive => 1, -methods => [ @Raxml_PARAMS, @Raxml_SWITCHES ], -create => 1 ); my ($out,$quiet) = $self->SUPER::_rearrange( [qw(OUTFILE_NAME QUIET)], @args ); $self->outfile_name( $out || '' ); $self->quiet( $quiet ) if $quiet; $self; } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory Returns: string Args : =cut sub program_dir { undef; } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ( $self, $value ) = @_; $self->{'error_string'} = $value if ( defined $value ); $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe -v 2>&1`; $string =~ /raxml\s+version\s+([\d\.]+)/i; return $1 || undef; } =head2 quiet Title : quiet Usage : Function: get or set value for 'quiet' Example : Returns : Args : the value =cut sub quiet { my ( $self, $value ) = @_; $self->{'_quiet'} = $value if ( defined $value ); $self->{'_quiet'}; } =head2 run Title : run Usage : $factory->run($stockholm_file) OR $factory->run($align_object) Function: Runs Raxml to generate a tree Returns : Bio::Tree::Tree object Args : File name for your input alignment in stockholm format, OR Bio::Align::AlignI compliant object (eg. Bio::SimpleAlign). =cut sub run { my ($self, $in) = @_; if (ref $in && $in->isa("Bio::Align::AlignI")) { $in = $self->_write_alignfile($in); } elsif (! -e $in) { $self->throw("When not supplying a Bio::Align::AlignI object, you must supply a readable filename"); } $self->_run($in); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: Runs the application Returns : Tree object Args : Alignment file name =cut sub _run { my ( $self, $file ) = @_; my $exe = $self->executable || return; my $param_str = $self->arguments . " " . $self->_setparams($file); my $command = "$exe $param_str"; $self->debug("Raxml command = $command"); my $status = system($command); # raxml creates tree files with names like "RAxML_bestTree.ABDBxjjdfg3" # if rapid bootstrapping was enabled, also a tree with RAxML_bipartitions.ABDBxjjdfg3 # with support values is created, which then should be returned my $outfile = $self->f() eq 'a' ? 'RAxML_bipartitions.' : 'RAxML_bestTree.'; $outfile .= $self->outfile_name; $outfile = File::Spec->catfile( ($self->w), $outfile ) if $self->w; if ( !-e $outfile || -z $outfile ) { $self->warn("Raxml call had status of $status: $? [command $command] \n"); return undef; } my $treeio = Bio::TreeIO->new( -file => $outfile ); my $tree = $treeio->next_tree; # if bootstraps were enabled, the bootstraps are the ids; convert to # bootstrap and no id # if ($self->boot) { # my @nodes = $tree->get_nodes; # my %non_internal = map { $_ => 1 } ($tree->get_leaf_nodes, $tree->get_root_node); # foreach my $node (@nodes) { # next if exists $non_internal{$node}; # $node->bootstrap && next; # protect ourselves incase the parser improves # $node->bootstrap($node->id); # $node->id(''); # } # } $tree; } =head2 _write_alignfile Title : _write_alignfile Usage : Internal function, not to be called directly Function: Create an alignment file Returns : filename Args : Bio::Align::AlignI =cut sub _write_alignfile { my ( $self, $align ) = @_; my ( $tfh, $tempfile ) = $self->io->tempfile( -dir => '.' ); my $out = Bio::AlignIO->new( -file => ">$tempfile", -format => 'phylip' ); $out->write_aln($align); $out->close(); undef($out); close($tfh); undef($tfh); die "Alignment file $tempfile was not created" if ( ! -e $tempfile ); $tempfile; } =head2 _alphabet Title : _alphabet Usage : my $alphabet = $self->_alphabet; Function: Get the alphabet of the input alignment, defaults to 'dna' Returns : 'dna' or 'protein' Args : Alignment file =cut sub _alphabet { my ( $self, $file ) = @_; if ($file) { if ( -e $file ) { my $in = Bio::AlignIO->new( -file => $file ); my $aln = $in->next_aln; # arbitrary, the first one my $seq = $aln->get_seq_by_pos(1); my $alphabet = $seq->alphabet; $self->{_alphabet} = $alphabet; } else { die "File $file can not be found"; } } # default is 'dna' return $self->{'_alphabet'} || 'dna'; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Raxml program Example : Returns : parameter string to be passed to Raxml Args : name of calling object =cut sub _setparams { my ( $self, $infile ) = @_; my $param_string = ''; # If 'model' is not set with '-m' check the alphabet of the input, # then specify the default model if ( !$self->m ) { my $model = ( $self->_alphabet($infile) eq 'dna' ) ? $DEFAULTNTMODEL : $DEFAULTAAMODEL; $self->m($model); } # Set default output file if no explicit output file has been given. # Raxml insists that the output file name not contain '/' and its # output directory is set using the '-w' argument. if ( !$self->outfile_name ) { my $dir = getcwd(); $self->w($dir); my ( $tfh, $outfile ) = $self->io->tempfile( -dir => $dir ); close($tfh); undef $tfh; $outfile = basename($outfile); $self->outfile_name($outfile); } for my $attr (@Raxml_PARAMS) { my $value = $self->$attr(); next unless ( defined $value ); $param_string .= ' -' . $attr . ' ' . $value . ' '; } for my $attr (@Raxml_SWITCHES) { my $value = $self->$attr(); next unless ($value); $param_string .= ' -' . $attr . ' '; } $param_string .= "-s $infile -n " . $self->outfile_name; my $null = File::Spec->devnull(); $param_string .= " > $null 2> $null" if ( $self->quiet() || $self->verbose < 0 ); $param_string; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $Raxml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $Raxml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; # Needed to keep compiler happy __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/SLR.pm000066400000000000000000000637161302566030400226220ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::SLR # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::SLR - Wrapper around the SLR program =head1 SYNOPSIS use Bio::Tools::Run::Phylo::SLR; use Bio::AlignIO; use Bio::TreeIO; use Bio::SimpleAlign; my $alignio = Bio::AlignIO->new (-format => 'fasta', -file => 't/data/219877.cdna.fasta'); my $aln = $alignio->next_aln; my $treeio = Bio::TreeIO->new (-format => 'newick', -file => 't/data/219877.tree'); my $tree = $treeio->next_tree; my $slr = Bio::Tools::Run::Phylo::SLR->new(); $slr->alignment($aln); $slr->tree($tree); # $rc = 1 for success, 0 for errors my ($rc,$results) = $slr->run(); my $positive_sites = $results->{'positive'}; print "# Site\tNeutral\tOptimal\tOmega\t", "lower\tupper\tLRT_Stat\tPval\tAdj.Pval\tResult\tNote\n"; foreach my $positive_site (@$positive_sites) { print $positive_site->[0], "\t", $positive_site->[1], "\t", $positive_site->[2], "\t", $positive_site->[3], "\t", $positive_site->[4], "\t", $positive_site->[5], "\t", $positive_site->[6], "\t", $positive_site->[7], "\t", $positive_site->[8], "\t", "positive\n"; } =head1 DESCRIPTION This is a wrapper around the SLR program. See http://www.ebi.ac.uk/goldman/SLR/ for more information. This module is more about generating the proper ctl file and will run the program in a separate temporary directory to avoid creating temp files all over the place. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut #' keep my emacs happy # Let the code begin... package Bio::Tools::Run::Phylo::SLR; use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM); use strict; use Bio::Root::Root; use Bio::AlignIO; use Bio::TreeIO; use Bio::SimpleAlign; use Bio::Tools::Run::WrapperBase; use Cwd; use File::Spec; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); =head2 Default Values INCOMPLETE DOCUMENTATION OF ALL METHODS seqfile [incodon] File from which to read alignment of codon sequences. The file should be in PAML format. treefile [intree] File from which tree should be read. The tree should be in Nexus format outfile [slr.res] File to which results are written. If the file already exists, it will be overwritten. reoptimise [1] Should the branch lengths, omega and kappa be reoptimized? 0 - no 1 - yes. kappa [2.0] Value for kappa. If 'reoptimise' is specified, the value given will be used as am initial estimate, omega [0.1] Value for omega (dN/dS). If 'reoptimise' is specified, the value given will be used as an initial estimate. codonf [0] How codon frequencies are estimated: 0: F61/F60 Estimates used are the empirical frequencies from the data. 1: F3x4 The frequencies of nucleotides at each codon position are estimated from the data and then multiplied together to get the frequency of observing a given codon. The frequency of stop codons is set to zero, and all other frequencies scaled appropriately. 2: F1x4 Nucleotide frequencies are estimated from the data (not taking into account at which position in the codon it occurs). The nucleotide frequencies are multiplied together to get the frequency of observing and then corrected for stop codons. freqtype [0] How codon frequencies are incorporated into the substitution matrix. 0: q_{ij} = pi_{j} s_{ij} 1: q_{ij} = \sqrt(pi_j/pi_i) s_{ij} 2: q_{ij} = \pi_{n} s_{ij}, where n is the nucleotide that the subsitution is to. 3: q_{ij} = s_{ij} / pi_i Option 0 is the tradition method of incorporating equilibrium frequencies into subsitution matrices (Felsenstein 1981; Goldman and Yang, 1994) Option 1 is described by Goldman and Whelan (2002), in this case with the additional parameter set to 0.5. Option 2 was suggested by Muse and Gaut (1994). Option 3 is included as an experiment, originally suggested by Bret Larget. it does not appear to describe evolution very successfully and should not be used for analyses. Kosakovsky-Pond has repeatedly stated that he finds incorporating codon frequencies in the manner of option 2 to be superior to option 0. We find that option 1 tends to perform better than either of these options. positive_only [0] If only positively selected sites are of interest, set this to "1". Calculation will be slightly faster, but information about sites under purifying selection is lost. gencode [universal] Which genetic code to use when determining whether a given mutation is synonymous or nonsynonymous. Currently only "universal" and "mammalian" mitochondrial are supported. nucleof [0] Allow for empirical exchangabilities for nucleotide substitution. 0: No adjustment. All nucleotides treated the same, modulo transition / transversion. 1: The rate at which a substitution caused a mutation from nucleotide a to nucleotide b is adjust by a constant N_{ab}. This adjustment is in addition to other adjustments (e.g. transition / transversion or base frequencies). aminof [0] Incorporate amino acid similarity parameters into substitution matrix, adjusting omega for a change between amino acid i and amino acid j. A_{ij} is a symmetric matrix of constants representing amino acid similarities. 0: Constant omega for all amino acid changes 1: omega_{ij} = omega^{A_{ij}} 2: omega_{ij} = a_{ij} log(omega) / [ 1 - exp(-a_{ij} log(omega)) ] Option 1 has the same form as the original codon subsitution model proposed by Goldman and Yang (but with potentially different constants). Option 2 has a more population genetic derivtion, with omega being interpreted as the ratio of fixation probabilities. nucfile [nuc.dat] If nucleof is non-zero, read nucleotide substitution constants from nucfile. If this file does not exist, hard coded constants are used. aminofile [amino.dat] If aminof is non-zero, read amino acid similarity constants from aminofile. If this file does not exist, hard coded constants are used. timemem [0] Print summary of real time and CPU time used. Will eventually print summary of memory use as well. ldiff [3.841459] Twice log-likelihood difference used as a threshold for calculating support (confidence) intervals for sitewise omega estimates. This value should be the quantile from a chi-square distribution with one degree of freedom corresponding to the support required. E.g. qchisq(0.95,1) = 3.841459 0.4549364 = 50% support 1.323304 = 75% support 2.705543 = 90% support 3.841459 = 95% support 6.634897 = 99% support 7.879439 = 99.5% support 10.82757 = 99.9% support paramin [] If not blank, read in parameters from file given by the argument. paramout [] If not blank, write out parameter estimates to file given. skipsitewise [0] Skip sitewise estimation of omega. Depending on other options given, either calculate maximum likelihood or likelihood fixed at parameter values given. seed [0] Seed for random number generator. If seed is 0, then previously produced seed file (~/.rng64) is used. If this does not exist, the random number generator is initialised using the clock. saveseed [1] If non-zero, save finial seed in file (~/.rng64) to be used as initial seed in future runs of program. =head2 Results Format Results file (default: slr.res) ------------ Results are presented in nine columns Site Number of sites in alignment Neutral (minus) Log-probability of observing site given that it was evolving neutrally (omega=1) Optimal (minus) Log-probability of observing site given that it was evolving at the optimal value of omega. Omega The value of omega which maximizes the log-probability of observing LRT_Stat Log-likelihood ratio statistic for non-neutral selection (or positive selection if the positive_only option is set to 1). LRT_Stat = 2 * (Neutral-Optimal) Pval P-value for non-neutral (or positive) selection at a site, unadjusted for multiple comparisons. Adj. Pval P-value for non-neutral (or positive) selection at a site, after adjusting for multiple comparisons using the Hochberg procedure (see the file "MultipleComparisons.txt" in the doc directory). Result A simple visual guide to the result. Sites detected as having been under positive selection are marked with a '+', sites under purifying selection are marked with '-'. The number of symbols Number symbols Threshold 1 95% 2 99% 3 95% after adjustment 4 99% after adjustment Occasionally the result may also contain an exclamation mark. This indicates that the observation at a site is not significantly different from random (equivalent to infinitely strong positive selection). This may indicate that the alignment at that site is bad Note The following events are flagged: Synonymous All codons at a site code for the same amino acid. Single character Only one sequence at the site is ungapped, the result of a recent insertion for example. All gaps All sequences at a site contain a gap character. Sites marked "Single character" or "All gaps" are not counted towards the number of sites for the purposes of correcting for multiple comparisons since it is not possible to detect selection from none or one observation under the assumptions made by the sitewise likelihood ratio test. =cut #' keep my emacs happy BEGIN { $MINNAMELEN = 25; $PROGRAMNAME = 'Slr_Linux_static'; if ($^O =~ /darwin/i) { $PROGRAMNAME = 'Slr_osx'; } elsif ($^O =~ /mswin/i) { $PROGRAMNAME = 'Slr_windows.exe'; } if( defined $ENV{'SLRDIR'} ) { $PROGRAM = Bio::Root::IO->catfile($ENV{'SLRDIR'},$PROGRAMNAME). ($^O =~ /mswin/i ?'_windows.exe':'');; } # valid values for parameters, the default one is always # the first one in the array # example file provided with the package %VALIDVALUES = ( 'outfile' => 'slr.res', 'reoptimise' => [ 1,0], 'kappa' => '2.0', 'omega' => '0.1', 'codonf' => [ 0, 1,2], 'freqtype' => [ 0, 1,2,3], 'positive_only' => [ 0, 1], 'gencode' => [ "universal", "mammalian"], 'nucleof' => [ 0, 1], 'aminof' => [ 0, 1,2], 'nucfile' => '', 'aminofile' => '', 'timemem' => [ 0, 1], 'ldiff' => [ 3.841459, 0.4549364,1.323304,2.705543,6.634897,7.879439,10.82757], 'paramin' => '', 'paramout' => '', 'skipsitewise' => [ 0, 1], 'seed' => [0], 'saveseed' => [ 1, 0] ); } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : ->program_dir() Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SLRDIR}) if $ENV{SLRDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Phylo::SLR->new(); Function: Builds a new Bio::Tools::Run::Phylo::SLR object Returns : Bio::Tools::Run::Phylo::SLR Args : -alignment => the Bio::Align::AlignI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -tree => the Bio::Tree::TreeI object -params => a hashref of SLR parameters (all passed to set_parameter) -executable => where the SLR executable resides See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $aln && $self->alignment($aln); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 prepare Title : prepare Usage : my $rundir = $slr->prepare($aln); Function: prepare the SLR analysis using the default or updated parameters the alignment parameter must have been set Returns : value of rundir Args : L object, L object =cut sub prepare{ my ($self,$aln,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; $aln = $self->alignment unless $aln; if( ! $aln ) { $self->warn("must have supplied a valid alignment file in order to run SLR"); return 0; } if( ! $tree ) { $self->warn("must have supplied a valid tree file in order to run SLR"); return 0; } my ($tempdir) = $self->tempdir(); my ($tempseqFH,$tempseqfile); # Reorder the alignment according to the tree my $ct = 1; my %order; foreach my $node ($tree->get_leaf_nodes) { $order{$node->id_output} = $ct++; } my @seq; my @ids; foreach my $seq ( $aln->each_seq() ) { push @seq, $seq; push @ids, $seq->display_id; } # use the map-sort-map idiom: my @sorted = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq; my $sorted_aln = Bio::SimpleAlign->new(); foreach (@sorted) { $sorted_aln->add_seq($_); } # Rename the leaf nodes in the tree from 1 to n $ct = 1; foreach my $node ($tree->get_leaf_nodes) { $node->id($ct++); } ($tempseqFH,$tempseqfile) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $alnout = Bio::AlignIO->new('-format' => 'phylip', '-fh' => $tempseqFH, '-interleaved' => 0, '-idlinebreak' => 1, '-idlength' => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1); $alnout->write_aln($sorted_aln); $alnout->close(); undef $alnout; close($tempseqFH); my ($temptreeFH,$temptreefile); ($temptreeFH,$temptreefile) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); # We need to add a line with the num of leaves ($ct-1) and the # num of trees (1) $treeout->_print(sprintf("%d 1\n",($ct-1))); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); # now let's print the ctl file. # many of the these programs are finicky about what the filename is # and won't even run without the properly named file. my ($treevolume,$treedirectories,$treefile) = File::Spec->splitpath( $temptreefile ); my ($alnvolume,$alndirectories,$alnfile) = File::Spec->splitpath( $tempseqfile ); my $slr_ctl = "$tempdir/slr.ctl"; open(SLR, ">$slr_ctl") or $self->throw("cannot open $slr_ctl for writing"); print SLR "seqfile\: $alnfile\n"; print SLR "treefile\: $treefile\n"; my $outfile = $self->outfile_name; print SLR "outfile\: $outfile\n"; my %params = $self->get_parameters; while( my ($param,$val) = each %params ) { next if $param eq 'outfile'; print SLR "$param\: $val\n"; } close(SLR); return $tempdir; } =head2 run Title : run Usage : my ($rc,$parser) = $slr->run($aln,$tree); Function: run the SLR analysis using the default or updated parameters the alignment parameter must have been set Returns : Return code, L Args : L object, L object =cut sub run { my ($self) = shift;; my $outfile = $self->outfile_name; my $tmpdir = $self->prepare(@_); #my ($rc,$parser) = (1); my ($rc,$results) = (1); { my $cwd = cwd(); my $exit_status; chdir($tmpdir); my $slrexe = $self->executable(); $self->throw("unable to find or run executable for SLR") unless $slrexe && -e $slrexe && -x _; my $run; open($run, "$slrexe |") or $self->throw("Cannot open exe $slrexe"); my @output = <$run>; $exit_status = close($run); $self->error_string(join('',@output)); if( (grep { /\berr(or)?: /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { open RESULTS, "$tmpdir/$outfile" or die "couldnt open results file: $!\n"; my $okay = 0; my $sites; my $type = 'default'; while () { chomp $_; if ( /^\#/ ) {next;} if ( /\!/ ) {$type = 'random';} # random is last elsif ( /\+/ ) {$type = 'positive';} elsif ( /\-\s+/ ) {$type = 'negative';} elsif ( /Constant/ ) {$type = 'constant';} elsif ( /All gaps/ ) {$type = 'all_gaps';} elsif ( /Single character/ ) {$type = 'single_character';} elsif ( /Synonymous/ ) {$type = 'synonymous';} else {$type = 'default'} if ( /^\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/ ) { push @{$sites->{$type}}, [$1,$2,$3,$4,$5,$6,$7,$8,$9]; } else { $DB::single=1;1; } } $results = $sites; close RESULTS; # TODO: we could have a proper parser object # $parser = Bio::Tools::Phylo::SLR->new(-file => "$tmpdir/$outfile", # -dir => "$tmpdir"); }; if( $@ ) { $self->warn($self->error_string); } chdir($cwd); } # return ($rc,$parser); return ($rc,$results); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 alignment Title : alignment Usage : $slr->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment{ my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $slr->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_slrparams'} }; } =head2 set_parameter Title : set_parameter Usage : $slr->set_parameter($param,$val); Function: Sets a SLR parameter, will be validated against the valid values as set in the %VALIDVALUES class variable. The checks can be ignored if one turns off param checks like this: $slr->no_param_checks(1) Returns : boolean if set was success, if verbose is set to -1 then no warning will be reported Args : $param => name of the parameter $value => value to set the parameter to See also: L =cut sub set_parameter{ my ($self,$param,$value) = @_; unless (defined $self->{'no_param_checks'} && $self->{'no_param_checks'} == 1) { if ( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } if ( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep { $value eq $_ } @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } } $self->{'_slrparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $slr->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_slrparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_slrparams'}->{$param} = $val->[0]; } else { $self->{'_slrparams'}->{$param} = $val; } } } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut sub no_param_checks{ my ($self,$value) = @_; if( defined $value) { $self->{'no_param_checks'} = $value; } return $self->{'no_param_checks'}; } =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $slr->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name { my $self = shift; if( @_ ) { return $self->{'_slrparams'}->{'outfile'} = shift @_; } unless (defined $self->{'_slrparams'}->{'outfile'}) { $self->{'_slrparams'}->{'outfile'} = 'out.res'; } return $self->{'_slrparams'}->{'outfile'}; } =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $slr->cleanup(); Function: Will cleanup the tempdir directory after an SLR run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Phylo/Semphy.pm000077500000000000000000000245601302566030400234240ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Phylo::Semphy # # Please direct questions and support issues to # # Cared for by Sendu Bala # # Copyright Sendu Bala # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Phylo::Semphy - Wrapper for Semphy =head1 SYNOPSIS use Bio::Tools::Run::Phylo::Semphy; # Make a Semphy factory $factory = Bio::Tools::Run::Phylo::Semphy->new(); # Run Semphy with an alignment my $tree = $factory->run($alignfilename); # or with alignment object $tree = $factory->run($bio_simplalign); # you can supply an initial tree as well, which can be a newick tree file, # Bio::Tree::Tree object... $tree = $factory->run($bio_simplalign, $tree_obj); # ... or Bio::DB::Taxonomy object $tree = $factory->run($bio_simplalign, $bio_db_taxonomy); # (mixtures of all the above are possible) # $tree isa Bio::Tree::Tree =head1 DESCRIPTION This is a wrapper for running the Semphy application by N. Friedman et a.. You can get details here: http://compbio.cs.huji.ac.il/semphy/. Semphy is used for phylogenetic reconstruction (making a tree with branch lengths from an aligned set of input sequences). You can try supplying normal Semphy command-line arguments to new(), eg. new(-hky => 1) or calling arg-named methods (excluding the initial hyphen(s), eg. $factory->hky(1) to set the --hky switch to true). Note that Semphy args are case-sensitive. To distinguish between Bioperl's -verbose and the Semphy's --verbose, you must set Semphy's verbosity with -semphy_verbose or the semphy_verbose() method. You will need to enable this Semphy wrapper to find the Semphy program. This can be done in (at least) three ways: 1. Make sure the Semphy executable is in your path. 2. Define an environmental variable SEMPHYDIR which is a directory which contains the Semphy application: In bash: export SEMPHYDIR=/home/username/semphy/ In csh/tcsh: setenv SEMPHYDIR /home/username/semphy 3. Include a definition of an environmental variable SEMPHYDIR in every script that will use this Semphy wrapper module, e.g.: BEGIN { $ENV{SEMPHYDIR} = '/home/username/semphy/' } use Bio::Tools::Run::Phylo::Semphy; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Sendu Bala Email bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Phylo::Semphy; use strict; use File::Spec; use Bio::AlignIO; use Bio::TreeIO; use base qw(Bio::Tools::Run::Phylo::PhyloBase); our $PROGRAM_NAME = 'semphy'; our $PROGRAM_DIR = $ENV{'SEMPHYDIR'}; # methods for the semphy args we support our %PARAMS = (outputfile => 'o', treeoutputfile => 'T', constraint => 'c', gaps => 'g', seed => 'r', Logfile => 'l', alphabet => 'a', ratio => 'z', ACGprob => 'p', BPrepeats => 'BPrepeats', BPconsensus => 'BPconsensus', SEMPHY => 'S', modelfile => 'modelfile', alpha => 'A', categories => 'C', semphy_verbose => 'semphy_verbose'); our %SWITCHES = (homogeneousRatesDTME => 'homogeneousRatesDTME', NJ => 'J', pairwiseGammaDTME => 'pairwiseGammaDTME', commonAlphaDTME => 'commonAlphaDTME', rate4siteDTME => 'rate4siteDTME', posteriorDTME => 'posteriorDTME', BPonUserTree => 'BPonUserTree', nucjc => 'nucjc', aaJC => 'aaJC', k2p => 'k2p', hky => 'hky', day => 'day', jtt => 'jtt', rev => 'rev', wag => 'wag', cprev => 'cprev', homogeneous => 'H', optimizeAlpha => 'O', bbl => 'n', likelihood => 'L', PerPosLike => 'P', PerPosPosterior => 'B', rate => 'R'); # just to be explicit, args we don't support (yet) or we handle ourselves our @UNSUPPORTED = qw(h help full-help s sequence t tree); =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns : string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string Args : None =cut sub program_dir { return $PROGRAM_DIR; } =head2 new Title : new Usage : $factory = Bio::Tools::Run::Phylo::Semphy->new() Function: creates a new Semphy factory Returns : Bio::Tools::Run::Phylo::Semphy Args : Most options understood by Semphy can be supplied as key => value pairs, with a true value for switches. These options can NOT be used with this wrapper (they are handled internally or don't make sense in this context): -h | --help | --fill-help -s | --sequence -t | --tree To distinguish between Bioperl's -verbose and the Semphy's --verbose, you must set Semphy's verbosity with -semphy_verbose =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $PARAMS{$_} } keys %PARAMS), (map { $_ => $SWITCHES{$_} } keys %SWITCHES), quiet => 'quiet'}, -create => 1, -case_sensitive => 1); return $self; } =head2 run Title : run Usage : $result = $factory->run($fasta_align_file); -or- $result = $factory->run($align_object); -or- $result = $factory->run($fasta_align_file, $newick_tree_file); -or- $result = $factory->run($align_object, $tree_object); -or- $result = $factory->run($align_object, $db_taxonomy_object); Function: Runs Semphy on an alignment. Returns : Bio::Tree::Tree Args : The first argument represents an alignment, the second (optional) argument a species tree (to set an initial tree: normally the -t option to Semphy). The alignment can be provided as a multi-fasta format alignment filename, or a Bio::Align::AlignI compliant object (eg. a Bio::SimpleAlign). The species tree can be provided as a newick format tree filename or a Bio::Tree::TreeI compliant object. Alternatively a Bio::DB::Taxonomy object can be supplied, in which case the species tree will be generated by using the alignment sequence names as species names and looking for those in the supplied database. In all cases where an initial tree was supplied, the alignment sequence names must correspond to node ids in the species tree. =cut sub run { my ($self, $aln, $tree) = @_; $aln || $self->throw("alignment must be supplied"); $self->_alignment($aln); if ($tree) { $self->_tree($tree); # check node and seq names match $self->_check_names; } return $self->_run; } sub _run { my $self = shift; my $exe = $self->executable || return; my $aln_file = $self->_write_alignment; # generate a semphy-friendly tree file my $tree = $self->_tree; my $tree_file = ''; if ($tree) { $tree = $self->_write_tree; } unless ($self->T) { my ($tfh, $tempfile) = $self->io->tempfile(-dir => $self->tempdir); $self->T($tempfile); close($tfh); } my $command = $exe.$self->_setparams($aln_file, $tree_file); $self->debug("semphy command = $command\n"); open(my $pipe, "$command |") || $self->throw("semphy call ($command) failed to start: $? | $!"); my $error = ''; while (<$pipe>) { print unless $self->quiet; $error .= $_; } close($pipe) || ($error ? $self->warn("semphy call ($command) failed: $error") : $self->throw("semphy call ($command) crashed: $?")); my $result_file = $self->T(); my $tio = Bio::TreeIO->new(-format => 'newick', -file => $result_file); my $result_tree = $tio->next_tree; return $result_tree; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Creates a string of params to be used in the command string Returns : string of params Args : alignment and tree file names =cut sub _setparams { my ($self, $aln_file, $tree_file) = @_; my $param_string = ' -s '.$aln_file; $param_string .= ' -t '.$tree_file if $tree_file; my %methods = map { $_ => $_ } keys %PARAMS; $methods{'semphy_verbose'} = 'verbose'; $param_string .= $self->SUPER::_setparams(-params => \%methods, -switches => [keys %SWITCHES], -double_dash => 1); $param_string .= ' 2>&1'; my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " 1>$null" if $self->quiet; return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Primate.pm000066400000000000000000000272361302566030400224650ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Primate # # Please direct questions and support issues to # # Cared for by # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Wrapper for Primate, Guy Slater's near exact match finder for short sequence tags. =head1 SYNOPSIS use Bio::Tools::Run::Primate; use Bio::SeqIO; my $query = "primer.fa"; my $target = "contig.fa"; my @params = ("query" => $query,"target" => $target,"m"=>0); my $fact = Bio::Tools::Run::Primate->new(@params); my @feat = $fact->run; foreach my $feat(@feat) { print $feat->seqname."\t".$feat->primary_tag."\t".$feat->start. "\t".$feat->end."\t".$feat->strand."\t".$feat->seq->seq."\n"; } =head1 DESCRIPTION Primate is available under to ensembl-nci package at http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/ensembl-nci/?cvsroot=Ensembl =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Primate; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR @PRIMATE_PARAMS $PROGRAMNAME @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @PRIMATE_PARAMS = qw(V Q T M B QUERY TARGET OUTFILE PROGRAM EXECUTABLE); @OTHER_SWITCHES = qw(QUIET VERBOSE); # Authorize attribute fields foreach my $attr ( @PRIMATE_PARAMS,@OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'primate'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PRIMATEDIR}) if $ENV{PRIMATEDIR}; } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::Primate->new() Function: Builds a new Bio::Tools::Run::Primate objet Returns : Bio::Tools::Run::Primate Args : query => the L object or a file path target => the L object or a file path m => the number of mismatches allowed, default 1(integer) b => [TRUE|FALSE] find best match, default FALSE executable=>where the program sits =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if($attr =~/^q$/i){ $self->query($value); } if($attr =~/^t$/i){ $self->target($value); } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : $primate->version Function: Determine the version number of the program Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe = $self->executable(); return undef unless defined $exe; my $string = `$exe -v ` ; $string =~ /\(([\d.]+)\)/; return $1 || undef; } =head2 search Title : search Usage : DEPRECATED. Use $factory->run() instead Function: Perform a primate search Returns : Array of L Args : =cut sub search { return shift->run(@_); } =head2 run Title : run Usage : @feat = $factory->run(); Function: Perform a primate search Returns : Array of L Args : =cut sub run{ my ($self,$target) = @_; $target = $target ||$self->target; $target || $self->throw("Need a target sequence"); $self->query || $self->throw("Need a query sequence"); # Create input file pointer my ($query_file,$target_file)= $self->_setinput($self->query,$target); if (!($query_file && $target_file)) {$self->throw("Unable to create temp files for query and target !");} # Create parameter string to pass to primate program my $param_string = $self->_setparams(); # run primate my @feats= $self->_run($query_file,$target_file,$param_string); return @feats; } ################################################# #INTERNAL METHODS =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to dba program Returns : array of L Args : path to query and target file and parameter string =cut sub _run { my ($self,$query_file,$target_file,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); close($tfh); # this is to make sure we don't have # open filehandles undef $tfh; my $commandstring = $self->executable. " $param_string -q $query_file -t $target_file > $outfile"; $self->debug( "primate command = $commandstring"); my $status = system($commandstring); $self->throw( "primate call ($commandstring) crashed: $? \n") unless $status==0; #parse pff format and return a Bio::Search::HSP::GenericHSP array my @feats = $self->_parse_results($outfile); return @feats; } =head2 _parse_results Title : _parse_results Usage : Internal function, not to be called directly Function: Passes primate output Returns : array of L Args : the name of the output file =cut sub _parse_results { my ($self,$outfile) = @_; $outfile||$self->throw("No outfile specified"); my @feats; my %query = $self->_query_seq(); open(OUT,$outfile); while(my $entry = ){ chomp($entry); if($entry =~ /primate/ ) { my ($dummy,$tagname, $seqname, $strand,$seq_end,$mismatch) = split(" " , $entry ); #map primate coordinates to Seq coordinates my $seq_start = $seq_end- length($query{$tagname})+2; $seq_end++; my $feature = Bio::SeqFeature::Generic->new( -seq_id => $seqname, -strand => $strand, -score => $mismatch, -start => $seq_start, -end => $seq_end, -frame => 1, -source => 'primate', -primary => $tagname); $feature->attach_seq($self->_target_seq); push @feats,$feature; } } return @feats; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input files for primate Returns : name of file containing query and target Args : query and target (either a filename or a L =cut sub _setinput { my ($self, $query,$target) = @_; my ($query_file,$target_file,$tfh1,$tfh2); my @query = ref ($query) eq "ARRAY" ? @{$query} : ($query); foreach my $query(@query){ if(ref($query)&& $query->isa("Bio::PrimarySeqI")){ ($tfh1,$query_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my %query; $query{$query->primary_id} = $query->seq; $self->_query_seq(\%query); $out1->write_seq($query) || return 0; close ($tfh1); undef $tfh1; } elsif (-e $query){ my $in = Bio::SeqIO->new(-file => $query , '-format' => 'fasta'); ($tfh1,$query_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); my %query; while(my $seq1 = $in->next_seq()){ $out1->write_seq($seq1) || return 0; $query{$seq1->primary_id} = $seq1->seq; } close($tfh1); undef $tfh1; $self->_query_seq(\%query); } else { return 0; } } if(ref($target) && $target->isa("Bio::PrimarySeqI")){ ($tfh2,$target_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'Fasta'); $out1->write_seq($target)|| return 0; $self->_target_seq($target); close($tfh2); undef $tfh2; } elsif (-e $target){ my $in = Bio::SeqIO->new(-file => $target , '-format' => 'fasta'); ($tfh2,$target_file) = $self->io->tempfile(-dir=>$self->tempdir); my $out = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); my $seq1 = $in->next_seq() || return 0; $out->write_seq($seq1); close($tfh2); undef $tfh2; $self->_target_seq($seq1); } else { return 0; } return $query_file,$target_file; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for primate program Returns : parameter string to be passed to primate Args : the param array =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @PRIMATE_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; #put params in format expected by dba $attr_key = ' -'.$attr_key; if(($attr_key !~/QUERY/i) && ($attr_key !~/TARGET/i)){ $param_string .= $attr_key.' '.$value; } } if ($self->quiet() || $self->verbose() < 0) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $param_string .= " >$null "; } return $param_string; } =head2 _query_seq() Title : _query_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Returns : a hash of seq with key the query tag Args : optional =cut sub _query_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_seq'} = $seq; } return %{$self->{'_query_seq'}}; } =head2 _target_seq() Title : _target_seq Usage : Internal function, not to be called directly Function: get/set for the target sequence Returns : L Args : optional =cut sub _target_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_target_seq'} = $seq; } return $self->{'_target_seq'}; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Primer3.pm000066400000000000000000000701161302566030400224000ustar00rootroot00000000000000# # This is the original copyright statement. I have relied on Chad's module # extensively for this module. # # Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved. # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # Copyright Chad Matsalla # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code # # But I have modified lots of it, so I guess I should add: # # Copyright (c) 2003 bioperl, Rob Edwards. All Rights Reserved. # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # Copyright Rob Edwards # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Primer3 - Create input for and work with the output from the program primer3 =head1 SYNOPSIS Bio::Tools::Primer3 creates the input files needed to design primers using primer3 and provides mechanisms to access data in the primer3 output files. This module provides a bioperl interface to the program primer3. See http://frodo.wi.mit.edu/primer3/primer3_code.html for details and to download the software. This module only works for primer3 release 1 but is not guaranteed to work with earlier versions. # design some primers. # the output will be put into temp.out use Bio::Tools::Run::Primer3; use Bio::SeqIO; my $seqio = Bio::SeqIO->new(-file=>'data/dna1.fa'); my $seq = $seqio->next_seq; my $primer3 = Bio::Tools::Run::Primer3->new(-seq => $seq, -outfile => "temp.out", -path => "/usr/bin/primer3_core"); # or after the fact you can change the program_name $primer3->program_name('my_suprefast_primer3'); unless ($primer3->executable) { print STDERR "primer3 can not be found. Is it installed?\n"; exit(-1) } # what are the arguments, and what do they mean? my $args = $primer3->arguments; print "ARGUMENT\tMEANING\n"; foreach my $key (keys %{$args}) {print "$key\t", $$args{$key}, "\n"} # set the maximum and minimum Tm of the primer $primer3->add_targets('PRIMER_MIN_TM'=>56, 'PRIMER_MAX_TM'=>90); # design the primers. This runs primer3 and returns a # Bio::Tools::Run::Primer3 object with the results $results = $primer3->run; # see the Bio::Tools::Run::Primer3 pod for # things that you can get from this. For example: print "There were ", $results->number_of_results, " primers\n"; Bio::Tools::Run::Primer3 creates the input files needed to design primers using primer3 and provides mechanisms to access data in the primer3 output files. This module provides a bioperl interface to the program primer3. See http://www-genome.wi.mit.edu/genome_software/other/primer3.html for details and to download the software. This module is based on one written by Chad Matsalla (bioinformatics1@dieselwurks.com). I have ripped some of his code, and added a lot of my own. I hope he is not mad at me! =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://www.bioperl.org/MailList.html - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Rob Edwards redwards@utmem.edu Based heavily on work of Chad Matsalla bioinformatics1@dieselwurks.com =head1 CONTRIBUTORS Shawn Hoon shawnh-at-stanford.edu Jason Stajich jason-at-bioperl.org Brian Osborne osborne1-at-optonline.net =head1 SEE ALSO L =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Primer3; use vars qw(@ISA); use strict; use Bio::Root::Root; use Bio::Tools::Primer3; use Bio::Tools::Run::WrapperBase; use File::Spec; use vars qw($AUTOLOAD @ISA @PRIMER3_PARAMS $PROGRAMNAME %OK_FIELD); @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { $PROGRAMNAME = 'primer3'; @PRIMER3_PARAMS=qw( PROGRAM EXCLUDED_REGION INCLUDED_REGION PRIMER_COMMENT PRIMER_DNA_CONC PRIMER_EXPLAIN_FLAG PRIMER_FILE_FLAG PRIMER_FIRST_BASE_INDEX PRIMER_GC_CLAMP PRIMER_INTERNAL_OLIGO_DNA_CONC PRIMER_INTERNAL_OLIGO_EXCLUDED_REGION PRIMER_INTERNAL_OLIGO_INPUT PRIMER_INTERNAL_OLIGO_MAX_GC PRIMER_INTERNAL_OLIGO_MAX_MISHYB PRIMER_INTERNAL_OLIGO_MAX_POLY_X PRIMER_INTERNAL_OLIGO_MAX_SIZE PRIMER_INTERNAL_OLIGO_MAX_TM PRIMER_INTERNAL_OLIGO_MIN_GC PRIMER_INTERNAL_OLIGO_MIN_QUALITY PRIMER_INTERNAL_OLIGO_MIN_SIZE PRIMER_INTERNAL_OLIGO_MIN_TM PRIMER_INTERNAL_OLIGO_MISHYB_LIBRARY PRIMER_INTERNAL_OLIGO_OPT_GC_PERCENT PRIMER_INTERNAL_OLIGO_OPT_SIZE PRIMER_INTERNAL_OLIGO_OPT_TM PRIMER_INTERNAL_OLIGO_SALT_CONC PRIMER_INTERNAL_OLIGO_SELF_ANY PRIMER_INTERNAL_OLIGO_SELF_END PRIMER_IO_WT_COMPL_ANY PRIMER_IO_WT_COMPL_END PRIMER_IO_WT_END_QUAL PRIMER_IO_WT_GC_PERCENT_GT PRIMER_IO_WT_GC_PERCENT_LT PRIMER_IO_WT_NUM_NS PRIMER_IO_WT_REP_SIM PRIMER_IO_WT_SEQ_QUAL PRIMER_IO_WT_SIZE_GT PRIMER_IO_WT_SIZE_LT PRIMER_IO_WT_TM_GT PRIMER_IO_WT_TM_LT PRIMER_LEFT_INPUT PRIMER_LIBERAL_BASE PRIMER_MAX_DIFF_TM PRIMER_MAX_END_STABILITY PRIMER_MAX_GC PRIMER_MAX_MISPRIMING PRIMER_MAX_POLY_X PRIMER_MAX_SIZE PRIMER_MAX_TM PRIMER_MIN_END_QUALITY PRIMER_MIN_GC PRIMER_MIN_QUALITY PRIMER_MIN_SIZE PRIMER_MIN_TM PRIMER_MISPRIMING_LIBRARY PRIMER_NUM_NS_ACCEPTED PRIMER_NUM_RETURN PRIMER_OPT_GC_PERCENT PRIMER_OPT_SIZE PRIMER_OPT_TM PRIMER_PAIR_MAX_MISPRIMING PRIMER_PAIR_WT_COMPL_ANY PRIMER_PAIR_WT_COMPL_END PRIMER_PAIR_WT_DIFF_TM PRIMER_PAIR_WT_IO_PENALTY PRIMER_PAIR_WT_PRODUCT_SIZE_GT PRIMER_PAIR_WT_PRODUCT_SIZE_LT PRIMER_PAIR_WT_PRODUCT_TM_GT PRIMER_PAIR_WT_PRODUCT_TM_LT PRIMER_PAIR_WT_PR_PENALTY PRIMER_PAIR_WT_REP_SIM PRIMER_PICK_ANYWAY PRIMER_PICK_INTERNAL_OLIGO PRIMER_PRODUCT_MAX_TM PRIMER_PRODUCT_MIN_TM PRIMER_PRODUCT_OPT_SIZE PRIMER_PRODUCT_OPT_TM PRIMER_PRODUCT_SIZE_RANGE PRIMER_QUALITY_RANGE_MAX PRIMER_QUALITY_RANGE_MIN PRIMER_RIGHT_INPUT PRIMER_SALT_CONC PRIMER_SELF_ANY PRIMER_SELF_END PRIMER_SEQUENCE_ID PRIMER_SEQUENCE_QUALITY PRIMER_START_CODON_POSITION PRIMER_TASK PRIMER_WT_COMPL_ANY PRIMER_WT_COMPL_END PRIMER_WT_END_QUAL PRIMER_WT_END_STABILITY PRIMER_WT_GC_PERCENT_GT PRIMER_WT_GC_PERCENT_LT PRIMER_WT_NUM_NS PRIMER_WT_POS_PENALTY PRIMER_WT_REP_SIM PRIMER_WT_SEQ_QUAL PRIMER_WT_SIZE_GT PRIMER_WT_SIZE_LT PRIMER_WT_TM_GT PRIMER_WT_TM_LT SEQUENCE TARGET PRIMER_DEFAULT_PRODUCT PRIMER_DEFAULT_SIZE PRIMER_INSIDE_PENALTY PRIMER_INTERNAL_OLIGO_MAX_TEMPLATE_MISHYB PRIMER_OUTSIDE_PENALTY PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS PRIMER_MAX_TEMPLATE_MISPRIMING PRIMER_PAIR_MAX_TEMPLATE_MISPRIMING PRIMER_PAIR_WT_TEMPLATE_MISPRIMING PRIMER_WT_TEMPLATE_MISPRIMING ); foreach my $attr (@PRIMER3_PARAMS) {$OK_FIELD{$attr}++} } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new() Title : new() Usage : my $primer3 = Bio::Tools::Run::Primer3->new(-file=>$file) to read a primer3 output file. my $primer3 = Bio::Tools::Run::Primer3->new(-seq=>sequence object) design primers against sequence Function: Start primer3 working and adds a sequence. At the moment it will not clear out the old sequence, but I suppose it should. Returns : Does not return anything. If called with a filename will allow you to retrieve the results Args : -seq (optional) Bio::Seq object of sequence. This is required to run primer3 but can be added later with add_targets() -outfile file name to output results to (can also be added with $primer3->outfile_name -path path to primer3 executable, including program name, e.g. "/usr/bin/primer3_core". This can also be set with program_name and program_dir -verbose (optional) set verbose output Notes : =cut sub new { my($class,%args) = @_; my $self = $class->SUPER::new(%args); $self->io->_initialize_io(); $self->program_name($args{-program}) if defined $args{'-program'}; if ($args{'-verbose'}) {$self->{'verbose'}=1} if ($args{'-seq'}) { $self->{'seqobject'}=$args{'-seq'}; my @input; push (@input, ("PRIMER_SEQUENCE_ID=".$self->{'seqobject'}->id), ("SEQUENCE=".$self->{'seqobject'}->seq)); $self->{'primer3_input'}=\@input; } if ($args{'-outfile'}) {$self->{_outfilename}=$args{'-outfile'}} if ($args{'-path'}) { my (undef,$path,$prog) = File::Spec->splitpath($args{'-path'}); # For Windows system, $path better (Letter disk not truncated) if ( $^O =~ m{mswin}i ) { require File::Basename; $path = File::Basename::dirname( $args{'-path'} ); $prog = File::Basename::basename( $args{'-path'} ); } $self->program_dir($path); $self->program_name($prog); } return $self; } =head2 program_name Title : program_name Usage : $primer3->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my $self = shift; return $self->{'program_name'} = shift @_ if @_; return $self->{'program_name'} if $self->{'program_name'}; for (qw(primer3 primer3_core)) { if ($self->io->exists_exe($_)) { $PROGRAMNAME = $_; last; } } # don't set permanently, use global return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $primer3->program_dir($dir) Function: returns the program directory, which may also be obtained from ENV variable. Returns : string Args : =cut sub program_dir { my ($self, $dir) = @_; if ($dir) { $self->{'program_dir'}=$dir; } # we need to stop here if we know what the answer is, otherwise we can # never set it and then call it later return $self->{'program_dir'} if $self->{'program_dir'}; if ($ENV{PRIMER3}) { $self->{'program_dir'} = Bio::Root::IO->catfile($ENV{PRIMER3}); } else { $self->{'program_dir'} = Bio::Root::IO->catfile('usr','local','bin'); } return $self->{'program_dir'} } =head2 add_targets() Title : add_targets() Usage : $primer3->add_targets(key=>value) Function: Add any legal value to the input command line. Returns : Returns the number of arguments added. Args : Use $primer3->arguments to find a list of all the values that are allowed, or see the primer3 docs. Notes : This will only do limited error checking at the moment, but it should work. =cut sub add_targets { my ($self, %args)=@_; my $added_args; # a count of what we have added. my $inputarray = $self->{'primer3_input'}; foreach my $key (keys %args) { # we will allow them to add a sequence before checking for arguments if ((uc($key) eq "-SEQ") || (uc($key) eq "-SEQUENCE")) { # adding a new sequence. We need to separate them with an = $self->{'seqobject'}=$args{$key}; if (defined $$inputarray[0]) {push (@$inputarray, "=")} push (@$inputarray, ("PRIMER_SEQUENCE_ID=". $self->{'seqobject'}->id),("SEQUENCE=".$self->{'seqobject'}->seq)); next; } unless ($self->{'no_param_checks'}) { unless ($OK_FIELD{$key}) { $self->warn("Parameter $key is not a valid Primer3 parameter"); next} } if (uc($key) eq "INCLUDED_REGION") { # this must be a comma separated start, length. my $sequencelength; # we don't have a length, hence we need to add the length of the # sequence less the start. foreach my $input (@$inputarray) { if ($input =~ /SEQUENCE=(.*)/) {$sequencelength=length($1)} } if (!$args{$key}) {$args{$key}="0," . $sequencelength} elsif ($args{$key} !~ /\,/) { my $length_of_included = $sequencelength-$args{$key}; $args{$key} .= ",".$length_of_included; } } elsif (uc($key) eq "PRIMER_MIN_SIZE") { # minimum size must be less than MAX size and greater than zero if (exists $args{"PRIMER_MAX_SIZE"}) { unless ($args{"PRIMER_MAX_SIZE"} > $args{"PRIMER_MIN_SIZE"}) { $self->warn('Maximum primer size (PRIMER_MAX_SIZE) must be greater than minimum primer size (PRIMER_MIN_SIZE)'); } } if ($args{$key} < 0) { $self->warn('Minimum primer size (PRIMER_MIN_SIZE) must be greater than 0'); } } elsif ($key eq "PRIMER_MAX_SIZE") { if ($args{$key}>35) {$self->warn('Maximum primer size (PRIMER_MAX_SIZE) must be less than 35')} } elsif (uc($key) eq "SEQUENCE") { # Add seqobject if not present, since it is checked for by Bio::Tools::Primer3->next_primer() $self->{'seqobject'}=Bio::Seq->new(-seq=>$args{$key}) if not defined($self->{'seqobject'}); } # need a check to see whether this is already in the array # and finally add the argument to the list. my $toadd=uc($key)."=".$args{$key}; my $replaced; # don't add it if it is replacing something! my @new_array; foreach my $input (@$inputarray) { my ($array_key, $array_value) = split '=', $input; if (uc($array_key) eq uc($key)) {push @new_array, $toadd; $replaced=1} else {push @new_array, $input} } unless ($replaced) {push @new_array, $toadd} @$inputarray=@new_array; if ($self->{'verbose'}) {print STDERR "Updated ", uc($key), " to $args{$key}\n"} $added_args++; } $self->{'primer3_input'}=$inputarray; return $added_args; } =head2 run() Title : run() Usage : $primer3->run(); Function: Run the primer3 program with the arguments that you have supplied. Returns : A Bio::Tools::Primer3 object containing the results. Args : None. Note : See the Bio::Tools::Primer3 documentation for those functions. =cut sub run { my($self) = @_; my $executable = $self->executable; my $input = $self->{'primer3_input'}; unless ($executable && -e $executable) { $self->throw("Executable was not found. Do not know where primer3 is!") if !$executable; $self->throw("$executable was not found. Do not know where primer3 is!"); exit(-1); } # note that I write this to a temp file because we need both read # and write access to primer3, therefore, # we can't use a simple pipe. if ($self->{'verbose'}) {print STDERR "TRYING\n", join "\n", @{$self->{'primer3_input'}}, "=\n"} # make a temporary file and print the instructions to it. my ($temphandle, $tempfile) = $self->io->tempfile; print $temphandle join "\n", @{$self->{'primer3_input'}}, "=\n"; close($temphandle); my $executable_command = $executable; if ( $executable =~ m{^[^\'\"]+(.+)[^\'\"]+$} ) { $executable_command = "\"$executable\" < \"$tempfile\"|"; } open (RESULTS, $executable_command) || $self->throw("Can't open RESULTS"); if ($self->{'_outfilename'}) { # I can't figure out how to use either of these to write the results out. # neither work, what am I doing wrong or missing in the docs? # $self->{output}=$self->_initialize_io(-file=>$self->{'outfile'}); # $self->{output}=$self->io; # OK, for now, I will just do it myself, because I need this to # check the parser :) open (OUT, ">".$self->{'_outfilename'}) || $self->throw("Can't open ".$self->{'_outfilename'}." for writing"); } my @results; while () { if ($self->{'_outfilename'}) { # this should work, but isn't #$self->{output}->_print($_); print OUT $_; } chomp; next if( $_ eq '='); # skip over bolderio record terminator my ($return, $value) = split('=',$_); $self->{'results'}->{$return} = $value; } close RESULTS; # close the output file if ($self->{'_outfilename'}) { close OUT; } $self->cleanup; # convert the results to individual results $self->{results_obj} = Bio::Tools::Primer3->new; $self->{results_obj}->_set_variable('results', $self->{results}); $self->{results_obj}->_set_variable('seqobject', $self->{seqobject}); # Bio::Tools::Primer3::_separate needs a hash of the primer3 arguments, # with the arg as the key and the value as the value (surprise!). my %input_hash = map {split '='} @{$self->{'primer3_input'}}; $self->{results_obj}->_set_variable('input_options', \%input_hash); $self->{results_separated}= $self->{results_obj}->_separate(); return $self->{results_obj}; } =head2 arguments() Title : arguments() Usage : $hashref = $primer3->arguments(); Function: Describes the options that you can set through Bio::Tools::Run::Primer3, with a brief (one line) description of what they are and their default values Returns : A string (if an argument is supplied) or a reference to a hash. Args : If supplied with an argument will return a string of its description. If no arguments are supplied, will return all the arguments as a reference to a hash Notes : Much of this is taken from the primer3 README file, and you should read that file for a more detailed description. =cut sub arguments { my ($self, $required) = @_; unless ($self->{'input_options'}) {$self->_input_args} if ($required) {return ${$self->{'input_options'}}{'$required'}} else {return $self->{'input_options'}} } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return unless my $exe = $self->executable; if (!defined $self->{'_progversion'}) { my $string = `$exe -about 2>&1`; my $v; if ($string =~ m{primer3\s+release\s+([\d\.]+)}) { $self->{'_progversion'} = $1; } } return $self->{'_progversion'} || undef; } =head2 _input_args() Title : _input_args() Usage : an internal method to set the input arguments for Primer3 Function: Define a hash with keys for each of the input arguments and values as a short one line description Returns : A reference to a hash. Args : None. Notes : Much of this is taken from the primer3 README file, and you should read that file for a more detailed description. =cut sub _input_args { my($self) = shift; # just return functions that we can set and what they are my %hash=( 'PRIMER_SEQUENCE_ID'=>'(string, optional) an id. Optional. Note must be present if PRIMER_FILE_FLAG is set', 'SEQUENCE'=>'(nucleotide sequence, REQUIRED) The sequence itself. Cannot contain newlines', 'INCLUDED_REGION'=>'(interval, optional) Where to pick primers from. In form ,. Based on zero indexing!', 'TARGET'=>'(interval list, default empty) Regions that must be included in the product. The value should be a space-separated list of ,', 'EXCLUDED_REGION'=>'(interval list, default empty) Regions that must NOT be included in the product. The value should be a space-separated list of ,', 'PRIMER_COMMENT'=>'(string) This is ignored, so we will just save, and return it', 'PRIMER_SEQUENCE_QUALITY'=>'(quality list, default empty) A list of space separated integers with one per base. Could adapt a Phred object to this.', 'PRIMER_LEFT_INPUT'=>'(nucleotide sequence, default empty) If you know the left primer sequence, put it here', 'PRIMER_RIGHT_INPUT'=>'(nucleotide sequence, default empty) If you know the right primer sequence, put it here', 'PRIMER_START_CODON_POSITION'=>'(int, default -1000000) Location of known start codons for designing in frame primers.', 'PRIMER_PICK_ANYWAY'=>'boolean, default 0) Pick a primer, even if we have violated some constraints.', 'PRIMER_MISPRIMING_LIBRARY'=>'(string, optional) A file containing sequences to avoid amplifying. Should be fasta format, but see primer3 docs for constraints.', 'PRIMER_MAX_MISPRIMING'=>'(decimal,9999.99, default 12.00) Weighting for PRIMER_MISPRIMING_LIBRARY', 'PRIMER_PAIR_MAX_MISPRIMING'=>'(decimal,9999.99, default 24.00 Weighting for PRIMER_MISPRIMING_LIBRARY', 'PRIMER_PRODUCT_MAX_TM'=>'(float, default 1000000.0) The maximum allowed Tm of the product.', 'PRIMER_PRODUCT_MIN_TM'=>'(float, default -1000000.0) The minimum allowed Tm of the product', 'PRIMER_EXPLAIN_FLAG'=>'(boolean, default 0) If set it will print a bunch of information out.', 'PRIMER_PRODUCT_SIZE_RANGE'=>'(size range list, default 100-300) space separated list of product sizes eg - -', 'PRIMER_DEFAULT_PRODUCT' => '(size range list, default 100-300)', 'PRIMER_PICK_INTERNAL_OLIGO'=>'(boolean, default 0) if set, a hybridization probe will be selected', 'PRIMER_GC_CLAMP'=>'(int, default 0) Number of Gs and Cs at the 3 prime end.', 'PRIMER_OPT_SIZE'=>'(int, default 20) Optimal primer size. Primers will be close to this value in length', 'PRIMER_DEFAULT_SIZE' => '(int, default 20)', 'PRIMER_MIN_SIZE'=>'(int, default 18) Minimum size. Must be 0 < PRIMER_MIN_SIZE < PRIMER_MAX_SIZE ', 'PRIMER_MAX_SIZE'=>'(int, default 27) Maximum size. Must be < 35.', 'PRIMER_OPT_TM'=>'(float, default 60.0C) Optimum Tm of a primer.', 'PRIMER_MIN_TM'=>'(float, default 57.0C) Minimum Tm of a primer', 'PRIMER_MAX_TM'=>'(float, default 63.0C) Maximum Tm of a primer', 'PRIMER_MAX_DIFF_TM'=>'(float, default 100.0C) acceptable difference in Tms', 'PRIMER_MIN_GC'=>'(float, default 20.0%) Minimum allowable GCs', 'PRIMER_OPT_GC_PERCENT'=>'(float, default 50.0%) Optimal GCs', 'PRIMER_MAX_GC'=>'(float, default 80.0%) Maximum allowable GCs', 'PRIMER_SALT_CONC'=>'(float, default 50.0 mM) Salt concentration required for Tm calcs.', 'PRIMER_DNA_CONC'=>'(float, default 50.0 nM) DNA concentration required for Tm calcs. ', 'PRIMER_NUM_NS_ACCEPTED'=>'(int, default 0) Maximum number of unknown bases (N) allowable in any primer.', 'PRIMER_SELF_ANY'=>'(decimal,9999.99, default 8.00) Maximum aligment score for within and between primers when checking for hairpin loops', 'PRIMER_SELF_END'=>'(decimal 9999.99, default 3.00) Maximum aligment score for within and between primers when checking for primer dimers', 'PRIMER_FILE_FLAG'=>'(boolean, default 0) Output .for and .rev with all acceptable forward and reverse primers', 'PRIMER_MAX_POLY_X'=>'(int, default 5) The maximum allowable length of a mononucleotide repeat.', 'PRIMER_LIBERAL_BASE'=>'(boolean, default 0) Use IUPAC codes (well, just change them to N). Note must also set PRIMER_NUM_NS_ACCEPTED', 'PRIMER_NUM_RETURN'=>'(int, default 5) Number of primers to return', 'PRIMER_FIRST_BASE_INDEX'=>'(int, default 0) Index of the first base. Do not change this or allow it to be changed, as we will have to mess with subseqs and whatnot.', 'PRIMER_MIN_QUALITY'=>'(int, default 0) Minimum sequence quality calculated from PRIMER_SEQUENCE_QUALITY', 'PRIMER_MIN_END_QUALITY'=>'(int, default 0) Minimum sequence quality calculated from PRIMER_SEQUENCE_QUALITY at 5 prime 5 bases', 'PRIMER_QUALITY_RANGE_MIN'=>'(int, default 0) Minimum sequence quality calculated from PRIMER_SEQUENCE_QUALITY', 'PRIMER_QUALITY_RANGE_MAX'=>'(int, default 100) Maximum sequence quality calculated from PRIMER_SEQUENCE_QUALITY', 'PRIMER_MAX_END_STABILITY'=>'(float 999.9999, default 100.0) Maximum stability for the five 3 prime bases of a primer. Bigger numbers mean more stable 3 prime ends.', 'PRIMER_PRODUCT_OPT_TM'=>'(float, default 0.0) Optimum melting temperature for the PCR product. 0 means no optimum.', 'PRIMER_PRODUCT_OPT_SIZE'=>'(int, default 0) Optimum size for the PCR product. 0 means no optimum.', 'PRIMER_TASK'=>'(string, default pick_pcr_primers) Choose from pick_pcr_primers, pick_pcr_primers_and_hyb_probe, pick_left_only, pick_right_only, pick_hyb_probe_only', 'PRIMER_WT_TM_GT'=>'(float, default 1.0) Penalty weight for primers with Tm over PRIMER_OPT_TM.', 'PRIMER_WT_TM_LT'=>'(float, default 1.0) Penalty weight for primers with Tm under PRIMER_OPT_TM.', 'PRIMER_WT_SIZE_LT'=>'(float, default 1.0) Penalty weight for primers shorter than PRIMER_OPT_SIZE.', 'PRIMER_WT_SIZE_GT'=>'(float, default 1.0) Penalty weight for primers longer than PRIMER_OPT_SIZE.', 'PRIMER_WT_GC_PERCENT_LT'=>'(float, default 1.0) Penalty weight for primers with GC percent greater than PRIMER_OPT_GC_PERCENT.', 'PRIMER_WT_GC_PERCENT_GT'=>'(float, default 1.0) Penalty weight for primers with GC percent greater than PRIMER_OPT_GC_PERCENT.', 'PRIMER_WT_COMPL_ANY'=>'(float, default 0.0)', 'PRIMER_WT_COMPL_END'=>'(float, default 0.0)', 'PRIMER_WT_NUM_NS'=>'(float, default 0.0)', 'PRIMER_WT_REP_SIM'=>'(float, default 0.0)', 'PRIMER_WT_SEQ_QUAL'=>'(float, default 0.0)', 'PRIMER_WT_END_QUAL'=>'(float, default 0.0)', 'PRIMER_WT_POS_PENALTY'=>'(float, default 0.0)', 'PRIMER_WT_END_STABILITY'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PR_PENALTY'=>'(float, default 1.0)', 'PRIMER_PAIR_WT_IO_PENALTY'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_DIFF_TM'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_COMPL_ANY'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_COMPL_END'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_TM_LT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_TM_GT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_SIZE_GT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_PRODUCT_SIZE_LT'=>'(float, default 0.0)', 'PRIMER_PAIR_WT_REP_SIM'=>'(float, default 0.0)', 'PRIMER_INTERNAL_OLIGO_EXCLUDED_REGION'=>'(interval list, default empty) Internal oligos must ignore these regions', 'PRIMER_INTERNAL_OLIGO_INPUT'=>'(nucleotide sequence, default empty) Known sequence of an internal oligo', 'PRIMER_INTERNAL_OLIGO_OPT_SIZE'=>'(int, default 20)', 'PRIMER_INTERNAL_OLIGO_MIN_SIZE'=>'(int, default 18)', 'PRIMER_INTERNAL_OLIGO_MAX_SIZE'=>'(int, default 27)', 'PRIMER_INTERNAL_OLIGO_OPT_TM'=>'(float, default 60.0 degrees C)', 'PRIMER_INTERNAL_OLIGO_OPT_GC_PERCENT'=>'(float, default 50.0%)', 'PRIMER_INTERNAL_OLIGO_MIN_TM'=>'(float, default 57.0 degrees C)', 'PRIMER_INTERNAL_OLIGO_MAX_TM'=>'(float, default 63.0 degrees C)', 'PRIMER_INTERNAL_OLIGO_MIN_GC'=>'(float, default 20.0%)', 'PRIMER_INTERNAL_OLIGO_MAX_GC'=>'(float, default 80.0%)', 'PRIMER_INTERNAL_OLIGO_SALT_CONC'=>'(float, default 50.0 mM)', 'PRIMER_INTERNAL_OLIGO_DNA_CONC'=>'(float, default 50.0 nM)', 'PRIMER_INTERNAL_OLIGO_SELF_ANY'=>'(decimal 9999.99, default 12.00)', 'PRIMER_INTERNAL_OLIGO_MAX_POLY_X'=>'(int, default 5)', 'PRIMER_INTERNAL_OLIGO_SELF_END'=>'(decimal 9999.99, default 12.00)', 'PRIMER_INTERNAL_OLIGO_MISHYB_LIBRARY'=>'(string, optional)', 'PRIMER_INTERNAL_OLIGO_MAX_MISHYB'=>'(decimal,9999.99, default 12.00)', 'PRIMER_INTERNAL_OLIGO_MIN_QUALITY'=>'(int, default 0)', 'PRIMER_IO_WT_TM_GT'=>'(float, default 1.0)', 'PRIMER_IO_WT_TM_LT'=>'(float, default 1.0)', 'PRIMER_IO_WT_GC_PERCENT_GT'=>'(float, default 1.0)', 'PRIMER_IO_WT_GC_PERCENT_LT'=>'(float, default 1.0)', 'PRIMER_IO_WT_SIZE_LT'=>'(float, default 1.0)', 'PRIMER_IO_WT_SIZE_GT'=>'(float, default 1.0)', 'PRIMER_IO_WT_COMPL_ANY'=>'(float, default 0.0)', 'PRIMER_IO_WT_COMPL_END'=>'(float, default 0.0)', 'PRIMER_IO_WT_NUM_NS'=>'(float, default 0.0)', 'PRIMER_IO_WT_REP_SIM'=>'(float, default 0.0)', 'PRIMER_IO_WT_SEQ_QUAL'=>'(float, default 0.0)', 'PRIMER_IO_WT_END_QUAL'=>'(float, default 0.0)', 'PRIMER_INSIDE_PENALTY' => '(float, default -1.0)', 'PRIMER_INTERNAL_OLIGO_MAX_TEMPLATE_MISHYB' => '(decimal 9999.99, default 12.00)', 'PRIMER_OUTSIDE_PENALTY' => '(float, default 0.0)', 'PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS' => '(boolean, default 1)', 'PRIMER_MAX_TEMPLATE_MISPRIMING' => '(decimal,9999.99, default -1.00)', 'PRIMER_PAIR_MAX_TEMPLATE_MISPRIMING' => '(decimal,9999.99, default -1.00)', 'PRIMER_PAIR_WT_TEMPLATE_MISPRIMING' => '(float, default 0.0)', 'PRIMER_WT_TEMPLATE_MISPRIMING' => '(float, default 0.0)' ); $self->{'input_options'}=\%hash; return \%hash; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Prints.pm000077500000000000000000000152021302566030400223340ustar00rootroot00000000000000# Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Prints =head1 SYNOPSIS Build a Prints factory my @params = ('DB',$dbfile); my $factory = Bio::Tools::Run::Prints->new($params); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION FingerPRINTScan II is a PRINTS fingerprint identification algorithm. Copyright (C) 1998,1999 Phil Scordis =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Prints; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PRINTS_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::IO; use Bio::Root::Root; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Prints; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @PRINTS_PARAMS=qw(DB PROGRAM VERBOSE); foreach my $attr ( @PRINTS_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'FingerPRINTScan'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PRINTSDIR}) if $ENV{PRINTSDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $prints->new(@params) Function: creates a new Prints factory Returns: Bio::Tools::Run::Prints Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED. Use $obj->run($seqFile) instead. Function: Runs Prints and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs Prints Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI, or a Fasta file name =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ){# it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { #The clone object is not a seq object but a file. #Perhaps should check here or before if this file is fasta format...if not die #Here the file does not need to be created or deleted. Its already written and may be used by other runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; my $str =$self->executable." ".$self->DB." ".$self->_input." -fjR >".$outfile; my $status = system($str); $self->throw( "Prints call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (PRINTS, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*PRINTS; } else { $filehandle = $outfile; } my $prints_parser = Bio::Tools::Prints->new(-fh=>$filehandle); my @prints_feat; while(my $prints_feat = $prints_parser->next_result){ push @prints_feat, $prints_feat; } $self->cleanup(); unlink $outfile; return @prints_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); $in->close(); undef $in; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Profile.pm000077500000000000000000000154101302566030400224560ustar00rootroot00000000000000# BioPerl module for Profile # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Profile =head1 SYNOPSIS Build a Profile factory # $paramfile is the full path to the seg binary file my @params = ('DB',$dbfile,'PROGRAM',$paramfile); my $factory = Bio::Tools::Run::Profile->new($param); # Pass the factory a Bio::PrimarySeqI object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION Wrapper module for the pfscan program =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Balamurugan Kumarasamy Email: fugui@worf.fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Profile; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROFILE_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Profile; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @PROFILE_PARAMS=qw(DB PROGRAM VERBOSE); foreach my $attr ( @PROFILE_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'pfscan'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{PROFILEDIR}) if $ENV{PROFILEDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $factory= Bio::Tools::Run::Profile->new($param); Function: creates a new Profile factory Returns: Bio::Tools::Run::Profile Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features Usage : DEPRECATED. Use $factory->run($seq) instead. Function: Runs Profile and creates an array of featrues Returns : An array of L objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : my @feats = $factory->run($seq) Function: Runs Profile Returns : An array of L objects Args : A Bio::PrimarySeqI =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ) { if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $display_id = $seq->display_id; my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run($display_id); unlink $infile1; } else { #The clone object is not a seq object but a file. #Perhaps should check here or before if this file is fasta format...if not die #Here the file does not need to be created or deleted. Its already written and may be used by other runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : $factory->_input($seqFile) Function: get/set for input file Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $factory->_run() Function: Makes a system call and runs pfscan Returns : An array of L objects Args : =cut sub _run { my ($self,$display_id)= @_; my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; my $str =$self->executable.' -fz '.$self->_input." ".$self->DB." > ".$outfile; my $status = system($str); $self->throw( "Profile call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (PROFILE, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*PROFILE; } else { $filehandle = $outfile; } my $profile_parser = Bio::Tools::Profile->new(-fh=>$filehandle); my @profile_feat; while(my $profile_feat = $profile_parser->next_result){ $profile_feat->seq_id($display_id); push @profile_feat, $profile_feat; } $self->cleanup(); unlink $outfile; return @profile_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : $factory->_writeSeqFile($seq) Function: Creates a file from the given seq object Returns : A string(filename) Args : Bio::PrimarySeqI =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); undef $in; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Promoterwise.pm000066400000000000000000000245721302566030400235630ustar00rootroot00000000000000# # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Promoterwise - Wrapper for aligning two sequences using promoterwise =head1 SYNOPSIS # Build a Promoterwise alignment factory my @params = ('-s'=>1,'-query_start'=>10,'-dymem'=>1); my $factory = Bio::Tools::Run::Promoterwise->new(@params); my (@fp)= $factory->run($seq1,$seq2); # each feature pair is a group of hsps foreach my $fp(@fp){ print "Hit Length: ".$fp->feature1->length."\n"; print "Hit Start: ".$fp->feature1->start."\n"; print "Hit End: ".$fp->feature1->end."\n"; print "Hsps: \n"; my @first_hsp = $fp->feature1->sub_SeqFeature; my @second_hsp = $fp->feature2->sub_SeqFeature; for ($i..$#first_hsp){ print $first_hsp[$i]->seq." ".$second_hsp[$i]->seq."\n"; } } print "end: ". $fp->feature2->start."\t".$fp->feature2->end."\n"; #Available parameters include: #( S T U V QUERY_START QUERY_END TARGET_START #TARGET_END LHWINDOW LHSEED LHALN LHSCORE LHREJECT #LHREJECT LHMAX DYMEM KBYTE DYCACHE) #For an explanation of these parameters, please see documentation #from the Wise package. =head1 DESCRIPTION Promoterwise is an alignment algorithm that relaxes the constraint that local alignments have to be co-linear. Otherwise it provides a similar model to DBA, which is designed for promoter sequence alignments by Ewan Birney. It is part of the wise2 package available at: http://www.sanger.ac.uk/software/wise2. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email: shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Promoterwise; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @PROMOTERWISE_SWITCHES @PROMOTERWISE_PARAMS @OTHER_SWITCHES %OK_FIELD); use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Promoterwise; use strict; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); # Two ways to run the program ..... # 1. define an environmental variable WISEDIR # export WISEDIR =/usr/local/share/wise2.2.0 # where the wise2.2.20 package is installed # # 2. include a definition of an environmental variable WISEDIR in # every script that will use DBA.pm # $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; BEGIN { @PROMOTERWISE_PARAMS = qw( S T U V QUERY_START QUERY_END TARGET_START TARGET_END LHWINDOW LHSEED LHALN LHSCORE LHREJECT LHREJECT LHMAX DYMEM KBYTE DYCACHE); @OTHER_SWITCHES = qw(SILENT QUIET ERROROFFSTD); # Authorize attribute fields foreach my $attr ( @PROMOTERWISE_PARAMS, @PROMOTERWISE_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'promoterwise'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin/") if $ENV{WISEDIR}; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $prog = $self->executable; my $string = `$prog -version`; $string =~ /(Version *)/i; return $1 || undef; } =head2 run Title : run Usage : 2 sequence objects @fp = $factory->run($seq1, $seq2); Function: run Returns : An array of Args : Name of a file containing a set of 2 fasta sequences or else 2 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 2 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub run{ my ($self, $seq1, $seq2)=@_; my ($attr, $value, $switch); $self->io->_io_cleanup(); # Create input file pointer my ($infile1,$infile2)= $self->_setinput($seq1, $seq2); if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) ");} # run promoterwise my @fp = $self->_run($infile1,$infile2); return @fp; } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to a promoterwise program Example : Returns : L Args : Name of a files containing 2 sequences in the order of peptide and genomic =cut sub _run { my ($self,$infile1,$infile2) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); unless ( $self->executable ) { $self->throw("Cannot run Promoterwise unless the executable is found.". " Check your environment variables or make sure ". "promoterwise is in your path."); } my $paramstring = $self->_setparams; my $commandstring = $self->executable." $infile1 $infile2 $paramstring"; # this is to capture STDERR messages which leak out when you run programs # with open(FH, "... |"); if( ( $self->silent && $self->quiet) && ($^O !~ /os2|dos|amigaos/) ) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " -quiet -silent -erroroffstd 2> $null"; } $self->debug( "promoterwise command = $commandstring"); open(PW, "$commandstring |") || $self->throw( "Promoterwise call ($commandstring) crashed: $? \n"); my $pw_parser = Bio::Tools::Promoterwise->new(-fh=>\*PW, -query1_seq=>$self->_query1_seq, -query2_seq=>$self->_query2_seq); my @fp; while (my $fp = $pw_parser->next_result){ push @fp,$fp; } return @fp; } sub _setinput { my ($self, $seq1, $seq2) = @_; my ($tfh1,$tfh2,$outfile1,$outfile2); $self->throw("calling with not enough arguments") unless $seq1 && $seq2; # Not going to set _query_pep/_subject_dna_seq # if you pass in a filename unless( ref($seq1) ) { unless( -e $seq1 ) { $self->throw("Sequence1 is not a Bio::PrimarySeqI object nor file\n"); } $outfile1 = $seq1; } else { ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new('-fh' => $tfh1, '-format' => 'fasta'); $out1->write_seq($seq1); $self->_query1_seq($seq1); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh1); undef $tfh1; } unless( ref($seq2) ) { unless( -e $seq2 ) { $self->throw("Sequence2 is not a Bio::PrimarySeqI object nor file\n"); } $outfile2 = $seq2; } else { ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); my $out2 = Bio::SeqIO->new('-fh' => $tfh2, '-format' => 'fasta'); $out2->write_seq($seq2); $self->_query2_seq($seq2); # Make sure you close things - this is what creates # Out of filehandle errors close($tfh2); undef $tfh2; } return ($outfile1,$outfile2); } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self) = @_; my $param_string; foreach my $attr(@PROMOTERWISE_PARAMS){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .= $attr_key.' '.$value; } foreach my $attr(@PROMOTERWISE_SWITCHES){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key; } $param_string = $param_string." -hitoutput tab"; #specify the output option return $param_string; } =head2 _query_pep_seq Title : _query_pep_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query1_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query1_seq'} = $seq; } return $self->{'_query1_seq'}; } =head2 _subject_dna_seq Title : _subject_dna_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _query2_seq{ my ($self,$seq) = @_; if(defined $seq){ $self->{'_query2_seq'} = $seq; } return $self->{'_query2_seq'}; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Pseudowise.pm000066400000000000000000000262521302566030400232100ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::Pseudowise # # Please direct questions and support issues to # # Cared for by # # Copyright Kiran # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Pseudowise - Object for prediting pseudogenes in a given sequence given a protein and a cdna sequence =head1 SYNOPSIS # Build a pseudowise alignment factory my $factory = Bio::Tools::Run::Pseudowise->new(); # Pass the factory 3 Bio:SeqI objects (in the order of query # peptide and cdna and target_genomic) # @genes is an array of GenericSeqFeature objects my @genes = $factory->run($seq1, $seq2, $seq3); =head1 DESCRIPTION Pseudowise is a pseudogene predition program developed by Ewan Birney http://www.sanger.ac.uk/software/wise2. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Kiran Email kiran@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Pseudowise; use vars qw($AUTOLOAD @ISA $PROGRAM_NAME $PROGRAM_DIR @PSEUDOWISE_SWITCHES @PSEUDOWISE_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Pseudowise; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable pseudowise to find the pseudowise program. This # can be done in (at least) two ways: # # 1. define an environmental variable WISEDIR # export WISEDIR =/usr/local/share/wise2.2.0 # where the wise2.2.20 package is installed # # 2. include a definition of an environmental variable WISEDIR in # every script that will use DBA.pm # $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; BEGIN { $PROGRAM_NAME = 'pseudowise'; $PROGRAM_DIR = Bio::Root::IO->catfile($ENV{WISEDIR},"src","bin") if $ENV{WISEDIR}; @PSEUDOWISE_PARAMS = qw(SPLICE_MAX_COLLAR SPLICE_MIN_COLLAR SPLICE_SCORE_OFFSET GENESTATS NOMATCHN PARAMS KBYTE DYMEM DYDEBUG PALDEBUG ERRORLOG); @PSEUDOWISE_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD); # Authorize attribute fields foreach my $attr ( @PSEUDOWISE_PARAMS, @PSEUDOWISE_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAM_NAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return $PROGRAM_DIR; } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/'PROGRAM'/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; my $string = `pseudowise -- ` ; $string =~ /\(([\d.]+)\)/; return $1 || undef; } =head2 predict_genes Title : predict_genes Usage : DEPRECATED. Use $factory->run instead Function: Predict pseudogenes Returns : An array of Bio::Seqfeature::Generic objects Args : Name of a file containing a set of 3 fasta sequences in the order of peptide, cdna and genomic sequences or else 3 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 3 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub predict_genes { return shift->run(@_); } =head2 run Title : run Usage : my @feats = $factory->run($seq1, $seq2, $seq3); Function: Executes pseudogene binary Returns : An array of Bio::Seqfeature::Generic objects Args : Name of a file containing a set of 3 fasta sequences in the order of peptide, cdna and genomic sequences or else 3 Bio::Seq objects. Throws an exception if argument is not either a string (eg a filename) or 3 Bio::Seq objects. If arguments are strings, throws exception if file corresponding to string name can not be found. =cut sub run { my ($self,@args)=@_; my ($attr, $value, $switch); # Create input file pointer my @files = $self->_setinput(@args); if( @files !=3 || grep { !defined } @files ) { $self->throw("Bad input data (sequences need an id ) "); } my $prot_name = $args[0]->display_id; return $self->_run($prot_name, @files); } =head2 _run Title : _run Usage : Internal function, not to be called directly Function: makes actual system call to a pseudowise program Example : Returns : nothing; pseudowise output is written to a temporary file $TMPOUTFILE Args : Name of a files containing 3 sequences in the order of peptide, cdna and genomic =cut sub _run { my ($self,$prot_name, @files) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); my $paramstring = $self->_setparams; my $commandstring = sprintf("%s %s %s > %s", $self->executable, $paramstring, join(" ", @files), $outfile); if($self->silent || $self->quiet || ($self->verbose < 1)){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $commandstring .= " 2> $null"; } $self->debug( "pseudowise command = $commandstring\n"); # my $status = system($commandstring); `$commandstring`; # $self->throw( "Pseudowise call ($commandstring) crashed: $? \n") # unless $status == 0; #parse the outpur and return a Bio::Seqfeature array my $genes = $self->_parse_results($prot_name,$outfile); close($tfh1); undef $tfh1; if( $self->verbose > 0 ) { open($tfh1,$outfile) || die $!; while(<$tfh1>) { $self->debug ($_); } } return @{$genes}; } =head2 _parse_results Title : __parse_results Usage : Internal function, not to be called directly Function: Parses pseudowise output Example : Returns : an reference to an array of Seqfeatures Args : the name of the output file =cut sub _parse_results { my ($self,$prot_name,$outfile) = @_; $outfile||$self->throw("No outfile specified"); my $filehandle; if (ref ($outfile) !~ /GLOB/i ) { open ($filehandle, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); } else { $filehandle = $outfile; } my @genes; #The big parsing loop - parses exons and predicted peptides my $parser = Bio::Tools::Pseudowise->new(-verbose => $self->verbose, -fh => $filehandle); while( my $f = $parser->next_feature ) { push @genes, $f; } return \@genes; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: Create input files for pseudowise program Example : Returns : name of file containing dba data input Args : Seq objects in the order of query protein and cdna and target genomic sequence =cut sub _setinput { my ($self, $seq1, $seq2, $seq3) = @_; my ($tfh1,$tfh2,$tfh3,$outfile1,$outfile2,$outfile3); if(!($seq1->isa("Bio::PrimarySeqI") && $seq2->isa("Bio::PrimarySeqI") && $seq2->isa("Bio::PrimarySeqI"))) { $self->throw("One or more of the sequences are nor Bio::PrimarySeqI objects\n"); } my $tempdir = $self->tempdir(); ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$tempdir); ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$tempdir); ($tfh3,$outfile3) = $self->io->tempfile(-dir=>$tempdir); my $out1 = Bio::SeqIO->new(-fh => $tfh1 ,'-format' => 'Fasta'); my $out2 = Bio::SeqIO->new(-fh => $tfh2, '-format' => 'Fasta'); my $out3 = Bio::SeqIO->new(-fh => $tfh3, '-format' => 'Fasta'); $out1->write_seq($seq1); $out2->write_seq($seq2); $out3->write_seq($seq3); $self->_query_pep_seq($seq1); $self->_query_cdna_seq($seq2); $self->_subject_dna_seq($seq3); close($tfh1); close($tfh2); close($tfh3); undef ($tfh1); undef ($tfh2); undef ($tfh3); return ($outfile1,$outfile2,$outfile3); } sub _setparams { my ($self) = @_; my $param_string; foreach my $attr(@PSEUDOWISE_PARAMS){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key.' '.$value; } foreach my $attr(@PSEUDOWISE_SWITCHES){ my $value = $self->$attr(); next unless (defined $value); my $attr_key = ' -'.(lc $attr); $param_string .=$attr_key; } return $param_string; } =head2 _query_pep_seq() Title : _query_pep_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_pep_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_pep_seq'} = $seq; } return $self->{'_query_pep_seq'}; } =head2 _query_cdna_seq() Title : _query_cdna_seq Usage : Internal function, not to be called directly Function: get/set for the query sequence Example : Returns : Args : =cut sub _query_cdna_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_query_cdna_seq'} = $seq; } return $self->{'_query_cdna_seq'}; } =head2 _subject_dna_seq() Title : _subject_dna_seq Usage : Internal function, not to be called directly Function: get/set for the subject sequence Example : Returns : Args : =cut sub _subject_dna_seq { my ($self,$seq) = @_; if(defined $seq){ $self->{'_subject_dna_seq'} = $seq; } return $self->{'_subject_dna_seq'}; } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/RNAMotif.pm000066400000000000000000000335241302566030400225000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::RNAMotif # # Please direct questions and support issues to # # Cared for by Chris Fields # # Copyright Chris Fields # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::RNAMotif - Wrapper for local execution of rnamotif, rm2ct, rmfmt, rmprune =head1 SYNOPSIS #run rnamotif|rmfmt|rm2ct my @params = ( descr => 'pyrR.descr', fmt => 'gb', setvar => 'ctx_maxlen=20', context => 1, sh => 1, ); my $factory = Bio::Tools::Run::RNAMotif->new(-program =>'rnamotif', -prune => 1, @params); # Pass the factory a Bio::Seq object or a file name # Returns a Bio::SearchIO object #my $searchio = $factory->run("B_sub.gb"); my $searchio = $factory->run($seq); while (my $result = $searchio->next_result){ while(my $hit = $result->next_hit){ while (my $hsp = $hit->next_hsp){ print join("\t", ( $r->query_name, $hit->name, $hsp->hit->start, $hsp->hit->end, $hsp->meta, $hsp->score, )), "\n"; } } } # Pass a finished report through rmfmt (-a format only) # Returns Bio::AlignIO object my $aio = Bio::AlignIO->new(-file=>"rna.msf",-format=>'msf'); my $factory = Bio::Tools::Run::RNAMotif->new('program'=>'rmfmt', 'a' => 1); my $alnin = $factory->run('trna.rnamotif'); my $aln = $alnin->next_aln; $aio->write_aln($aln); =head1 DESCRIPTION Wrapper module for Tom Macke and David Cases's RNAMotif suite of programs. This allows running of rnamotif, rmprune, rm2ct, and rmfmt. Binaries are available at http://www.scripps.edu/mb/case/casegr-sh-3.5.html. This wrapper allows for one to save output to an optional named file or tempfile using the '-outfile_name' or '-tempfile' parameters; this is primarily for saving output from the rm2ct program, which currently does not have a parser available. If both a named output file and tempfile flag are set, the output file name is used. The default setting is piping output into a filehandle for parsing (or output to STDERR, for rm2ct which requires '-verbose' set to 1). WARNING: At this time, there is very little checking of parameter settings, so one could have an error if setting the worng parameter for a program. Future versions will likely add some error checking. =head1 NOTES ON PROGRAM PARAMETERS All program parameters are currently supported. Of note, the 'D' parameter, used for setting the value of a variable to a value, is changed to 'set_var' to avoid name collisions with 'd' (used for dumping internal data structures). =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Chris Fields Email: cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS cjfields-at-uiuc-dot-edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::RNAMotif; use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::SearchIO; use Bio::AlignIO; use Bio::Tools::Run::WrapperBase; use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # will move parameters to each program, use this for _set_params my %RNAMOTIF_PROGS =( rnamotif => [qw(c d h N O p s v context sh setvar I xdfname pre post descr xdescr fmt fmap )], rm2ct => [qw(t)], rmfmt => [qw(a l la smax td)], rmprune => [] # no params ); my %RNAMOTIF_SWITCHES = map {$_ => 1} qw(c d h p s v l a la context sh); # order is important here my @RNAMOTIF_PARAMS=qw(program prune c sh N d h p s v context setvar O I xdfname pre post descr xdescr fmt fmap l a la t); =head2 new Title : new Usage : my $wrapper = Bio::Tools::Run::RNAMotif->new(@params) Function: creates a new RNAMotif factory Returns: Bio::Tools::Run::RNAMotif Args : list of parameters -tempfile => set tempfile flag (default 0) -outfile_name => set file to send output to (default none) -prune => set rmprune postprocess flag (default 0) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($out, $tf) = $self->_rearrange([qw(OUTFILE_NAME TEMPFILE)], @args); $self->io->_initialize_io(); if ($tf && !$out) { my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); } else { $out ||= ''; $self->outfile_name($out); } $tf && $self->tempfile($tf); $self->_set_from_args(\@args, -methods => [@RNAMOTIF_PARAMS], -create => 1 ); return $self; } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { my ($self) = shift; return $self->program(@_); } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{RNAMOTIFDIR}) if $ENV{RNAMOTIFDIR}; } =head2 version Title : version Usage : $v = $prog->version(); Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return undef unless $self->executable; return $self->{'_progversion'} if $self->{'_progversion'}; my $string = `rnamotif -v 2>&1`; my $v; if ($string =~ m{([\d.]+)}) { $v = $1; } return $self->{'_progversion'} = $v || $string; } =head2 run Title : run Usage : $obj->run($seqFile) Function: Runs RNAMotif programs, returns Bio::SearchIO/Bio::AlignIO Returns : Depends on program: 'rnamotif' - returns Bio::SearchIO 'rmfmt -a' - returns Bio::AlignIO all others - sends output to outfile, tempfile, STDERR Use search() (for Bio::SearchIO stream) or get_AlignIO() (for Bio::AlignIO stream) for a uniform Bioperl object interface. Args : A Bio::PrimarySeqI or file name Note : This runs any RNAMotif program set via program() =cut sub run { my ($self,@seq) = @_; $self->throw ("Must pass a file name or a list of Bio::PrimarySeqI objects") if (!@seq); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head2 search Title : search Usage : $searchio = $obj->search($seqFile) Function: Runs 'rnamotif' on seqs, returns Bio::SearchIO Returns : A Bio::SearchIO Args : A Bio::PrimarySeqI or file name Note : Runs 'rnamotif' only, regardless of program setting; all other parameters loaded =cut sub search { my ($self,@seq) = @_; $self->throw ("Must pass a file name or a list of Bio::PrimarySeqI objects") if (!@seq); if (ref $seq[0] && $seq[0]->isa("Bio::PrimarySeqI") ){# it is an object my $infile1 = $self->_writeSeqFile(@seq); return $self->_run($infile1); } else { return $self->_run(@seq); } } =head2 get_AlignIO Title : get_AlignIO Usage : $aln = $obj->get_AlignIO($seqFile) Function: Runs 'rmfmt -a' on file, returns Bio::AlignIO Returns : A Bio::AlignIO Args : File name Note : Runs 'rmfmt -a' only, regardless of program setting; only file name and outfile (if any) are set =cut sub get_AlignIO { my ($self,@seq) = @_; $self->throw ("Must pass a file name") if (!@seq && ref($seq[0])); return $self->_run(@seq); } =head2 tempfile Title : tempfile Usage : $obj->tempfile(1) Function: Set tempfile flag. When set, writes output to a tempfile; this is overridden by outfile_name() if set Returns : Boolean setting (or undef if not set) Args : [OPTIONAL] Boolean =cut sub tempfile { my $self = shift; return $self->{'_tempfile'} = shift if @_; return $self->{'_tempfile'}; } =head2 prune Title : prune Usage : $obj->prune(1) Function: Set rmprune flag. When set, follows any searches with a call to rmprune (this deletes some redundant sequence hits) Returns : Boolean setting (or undef if not set) Args : [OPTIONAL] Boolean =cut sub prune { my $self = shift; return $self->{'_prune'} = shift if @_; return $self->{'_prune'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : Args : =cut sub _run { my ($self,$file,$prog)= @_; return unless $self->executable; $self->io->_io_cleanup(); my ($str, $progname, $outfile) = ($prog || $self->executable, $self->program_name, $self->outfile_name); my $param_str = $self->_setparams($file); my $descr = ($self->can('descr')) ? $self->descr : ($self->can('xdescr')) ? $self->xdescr : $self->throw("Must have a descriptor present!"); $str .= " $param_str"; $self->debug("RNAMotif command: $str\n"); # rnamotif => SearchIO object # rmfmt -a => AlignIO object # all others sent to outfile, tempfile, or STDERR (upon verbose = 1) my $obj = ($progname eq 'rnamotif' || $progname eq 'rmprune' ) ? Bio::SearchIO->new(-verbose => $self->verbose, -format => "rnamotif", -version => $self->version, -database => $file, -model => $descr) : ($progname eq 'rmfmt' && $self->can('a') && $self->a) ? Bio::AlignIO->new(-verbose => $self->verbose, -format =>'fasta') : undef; my @args; # file-based if ($outfile) { local $SIG{CHLD} = 'DEFAULT'; my $status = system($str); if($status || !-e $outfile ) { my $error = ($!) ? "$! Status: $status" : "Status: $status"; $self->throw( "RNAMotif call crashed: $error \n[command $str]\n"); return undef; } if ($obj && ref($obj)) { $obj->file($outfile); @args = (-file => $outfile); } # fh-based } else { open(my $fh,"$str |") || $self->throw("RNAMotif call ($str) crashed: $?\n"); if ($obj && ref($obj)) { $obj->fh($fh); @args = (-fh => $fh); } else { # dump to debugging my $io; while(<$fh>) {$io .= $_;} close($fh); $self->debug($io); return 1; } } # initialize SearchIO/AlignIO...um...IO # (since file/fh set post obj construction) $obj->_initialize_io(@args) if $obj && ref($obj); return $obj || 1; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: creates a string of params to be used in the command string Example : Returns : string of params Args : =cut sub _setparams { my ($self, $file) = @_; my $progname = $self->program_name; # small sanity check $self->throw("Unknown program: $progname") if (!exists $RNAMOTIF_PROGS{$progname} ); my $param_string; my $outfile = ($self->outfile_name) ? ' > '.$self->outfile_name : ''; my @params; foreach my $attr (@RNAMOTIF_PARAMS) { next if ($attr =~/PROGRAM|DB|PRUNE/i); my $value = $self->$attr(); next unless (defined $value); my $attr_key = '-'.$attr; if (exists $RNAMOTIF_SWITCHES{$attr}) { push @params, $attr_key; } else { if ($attr eq 'setvar') { push @params, '-D'.$value; } else { push @params, $attr_key.' '.$value; } } } $param_string = join ' ', @params; $param_string .= ' '.$file; if ($self->prune && $self->program_name eq 'rnamotif') { $param_string .= ' | rmprune'; } $param_string .= $outfile; return $param_string; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : writes passed Seq objects to tempfile, to be used as input for program Args : =cut sub _writeSeqFile { my ($self,@seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); foreach my $s(@seq){ $in->write_seq($s); } $in->close(); $in = undef; close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/RepeatMasker.pm000066400000000000000000000267161302566030400234510ustar00rootroot00000000000000# BioPerl module for RepeatMasker # # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::RepeatMasker - Wrapper for RepeatMasker Program =head1 SYNOPSIS use Bio::Tools::Run::RepeatMasker; my @params=("mam" => 1,"noint"=>1); my $factory = Bio::Tools::Run::RepeatMasker->new(@params); $in = Bio::SeqIO->new(-file => "contig1.fa", -format => 'fasta'); my $seq = $in->next_seq(); #return an array of Bio::SeqFeature::FeaturePair objects my @feats = $factory->run($seq); # or $factory->run($seq); my @feats = $factory->repeat_features; #return the masked sequence, a Bio::SeqI object my $masked_seq = $factory->run; =head1 DESCRIPTION To use this module, the RepeatMasker program (and probably database) must be installed. RepeatMasker is a program that screens DNA sequences for interspersed repeats known to exist in mammalian genomes as well as for low complexity DNA sequences. For more information, on the program and its usage, please refer to http://www.repeatmasker.org/. Having installed RepeatMasker, you must let Bioperl know where it is. This can be done in (at least) three ways: 1. Make sure the RepeatMasker executable is in your path. 2. Define an environmental variable REPEATMASKERDIR which is a directory which contains the RepeatMasker executable: In bash: export REPEATMASKERDIR=/home/username/RepeatMasker/ In csh/tcsh: setenv REPEATMASKERDIR /home/username/RepeatMasker/ 3. Include a definition of an environmental variable REPEATMASKERDIR in every script that will use this RepeatMasker wrapper module, e.g.: BEGIN { $ENV{REPEATMASKERDIR} = '/home/username/RepeatMasker/' } use Bio::Tools::Run::RepeatMasker; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut package Bio::Tools::Run::RepeatMasker; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @RM_SWITCHES @RM_PARAMS @OTHER_SWITCHES %OK_FIELD); use strict; use Bio::SeqFeature::Generic; use Bio::SeqFeature::FeaturePair; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::RepeatMasker; # Let the code begin... @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase ); BEGIN { @RM_PARAMS = qw(DIR DIV LIB CUTOFF PARALLEL GC FRAG SPECIES MAXSIZE ); @RM_SWITCHES = qw(NOLOW LOW L NOINT INT NORNA ALU M MUS ROD RODENT MAM MAMMAL COW AR ARABIDOPSIS DR DROSOPHILA EL ELEGANS IS_ONLY IS_CLIP NO_IS RODSPEC E EXCLN NO_ID FIXED XM U GFF ACE POLY X XSMALL SMALL INV A ALIGNMENTS PRIMSPEC W WUBLAST S Q QQ GCCALC NOCUT); @OTHER_SWITCHES = qw(NOISY QUIET SILENT); # Authorize attribute fields foreach my $attr ( @RM_PARAMS, @RM_SWITCHES, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'RepeatMasker'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{REPEATMASKERDIR}) if $ENV{REPEATMASKERDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new($seq) Function: creates a new wrapper Returns: Bio::Tools::Run::RepeatMasker Args : self =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); # Need to check that filehandle is not left open here... while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } # unless ($self->executable()) { # if( $self->verbose >= 0 ) { # warn "RepeatMasker program not found as ".($self->executable||''). # " or not executable. \n"; # } # } return $self; } =head2 version Title : version Usage : Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; return $self->{'_version'} if( defined $self->{'_version'} ); my $exe = $self->executable; return undef unless $exe; my $string = `$exe -- ` ; if( $string =~ /\(([\d.]+)\)/ || $string =~ /RepeatMasker\s+version\s+(\S+)/ ) { return $self->{'_version'} = $1; } else { return $self->{'_version'} = undef; } } =head2 run Title : run Usage : $rm->run($seq); Function: Run Repeatmasker on the sequence set as the argument Returns : an array of repeat features that are Bio::SeqFeature::FeaturePairs Args : Bio::PrimarySeqI compliant object =cut sub run { my ($self,$seq) = @_; my ($infile); $infile = $self->_setinput($seq); my $param_string = $self->_setparams(); my @repeat_feats = $self->_run($infile,$param_string); return @repeat_feats; } =head2 mask Title : mask Usage : $rm->mask($seq) Function: This method is deprecated. Call run() instead Example : Returns : an array of repeat features that are Bio::SeqFeature::FeaturePairs Args : Bio::PrimarySeqI compliant object =cut sub mask{ return shift->run(@_); } =head2 _run Title : _run Usage : $rm->_run ($filename,$param_string) Function: internal function that runs the repeat masker Example : Returns : an array of repeat features Args : the filename to the input sequence and the parameter string =cut sub _run { my ($self,$infile,$param_string) = @_; my $instring; $self->debug( "Program ".$self->executable."\n"); my $outfile = $infile.".out"; my $cmd_str = $self->executable." $param_string ". $infile; $self->debug("repeat masker command = $cmd_str"); if ($self->quiet || $self->verbose <=0){ my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd_str.=" 2> $null 1>$null"; } my $status = system($cmd_str); $self->throw("Repeat Masker Call($cmd_str) crashed: $?\n") unless $status == 0; unless (open (RM, $outfile)) { $self->throw("Cannot open RepeatMasker outfile for parsing"); } my $rpt_parser = Bio::Tools::RepeatMasker->new(-fh=>\*RM); my @rpt_feat; while(my $rpt_feat = $rpt_parser->next_result){ push @rpt_feat, $rpt_feat; } $self->repeat_features(\@rpt_feat); #get masked sequence my $masked = $infile.".masked"; my $seqio = Bio::SeqIO->new(-file=>$masked,-format=>'FASTA'); $self->masked_seq($seqio->next_seq); return @rpt_feat; } =head2 masked_seq Title : masked_seq Usage : $rm->masked_seq($seq) Function: get/set for masked sequence Example : Returns : the masked sequence Args : Bio::Seq object =cut sub masked_seq { my ($self,$seq) = @_; if($seq){ $self->{'_masked_seq'} = $seq; } return $self->{'_masked_seq'}; } =head2 repeat_features Title : repeat_features Usage : $rm->repeat_features(\@rf) Function: get/set for repeat features array Example : Returns : the array of repeat features Args : =cut sub repeat_features { my ($self,$rf) = @_; if($rf) { $self->{'_rf'} = $rf; } return @{$self->{'_rf'}}; } =head2 _setparams() Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for repeatmasker program Example : Returns : parameter string to be passed to repeatmasker Args : name of calling object =cut sub _setparams { my ($attr, $value, $self); $self = shift; my $param_string = ""; for $attr ( @RM_PARAMS ) { $value = $self->$attr(); next unless (defined $value); my $attr_key = lc $attr; #put params in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key.' '.$value; } for $attr ( @RM_SWITCHES) { $value = $self->$attr(); next unless ($value); my $attr_key = lc $attr; #put switches in format expected by dba $attr_key = ' -'.$attr_key; $param_string .= $attr_key ; } return $param_string; } =head2 _setinput() Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : a Bio::PrimarySeqI compliant object =cut sub _setinput { my ($self,$seq) = @_; $seq->isa("Bio::PrimarySeqI") || $self->throw("Need a Bio::PrimarySeq compliant object for RepeatMasker"); # my $in = Bio::SeqIO->new(-file => $infilename , '-format' => 'Fasta'); my ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); $out1->write_seq($seq); close($tfh1); undef $tfh1; return ($outfile1); } =head1 Bio::Tools::Run::Wrapper methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $codeml->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $codeml->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Samtools.pm000077500000000000000000000123421302566030400226600ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Samtools # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Samtools - a run wrapper for the samtools suite *BETA* =head1 SYNOPSIS # convert a sam to a bam $samt = Bio::Tools::Run::Samtools( -command => 'view', -sam_input => 1, -bam_output => 1 ); $samt->run( -bam => "mysam.sam", -out => "mysam.bam" ); # sort it $samt = Bio::Tools::Run::Samtools( -command => 'sort' ); $samt->run( -bam => "mysam.bam", -pfx => "mysam.srt" ); # now create an assembly $assy = Bio::IO::Assembly->new( -file => "mysam.srt.bam", -refdb => "myref.fas" ); =head1 DESCRIPTION This is a wrapper for running samtools, a suite of large-alignment reading and manipulation programs available at L. =head1 RUNNING COMMANDS To run a C command, construct a run factory, specifying the desired command using the C<-command> argument in the factory constructor, along with options specific to that command (see L): $samt = Bio::Tools::Run::Samtools->new( -command => 'view', -sam_input => 1, -bam_output => 1); To execute, use the C method. Input and output files are specified in the arguments of C (see L): $samt->run( -bam => "mysam.sam", -out => "mysam.bam" ); =head1 OPTIONS C is complex, with many subprograms (commands) and command-line options and file specs for each. This module attempts to provide commands and options comprehensively. You can browse the choices like so: $samt = Bio::Tools::Run::Samtools->new( -command => 'pileup' ); # all samtools commands @all_commands = $samt->available_parameters('commands'); @all_commands = $samt->available_commands; # alias # just for pileup @pup_params = $samt->available_parameters('params'); @pup_switches = $samt->available_parameters('switches'); @pup_all_options = $samt->available_parameters(); Reasonably mnemonic names have been assigned to the single-letter command line options. These are the names returned by C, and can be used in the factory constructor like typical BioPerl named parameters. See L for the gory details. =head1 FILES When a command requires filenames, these are provided to the C method, not the constructor (C). To see the set of files required by a command, use C or the alias C: $samt = Bio::Tools::Run::Samtools->new( -command => 'view' ); @filespec = $samt->filespec; This example returns the following array: bam >out This indicates that the bam/sam file (bam) and the output file (out) MUST be specified in the C argument list: $samt->run( -bam => 'mysam.sam', -out => 'mysam.cvt' ); If files are not specified per the filespec, text sent to STDOUT and STDERR is saved and is accessible with C<$bwafac->stdout()> and C<$bwafac->stderr()>. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Samtools; use strict; use warnings; use Bio::Root::Root; use Bio::Tools::Run::Samtools::Config; # currently an AssemblerBase object, but the methods we need from # there should really go in an updated WrapperBase.../maj use base qw(Bio::Tools::Run::WrapperBase Bio::Root::Root); use Bio::Tools::Run::WrapperBase::CommandExts; our $program_name = 'samtools'; our $use_dash = 1; our $join = ' '; =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::Samtools(); Function: Builds a new Bio::Tools::Run::Samtools object Returns : an instance of Bio::Tools::Run::Samtools Args : =cut sub new { my ($class, @args) = @_; $program_dir ||= $ENV{SAMTOOLSDIR}; my $self = $class->SUPER::new(@args); return $self; } sub run { shift->_run(@_) } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Samtools/000077500000000000000000000000001302566030400223155ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Samtools/Config.pm000077500000000000000000000116501302566030400240660ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Samtools::Config # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Samtools::Config - configurator for Bio::Tools::Run::Samtools =head1 SYNOPSIS Not used directly. =head1 DESCRIPTION Exports global configuration variables (as required by L) to Samtools.pm. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::Samtools::Config; use strict; use warnings; no warnings qw(qw); use Exporter; our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( $program_dir @program_commands %command_prefixes @program_params @program_switches %param_translation %command_files ); @EXPORT_OK = qw(); our $program_dir; our @program_commands = qw( view sort index merge faidx pileup fixmate rmdup fillmd ); # composite commands: pseudo-commands that run a # sequence of commands # composite command prefix => list of prefixes of commands this # composite command runs # # prefixes only for commands that take params/switches... our %command_prefixes = ( 'view' => 'view', 'sort' => 'srt', 'index' => 'idx', 'merge' => 'mrg', 'faidx' => 'fai', 'pileup' => 'pup', 'fillmd' => 'fmd' ); our @program_params = qw( command view|tab_delim view|out_file view|pass_flags view|filt_flags view|refseq view|qual_threshold view|library view|read_group srt|mem_hint mrg|headers_in pup|refseq pup|map_qcap pup|ref_list pup|site_list pup|theta pup|n_haplos pup|exp_hap_diff pup|indel_prob ); our @program_switches = qw( view|bam_output view|uncompressed view|add_header view|only_header view|sam_input srt|sort_by_names mrg|sort_by_names pup|qual_last_col pup|sam_input pup|indels_only pup|call_cons pup|genot_L fmd|match_with_eq ); our %param_translation = ( 'view|tab_delim' => 't', 'view|out_file' => 'o', 'view|pass_flags' => 'f', 'view|refseq' => 'T', 'view|filt_flags' => 'F', 'view|qual_threshold' => 'q', 'view|library' => 'l', 'view|read_group' => 'r', 'view|bam_output' => 'b', 'view|uncompressed' => 'u', 'view|add_header' => 'h', 'view|only_header' => 'H', 'view|sam_input' => 'S', 'srt|mem_hint' => 'm', 'srt|sort_by_names' => 'n', 'mrg|headers_in' => 'h', 'mrg|sort_by_names' => 'n', 'pup|refseq' => 'f', 'pup|map_qcap' => 'M', 'pup|ref_list' => 't', 'pup|site_list' => 'l', 'pup|theta' => 'T', 'pup|n_haplos' => 'N', 'pup|exp_hap_diff' => 'f', 'pup|indel_prob' => 'I', 'pup|qual_last_col' => 's', 'pup|sam_input' => 'S', 'pup|indels_only' => 'i', 'pup|call_cons' => 'c', 'pup|genot_L' => 'g', 'fmd|match_with_eq' => 'e' ); # # the order in the arrayrefs is the order required # on the command line # # the strings in the arrayrefs (less special chars) # become the keys for named parameters to run_maq # # special chars: # # '#' implies optional # '*' implies variable number of this type # <|> implies stdin/stdout redirect # our %command_files = ( 'view' => [qw( bam #*rgn >out )], 'sort' => [qw( bam >pfx )], 'index' => [qw( bam )], 'merge' => [qw( obm *ibm )], 'faidx' => [qw( fas #*rgn )], 'pileup' => [qw( bam >out )], 'fixmate' => [qw( ibm obm )], 'rmdup' => [qw( ibm obm )], 'fillmd' => [qw( bam fas )] ); 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Seg.pm000077500000000000000000000145411302566030400216000ustar00rootroot00000000000000# Copyright Balamurugan Kumarasamy # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Seg - Object for identifying low complexity regions in a given protein seequence. =head1 SYNOPSIS # Build a Seg factory # $paramfile is the full path to the seg binary file my @params = ('PROGRAM',$paramfile); my $factory = Bio::Tools::Run::Seg->new($param); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION Seg is a program which identifies low complexity regions in proteins. It was developed by Wootton and Federhen at NCBI. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Seg; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SEG_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Seg; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @SEG_PARAMS=qw(PROGRAM VERBOSE); foreach my $attr ( @SEG_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'seg'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns : string, or undef if $SEGDIR not in ENV Args : None =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SEGDIR}) if $ENV{SEGDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new(@params) Function: creates a new Seg factory Returns: Bio::Tools::Run::Seg Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED Use $obj->run($seq) instead Function: Runs Seg and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run Usage : $obj->run($seq) Function: Runs Seg and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run{ my ($self,$seq) = @_; my @feats; if (ref($seq) ) { # it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { #The seq object is not a seq object but a file. #Here the file does not need to be created. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal (not to be used directly) Returns : Args : =cut sub _input { my ($self,$infile1) = @_; if(defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal (not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : None =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str =$self->executable." ".$self->_input." -l > ".$outfile; my $status = system($str); $self->throw( "Seg call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (SEG, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*SEG; } else { $filehandle = $outfile; } my $seg_parser = Bio::Tools::Seg->new(-fh=>$filehandle); my @seg_feat; while(my $seg_feat = $seg_parser->next_result){ push @seg_feat, $seg_feat; } # free resources $self->cleanup(); unlink $outfile; close($tfh1); undef $tfh1; return @seg_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal (not to be used directly) Returns : string - Fasta filename to which $seq was written Args : Bio::Seq object =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); $in->close(); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Signalp.pm000077500000000000000000000160261302566030400224570ustar00rootroot00000000000000# Wrapper module for SignalP Bio::Tools::Run::Signalp # # Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp # originally written by Marc Sohrmann (ms2@sanger.ac.uk) # Written in BioPipe by Balamurugan Kumarasamy # Please direct questions and support issues to # # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) =head1 NAME Bio::Tools::Run::Signalp =head1 SYNOPSIS Build a Signalp factory my $factory = Bio::Tools::Run::Signalp->new(); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION wrapper module for Signalp program =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp originally written by Marc Sohrmann (ms2@sanger.ac.uk) Written in BioPipe by Balamurugan Kumarasamy Contributions by David Vilanova (david.vilanova@urbanet.ch) Shawn Hoon (shawnh@fugu-sg.org) # Please direct questions and support issues to # Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Signalp; use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME @SIGNALP_PARAMS %OK_FIELD); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Signalp; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { @SIGNALP_PARAMS=qw(PROGRAM VERBOSE); foreach my $attr ( @SIGNALP_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'signalp'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SIGNALPDIR}) if $ENV{SIGNALPDIR}; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; return $self->$attr if $self->$attr; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $factory= Bio::Tools::Run::Signalp->new(); Function: creates a new Signalp factory Returns: Bio::Tools::Run::Signalp Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/PROGRAM/i) { $self->executable($value); next; } $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED. Use $factory->run($seq) instead Function: Runs Signalp and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 run Title : run() Usage : my $feats = $factory->run($seq) Function: Runs Signalp Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$seq) = @_; my @feats; if (ref($seq) ) { if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { my $in = Bio::SeqIO->new(-file => $seq, '-format' =>'fasta'); my $infile1; while ( my $tmpseq = $in->next_seq() ) { $infile1 = $self->_writeSeqFile($tmpseq); } $self->_input($infile1); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : $factory->_input($seqFile) Function: get/set for input file Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; $self->{'input'} = $infile1 if(defined $infile1); return $self->{'input'}; } =head2 _run Title : _run Usage : $factory->_run() Function: Makes a system call and runs signalp Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str =$self->executable." -t euk -trunc 50 ".$self->{'input'}." > ".$outfile; my $status = system($str); $self->throw( "Signalp call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (SIGNALP, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*SIGNALP; } else { $filehandle = $outfile; } my $signalp_parser = Bio::Tools::Signalp->new(-fh=>$filehandle); my @signalp_feat; while(my $signalp_feat = $signalp_parser->next_result){ push @signalp_feat, $signalp_feat; } $self->cleanup(); close($tfh1); undef $tfh1; unlink $outfile; return @signalp_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : $factory->_writeSeqFile($seq) Function: Creates a file from the given seq object Returns : A string(filename) Args : Bio::PrimarySeqI =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'fasta'); $in->write_seq($seq); $in->close(); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Simprot.pm000066400000000000000000000351651302566030400225210ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::Simprot # # Please direct questions and support issues to # # Cared for by Albert Vilella # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Simprot - Wrapper around the Simprot program. Wrapper for the calculation of a multiple sequence alignment from a phylogenetic tree =head1 SYNOPSIS use Bio::Tools::Run::Simprot; use Bio::TreeIO; my $treeio = Bio::TreeIO->new( -format => 'nh', -file => 't/data/tree.nh'); my $tree = $treeio->next_tree; my $simprot = Bio::Tools::Run::Simprot->new(); $simprot->tree($tree); my ($rc,$aln,$seq) = $simprot->run(); =head1 DESCRIPTION This is a wrapper around the Simprot program by Andy Pang, Andrew D Smith, Paulo AS Nuin and Elisabeth RM Tillier. Simprot allows for several models of amino acid substitution (PAM, JTT and PMB), allows for gamma distributed sites rates according to Yang's model, and implements a parameterised Qian and Goldstein distribution model for insertion and deletion. See http://www.uhnres.utoronto.ca/labs/tillier/software.htm for more information. =head2 Helping the module find your executable You will need to enable SIMPROTDIR to find the simprot program. This can be done in (at least) three ways: 1. Make sure the simprot executable is in your path (i.e. 'which simprot' returns a valid program 2. define an environmental variable SIMPROTDIR which points to a directory containing the 'simprot' app: In bash export SIMPROTDIR=/home/progs/simprot or In csh/tcsh setenv SIMPROTDIR /home/progs/simprot 3. include a definition of an environmental variable SIMPROTDIR in every script that will BEGIN {$ENV{SIMPROTDIR} = '/home/progs/simprot'; } use Bio::Tools::Run::Simprot; =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Albert Vilella Email avilella-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Simprot; use vars qw(@ISA %VALIDVALUES $PROGRAMNAME $PROGRAM); use strict; use Bio::SimpleAlign; use Bio::AlignIO; use Bio::SeqIO; use Bio::TreeIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # valid values for parameters, the default one is always # the first one in the array BEGIN { %VALIDVALUES = ( 'branch' => '1', 'eFactor' => '3', 'indelFrequncy' => '0.03', 'maxIndel' => '2048', 'subModel' => [ 2,0,1], # 0:PAM, 1:JTT, 2:PMB 'rootLength' => '50', 'alpha' => '1', 'Benner' => '0', 'interleaved' => '1', 'variablegamma' => '0', 'bennerk' => '-2', ); } =head2 program_name Title : program_name Usage : $factory->program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return 'simprot'; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable. Returns: string Args : =cut sub program_dir { return Bio::Root::IO->catfile($ENV{SIMPROTDIR}) if $ENV{SIMPROTDIR}; } =head2 new Title : new Usage : my $simprot = Bio::Tools::Run::Simprot->new(); Function: Builds a new Bio::Tools::Run::Simprot Returns : Bio::Tools::Run::Simprot Args : -alignment => the Bio::Align::AlignI object -tree => the Bio::Tree::TreeI object -save_tempfiles => boolean to save the generated tempfiles and NOT cleanup after onesself (default FALSE) -executable => where the simprot executable resides -params => A reference to a hash where keys are parameter names and hash values are the associated parameter values See also: L, L =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($aln, $tree, $st, $params, $exe, $ubl) = $self->_rearrange([qw(TREE SAVE_TEMPFILES PARAMS EXECUTABLE)], @args); defined $tree && $self->tree($tree); defined $st && $self->save_tempfiles($st); defined $exe && $self->executable($exe); $self->set_default_parameters(); if( defined $params ) { if( ref($params) !~ /HASH/i ) { $self->warn("Must provide a valid hash ref for parameter -FLAGS"); } else { map { $self->set_parameter($_, $$params{$_}) } keys %$params; } } return $self; } =head2 set_parameters Title : set_parameters Usage : $codeml->set_parameters($parameter, $value); Function: (Re)set the SimProt parameters Returns : none Args : First argument is the parameter name Second argument is the parameter value =cut sub set_parameter{ my ($self,$param,$value) = @_; unless (defined $self->{'no_param_checks'} && $self->{'no_param_checks'} == 1) { if ( ! defined $VALIDVALUES{$param} ) { $self->warn("unknown parameter $param will not be set unless you force by setting no_param_checks to true"); return 0; } if ( ref( $VALIDVALUES{$param}) =~ /ARRAY/i && scalar @{$VALIDVALUES{$param}} > 0 ) { unless ( grep { $value eq $_ } @{ $VALIDVALUES{$param} } ) { $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value"); return 0; } } } $self->{'_simprotparams'}->{$param} = $value; return 1; } =head2 set_default_parameters Title : set_default_parameters Usage : $codeml->set_default_parameters(0); Function: (Re)set the default parameters from the defaults (the first value in each array in the %VALIDVALUES class variable) Returns : none Args : boolean: keep existing parameter values =cut sub set_default_parameters{ my ($self,$keepold) = @_; $keepold = 0 unless defined $keepold; while( my ($param,$val) = each %VALIDVALUES ) { # skip if we want to keep old values and it is already set next if( defined $self->{'_simprotparams'}->{$param} && $keepold); if(ref($val)=~/ARRAY/i ) { $self->{'_simprotparams'}->{$param} = $val->[0]; } else { $self->{'_simprotparams'}->{$param} = $val; } } } =head2 get_parameters Title : get_parameters Usage : my %params = $self->get_parameters(); Function: returns the list of parameters as a hash Returns : associative array keyed on parameter names Args : none =cut sub get_parameters{ my ($self) = @_; # we're returning a copy of this return %{ $self->{'_simprotparams'} }; } =head2 prepare Title : prepare Usage : my $rundir = $simprot->prepare(); Function: prepare the simprot analysis using the default or updated parameters the alignment parameter and species tree must have been set Returns : value of rundir Args : L object, L object [optional] =cut sub prepare { my ($self,$tree) = @_; unless ( $self->save_tempfiles ) { # brush so we don't get plaque buildup ;) $self->cleanup(); } $tree = $self->tree unless $tree; if( ! $tree ) { $self->warn("must have supplied a valid species tree file in order to run simprot"); return 0; } my ($tempdir) = $self->tempdir(); my ($temptreeFH); if( ! ref($tree) && -e $tree ) { $self->{_temptreefile} = $tree; } else { ($temptreeFH,$self->{_temptreefile}) = $self->io->tempfile ('-dir' => $tempdir, UNLINK => ($self->save_tempfiles ? 0 : 1)); my $treeout = Bio::TreeIO->new('-format' => 'newick', '-fh' => $temptreeFH); $treeout->write_tree($tree); $treeout->close(); close($temptreeFH); } $self->{_prepared} = 1; my %params = $self->get_parameters; while( my ($param,$val) = each %params ) { $self->{_simprot_params} .=" \-\-$param\=$val"; } return $tempdir; } =head2 run Title : run Usage : my $nhx_tree = $simprot->run(); Function: run the simprot analysis using the default or updated parameters the alignment parameter must have been set Returns : L object [optional] Args : L object L object =cut sub run { my ($self,$tree) = @_; $self->prepare($tree) unless (defined($self->{_prepared})); my ($rc,$aln,$seq) = (1); my ($tmpdir) = $self->tempdir(); my $outfile; { my $commandstring; my $exit_status; my $simprot_executable = $self->executable; $commandstring .= $simprot_executable; $commandstring .= $self->{_simprot_params}; $commandstring .= " --tree=". $self->{_temptreefile} . " "; my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $self->outfile_name($outfile); my $seqfile; ($tfh, $seqfile) = $self->io->tempfile(-dir=>$self->tempdir()); close($tfh); undef $tfh; $commandstring .= "--alignment=". $self->outfile_name . " "; $commandstring .= "--sequence=". $seqfile . " "; $self->throw("unable to find or run executable for 'simprot'") unless $simprot_executable && -e $simprot_executable && -x _; open(RUN, "$commandstring |") or $self->throw("Cannot run $commandstring"); my @output = ; $exit_status = close(RUN); $self->error_string(join('',@output)); if( (grep { /^\[ /io } @output) || !$exit_status) { $self->warn("There was an error - see error_string for the program output"); $rc = 0; } eval { $aln = Bio::AlignIO->new(-file => "$outfile",-format => 'fasta'); $seq = Bio::SeqIO->new(-file => "$seqfile", -format => 'fasta'); }; if( $@ ) { $self->warn($self->error_string); } } unless ( $self->save_tempfiles ) { $self->cleanup(); } return ($rc,$aln,$seq); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysus run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string { my ($self,$value) = @_; if( defined $value) { $self->{'error_string'} = $value; } return $self->{'error_string'}; } =head2 version Title : version Usage : exit if $prog->version() < 1.8 Function: Determine the version number of the program Example : Returns : float or undef Args : none =cut sub version { my ($self) = @_; my $exe; return undef unless $exe = $self->executable; my $string = `$exe 2>&1` ; $string =~ /Version\:\s+(\d+.\d+.\d+)/m; return $1 || undef; } =head2 alignment Title : alignment Usage : $simprot->align($aln); Function: Get/Set the L object Returns : L object Args : [optional] L Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub alignment { my ($self,$aln) = @_; if( defined $aln ) { if( -e $aln ) { $self->{'_alignment'} = $aln; } elsif( !ref($aln) || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must specify a valid Bio::Align::AlignI object to the alignment function not $aln"); return undef; } else { $self->{'_alignment'} = $aln; } } return $self->{'_alignment'}; } =head2 tree Title : tree Usage : $simprot->tree($tree, %params); Function: Get/Set the L object Returns : L Args : [optional] $tree => L, [optional] %parameters => hash of tree-specific parameters Comment : We could potentially add support for running directly on a file but we shall keep it simple See also: L =cut sub tree { my ($self, $tree, %params) = @_; if( defined $tree ) { if( ! ref($tree) || ! $tree->isa('Bio::Tree::TreeI') ) { $self->warn("Must specify a valid Bio::Tree::TreeI object to the alignment function"); } $self->{'_tree'} = $tree; } return $self->{'_tree'}; } =head1 Bio::Tools::Run::BaseWrapper methods =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $simprot->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $simprot->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a L object Returns : L Args : none =cut sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; # Needed to keep compiler happy bioperl-run-release-1-7-1/lib/Bio/Tools/Run/StandAloneBlast.pm000066400000000000000000000541051302566030400240750ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlast # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneBlast - Object for the local execution of the NCBI BLAST program suite (blastall, blastpgp, bl2seq). There is experimental support for WU-Blast and NCBI rpsblast. =head1 SYNOPSIS # Local-blast "factory object" creation and blast-parameter # initialization: @params = (-database => 'swissprot', -outfile => 'blast1.out'); $factory = Bio::Tools::Run::StandAloneBlast->new(@params); # Blast a sequence against a database: $str = Bio::SeqIO->new(-file=>'t/amino.fa', -format => 'Fasta'); $input = $str->next_seq(); $input2 = $str->next_seq(); $blast_report = $factory->blastall($input); # Run an iterated Blast (psiblast) of a sequence against a database: $factory->j(3); # 'j' is blast parameter for # of iterations $factory->outfile('psiblast1.out'); $factory = Bio::Tools::Run::StandAloneBlast->new(@params); $blast_report = $factory->blastpgp($input); # Use blast to align 2 sequences against each other: $factory = Bio::Tools::Run::StandAloneBlast->new(-outfile => 'bl2seq.out'); $factory->bl2seq($input, $input2); # Experimental support for WU-Blast 2.0 my $factory = Bio::Tools::Run::StandAloneBlast->new(-program =>"wublastp", -database =>"swissprot", -e => 1e-20); my $blast_report = $factory->wublast($seq); # Experimental support for NCBI rpsblast my $factory = Bio::Tools::Run::StandAloneBlast->new(-db => 'CDD/Cog', -expect => 0.001); $factory->F('T'); # turn on SEG filtering of query sequence my $blast_report = $factory->rpsblast($seq); # Use the experimental fast Blast parser, 'blast_pull' my $factory = Bio::Tools::Run::StandAloneBlast->new(-_READMETHOD =>'blast_pull', @other_params); # Various additional options and input formats are available, # see the DESCRIPTION section for details. =head1 DESCRIPTION This DESCRIPTION only documents Bio::Tools::Run::StandAloneBlast, a Bioperl object for running the NCBI standAlone BLAST package. Blast itself is a large & complex program - for more information regarding BLAST, please see the BLAST documentation which accompanies the BLAST distribution. BLAST is available from ftp://ncbi.nlm.nih.gov/blast/. A source of confusion in documenting a BLAST interface is that the term "program" is used in - at least - three different ways in the BLAST documentation. In this DESCRIPTION, "program" will refer to the BLAST routine set by the BLAST C<-p> parameter that can be set to blastn, blastp, tblastx etc. We will use the term Blast "executable" to refer to the various different executable files that may be called - ie. blastall, blastpgp or bl2seq. In addition, there are several BLAST capabilities, which are also referred to as "programs", and are implemented by using specific combinations of BLAST executables, programs and parameters. They will be referred by their specific names - eg PSIBLAST and PHIBLAST. Before running StandAloneBlast it is necessary: to install BLAST on your system, to edit set the environmental variable $BLASTDIR or your $PATH variable to point to the BLAST directory, and to ensure that users have execute privileges for the BLAST program. If the databases which will be searched by BLAST are located in the data subdirectory of the blast program directory (the default installation location), StandAloneBlast will find them; however, if the database files are located in any other location, environmental variable $BLASTDATADIR will need to be set to point to that directory. The use of the StandAloneBlast module is as follows: Initially, a local blast "factory object" is created. The constructor may be passed an optional array of (non-default) parameters to be used by the factory, eg: @params = (-program => 'blastn', -database => 'ecoli.nt'); $factory = Bio::Tools::Run::StandAloneBlast->new(@params); Any parameters not explicitly set will remain as the defaults of the BLAST executable. Note each BLAST executable has somewhat different parameters and options. See the BLAST Documentation for a description or run the BLAST executable from the command line followed solely with a "-" to see a list of options and default values for that executable; eg Eblastall -. BLAST parameters can be changed and/or examined at any time after the factory has been created. The program checks that any parameter/switch being set/read is valid. Except where specifically noted, StandAloneBlast uses the same single-letter, case-sensitive parameter names as the actual blast program. Currently no checks are included to verify that parameters are of the proper type (e.g. string or numeric) or that their values are within the proper range. As an example, to change the value of the Blast parameter 'e' ('e' is the parameter for expectation-value cutoff) $expectvalue = 0.01; $factory->e($expectvalue); Note that for improved script readibility one can modify the name of the (ncbi) BLAST parameters as desired as long as the initial letter (and case) of the parameter are preserved, e.g.: $factory->expectvalue($expectvalue); Unfortunately, some of the BLAST parameters are not the single letter one might expect (eg "iteration round" in blastpgp is 'j'). Again one can check by using, for example: > blastpgp - Wublast parameters need to be complete (ie. don't truncate them to their first letter), but are case-insensitive. Once the factory has been created and the appropriate parameters set, one can call one of the supported blast executables. The input sequence(s) to these executables may be fasta file(s) as described in the BLAST documentation. $inputfilename = 't/testquery.fa'; $blast_report = $factory->blastall($inputfilename); In addition, sequence input may be in the form of either a Bio::Seq object or (a reference to) an array of Bio::Seq objects, e.g.: $input = Bio::Seq->new(-id => "test query", -seq => "ACTACCCTTTAAATCAGTGGGGG"); $blast_report = $factory->blastall($input); NOTE: Use of the BPlite method has been deprecated and is no longer supported. For blastall and non-psiblast blastpgp runs, report object is a L object, selected by the user with the parameter _READMETHOD. The leading underscore is needed to distinguish this option from options which are passed to the BLAST executable. The default parser is Bio::SearchIO::blast. In any case, the "raw" blast report is also available. The filename is set by the 'outfile' parameter and has the default value of "blastreport.out". For psiblast execution in the BLAST "jumpstart" mode, the program must be passed (in addition to the query sequence itself) an alignment containing the query sequence (in the form of a SimpleAlign object) as well as a "mask" specifying at what residues position-specific scoring matrices (PSSMs) are to used and at what residues default scoring matrices (eg BLOSUM) are to be used. See psiblast documentation for more details. The mask itself is a string of 0's and 1's which is the same length as each sequence in the alignment and has a "1" at locations where (PSSMs) are to be used and a "0" at all other locations. So for example: $str = Bio::AlignIO->new(-file => "cysprot.msf", -format => 'msf'); $aln = $str->next_aln(); $len = $aln->length_aln(); $mask = '1' x $len; # simple case where PSSM's to be used at all residues $report = $factory->blastpgp("cysprot1.fa", $aln, $mask); For bl2seq execution, StandAloneBlast.pm can be combined with AlignIO.pm to directly produce a SimpleAlign object from the alignment of the two sequences produced by bl2seq as in: # Get 2 sequences $str = Bio::SeqIO->new(-file=>'t/amino.fa' , -format => 'Fasta'); my $seq3 = $str->next_seq(); my $seq4 = $str->next_seq(); # Run bl2seq on them $factory = Bio::Tools::Run::StandAloneBlast->new(-program => 'blastp', -outfile => 'bl2seq.out'); my $bl2seq_report = $factory->bl2seq($seq3, $seq4); # Use AlignIO.pm to create a SimpleAlign object from the bl2seq report $str = Bio::AlignIO->new(-file=> 'bl2seq.out',-format => 'bl2seq'); $aln = $str->next_aln(); For more examples of syntax and use of StandAloneBlast.pm, the user is encouraged to run the scripts standaloneblast.pl in the bioperl examples/tools directory and StandAloneBlast.t in the bioperl t/ directory. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Peter Schattner Email schattner at alum.mit.edu =head1 MAINTAINER - Torsten Seemann Email torsten at infotech.monash.edu.au =head1 CONTRIBUTORS Sendu Bala bix@sendu.me.uk (reimplementation) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::StandAloneBlast; use strict; use warnings; use Bio::Root::IO; use Bio::Seq; use Bio::SeqIO; use Bio::SearchIO; use File::Spec; use base qw(Bio::Tools::Run::WrapperBase Bio::Factory::ApplicationFactoryI); our $AUTOLOAD; our $DEFAULTBLASTTYPE = 'NCBI'; our $DEFAULTREADMETHOD = 'BLAST'; # If local BLAST databases are not stored in the standard # /data directory, the variable BLASTDATADIR will need to be # set explicitly our $DATADIR = $ENV{'BLASTDATADIR'} || $ENV{'BLASTDB'}; if (! defined $DATADIR && defined $ENV{'BLASTDIR'}) { my $dir = Bio::Root::IO->catfile($ENV{'BLASTDIR'}, 'data'); if (-d $dir) { $DATADIR = $dir; } elsif ($ENV{'BLASTDIR'} =~ /bin/) { $dir = $ENV{'BLASTDIR'}; $dir =~ s/bin/data/; $DATADIR = $dir if -d $dir; } } =head2 new Title : new Usage : my $obj = Bio::Tools::Run::StandAloneBlast->new(); Function: Builds a newBio::Tools::Run::StandAloneBlast object Returns : Bio::Tools::Run::StandAloneNCBIBlast or StandAloneWUBlast Args : -quiet => boolean # make program execution quiet -_READMETHOD => 'BLAST' (default, synonym 'SearchIO') || 'blast_pull' # the parsing method, case insensitive Essentially all BLAST parameters can be set via StandAloneBlast.pm. Some of the most commonly used parameters are listed below. All parameters have defaults and are optional except for -p in those programs that have it. For a complete listing of settable parameters, run the relevant executable BLAST program with the option "-" as in blastall - Note that the input parameters (-i, -j, -input) should not be set directly by you: this module sets them when you call one of the executable methods. Blastall -p Program Name [String] Input should be one of "blastp", "blastn", "blastx", "tblastn", or "tblastx". -d Database [String] default = nr The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "nr est" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm -S Query strands to search against database (for blast[nx], and tblastx). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 Blastpgp (including Psiblast) -j is the maximum number of rounds (default 1; i.e., regular BLAST) -h is the e-value threshold for including sequences in the score matrix model (default 0.001) -c is the "constant" used in the pseudocount formula specified in the paper (default 10) -B Multiple alignment file for PSI-BLAST "jump start mode" Optional -Q Output File for PSI-BLAST Matrix in ASCII [File Out] Optional rpsblast -d Database [String] default = (none - you must specify a database) The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "Cog Smart" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm Bl2seq -p Program name: blastp, blastn, blastx. For blastx 1st argument should be nucleotide [String] default = blastp -o alignment output file [File Out] default = stdout -e Expectation value (E) [Real] default = 10.0 -S Query strands to search against database (blastn only). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 WU-Blast -p Program Name [String] Input should be one of "wublastp", "wublastn", "wublastx", "wutblastn", or "wutblastx". -d Database [String] default = nr The database specified must first be formatted with xdformat. -E Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm =cut sub new { my ($caller, @args) = @_; my $class = ref($caller) || $caller; # Because of case-sensitivity issues, ncbi and wublast methods are # mutually exclusive. We can't load ncbi methods if we start with wublast # (and vice versa) since wublast e() and E() should be the same thing, # whilst they must be different things in ncbi blast. # # Solution: split StandAloneBlast out into two more modules for NCBI and WU if ($class =~ /NCBI|WU/) { return $class->SUPER::new(@args); } my %args = @args; my $blasttype = $DEFAULTBLASTTYPE; while (my ($attr, $value) = each %args) { if ($attr =~/^-?\s*program\s*$|^-?p$/) { if ($value =~ /^wu*/) { $blasttype = 'WU'; } } } my $module = "Bio::Tools::Run::StandAlone${blasttype}Blast"; Bio::Root::Root->_load_module($module); return $module->new(@args); } =head2 executable Title : executable Usage : my $exe = $blastfactory->executable('blastall'); Function: Finds the full path to the executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exename, $exe, $warn) = @_; $exename = 'blastall' unless (defined $exename || $self =~ /WUBlast/); $self->program_name($exename); if( defined $exe && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } unless( defined $self->{'_pathtoexe'}->{$exename} ) { my $f = $self->program_path($exename); $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); # This is how I meant to split up these conditionals --jason # if exe is null we will execute this (handle the case where # PROGRAMDIR pointed to something invalid) unless( $exe ) { # we didn't find it in that last conditional if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { $self->{'_pathtoexe'}->{$exename} = $exe; } else { $self->warn("Cannot find executable for $exename") if $warn; $self->{'_pathtoexe'}->{$exename} = undef; } } } return $self->{'_pathtoexe'}->{$exename}; } =head2 program_dir Title : program_dir Usage : my $dir = $factory->program_dir(); Function: Abstract get method for dir of program. Returns : string representing program directory Args : none =cut sub program_dir { my $self = shift; $self =~ /NCBIBlast/? $ENV{'BLASTDIR'}: $ENV{'WUBLASTDIR'}; } sub program_name { my $self = shift; if (@_) { $self->{program_name} = shift } return $self->{program_name} || ''; } sub program { my $self = shift; if( wantarray ) { return ($self->executable, $self->p()); } else { return $self->executable(@_); } } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: Create input file(s) for Blast executable Example : Returns : name of file containing Blast data input Args : Seq object reference or input file name =cut sub _setinput { my ($self, $executable, $input1, $input2) = @_; my ($seq, $temp, $infilename1, $infilename2,$fh ) ; # If $input1 is not a reference it better be the name of a file with # the sequence/ alignment data... $self->io->_io_cleanup(); SWITCH: { unless (ref $input1) { $infilename1 = (-e $input1) ? $input1 : 0 ; last SWITCH; } # $input may be an array of BioSeq objects... if (ref($input1) =~ /ARRAY/i ) { ($fh,$infilename1) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); $temp = Bio::SeqIO->new(-fh=> $fh, -format => 'fasta'); foreach $seq (@$input1) { unless ($seq->isa("Bio::PrimarySeqI")) {return 0;} $seq->display_id($seq->display_id); $temp->write_seq($seq); } close $fh; $fh = undef; last SWITCH; } # $input may be a single BioSeq object... elsif ($input1->isa("Bio::PrimarySeqI")) { ($fh,$infilename1) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); # just in case $input1 is taken from an alignment and has spaces (ie # deletions) indicated within it, we have to remove them - otherwise # the BLAST programs will be unhappy my $seq_string = $input1->seq(); $seq_string =~ s/\W+//g; # get rid of spaces in sequence $input1->seq($seq_string); $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'fasta'); $temp->write_seq($input1); close $fh; undef $fh; last SWITCH; } $infilename1 = 0; # Set error flag if you get here } unless ($input2) { return $infilename1; } SWITCH2: { unless (ref $input2) { $infilename2 = (-e $input2) ? $input2 : 0 ; last SWITCH2; } if ($input2->isa("Bio::PrimarySeqI") && $executable eq 'bl2seq' ) { ($fh,$infilename2) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta'); $temp->write_seq($input2); close $fh; undef $fh; last SWITCH2; } # Option for using psiblast's pre-alignment "jumpstart" feature elsif ($input2->isa("Bio::SimpleAlign") && $executable eq 'blastpgp' ) { # a bit of a lie since it won't be a fasta file ($fh,$infilename2) = $self->io->tempfile(TEMPLATE=>'blastquery-XXXXXX', SUFFIX=>'.fasta'); # first we retrieve the "mask" that determines which residues should # by scored according to their position and which should be scored # using the non-position-specific matrices my @mask = split("", shift ); # get mask # then we have to convert all the residues in every sequence to upper # case at the positions that we want psiblast to use position specific # scoring foreach $seq ( $input2->each_seq() ) { my @seqstringlist = split("",$seq->seq()); for (my $i = 0; $i < scalar(@mask); $i++) { unless ( $seqstringlist[$i] =~ /[a-zA-Z]/ ) {next} $seqstringlist[$i] = $mask[$i] ? uc $seqstringlist[$i]: lc $seqstringlist[$i] ; } my $newseqstring = join("", @seqstringlist); $seq->seq($newseqstring); } # Now we need to write out the alignment to a file # in the "psi format" which psiblast is expecting $input2->map_chars('\.','-'); $temp = Bio::AlignIO->new(-fh=> $fh, '-format' => 'psi'); $temp->write_aln($input2); close $fh; undef $fh; last SWITCH2; } $infilename2 = 0; # Set error flag if you get here } return ($infilename1, $infilename2); } =head1 Bio::Tools::Run::WrapperBase methods =cut =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Returns : value of save_tempfiles Args : newvalue (optional) =cut =head2 outfile_name Title : outfile_name Usage : my $outfile = $tcoffee->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut =head2 cleanup Title : cleanup Usage : $tcoffee->cleanup(); Function: Will cleanup the tempdir directory after a PAML run Returns : none Args : none =cut =head2 io Title : io Usage : $obj->io($newval) Function: Gets a Bio::Root::IO object Returns : Bio::Root::IO Args : none =cut 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/StandAloneBlastPlus.pm000077500000000000000000001154721302566030400247510ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlastPlus # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneBlastPlus - Compute with NCBI's blast+ suite *ALPHA* =head1 SYNOPSIS B: This module is related to the L system in name (and inspiration) only. You must use this module directly. # existing blastdb: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb' ); # create blastdb from fasta file and attach $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => 'myseqs.fas', -create => 1 ); # create blastdb from BioPerl sequence collection objects $alnio = Bio::AlignIO->new( -file => 'alignment.msf' ); $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => $alnio, -create => 1 ); @seqs = $alnio->next_aln->each_seq; $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => \@seqs, -create => 1 ); # create database with masks $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'my_masked_db', -db_data => 'myseqs.fas', -masker => 'dustmasker', -mask_data => 'maskseqs.fas', -create => 1 ); # create a mask datafile separately $mask_file = $fac->make_mask( -data => 'maskseqs.fas', -masker => 'dustmasker' ); # query database for metadata $info_hash = $fac->db_info; $num_seq = $fac->db_num_sequences; @mask_metadata = @{ $fac->db_filter_algorithms }; # perform blast methods $result = $fac->tblastn( -query => $seqio ); # see Bio::Tools::Run::StandAloneBlastPlus::BlastMethods # for many more details =head1 DESCRIPTION B This module requires BLAST+ v. 2.2.24+ and higher. Until the API stabilizes for BLAST+, consider this module highly experimental. This module along with L allows the user to perform BLAST functions using the external program suite C (available at L), using BioPerl objects and L facilities. This wrapper can prepare BLAST databases as well as run BLAST searches. It can also be used to run C programs independently. This module encapsulates object construction and production of databases and masks. Blast analysis methods (C, etc>) are contained in L. =head1 USAGE The basic mantra is to (1) create a BlastPlus factory using the C constructor, and (2) perform BLAST analyses by calling the desired BLAST program by name off the factory object. The blast database itself and any masking data are attached to the factory object (step 1). Query sequences and any parameters associated with particular programs are provided to the blast method call (step 2), and are run against the attached database. =head2 Factory construction/initialization The factory needs to be told where the blast+ programs live. The C environment variable will be checked for the default executable directory. The program directory can be set for individual factory instances with the C parameter. All the blast+ programs must be accessible from that directory (i.e., as executable files or symlinks). Either the database or BLAST subject data must be specified at object construction. Databases can be pre-existing formatted BLAST dbs, or can be built directly from fasta sequence files or BioPerl sequence object collections of several kinds. The key constructor parameters are C, C, C. To specify a pre-existing BLAST database, use C alone: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => 'mydb' ); The directory can be specified along with the basename, or separately with C: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => '~/home/blast/mydb' ); #same as $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => 'mydb', -DB_DIR => '~/home/blast' ); To create a BLAST database de novo, see L. If you wish to apply pre-existing mask data (i.e., the final ASN1 output from one of the blast+ masker programs), to the database before querying, specify it with C: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -DB_NAME => 'mydb', -MASK_FILE => 'mymaskdata.asn' ); =head2 Creating a BLAST database There are several options for creating the database de novo using attached data, both before and after factory construction. If a temporary database (one that can be deleted by the C method) is desired, leave out the C<-db_name> parameter. If C<-db_name> is specified, the database will be preserved with the basename specified. Use C<-create => 1> to create a new database (otherwise the factory will look for an existing database). Use C<-overwrite => 1> to create and overwrite an existing database. Note that the database is not created immediately on factory construction. It will be created if necessary on the first use of a factory BLAST method, or you can force database creation by executing $fac->make_db(); =over =item * Specify data during construction With a FASTA file: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => 'myseqs.fas', -create => 1 ); With another BioPerl object collection: $alnio = Bio::AlignIO->new( -file => 'alignment.msf' ); $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => $alnio, -create => 1 ); @seqs = $alnio->next_aln->each_seq; $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'mydb', -db_data => \@seqs, -create => 1 ); Other collections (e.g., L) are valid. If a certain type does not work, please submit an enhancement request. To create temporary databases, leave out the C<-db_name>, e.g. $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_data => 'myseqs.fas', -create => 1 ); To get the tempfile basename, do: $dbname = $fac->db; =item * Specify data post-construction Use the explicit attribute setters: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -create => 1 ); $fac->set_db_data('myseqs.fas'); $fac->make_db; =back =head2 Creating and using mask data The blast+ mask utilities C, C, and C are available. Masking can be rolled into database creation, or can be executed later. If your mask data is already created and in ASN1 format, set the C<-mask_file> attribute on construction (see L). To create a mask from raw data or an existing database and apply the mask upon database creation, construct the factory like so: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'my_masked_db', -db_data => 'myseqs.fas', -masker => 'dustmasker', -mask_data => 'maskseqs.fas', -create => 1 ); The masked database will be created during C. The C<-mask_data> parameter can be a FASTA filename or any BioPerl sequence object collection. If the datatype ('nucl' or 'prot') of the mask data is not compatible with the selected masker, an exception will be thrown with a message to that effect. To create a mask ASN1 file that can be used in the C<-mask_file> parameter separately from the attached database, use the C method directly: $mask_file = $fac->make_mask(-data => 'maskseqs.fas', -masker => 'dustmasker'); # segmasker can use a blastdb as input $mask_file = $fac->make_mask(-mask_db => 'mydb', -masker => 'segmasker') $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'my_masked_db', -db_data => 'myseqs.fas', -mask_file => $mask_file -create => 1 ); =head2 Getting database information To get a hash containing useful metadata on an existing database (obtained by running C, use C: # get info on the attached database.. $info = $fac->db_info; # get info on another database $info = $fac->db_info('~/home/blastdbs/another'); To get a particular info element for the attached database, just call the element name off the factory: $num_seqs = $fac->db_num_sequences; # info on all the masks applied to the db, if any: @masking_info = @{ $fac->db_filter_algorithms }; =head2 Accessing the L factory The blast+ programs are actually executed by a L wrapper instance. This instance is available for peeking and poking in the L C attribute. For convenience, C methods can be run from the C object, and are delegated to the C attribute. For example, to get the blast+ program to be executed, examine either $fac->factory->command or $fac->command Similarly, the current parameters for the C factory are @parameters = $fac->get_parameters =head2 Cleaning up temp files Temporary analysis files produced under a single factory instances can be unlinked by running $fac->cleanup; Tempfiles are generally not removed unless this method is explicitly called. C only unlinks "registered" files and databases. All temporary files are automatically registered; in particular, "anonymous" databases (such as $fac->Bio::Tools::Run::StandAloneBlastPlus->new( -db_data => 'myseqs.fas', -create => 1 ); without a C<-db_name> specification) are registered for cleanup. Any file or database can be registered with an internal method: $fac->_register_temp_for_cleanup('testdb'); =head2 Other Goodies =over =item You can check whether a given basename points to a properly formatted BLAST database by doing $is_good = $fac->check_db('putative_db'); =item User parameters can be passed to the underlying blast+ programs (if you know what you're doing) with C and C: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'customdb', -db_data => 'myseqs.fas', -db_make_args => [ '-taxid_map' => 'seq_to_taxa.txt' ], -masker => 'windowmasker', -mask_data => 'myseqs.fas', -mask_make_args => [ '-dust' => 'T' ], -create => 1 ); =item You can prevent exceptions from being thrown by failed blast+ program executions by setting C. Examine the error with C: $fac->no_throw_on_crash(1); $fac->make_db; if ($fac->stderr =~ /Error:/) { #handle error ... } =back =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us =head1 CONTRIBUTORS =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::StandAloneBlastPlus; use strict; our $AUTOLOAD; # Object preamble - inherits from Bio::Root::Root use lib '../../..'; use Bio::Root::Root; use Bio::SeqIO; use Bio::Tools::GuessSeqFormat; use Bio::Tools::Run::StandAloneBlastPlus::BlastMethods; use File::Temp 0.22; use IO::String; use base qw(Bio::Root::Root); unless ( eval "require Bio::Tools::Run::BlastPlus" ) { Bio::Root::Root->throw("This module requires 'Bio::Tools::Run::BlastPlus'"); } my %AVAILABLE_MASKERS = ( 'windowmasker' => 'nucl', 'dustmasker' => 'nucl', 'segmasker' => 'prot' ); # NOTE: After testing all possible output formats, only 'maskinfo_asn1_text' # is currently working correctly as input for makeblastdb '-mask_data' argument, # the others return an 'Unknown encoding for mask data' error my %MASKER_ENCODING = ( 'windowmasker' => 'maskinfo_asn1_text', 'dustmasker' => 'maskinfo_asn1_text', 'segmasker' => 'maskinfo_asn1_text' ); my $bp_class = 'Bio::Tools::Run::BlastPlus'; # what's the desire here? # # * factory object (created by new()) # - points to some blast db entity, so all functions run off the # the factory (except bl2seq?) use the associated db # # * create a blast formatted database: # - specify a file, or an AlignI object # - store for later, or store in a tempfile to throw away # - object should store its own database pointer # - provide masking options based on the maskers provided # # * perform database actions via db-oriented blast+ commands # via the object # # * perform blast searches against the database # - blastx, blastp, blastn, tblastx, tblastn # - specify Bio::Seq objects or files as queries # - output the results as a file or as a Bio::Search::Result::BlastResult # * perform 'special' (i.e., ones I don't know) searches # - psiblast, megablast, rpsblast, rpstblastn # some of these are "tasks" under particular programs # check out psiblast, why special (special 'iteration' handling in # ...::BlastResult) # check out rpsblast, megablast # # * perform bl2seq # - return the alignment directly as a convenience, using Bio::Search # functions # lazy db formatting: makeblastdb only on first blast request... # ParameterBaseI delegation : use AUTOLOAD # # =head2 new Title : new Usage : my $obj = new Bio::Tools::Run::StandAloneBlastPlus(); Function: Builds a new Bio::Tools::Run::StandAloneBlastPlus object Returns : an instance of Bio::Tools::Run::StandAloneBlastPlus Args : named argument (key => value) pairs: -db : blastdb name =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($db_name, $db_data, $db_dir, $db_make_args, $mask_file, $mask_data, $mask_make_args, $masker, $create, $overwrite, $is_remote, $prog_dir, $program_dir) = $self->_rearrange([qw( DB_NAME DB_DATA DB_DIR DB_MAKE_ARGS MASK_FILE MASK_DATA MASK_MAKE_ARGS MASKER CREATE OVERWRITE REMOTE PROG_DIR PROGRAM_DIR )], @args); # parm taint checks if ($db_name) { $self->throw("DB name contains invalid characters") unless $db_name =~ m{^[a-z0-9_/:.+-]+$}i; } if ( $db_dir ) { $self->throw("DB directory (DB_DIR) not found") unless (-d $db_dir); $self->{'_db_dir'} = $db_dir; } else { $self->{'_db_dir'} = '.'; } $program_dir ||= $prog_dir; # alias # now handle these systematically (bug #3003) # allow db_name to include path info # let db_dir act as root if present and db_name is a relative path # db property contains the pathless name only if ($db_name) { my ($v,$d,$f) = File::Spec->splitpath($db_name); $self->throw("No DB name at the end of path '$db_name'") unless $f; $f =~ s/\..*$//; # tolerant of extensions, but ignore them $self->{_db} = $f; # now establish db_path property as the internal authority on # db location... if ( File::Spec->file_name_is_absolute($db_name) ) { $self->throw("Path specified in DB name ('$d') does not exist") unless !$d || (-d $d); $self->{_db_path} = File::Spec->catfile($d,$f); $self->{_db_dir} = $d; # ignore $db_dir, give heads-up $self->warn("DB name is an absolute path; setting db_dir to '".$self->db_dir."'") if $db_dir; } else { $d = File::Spec->catdir($self->db_dir, $d); $self->throw("Path specified by DB_DIR+DB_NAME ('$d') does not exist") unless !$d || (-d $d); $self->{_db_path} = File::Spec->catfile($d,$f); } } if ($masker) { $self->throw("Masker '$masker' not available") unless grep /^$masker$/, keys %AVAILABLE_MASKERS; $self->{_masker} = $masker; } if ($program_dir) { $self->throw("Can't find program directory '$program_dir'") unless -d $program_dir; $self->{_program_dir} = $program_dir; } elsif ($ENV{BLASTPLUSDIR}) { $self->{_program_dir} = $ENV{BLASTPLUSDIR}; } $Bio::Tools::Run::BlastPlus::program_dir = $self->{_program_dir} || $Bio::Tools::Run::BlastPlus::program_dir; $self->set_db_make_args( $db_make_args) if ( $db_make_args ); $self->set_mask_make_args( $mask_make_args) if ($mask_make_args); $self->{'_create'} = $create; $self->{'_overwrite'} = $overwrite; $self->{'_is_remote'} = $is_remote; $self->{'_db_data'} = $db_data; $self->{'_mask_file'} = $mask_file; $self->{'_mask_data'} = $mask_data; # check db if (defined $self->check_db and $self->check_db == 0 and !$self->is_remote) { $self->throw("DB '".$self->db."' can't be found. To create, set -create => 1.") unless ($create || $overwrite); } if (!$self->db) { # allow this to pass; catch lazily at make_db... if (!$self->db_data) { $self->debug('No database or db data specified. '. 'To create a new database, provide '. '-db_data => [fasta|\@seqs|$seqio_object]') } # no db specified; create temp db $self->{_create} = 1; if ($self->db_dir) { my $fh = File::Temp->new(TEMPLATE => 'DBXXXXX', DIR => $self->db_dir, UNLINK => 1); my ($v,$d,$f) = File::Spec->splitpath($fh->filename); $self->{_db} = $f; $self->{_db_path} = $fh->filename; $self->_register_temp_for_cleanup($self->db_path); $fh->close; } else { $self->{_db_dir} = File::Temp->newdir('DBDXXXXX'); $self->{_db} = 'DBTEMP'; $self->{_db_path} = File::Spec->catfile($self->db_dir, $self->db); } } return $self; } =head2 db() Title : db Usage : $obj->db($newval) Function: contains the basename of the local blast database Example : Returns : value of db (a scalar string) Args : readonly =cut sub db { shift->{_db} } sub db_name { shift->{_db} } sub set_db_name { shift->{_db} = shift } sub db_dir { shift->{_db_dir} } sub set_db_dir { shift->{_db_dir} = shift } sub db_path { shift->{_db_path} } sub db_data { shift->{_db_data} } sub set_db_data { shift->{_db_data} = shift } sub db_type { shift->{_db_type} } sub masker { shift->{_masker} } sub set_masker { shift->{_masker} = shift } sub mask_file { shift->{_mask_file} } sub set_mask_file { shift->{_mask_file} = shift } sub mask_data { shift->{_mask_data} } sub set_mask_data { shift->{_mask_data} = shift } =head2 factory() Title : factory Usage : $obj->factory($newval) Function: attribute containing the Bio::Tools::Run::BlastPlus factory Example : Returns : value of factory (Bio::Tools::Run::BlastPlus object) Args : readonly =cut sub factory { shift->{_factory} } sub create { shift->{_create} } sub overwrite { shift->{_overwrite} } sub is_remote { shift->{_is_remote} } =head2 program_version() Title : program_version Usage : $version = $bedtools_fac->program_version() Function: Returns the program version (if available) Returns : string representing location and version of the program Note : this works around the WrapperBase::version() method conflicting with the -version parameter for SABlast (good argument for not having getter/setters for these) =cut =head2 package_version() Title : package_version Usage : $version = $bedtools_fac->package_version() Function: Returns the BLAST+ package version (if available) Returns : string representing BLAST+ package version (may differ from version()) =cut sub program_version { my $self = shift; my $fac = $self->factory; $fac->program_version(@_) if $fac; } sub package_version { my $self = shift; my $fac = $self->factory; $fac->package_version(@_) if $fac; } =head1 DB methods =head2 make_db() Title : make_db Usage : Function: create the blast database (if necessary), imposing masking if specified Returns : true on success Args : =cut # should also provide facility for creating subdatabases from # existing databases (i.e., another format for $data: the name of an # existing blastdb...) sub make_db { my $self = shift; my @args = @_; return 1 if ( $self->check_db && !$self->overwrite ); # already there or force make $self->throw('No database or db data specified. '. 'To create a new database, provide '. '-db_data => [fasta|\@seqs|$seqio_object]') unless $self->db_data; # db_data can be: fasta file, array of seqs, Bio::SeqIO object my $data = $self->db_data; $data = $self->_fastize($data); my $testio = Bio::SeqIO->new(-file=>$data, -format=>'fasta'); $self->{_db_type} = ($testio->next_seq->alphabet =~ /.na/) ? 'nucl' : 'prot'; $testio->close; my ($v,$d,$name) = File::Spec->splitpath($data); $name =~ s/\.fas$//; $self->{_db} ||= $name; $self->{_db_path} = File::Spec->catfile($self->db_dir,$self->db); # <#######[ # deal with creating masks here, # and provide correct parameters to the # makeblastdb ... # accomodate $self->db_make_args here -- allow them # to override defaults, or allow only those args # that are not specified here? my $usr_db_args ||= $self->db_make_args; my %usr_args = @$usr_db_args if $usr_db_args; my %db_args = ( -in => $data, -dbtype => $self->db_type, -out => $self->db_path, -title => $self->db, -parse_seqids => 1 # necessary for masking ); # usr arg override if (%usr_args) { $db_args{$_} = $usr_args{$_} for keys %usr_args; } # do masking if requested # if the (masker and mask_data) OR mask_file attributes of this # object are set, assume that masking is desired # if ($self->mask_file) { # the actual masking data is provided $db_args{'-mask_data'} = $self->mask_file; } elsif ($self->masker && $self->mask_data) { # build the mask $db_args{'-mask_data'} = $self->make_mask(-data => $self->mask_data); $self->throw("Masker error: message is '".$self->stderr."'") unless $db_args{'-mask_data'}; $self->{_mask_data} = $db_args{'-mask_data'}; } $self->{_factory} = $bp_class->new( -command => 'makeblastdb', %db_args ); $self->factory->no_throw_on_crash($self->no_throw_on_crash); return $self->factory->_run; } =head2 make_mask() Title : make_mask Usage : Function: create masking data based on specified parameters Returns : mask data filename (scalar string) Args : =cut # mask program usage (based on blast+ manual) # # program dbtype opn # windowmasker nucl mask overrep data, low-complexity (optional) # dustmasker nucl mask low-complexity # segmasker prot sub make_mask { my $self = shift; my @args = @_; my ($data, $mask_db, $make_args, $masker) = $self->_rearrange([qw( DATA MASK_DB MAKE_ARGS MASKER)], @args); my (%mask_args,%usr_args,$db_type); my $infmt = 'fasta'; $self->throw("make_mask requires -data argument") unless $data; $masker ||= $self->masker; $self->throw("no masker specified and no masker default set in object") unless $masker; my $usr_make_args ||= $self->mask_make_args; %usr_args = @$usr_make_args if $usr_make_args; unless (grep /^$masker$/, keys %AVAILABLE_MASKERS) { $self->throw("Masker '$masker' not available"); } if ($self->check_db($data)) { unless ($masker eq 'segmasker') { $self->throw("Masker '$masker' can't use a blastdb as primary input"); } unless ($self->db_info($data)->{_db_type} eq $AVAILABLE_MASKERS{$masker}) { $self->throw("Masker '$masker' is incompatible with input db sequence type"); } $infmt = 'blastdb'; } else { $data = $self->_fastize($data); my $sio = Bio::SeqIO->new(-file=>$data); my $s = $sio->next_seq; my $type; if ($s->alphabet =~ /.na/) { $type = 'nucl'; } elsif ($s->alphabet =~ /protein/) { $type = 'prot'; } else { $type = 'UNK'; } unless ($type eq $AVAILABLE_MASKERS{$masker}) { $self->throw("Masker '$masker' is incompatible with sequence type '$type'"); } } # check that sequence type and masker program match: # now, need to provide reasonable default masker arg settings, # and override these with $usr_make_args as necessary and appropriate my $mh = File::Temp->new(TEMPLATE=>'MSKXXXXX', UNLINK => 0, DIR => $self->db_dir); my $mask_outfile = $mh->filename; $mh->close; $self->_register_temp_for_cleanup(File::Spec->catfile($self->db_dir,$mask_outfile)); # NOTE: '-outfmt' argument must not be included in the default args because # it conflicts with windowmasker '-mk_counts' argument %mask_args = ( -in => $data, -parse_seqids => 1, ); # usr arg override if (%usr_args) { $mask_args{$_} = $usr_args{$_} for keys %usr_args; } # masker-specific pipelines my $status; for ($masker) { m/dustmasker/ && do { $mask_args{'-out'} = $mask_outfile; $mask_args{'-outfmt'} = $MASKER_ENCODING{$masker}; $self->{_factory} = $bp_class->new(-command => $masker, %mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last; }; m/windowmasker/ && do { # check mask_db if present if ($mask_db) { unless ($self->check_db($mask_db)) { $self->throw("Mask database '$mask_db' is not present or valid"); } } my $cth = File::Temp->new(TEMPLATE=>'MCTXXXXX', DIR => $self->db_dir); my $ct_file = $cth->filename; $cth->close; $mask_args{'-out'} = $ct_file; $mask_args{'-mk_counts'} = 'true'; $self->{_factory} = $bp_class->new(-command => $masker, %mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last unless $status; delete $mask_args{'-mk_counts'}; $mask_args{'-ustat'} = $ct_file; $mask_args{'-out'} = $mask_outfile; $mask_args{'-outfmt'} = $MASKER_ENCODING{$masker}; if ($mask_db) { $mask_args{'-in'} = $mask_db; $mask_args{'-infmt'} = 'blastdb'; } $self->factory->reset_parameters(%mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last; }; m/segmasker/ && do { $mask_args{'-infmt'} = $infmt; $mask_args{'-out'} = $mask_outfile; $mask_args{'-outfmt'} = $MASKER_ENCODING{$masker}; $self->{_factory} = $bp_class->new(-command => $masker, %mask_args); $self->factory->no_throw_on_crash($self->no_throw_on_crash); $status = $self->factory->_run; last; }; do { $self->throw("Masker program '$masker' not recognized"); }; } return $status ? $mask_outfile : $status; } =head2 db_info() Title : db_info Usage : Function: get info for database (via blastdbcmd -info); add factory attributes Returns : hash of database attributes Args : [optional] db name (scalar string) (default: currently attached db) =cut sub db_info { my $self = shift; my $db = shift; $db ||= $self->db_path; unless ($db) { $self->warn("db_info: db not specified and no db attached"); return; } if ($self->is_remote) { $self->warn("db_info: sorry, can't get info for remote database (complain to NCBI)"); return; } if ($db eq $self->db and $self->{_db_info}) { return $self->{_db_info}; # memoized } my $db_info_text; $self->{_factory} = $bp_class->new( -command => 'blastdbcmd', -info => 1, -db => $db ); $self->factory->no_throw_on_crash(1); $self->factory->_run(); $self->factory->no_throw_on_crash(0); if ($self->factory->stderr =~ /No alias or index file found/) { $self->warn("db_info: Couldn't find database ".$self->db."; make with make_db()"); return; } $db_info_text = $self->factory->stdout; # parse info into attributes my $infh = IO::String->new($db_info_text); my %attr; while (<$infh>) { /Database: (.*)/ && do { $attr{db_info_name} = $1; next; }; /([0-9,]+) sequences; ([0-9,]+) total/ && do { $attr{db_num_sequences} = $1; $attr{db_total_bases} = $2; $attr{db_num_sequences} =~ s/,//g; $attr{db_total_bases} =~ s/,//g; next; }; /Date: (.*?)\s+Longest sequence: ([0-9,]+)/ && do { $attr{db_date} = $1; # convert to more usable date object $attr{db_longest_sequence} = $2; $attr{db_longest_sequence} =~ s/,//g; next; }; /Algorithm ID/ && do { my $alg = $attr{db_filter_algorithms} = []; while (<$infh>) { if (/\s*([0-9]+)\s+([a-z0-9_]+)\s+(.*)/i) { my ($alg_id, $alg_name, $alg_opts) = ($1, $2, $3); $alg_opts =~ s/\s+$//; push @$alg, { algorithm_id => $alg_id, algorithm_name => $alg_name, algorithm_opts => $alg_opts }; } else { last; } } next; }; } # get db type if ( -e $db.'.psq' ) { $attr{_db_type} = 'prot'; } elsif (-e $db.'.nsq') { $attr{_db_type} = 'nucl'; } else { $attr{_db_type} = 'UNK'; # bork } if ($db eq $self->db) { $self->{_db_type} = $attr{_db_type}; $self->{_db_info_text} = $db_info_text; $self->{_db_info} = \%attr; } return \%attr; } =head2 set_db_make_args() Title : set_db_make_args Usage : Function: set the DB make arguments attribute with checking Returns : true on success Args : arrayref or hashref of named arguments =cut sub set_db_make_args { my $self = shift; my $args = shift; $self->throw("Arrayref or hashref required at DB_MAKE_ARGS") unless ref($args) =~ /^ARRAY|HASH$/; if (ref($args) eq 'HASH') { my @a = %$args; $args = \@a; } $self->throw("Named args required for DB_MAKE_ARGS") unless !(@$args % 2); $self->{'_db_make_args'} = $args; return 1; } sub db_make_args { shift->{_db_make_args} } =head2 set_mask_make_args() Title : set_mask_make_args Usage : Function: set the masker make arguments attribute with checking Returns : true on success Args : arrayref or hasref of named arguments =cut sub set_mask_make_args { my $self = shift; my $args = shift; $self->throw("Arrayref or hashref required at MASK_MAKE_ARGS") unless ref($args) =~ /^ARRAY|HASH$/; if (ref($args) eq 'HASH') { my @a = %$args; $args = \@a; } $self->throw("Named args required at MASK_MAKE_ARGS") unless !(@$args % 2); $self->{'_mask_make_args'} = $args; return 1; } sub mask_make_args { shift->{_mask_make_args} } =head2 check_db() Title : check_db Usage : Function: determine if database with registered name and dir exists Returns : 1 if db present, 0 if not present, undef if name/dir not set Args : [optional] db name (default is 'registered' name in $self->db) [optional] db directory (default is 'registered' dir in $self->db_dir) =cut sub check_db { my $self = shift; my ($db) = @_; my $db_path; if ($db) { my ($v,$d,$f) = File::Spec->splitpath($db); $f =~ s/\..*$//; # ignore extensions $db_path = File::Spec->catfile($d||'.',$f); } else { $db_path = $self->db_path; } if ( $db_path ) { $self->{_factory} = $bp_class->new( -command => 'blastdbcmd', -info => 1, -db => $db_path ); # $DB::single=1; $self->factory->no_throw_on_crash(1); $self->factory->_run(); $self->factory->no_throw_on_crash(0); return 0 if ($self->factory->stderr =~ /No alias or index file found/); return 1; } return; } =head2 no_throw_on_crash() Title : no_throw_on_crash Usage : $fac->no_throw_on_crash($newval) Function: set to prevent an exeception throw on a failed blast program execution Example : Returns : value of no_throw_on_crash (boolean) Args : on set, new value (boolean) =cut sub no_throw_on_crash { my $self = shift; return $self->{'no_throw_on_crash'} = shift if @_; return $self->{'no_throw_on_crash'}; } =head1 Internals =head2 _fastize() Title : _fastize Usage : Function: convert a sequence collection to a temporary fasta file (sans gaps) Returns : fasta filename (scalar string) Args : sequence collection =cut sub _fastize { my $self = shift; my $data = shift; for ($data) { !ref && do { # suppose a fasta file name $self->throw('Sequence file not found') unless -e $data; my $guesser = Bio::Tools::GuessSeqFormat->new(-file => $data); $self->throw('Sequence file not in FASTA format') unless $guesser->guess eq 'fasta'; last; }; (ref eq 'ARRAY') && (ref $$data[0]) && ($$data[0]->isa('Bio::Seq') || $$data[0]->isa('Bio::PrimarySeq')) && do { my $fh = File::Temp->new(TEMPLATE => 'DBDXXXXX', UNLINK => 0, DIR => $self->db_dir, SUFFIX => '.fas'); my $fname = $fh->filename; $fh->close; $self->_register_temp_for_cleanup($fname); my $fasio = Bio::SeqIO->new(-file=>">$fname", -format=>"fasta") or $self->throw("Can't create temp fasta file"); for (@$data) { my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_); } $fasio->close; $data = $fname; last; }; ref && do { # some kind of object my ($fmt) = ref($data) =~ /.*::(.*)/; if ($fmt eq 'fasta') { $data = $data->file; # use the fasta file directly } else { # convert my $fh = File::Temp->new(TEMPLATE => 'DBDXXXXX', UNLINK => 0, DIR => $self->db_dir, SUFFIX => '.fas'); my $fname = $fh->filename; $fh->close; $self->_register_temp_for_cleanup($fname); my $fasio = Bio::SeqIO->new(-file=>">$fname", -format=>"fasta") or $self->throw("Can't create temp fasta file"); require Bio::PrimarySeq; if ($data->isa('Bio::AlignIO')) { my $aln = $data->next_aln; for ($aln->each_seq) { # must de-gap my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_) } } elsif ($data->isa('Bio::SeqIO')) { while (local $_ = $data->next_seq) { my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_); } } elsif ($data->isa('Bio::Align::AlignI')) { for( $data->each_seq) { my $s = $_->seq; my $a = $_->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $_->seq( $s ); $_->alphabet($a); $fasio->write_seq($_) } } elsif ($data->isa('Bio::Seq') || $data->isa('Bio::PrimarySeq')) { my $s = $data->seq; my $a = $data->alphabet; $s =~ s/[$Bio::PrimarySeq::GAP_SYMBOLS]//g; $data->seq($s); $data->alphabet($a); $fasio->write_seq($data); } else { $self->throw("Can't handle sequence container object ". "of type '".ref($data)."'"); } $fasio->close; $data = $fname; } last; }; } return $data; } =head2 _register_temp_for_cleanup() Title : _register_temp_for_cleanup Usage : Function: register a file for cleanup with cleanup() method Returns : true on success Args : a file name or a blastdb basename (scalar string) =cut sub _register_temp_for_cleanup { my $self = shift; my @files = @_; for (@files) { my ($v, $d, $n) = File::Spec->splitpath($_); $_ = File::Spec->catfile($self->db_dir, $n) unless length($d); push @{$self->{_cleanup_list}}, File::Spec->rel2abs($_); } return 1; } =head2 cleanup() Title : cleanup Usage : Function: unlink files registered for cleanup Returns : true on success Args : =cut sub cleanup { my $self = shift; return unless $self->{_cleanup_list}; my $self_file = ''; if (exists $self->{_results}->{_file}) { $self_file = $self->{_results}->{_file}; } for (@{$self->{_cleanup_list}}) { # Close $self_file filehandle if it appears on the cleanup list, # to avoid 'permission denied' errors when unlinking if ($self_file ne '' and $_ =~ m/$self_file$/) { close $self->{_results}->_fh; } m/(\.[a-z0-9_]+)+$/i && do { unlink $_; next; }; do { # catch all index files if ( -e $_.".psq" ) { unlink glob($_.".p*"); unlink glob($_.".??.p*"); } elsif ( -e $_.".nsq" ) { unlink glob($_.".n*"); unlink glob($_.".??.n*"); } else { unlink $_; } next; }; } return 1; } =head2 AUTOLOAD In this module, C delegates L and L methods (including those of L) to the C attribute: $fac->stderr gives you $fac->factory->stderr If $AUTOLOAD isn't pointing to a WrapperBase method, then AUTOLOAD attempts to return a C attribute: e.g. $fac->db_num_sequences works by looking in the $fac->db_info() hash. Finally, if $AUTOLOAD is pointing to a blast query method, AUTOLOAD runs C with the C<-method> parameter appropriately set. =cut sub AUTOLOAD { my $self = shift; my @args = @_; my $method = $AUTOLOAD; $method =~ s/.*:://; my @ret; if (grep /^$method$/, @Bio::Tools::Run::StandAloneBlastPlus::BlastMethods) { push @args, ('-method_args' => ['-remote' => 1] ) if ($self->is_remote); return $self->run( -method => $method, @args ); } if ($self->factory and $self->factory->can($method)) { # factory method return $self->factory->$method(@args); } if ($self->db_info and grep /^$method$/, keys %{$self->db_info}) { return $self->db_info->{$method}; } # else, fail $self->throw("Can't locate method '$method' in class ".ref($self)); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/StandAloneBlastPlus/000077500000000000000000000000001302566030400243765ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/StandAloneBlastPlus/BlastMethods.pm000077500000000000000000000270071302566030400273360ustar00rootroot00000000000000# $Id$ # # BioPerl module for Bio::Tools::Run::StandAloneBlastPlus::BlastMethods # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneBlastPlus::BlastMethods - Provides BLAST methods to StandAloneBlastPlus =head1 SYNOPSIS # create a factory: $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'testdb' ); # get your results $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -method_args => [ '-num_alignments' => 10 ] ); $result = $fac->tblastx( -query => $an_alignment_object, -outfile => 'query.bls', -outformat => 7 ); # do a bl2seq $fac->bl2seq( -method => 'blastp', -query => $seq_object_1, -subject => $seq_object_2 ); =head1 DESCRIPTION This module provides the BLAST methods (blastn, blastp, psiblast, etc.) to the L object. =head1 USAGE This POD describes the use of BLAST methods against a L factory object. The object itself has extensive facilities for creating, formatting, and masking BLAST databases; please refer to L POD for these details. Given a C factory, such as $fac = Bio::Tools::Run::StandAloneBlastPlus->new( -db_name => 'testdb' ); you can run the desired BLAST method directly from the factory object, against the database currently attached to the factory (in the example, C). C<-query> is a required argument: $result = $fac->blastn( -query => 'query_seqs.fas' ); Here, C<$result> is a L object. Other details: =over =item * The blast output file can be named explicitly: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls' ); =item * The output format can be specified: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -outformat => 7 ); #tabular =item * Additional arguments to the method can be specified: $result = $fac->blastn( -query => 'query_seqs.fas', -outfile => 'query.bls', -method_args => [ '-num_alignments' => 10 , '-evalue' => 100 ]); =item * To get the name of the blast output file, do $file = $fac->blast_out; =item * To clean up the temp files (you must do this explicitly): $fac->cleanup; =back =head2 bl2seq() Running C is similar, but both C<-query> and C<-subject> are required, and the attached database is ignored. The blast method must be specified explicitly with the C<-method> parameter: $fac->bl2seq( -method => 'blastp', -query => $seq_object_1, -subject => $seq_object_2 ); Other parameters ( C<-method_args>, C<-outfile>, and C<-outformat> ) are valid. =head2 Return values The return value is always a L object on success, undef on failure. =head1 SEE ALSO L, L =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us Describe contact details here =head1 CONTRIBUTORS Additional contributors names and emails here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... # note: providing methods directly to the namespace... package Bio::Tools::Run::StandAloneBlastPlus; use strict; use warnings; use Bio::SearchIO; use lib '../../../..'; use Bio::Tools::Run::BlastPlus; use File::Temp; use File::Copy; use File::Spec; our @BlastMethods = qw( blastp blastn blastx tblastn tblastx psiblast rpsblast rpstblastn ); =head2 run() Title : run Usage : Function: Query the attached database using a specified blast method Returns : Bio::Search::Result::BlastResult object Args : key => value: -method => $method [blastp|blastn|blastx|tblastx|tblastn| rpsblast|psiblast|rpstblastn] -query => $query_sequences (a fasta file name or BioPerl sequence object or sequence collection object) -outfile => $blast_report_file (optional: default creates a tempfile) -outformat => $format_code (integer in [0..10], see blast+ docs) -method_args => [ -key1 => $value1, ... ] (additional arguments for the given method) =cut sub run { my $self = shift; my @args = @_; my ($method, $query, $outfile, $outformat, $method_args) = $self->_rearrange( [qw( METHOD QUERY OUTFILE OUTFORMAT METHOD_ARGS )], @args); my $ret; my (%blast_args, %usr_args); unless ($method) { $self->throw("Blast run: method not specified, use -method"); } unless ($query) { $self->throw("Blast run: query data required, use -query"); } unless ($outfile) { # create a tempfile name my $fh = File::Temp->new(TEMPLATE => 'BLOXXXXX', DIR => $self->db_dir, UNLINK => 0); $outfile = $fh->filename; $fh->close; $self->_register_temp_for_cleanup($outfile); } if ($outformat) { unless ($outformat =~ /^"?[0-9]{1,2}/) { $self->throw("Blast run: output format code should be integer 0-10"); } $blast_args{'-outfmt'} = $outformat; } if ($method_args) { $self->throw("Blast run: method arguments must be name => value pairs") unless !(@$method_args % 2); %usr_args = @$method_args; } # make db if necessary $self->make_db unless $self->check_db or $self->is_remote or $usr_args{'-subject'} or $usr_args{'-SUBJECT'}; # no db nec if this is bl2seq... $self->{_factory} = Bio::Tools::Run::BlastPlus->new( -command => $method ); if (%usr_args) { my @avail_parms = $self->factory->available_parameters('all'); while ( my( $key, $value ) = each %usr_args ) { $key =~ s/^-//; unless (grep /^$key$/, @avail_parms) { $self->throw("Blast run: parameter '$key' is not available for method '$method'"); } } } # remove a leading ./ on remote databases. Something adds that in the # factory, easier to remove here. my $db = $self->db_path; if ($self->is_remote) { $db =~ s#^\./##; } $blast_args{-db} = $db; $blast_args{-query} = $self->_fastize($query); $blast_args{-out} = $outfile; # user arg override if (%usr_args) { $blast_args{$_} = $usr_args{$_} for keys %usr_args; } # override for bl2seq; if ($blast_args{'-db'} && $blast_args{'-subject'}) { delete $blast_args{'-db'}; } $self->factory->set_parameters( %blast_args ); $self->factory->no_throw_on_crash( $self->no_throw_on_crash ); my $status = $self->_run; return $status unless $status; # kludge to demodernize the bl2seq output if ($blast_args{'-subject'}) { unless (_demodernize($outfile)) { $self->throw("Ack! demodernization failed!"); } } # if here, success for ($method) { m/^(t|psi|rps|rpst)?blast[npx]?/ && do { $ret = Bio::SearchIO->new(-file => $outfile); $self->{_blastout} = $outfile; $self->{_results} = $ret; $ret = $ret->next_result; last; }; do { 1; # huh? }; } return $ret; } =head2 bl2seq() Title : bl2seq Usage : Function: emulate bl2seq using blast+ programs Returns : Bio::Search::Result::BlastResult object Args : key => value -method => $blast_method [blastn|blastp|blastx| tblastn|tblastx] -query => $query (fasta file or BioPerl sequence object -subject => $subject (fasta file or BioPerl sequence object) -outfile => $blast_report_file -method_args => [ $key1 => $value1, ... ] (additional method parameters) =cut sub bl2seq { my $self = shift; my @args = @_; my ($method, $query, $subject, $outfile, $outformat, $method_args) = $self->_rearrange( [qw( METHOD QUERY SUBJECT OUTFILE OUTFORMAT METHOD_ARGS )], @args); unless ($method) { $self->throw("bl2seq: blast method not specified, use -method"); } unless ($query) { $self->throw("bl2seq: query data required, use -query"); } unless ($subject) { $self->throw("bl2seq: subject data required, use -subject"); } $subject = $self->_fastize($subject); my @run_args; if ($method_args) { @run_args = @$method_args; } return $self->run( -method => $method, -query => $query, -outfile => $outfile, -outformat => $outformat, -method_args => [ @run_args, '-subject' => $subject ] ); } =head2 next_result() Title : next_result Usage : $result = $fac->next_result; Function: get the next BLAST result Returns : Bio::Search::Result::BlastResult object Args : none =cut sub next_result() { my $self = shift; return unless $self->{_results}; return $self->{_results}->next_result; } =head2 rewind_results() Title : rewind_results Usage : $fac->rewind_results; Function: rewind BLAST results Returns : true on success Args : =cut sub rewind_results { my $self = shift; return unless $self->blast_out; $self->{_results} = Bio::SearchIO->new( -file => $self->blast_out ); return 1; } =head2 blast_out() Title : blast_out Usage : $file = $fac->blast_out Function: get the filename of the blast report file Returns : scalar string Args : none =cut sub blast_out { shift->{_blastout} } # =head2 _demodernize() # Title : _demodernize # Usage : # Function: Ha! Wouldn't you like to know! # Returns : # Args : # =cut sub _demodernize { my $file = shift; my $tf = File::Temp->new(); open (my $f, $file); while (<$f>) { s/^Subject=\s+/>/; print $tf $_; } $tf->close; copy($tf->filename, $file); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/StandAloneNCBIBlast.pm000066400000000000000000000467111302566030400245350ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlast # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneNCBIBlast - Object for the local execution of the NCBI BLAST program suite (blastall, blastpgp, bl2seq). With experimental support for NCBI rpsblast. =head1 SYNOPSIS # Do not use directly; see Bio::Tools::Run::StandAloneBlast =head1 DESCRIPTION See Bio::Tools::Run::StandAloneBlast =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Peter Schattner Email schattner at alum.mit.edu =head1 MAINTAINER - Torsten Seemann Email torsten at infotech.monash.edu.au =head1 CONTRIBUTORS Sendu Bala bix@sendu.me.uk (reimplementation) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::StandAloneNCBIBlast; use strict; use warnings; use base qw(Bio::Tools::Run::StandAloneBlast); our $AUTOLOAD; our $DEFAULTREADMETHOD = 'BLAST'; # If local BLAST databases are not stored in the standard # /data directory, the variable BLASTDATADIR will need to be # set explicitly our $DATADIR = $Bio::Tools::Run::StandAloneBlast::DATADIR; our %GENERAL_PARAMS = (i => 'input', o => 'outfile', p => 'program', d => 'database'); our @BLASTALL_PARAMS = qw(A B C D E F G K L M O P Q R S W X Y Z a b e f l m q r t v w y z n); our @BLASTALL_SWITCH = qw(I g J T U n V s); our @BLASTPGP_PARAMS = qw(A B C E F G H I J K L M N O P Q R S T U W X Y Z a b c e f h j k l m q s t u v y z); our @RPSBLAST_PARAMS = qw(F I J L N O P T U V X Y Z a b e l m v y z); our @BL2SEQ_PARAMS = qw(A D E F G I J M S T U V W X Y a e g j m q r t); our @OTHER_PARAMS = qw(_READMETHOD); =head2 new Title : new Usage : my $obj = Bio::Tools::Run::StandAloneBlast->new(); Function: Builds a newBio::Tools::Run::StandAloneBlast object Returns : Bio::Tools::Run::StandAloneBlast Args : -quiet => boolean # make program execution quiet -_READMETHOD => 'BLAST' (default, synonym 'SearchIO') || 'blast_pull' # the parsing method, case insensitive Essentially all BLAST parameters can be set via StandAloneBlast.pm. Some of the most commonly used parameters are listed below. All parameters have defaults and are optional except for -p in those programs that have it. For a complete listing of settable parameters, run the relevant executable BLAST program with the option "-" as in blastall - Note that the input parameters (-i, -j, -input) should not be set directly by you: this module sets them when you call one of the executable methods. Blastall -p Program Name [String] Input should be one of "blastp", "blastn", "blastx", "tblastn", or "tblastx". -d Database [String] default = nr The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "nr est" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm -S Query strands to search against database (for blast[nx], and tblastx). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 Blastpgp (including Psiblast) -j is the maximum number of rounds (default 1; i.e., regular BLAST) -h is the e-value threshold for including sequences in the score matrix model (default 0.001) -c is the "constant" used in the pseudocount formula specified in the paper (default 10) -B Multiple alignment file for PSI-BLAST "jump start mode" Optional -Q Output File for PSI-BLAST Matrix in ASCII [File Out] Optional rpsblast -d Database [String] default = (none - you must specify a database) The database specified must first be formatted with formatdb. Multiple database names (bracketed by quotations) will be accepted. An example would be -d "Cog Smart" -e Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm Bl2seq -p Program name: blastp, blastn, blastx. For blastx 1st argument should be nucleotide [String] default = blastp -o alignment output file [File Out] default = stdout -e Expectation value (E) [Real] default = 10.0 -S Query strands to search against database (blastn only). 3 is both, 1 is top, 2 is bottom [Integer] default = 3 =cut sub new { my ($caller, @args) = @_; my $self = $caller->SUPER::new(@args); # StandAloneBlast is special in that "one can modify the name of # the (ncbi) BLAST parameters as desired as long as the initial letter (and # case) of the parameter are preserved". We handle this by truncating input # args to their first char my %args = @args; @args = (); while (my ($attr, $value) = each %args) { $attr =~ s/^-//; $attr = substr($attr, 0, 1) unless $attr =~ /^_/; push(@args, $attr, $value); } $self->_set_from_args(\@args, -methods => {(map { $_ => $GENERAL_PARAMS{$_} } keys %GENERAL_PARAMS), (map { $_ => $_ } (@OTHER_PARAMS, @BLASTALL_PARAMS, @BLASTALL_SWITCH, @BLASTPGP_PARAMS, @RPSBLAST_PARAMS, @BL2SEQ_PARAMS))}, -code => { map { $_ => 'my $self = shift; if (@_) { my $value = shift; if ($value && $value ne \'F\') { $value = \'T\'; } else { $value = \'F\'; } $self->{\'_\'.$method} = $value; } return $self->{\'_\'.$method} || return;' } @BLASTALL_SWITCH }, # these methods can take boolean or 'T' and 'F' -create => 1, -force => 1, -case_sensitive => 1); my ($tfh, $tempfile) = $self->io->tempfile(); my $outfile = $self->o || $self->outfile || $tempfile; $self->o($outfile); close($tfh); $self->_READMETHOD($DEFAULTREADMETHOD) unless $self->_READMETHOD; return $self; } # StandAloneBlast is special in that "one can modify the name of # the (ncbi) BLAST parameters as desired as long as the initial letter (and # case) of the parameter are preserved". We handle this with AUTOLOAD # redirecting to the automatically created methods from _set_from_args() ! sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; my $orig = $attr; $attr = substr($attr, 0, 1); $self->can($attr) || $self->throw("Unallowed parameter: $orig !"); return $self->$attr(@_); } =head2 blastall Title : blastall Usage : $blast_report = $factory->blastall('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"ACTACCCTTTAAATCAGTGGGGG"); $blast_report = $factory->blastall($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory->blastall($seq_array_ref); Returns : Reference to a Blast object containing the blast report. Args : Name of a file or Bio::Seq object or an array of Bio::Seq object containing the query sequence(s). Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. =cut sub blastall { my ($self, $input1) = @_; $self->io->_io_cleanup(); my $executable = 'blastall'; # Create input file pointer my $infilename1 = $self->_setinput($executable, $input1) || $self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!"); $self->i($infilename1); my $blast_report = $self->_generic_local_blast($executable); } =head2 blastpgp Title : blastpgp Usage : $blast_report = $factory-> blastpgp('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"ACTADDEEQQPPTCADEEQQQVVGG"); $blast_report = $factory->blastpgp ($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory-> blastpgp(\@seq_array); Returns : Reference to a Bio::SearchIO object containing the blast report Args : Name of a file or Bio::Seq object. In psiblast jumpstart mode two additional arguments are required: a SimpleAlign object one of whose elements is the query and a "mask" to determine how BLAST should select scoring matrices see DESCRIPTION above for more details. Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. Returns : Reference to Bio::SearchIO object containing the blast report. =cut sub blastpgp { my $self = shift; my $executable = 'blastpgp'; my $input1 = shift; my $input2 = shift; # used by blastpgp's -B option to specify which # residues are position aligned my $mask = shift; my ($infilename1, $infilename2 ) = $self->_setinput($executable, $input1, $input2, $mask); if (!$infilename1) {$self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!");} $self->i($infilename1); # set file name of sequence to be blasted to inputfilename1 (-i param of blastpgp) if ($input2) { unless ($infilename2) {$self->throw("$input2 not SimpleAlign Object in pre-aligned psiblast\n");} $self->B($infilename2); # set file name of partial alignment to inputfilename2 (-B param of blastpgp) } my $blast_report = $self->_generic_local_blast($executable); } =head2 rpsblast Title : rpsblast Usage : $blast_report = $factory->rpsblast('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"MVVLCRADDEEQQPPTCADEEQQQVVGG"); $blast_report = $factory->rpsblast($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory->rpsblast(\@seq_array); Args : Name of a file or Bio::Seq object or an array of Bio::Seq object containing the query sequence(s). Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. Returns : Reference to a Bio::SearchIO object containing the blast report =cut sub rpsblast { my ($self, $input1) = @_; $self->io->_io_cleanup(); my $executable = 'rpsblast'; # Create input file pointer my $infilename1 = $self->_setinput($executable, $input1) || $self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!"); $self->i($infilename1); my $blast_report = $self->_generic_local_blast($executable); } =head2 bl2seq Title : bl2seq Usage : $factory-> bl2seq('t/seq1.fa', 't/seq2.fa'); or $input1 = Bio::Seq->new(-id=>"test query1", -seq=>"ACTADDEEQQPPTCADEEQQQVVGG"); $input2 = Bio::Seq->new(-id=>"test query2", -seq=>"ACTADDEMMMMMMMDEEQQQVVGG"); $blast_report = $factory->bl2seq ($input1, $input2); Returns : Reference to a BPbl2seq object containing the blast report. Args : Names of 2 files or 2 Bio::Seq objects containing the sequences to be aligned by bl2seq. Throws an exception if argument is not either a pair of strings (eg filenames) or references to Bio::Seq objects. If arguments are strings, throws exception if files corresponding to string names can not be found. =cut sub bl2seq { my $self = shift; my $executable = 'bl2seq'; my $input1 = shift; my $input2 = shift; # Create input file pointer my ($infilename1, $infilename2 ) = $self->_setinput($executable, $input1, $input2); if (!$infilename1){$self->throw(" $input1 not Seq Object or file name!");} if (!$infilename2){$self->throw("$input2 not Seq Object or file name!");} $self->i($infilename1); # set file name of first sequence to # be aligned to inputfilename1 # (-i param of bl2seq) $self->j($infilename2); # set file name of first sequence to # be aligned to inputfilename2 # (-j param of bl2seq) my $blast_report = $self->_generic_local_blast($executable); } =head2 _generic_local_blast Title : _generic_local_blast Usage : internal function not called directly Returns : Bio::SearchIO Args : Reference to calling object and name of BLAST executable =cut sub _generic_local_blast { my $self = shift; my $executable = shift; # Create parameter string to pass to Blast program my $param_string = $self->_setparams($executable); # run Blast my $blast_report = $self->_runblast($executable, $param_string); } =head2 _runblast Title : _runblast Usage : Internal function, not to be called directly Function: makes actual system call to Blast program Example : Returns : Report Bio::SearchIO object in the appropriate format Args : Reference to calling object, name of BLAST executable, and parameter string for executable =cut sub _runblast { my ($self, $executable, $param_string) = @_; my ($blast_obj, $exe); if (! ($exe = $self->executable($executable)) ) { $self->warn("cannot find path to $executable"); return; } # Use double quotes if executable path have empty spaces if ($exe =~ m/ /) { $exe = "\"$exe\""; } my $commandstring = $exe.$param_string; $self->debug("$commandstring\n"); system($commandstring) && $self->throw("$executable call crashed: $? | $! | $commandstring\n"); # set significance cutoff to set expectation value or default value # (may want to make this value vary for different executables) my $signif = $self->e() || 1e-5; # get outputfilename my $outfile = $self->o(); # this should allow any blast SearchIO parser (not just 'blast_pull' or 'blast', # but 'blastxml' and 'blasttable'). Fall back to 'blast' if not stipulated. my $method = $self->_READMETHOD; if ($method =~ /^(?:blast|SearchIO)/i ) { $method = 'blast' if $method =~ m{SearchIO}i; $blast_obj = Bio::SearchIO->new(-file => $outfile, -format => $method); } # should these be here? They have been deprecated... elsif ($method =~ /BPlite/i ) { if ($executable =~ /bl2seq/i) { # Added program info so BPbl2seq can compute strand info $self->throw("Use of Bio::Tools::BPbl2seq is deprecated; use Bio::SearchIO modules instead"); } elsif ($executable =~ /blastpgp/i && defined $self->j() && $self->j() > 1) { $self->throw("Use of Bio::Tools::BPpsilite is deprecated; use Bio::SearchIO modules instead"); } elsif ($executable =~ /blastall|rpsblast/i) { $self->throw("Use of Bio::Tools::BPlite is deprecated; use Bio::SearchIO modules instead"); } else { $self->warn("Unrecognized executable $executable"); } } else { $self->warn("Unrecognized readmethod $method"); } return $blast_obj; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Blast program Example : Returns : parameter string to be passed to Blast Args : Reference to calling object and name of BLAST executable =cut sub _setparams { my ($self, $executable) = @_; my ($attr, $value, @execparams); if ($executable eq 'blastall') { @execparams = (@BLASTALL_PARAMS, @BLASTALL_SWITCH); } elsif ($executable eq 'blastpgp') { @execparams = @BLASTPGP_PARAMS; } elsif ($executable eq 'rpsblast') { @execparams = @RPSBLAST_PARAMS; } elsif ($executable eq 'bl2seq' ) { @execparams = @BL2SEQ_PARAMS; } # we also have all the general params push(@execparams, keys %GENERAL_PARAMS); my $database = $self->d; if ($database && $executable ne 'bl2seq') { # Need to prepend datadirectory to database name my @dbs = split(/ /, $database); for my $i (0..$#dbs) { # (works with multiple databases) if (! (-e $dbs[$i].".nin" || -e $dbs[$i].".pin") && ! (-e $dbs[$i].".nal" || -e $dbs[$i].".pal") ) { $dbs[$i] = File::Spec->catdir($DATADIR, $dbs[$i]); } } $self->d('"'.join(" ", @dbs).'"'); } # workaround for problems with shell metacharacters [bug 2707] # simply quoting does not always work! my $tmp = $self->o; $self->o(quotemeta($tmp)) if ($tmp && $^O !~ /^MSWin/); my $param_string = $self->SUPER::_setparams(-params => [@execparams], -dash => 1); $self->o($tmp) if ($tmp && $^O !~ /^MSWin/); $self->d($database) if $database; if ($self->quiet()) { $param_string .= ' 2> '.File::Spec->devnull; } return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/StandAloneWUBlast.pm000066400000000000000000000221661302566030400243530ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::StandAloneBlast # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::StandAloneWUBlast - Object for the local execution of WU-Blast. =head1 SYNOPSIS # Do not use directly; use Bio::Tools::Run::StandAloneBlast =head1 DESCRIPTION See Bio::Tools::Run::StandAloneBlast =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Peter Schattner Email schattner at alum.mit.edu =head1 MAINTAINER - Torsten Seemann Email torsten at infotech.monash.edu.au =head1 CONTRIBUTORS Sendu Bala bix@sendu.me.uk (reimplementation) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::StandAloneWUBlast; use strict; use base qw(Bio::Tools::Run::StandAloneBlast); our $AUTOLOAD; our $DEFAULTREADMETHOD = 'BLAST'; # If local BLAST databases are not stored in the standard # /data directory, the variable BLASTDATADIR will need to be # set explicitly our $DATADIR = $Bio::Tools::Run::StandAloneBlast::DATADIR; our %GENERAL_PARAMS = (i => 'input', o => 'outfile', p => 'program', d => 'database'); our @WUBLAST_PARAMS = qw(e s e2 s2 w t x m y z l k h v b q r matrix filter wordmask filter maskextra hitdist wink ctxfactor gape gaps gape2 gaps2 gapw gapx olf golf olmax golmax gapdecayrate topcombon topcomboe sumstatsmethod hspsepqmax hspsepsmax gapsepqmax gapsepsmax altscore hspmax gspmax qoffset nwstart nwlen qrecmin qrecmax dbrecmin dbrecmax vdbdescmax dbchunks sort_by_pvalue cpus putenv getenv progress); our @WUBLAST_SWITCH = qw(kap sump poissonp lcfilter lcmask echofilter stats nogap gapall pingpong nosegs postsw span2 span1 span prune consistency links ucdb gi noseqs qtype qres sort_by_pvalue sort_by_count sort_by_highscore sort_by_totalscore sort_by_subjectlength mmio nonnegok novalidctxok shortqueryok notes warnings errors endputenv getenv endgetenv abortonerror abortonfatal); our @OTHER_PARAMS = qw(_READMETHOD); =head2 new Title : new Usage : my $obj = Bio::Tools::Run::StandAloneBlast->new(); Function: Builds a newBio::Tools::Run::StandAloneBlast object Returns : Bio::Tools::Run::StandAloneBlast Args : -quiet => boolean # make program execution quiet -_READMETHOD => 'BLAST' (default, synonym 'SearchIO') || 'blast_pull' # the parsing method, case insensitive Essentially all BLAST parameters can be set via StandAloneBlast.pm. Some of the most commonly used parameters are listed below. All parameters have defaults and are optional except for -p. -p Program Name [String] Input should be one of "wublastp", "wublastn", "wublastx", "wutblastn", or "wutblastx". -d Database [String] default = nr The database specified must first be formatted with xdformat. -E Expectation value (E) [Real] default = 10.0 -o BLAST report Output File [File Out] Optional, default = ./blastreport.out ; set by StandAloneBlast.pm =cut sub new { my ($caller, @args) = @_; my $self = $caller->SUPER::new(@args); $self->_set_from_args(\@args, -methods => {(map { $_ => $GENERAL_PARAMS{$_} } keys %GENERAL_PARAMS), (map { $_ => $_ } (@OTHER_PARAMS, @WUBLAST_PARAMS, @WUBLAST_SWITCH))}, -create => 1, -force => 1); my ($tfh, $tempfile) = $self->io->tempfile(); my $outfile = $self->o || $self->outfile || $tempfile; $self->o($outfile); close($tfh); $self->_READMETHOD($DEFAULTREADMETHOD) unless $self->_READMETHOD; return $self; } # We let get/setter method names be case-insensitve sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; my $orig = $attr; $attr = lc($attr); $self->can($attr) || $self->throw("Unallowed parameter: $orig !"); return $self->$attr(@_); } =head2 wublast Title : wublast Usage : $blast_report = $factory->wublast('t/testquery.fa'); or $input = Bio::Seq->new(-id=>"test query", -seq=>"ACTACCCTTTAAATCAGTGGGGG"); $blast_report = $factory->wublast($input); or $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects $blast_report = $factory->wublast(\@seq_array); Returns : Reference to a Blast object Args : Name of a file or Bio::Seq object or an array of Bio::Seq object containing the query sequence(s). Throws an exception if argument is not either a string (eg a filename) or a reference to a Bio::Seq object (or to an array of Seq objects). If argument is string, throws exception if file corresponding to string name can not be found. =cut sub wublast { my ($self, $input1) = @_; $self->io->_io_cleanup(); my $executable = 'wublast'; # Create input file pointer my $infilename1 = $self->_setinput($executable, $input1) || $self->throw("$input1 not Bio::Seq object or array of Bio::Seq objects or file name!"); $self->i($infilename1); my $blast_report = $self->_generic_local_wublast($executable); } =head2 _generic_local_wublast Title : _generic_local_wublast Usage : internal function not called directly Returns : Blast object Args : Reference to calling object and name of BLAST executable =cut sub _generic_local_wublast { my $self = shift; my $executable = shift; # Create parameter string to pass to Blast program my $param_string = $self->_setparams($executable); $param_string = " ".$self->database." ".$self->input." ".$param_string; # run Blast my $blast_report = $self->_runwublast($executable, $param_string); } =head2 _runwublast Title : _runwublast Usage : Internal function, not to be called directly Function: makes actual system call to WU-Blast program Example : Returns : Report Blast object Args : Reference to calling object, name of BLAST executable, and parameter string for executable =cut sub _runwublast { my ($self, $executable, $param_string) = @_; my ($blast_obj, $exe); if (! ($exe = $self->executable($self->p))){ $self->warn("cannot find path to $executable"); return; } # Use double quotes if executable path have empty spaces if ($exe =~ m/ /) { $exe = "\"$exe\""; } my $commandstring = $exe.$param_string; $self->debug("$commandstring\n"); system($commandstring) && $self->throw("$executable call crashed: $? | $! | $commandstring\n"); # get outputfilename my $outfile = $self->o(); $blast_obj = Bio::SearchIO->new(-file => $outfile, -format => 'blast'); return $blast_obj; } =head2 _setparams Title : _setparams Usage : Internal function, not to be called directly Function: Create parameter inputs for Blast program Example : Returns : parameter string to be passed to Blast Args : Reference to calling object and name of BLAST executable =cut sub _setparams { my ($self, $executable) = @_; my ($attr, $value, @execparams); @execparams = @WUBLAST_PARAMS; # of the general params, wublast only takes outfile at # this stage (we add in program, input and database manually elsewhere) push(@execparams, 'o'); # workaround for problems with shell metacharacters [bug 2707] # simply quoting does not always work! # Fixed so Windows files are not quotemeta'd my $tmp = $self->o; $self->o(quotemeta($tmp)) if ($tmp && $^O !~ /^MSWin/); my $param_string = $self->SUPER::_setparams(-params => [@execparams], -switches => \@WUBLAST_SWITCH, -dash => 1); $self->o($tmp) if ($tmp && $^O !~ /^MSWin/); if ($self->quiet()) { $param_string .= ' 2> '.File::Spec->devnull; } return $param_string; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/TigrAssembler.pm000066400000000000000000000300311302566030400236120ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::TigrAssembler # # Copyright Florent E Angly # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::TigrAssembler - Wrapper for local execution of TIGR Assembler v2 =head1 SYNOPSIS use Bio::Tools::Run::TigrAssembler; # Run TIGR Assembler using an input FASTA file my $factory = Bio::Tools::Run::TigrAssembler->new( -minimum_overlap_length => 35 ); my $asm_obj = $factory->run($fasta_file, $qual_file); # An assembly object is returned by default for my $contig ($assembly->all_contigs) { ... do something ... } # Read some sequences use Bio::SeqIO; my $sio = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta'); my @seqs; while (my $seq = $sio->next_seq()) { push @seqs,$seq; } # Run TIGR Assembler with input sequence objects and return an assembly file my $asm_file = 'results.tigr'; $factory->out_type($asm_file); $factory->run(\@seqs); # Use LIGR Assembler instead my $ligr = Bio::Tools::Run::TigrAssembler->new( -program_name => 'LIGR_Assembler', -trimmed_seq => 1 ); $ligr->run(\@seqs); =head1 DESCRIPTION Wrapper module for the local execution of the DNA assembly program TIGR Assembler v2.0. TIGR Assembler is open source software under The Artistic License and available at: http://www.tigr.org/software/assembler/ This module runs TIGR Assembler by feeding it a FASTA file or sequence objects and returning an assembly file or assembly and IO objects. When the input is Bioperl object, sequences less than 39 bp long are filtered out since they are not supported by TIGR Assembler. If provided in the following way, TIGR Assembler will use additional information present in the sequence descriptions for assembly: >seq_name minimum_clone_length maximum_clone_length median_clone_length clear_end5 clear_end3 or >db|seq_name minimum_clone_length maximum_clone_length median_clone_length clear_end5 clear_end3 e.g. >GHIBF57F 500 3000 1750 33 587 This module also supports LIGR Assembler, a variant of TIGR Assembler: http://sourceforge.net/projects/ligr-assembler/ =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Florent E Angly Email: florent-dot-angly-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::TigrAssembler; use strict; use IPC::Run; use base qw( Bio::Root::Root Bio::Tools::Run::AssemblerBase ); our $program_name = 'TIGR_Assembler'; # name of the executable our @program_params = (qw( minimum_percent minimum_length max_err_32 quality_file maximum_end resort_after )); our @program_switches = (qw( include_singlets consider_low_scores safe_merging_stop ignore_tandem_32mers use_tandem_32mers not_random incl_bad_seq trimmed_seq )); our %param_translation = ( 'quality_file' => 'q', 'minimum_percent' => 'p', 'minimum_length' => 'l', 'include_singlets' => 's', 'max_err_32' => 'g', 'consider_low_scores' => 'L', 'maximum_end' => 'e', 'ignore_tandem_32mers' => 't', 'use_tandem_32mers' => 'u', 'safe_merging_stop' => 'X', 'not_random' => 'N', 'resort_after' => 'r', 'incl_bad_seq' => 'b', 'trimmed_seq' => 'i' ); our $qual_param = 'quality_file'; our $use_dash = 1; our $join = ' '; our $asm_format = 'tigr'; our $min_len = 39; =head2 new Title : new Usage : $factory->new( -minimum_percent => 95, -minimum_length => 50, -include_singlets => 1 ); Function: Create a TIGR Assembler factory Returns : A Bio::Tools::Run::TigrAssembler object Args : TIGR Assembler options available in this module: minimum_percent / minimum_overlap_similarity: the minimum percent identity that two DNA fragments must achieve over their entire region of overlap in order to be considered as a possible assembly. Adjustments are made by the program to take into account that the ends of sequences are lower quality and doubled base calls are the most frequent sequencing error. minimum_length / minimum_overlap_length: the minimum length two DNA fragments must overlap to be considered as a possible assembly (warning: this option is not strictly respected by TIGR Assembler...) include_singlets: a flag which indicates that singletons (assemblies made up of a single DNA fragment) should be included in the lassie output_file - the default is to not include singletons. max_err_32: the maximum number + 1 of alignment errors (mismatches or gaps) allowed within any contiguous 32 base pairs in the overlap region between two DNA fragments in the same assembly. This is meant to split apart splice variants which have short splice differences and would not be disqualified by the -p minimum_percent parameter. consider_low_scores: a flag which causes even very LOW pairwise scores to be considered - caution using this flag may cause longer run time and a worse assembly. maximum_end: the maximum length at the end of a DNA fragment that does not match another overlapping DNA fragment (sometimes referred to as overhang) that will not disqualify a DNA fragment from becoming part of an assembly. ignore_tandem_32mers: a flag which causes tandem 32mers (a tandem 32mer is a 32mer which occurs more than once in at least one sequence read) to be ignored (this is now the default behavior and this flag is for backward compatibility) use_tandem_32mers: a flag which causes tandem 32mers to be used for pairwise comparison opposite of the -t flag which is now the default). safe_merging_stop: a flag which causes merging to stop when only sequences which appear to be repeats are left and these cannot be merged based on clone length constraints. not_random: a flag which indicates that the DNA fragments in the input_file should not be treated as random genomic fragments for the purpose of determining repeat regions. resort_after: specifies how many sequences should be merged before resorting the possible merges based on clone constraints. LIGR Assembler has the same options as TIGR Assembler, and the following: incl_bad_seq: keep all sequences including potential chimeras and splice variants trimmed_seq: indicates that the sequences are trimmed. High quality scores will be given on the whole sequence length instead of just in the middle) =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->_set_program_options(\@args, \@program_params, \@program_switches, \%param_translation, $qual_param, $use_dash, $join); *minimum_overlap_length = \&minimum_length; *minimum_overlap_similarity = \&minimum_percent; $self->program_name($program_name) if not defined $self->program_name(); $self->_assembly_format($asm_format); return $self; } =head2 out_type Title : out_type Usage : $factory->out_type('Bio::Assembly::ScaffoldI') Function: Get/set the desired type of output Returns : The type of results to return Args : Desired type of results to return (optional): 'Bio::Assembly::IO' object 'Bio::Assembly::ScaffoldI' object (default) The name of a file to save the results in =cut =head2 run Title : run Usage : $factory->run($fasta_file); Function: Run TIGR Assembler Returns : - a Bio::Assembly::ScaffoldI object, a Bio::Assembly::IO object, a filename, or undef if all sequences were too small to be usable Returns : Assembly results (file, IO object or assembly object) Args : - sequence input (FASTA file or sequence object arrayref) - optional quality score input (QUAL file or quality score object arrayref) =cut =head2 _run Title : _run Usage : $assembler->_run() Function: Make a system call and run Newbler Returns : An assembly file Args : - FASTA file, SFF file and MID, or analysis dir and MID - optional QUAL file =cut sub _run { my ($self, $fasta_file, $qual_file) = @_; # Setup needed files and filehandles first my ($output_fh, $output_file ) = $self->_prepare_output_file( ); my ($scratch_fh, $scratch_file) = $self->io->tempfile( -dir => $self->tempdir() ); my ($stderr_fh, $stderr_file ) = $self->io->tempfile( -dir => $self->tempdir() ); # Get program executable my $exe = $self->executable; # Get command-line options my $options = $self->_translate_params(); # Usage: TIGR_Assembler [options] scratch_file < input_file > output_file my @program_args = ( $exe, @$options, $scratch_file ); my $stdin = $fasta_file; my $stdout = $output_file; my $stderr = $stderr_file; my @ipc_args = ( \@program_args, '<', $fasta_file, '>', $output_file, '2>', $stderr_file ); # Print command for debugging if ($self->verbose() >= 0) { my $cmd = ''; $cmd .= join ( ' ', @program_args ); for ( my $i = 1 ; $i < scalar @ipc_args ; $i++ ) { my $element = $ipc_args[$i]; my $ref = ref($element); my $value; if ( $ref && $ref eq 'SCALAR') { $value = $$element; } else { $value = $element; } $cmd .= " $value"; } $self->debug( "$exe command = $cmd\n" ); } # Execute command eval { IPC::Run::run(@ipc_args) || die("There was a problem running $exe: $!"); }; if ($@) { $self->throw("$exe call crashed: $@"); } $self->debug(join("\n", "$exe STDERR", $stderr_file)) if $stderr_file; # TIGR Assembler's stderr reports a lot more than just errors # Close filehandles close($scratch_fh); close($output_fh); close($stderr_fh); # Import assembly return $output_file; } =head2 _remove_small_sequences Title : _remove_small_sequences Usage : $assembler->_remove_small_sequences(\@seqs, \@quals) Function: Remove sequences below a threshold length Returns : a new sequence object array reference a new quality score object array reference Args : sequence object array reference quality score object array reference (optional) =cut # Aliasing function _prepare_input_sequences to _remove_small_sequences *_prepare_input_sequences = \&_remove_small_sequences; sub _remove_small_sequences { my ($self, $seqs, $quals) = @_; # The threshold length, $min_len, has been registered as a global variable my @new_seqs; my @new_quals; if (ref($seqs) =~ m/ARRAY/i) { my @removed; my $nof_seqs = scalar @$seqs; for my $i (1 .. $nof_seqs) { my $seq = $$seqs[$i-1]; if ($seq->length >= $min_len) { push @new_seqs, $seq; if ($quals) { my $qual = $$quals[$i-1]; push @new_quals, $qual; } } else { push @removed, $seq->id; } } if (scalar @removed > 0) { $self->warn("The following sequences were removed because they are smaller". " than $min_len bp: @removed\n"); } } return \@new_seqs, \@new_quals; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Tmhmm.pm000077500000000000000000000173611302566030400221470ustar00rootroot00000000000000# # # Copyright Balamurugan Kumarasamy # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code # =head1 NAME Bio::Tools::Run::Tmhmm - Object for identifying transmembrane helixes in a given protein seequence. =head1 SYNOPSIS # Build a Tmhmm factory # $paramfile is the full path to the seg binary file my @params = ('PROGRAM',$paramfile); my $factory = Bio::Tools::Run::Tmhmm->new($param); # Pass the factory a Bio::Seq object # @feats is an array of Bio::SeqFeature::Generic objects my @feats = $factory->run($seq); =head1 DESCRIPTION Tmhmm is a program for identifying transmembrane helices in proteins. You must have the environmental variable TMHMMDIR set to the base directory where I and it's associated data/option files reside (NOT the bin directory where the actual executable resides) =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Bala Email savikalpa@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Tmhmm; use vars qw($AUTOLOAD @ISA $PROGRAMNAME @TMHMM_PARAMS %OK_FIELD); use strict; use Cwd; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Tools::Tmhmm; use Bio::Tools::Run::WrapperBase; @ISA = qw(Bio::Tools::Run::WrapperBase); BEGIN { $PROGRAMNAME = 'tmhmm' . ($^O =~ /mswin/i ?'.exe':''); @TMHMM_PARAMS=qw(PROGRAM VERBOSE NOPLOT); foreach my $attr ( @TMHMM_PARAMS) { $OK_FIELD{$attr}++; } } =head2 program_name Title : program_name Usage : $factory>program_name() Function: holds the program name Returns: string Args : None =cut sub program_name { return $PROGRAMNAME; } =head2 program_dir Title : program_dir Usage : $factory->program_dir(@params) Function: returns the program directory, obtained from ENV variable, in this case it is the tmhmm installation directory, not the location of the executable. Returns: string Args : =cut sub program_dir { return $ENV{TMHMMDIR} || ''; } =head2 program_path Title : program_path Usage : my $path = $factory->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Args : none =cut sub program_path { my ($self) = @_; my @path; if ($self->program_dir) { my $program_dir = $self->program_dir; $program_dir =~ s/\/bin//; push @path, $program_dir; } push @path, 'bin'; push @path, $self->program_name.($^O =~ /mswin/i ?'.exe':''); return File::Spec->catfile(@path); } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : $rm->new(@params) Function: creates a new Tmhmm factory Returns: Bio::Tools::Run::Tmhmm Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 predict_protein_features Title : predict_protein_features() Usage : DEPRECATED Use $obj->run($seq) instead Function: Runs Tmhmm and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub predict_protein_features{ return shift->run(@_); } =head2 executable Title : executable Usage : my $exe = $tmhmm->executable('tmhmm'); Function: Finds the full path to the 'tmhmm' executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my $self = shift; my $exe = $self->SUPER::executable(@_) || return; # even if its executable, we still need the environment variable to have # been set if (! $ENV{TMHMMDIR}) { $self->warn("Environment variable TMHMMDIR must be set, even if the tmhmm executable is in your path"); return undef; } return $exe; } =head2 run Title : run() Usage : $obj->run($seq) Function: Runs Tmhmm and creates an array of featrues Returns : An array of Bio::SeqFeature::Generic objects Args : A Bio::PrimarySeqI =cut sub run { my ($self,$seq) = @_; my @feats; if (ref($seq) ) { # it is an object if (ref($seq) =~ /GLOB/) { $self->throw("cannot use filehandle"); } my $infile1 = $self->_writeSeqFile($seq); $self->_input($infile1); @feats = $self->_run(); unlink $infile1; } else { # The clone object is not a seq object but a file. Perhaps # should check here or before if this file is fasta format...if # not die Here the file does not need to be created or # deleted. Its already written and may be used by other # runnables. $self->_input($seq); @feats = $self->_run(); } return @feats; } =head2 _input Title : _input Usage : obj->_input($seqFile) Function: Internal(not to be used directly) Returns : Args : =cut sub _input() { my ($self,$infile1) = @_; if (defined $infile1){ $self->{'input'}=$infile1; } return $self->{'input'}; } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An array of Bio::SeqFeature::Generic objects Args : =cut sub _run { my ($self)= @_; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $str = $self->executable || return; if( $self->NOPLOT ) { $str .= " --noplot"; } $str .= " -basedir=".$self->program_dir." -workdir=".$self->tempdir()." ".$self->_input." > ".$outfile; my $status = system($str); $self->throw( "Tmhmm call ($str) crashed: $? \n") unless $status==0; my $filehandle; if (ref ($outfile) !~ /GLOB/) { open (TMHMM, "<".$outfile) or $self->throw ("Couldn't open file ".$outfile.": $!\n"); $filehandle = \*TMHMM; } else { $filehandle = $outfile; } my $tmhmm_parser = Bio::Tools::Tmhmm->new(-fh=>$filehandle); my @tmhmm_feat; while(my $tmhmm_feat = $tmhmm_parser->next_result){ push @tmhmm_feat, $tmhmm_feat; } # free resources $self->cleanup(); unlink $outfile; close($tfh1); undef $tfh1; return @tmhmm_feat; } =head2 _writeSeqFile Title : _writeSeqFile Usage : obj->_writeSeqFile($seq) Function: Internal(not to be used directly) Returns : Args : =cut sub _writeSeqFile{ my ($self,$seq) = @_; my ($tfh,$inputfile) = $self->io->tempfile(-dir=>$self->tempdir()); my $in = Bio::SeqIO->new(-fh => $tfh , '-format' => 'Fasta'); $in->write_seq($seq); close($tfh); undef $tfh; return $inputfile; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/TribeMCL.pm000066400000000000000000000701061302566030400224570ustar00rootroot00000000000000# BioPerl module for TribeMCL # # Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::TribeMCL =head1 SYNOPSIS use Bio::Tools::Run::TribeMCL; use Bio::SearchIO; # 3 methods to input the blast results # straight forward raw blast output (NCBI or WU-BLAST) my @params = ('inputtype'=>'blastfile'); # OR # markov program format # protein_id1 protein_id2 evalue_magnitude evalue_factor # for example: # proteins ENSP00000257547 and ENSP00000261659 # with a blast score evalue of 1e-50 # and proteins O42187 and ENSP00000257547 # with a blast score evalue of 1e-119 # entry would be my @array = [[qw(ENSP00000257547 ENSP00000261659 1 50)], [qw(O42187 ENSP00000257547 1 119)]]; my @params = ('pairs'=>\@array,I=>'2.0'); # OR # pass in a searchio object # slowest of the 3 methods as it does more rigourous parsing # than required for us here my $sio = Bio::SearchIO->new(-format=>'blast', -file=>'blast.out'); my @params=('inputtype'=>'searchio',I=>'2.0'); # you can specify the path to the executable manually in the following way my @params=('inputtype'=>'blastfile',I=>'2.0', 'mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl', 'matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); my $fact = Bio::Tools::Run::TribeMCL->new(@params); # OR $fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); $fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl'); # to run my $fact = Bio::Tools::Run::TribeMCL->new(@params); # Run the program # returns an array reference to clusters where members are the ids # for example :2 clusters with 3 members per cluster: # $fam = [ [mem1 mem2 mem3],[mem1 mem2 mem3]] # pass in either the blastfile path/searchio obj/the array ref to scores my $fam = $fact->run($sio); # print out your clusters for (my $i = 0; $i [$i]})." members\n"; foreach my $member (@{$fam->[$i]}){ print "\t$member\n"; } } =head1 DESCRIPTION TribeMCL is a method for clustering proteins into related groups, which are termed 'protein families'. This clustering is achieved by analysing similarity patterns between proteins in a given dataset, and using these patterns to assign proteins into related groups. In many cases, proteins in the same protein family will have similar functional properties. TribeMCL uses a novel clustering method (Markov Clustering or MCL) which solves problems which normally hinder protein sequence clustering. Reference: Enright A.J., Van Dongen S., Ouzounis C.A; Nucleic Acids Res. 30(7):1575-1584 (2002) You will need tribe-matrix (the program used to generate the matrix for input into mcl) and mcl (the clustering software) available at: http://www.ebi.ac.uk/research/cgg/tribe/ or http://micans.org/mcl/. Future Work in this module: Port the tribe-matrix program into perl so that we can enable have a SearchIO kinda module for reading and writing mcl data format =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a "_". =cut # Let the code begin... package Bio::Tools::Run::TribeMCL; use vars qw($AUTOLOAD @ISA $PROGRAMDIR @TRIBEMCL_PARAMS @MATRIXPROGRAM_PARAMS @MCLPROGRAM_PARAMS @OTHER_SWITCHES %OK_FIELD $MATRIXPROGRAM_NAME $MCLPROGRAM_NAME $MCLPROGRAM $MATRIXPROGRAM ); use strict; use Bio::SeqIO; use Bio::Root::Root; use Bio::Root::IO; use Bio::Cluster::SequenceFamily; use Bio::Factory::ApplicationFactoryI; use Bio::Tools::Run::WrapperBase; use Bio::Annotation::DBLink; use Bio::Seq; use Bio::Species; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); # You will need to enable mcl and tribe-matrix to use this wrapper. This # can be done in (at least) two ways: # # 1. define an environmental variable TRIBEDIR # export TRIBEDIR =/usr/local/share/mclbin/ # where the tribe-matrix and mcl programs are located. #you probably need to copy them individually to the same directory #if the installation puts them in different directories. or simply put them in #your standard /usr/local/bin # # 2. include a definition of an environmental variable TRIBEDIR in # every script that will use TRIBEMCL.pm # $ENV{TRIBEDIR} = '/usr/local/share/mclbin/; # #3. Manually set the path to the executabes in your code: # #my @params=('inputtype'=>'blastfile',I=>'2.0',' # mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl',' # matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); #my $fact = Bio::Tools::Run::TribeMCL->new(@params); #or #$fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); #$fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl'); BEGIN { $MATRIXPROGRAM_NAME = 'tribe-matrix'; $MCLPROGRAM_NAME = 'mcl'; if (defined $ENV{TRIBEDIR}) { $PROGRAMDIR = $ENV{TRIBEDIR} || ''; $MCLPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MCLPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':'')); $MATRIXPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MATRIXPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':'')); } @TRIBEMCL_PARAMS = qw(inputtype hsp hit scorefile blastfile description_file searchio pairs mcl matrix weight description family_tag use_db); @MATRIXPROGRAM_PARAMS = qw(ind out chunk); @MCLPROGRAM_PARAMS = qw(I t P R pct o); @OTHER_SWITCHES = qw(verbose quiet); # Authorize attribute fields foreach my $attr (@TRIBEMCL_PARAMS, @MATRIXPROGRAM_PARAMS, @MCLPROGRAM_PARAMS, @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } } sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); my ($attr, $value); while (@args) { $attr = shift @args; $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters if ($attr =~/MCL/i) { $self->mcl_executable($value); next; } if ($attr =~ /MATRIX/i){ $self->matrix_executable($value); next; } $self->$attr($value); } defined($self->weight) || $self->weight(200); return $self; } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 mcl_executable Title : mcl_executable Usage : $self->mcl_executable() Function: get set for path to mcl executable Returns : String or undef if not installed Args : [optional] string of path to executable [optional] boolean to warn on missing executable status =cut sub mcl_executable{ my ($self,$exe,$warn) = @_; if( defined $exe ) { $self->{'_mcl_exe'} = $exe; } unless( defined $self->{'_mcl_exe'} ) { # this would be the name set in the BEGIN block if( $MCLPROGRAM && -e $MCLPROGRAM && -x $MCLPROGRAM ) { $self->{'_mcl_exe'} = $MCLPROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($MCLPROGRAM_NAME) ) && -x $exe ) { $self->{'_mcl_exe'} = $exe; } else { $self->warn("Cannot find executable for $MCLPROGRAM_NAME") if $warn; $self->{'_mcl_exe'} = undef; } } } $self->{'_mcl_exe'}; } =head2 matrix_executable Title : matrix_executable Usage : $self->matrix_executable() Function: get set for path to tribe-matrix executable Returns : String or undef if not installed Args : [optional] string of path to executable [optional] boolean to warn on missing executable status =cut sub matrix_executable{ my ($self,$exe,$warn) = @_; if( defined $exe ) { $self->{'_matrix_exe'} = $exe; } unless( defined $self->{'_matrix_exe'} ) { # this would be the name set in the BEGIN block if( $MATRIXPROGRAM && -e $MATRIXPROGRAM && -x $MATRIXPROGRAM ) { $self->{'_matrix_exe'} = $MATRIXPROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($MATRIXPROGRAM_NAME) ) && -x $exe ) { $self->{'_matrix_exe'} = $exe; } else { $self->warn("Cannot find executable for $MATRIXPROGRAM_NAME") if $warn; $self->{'_matrix_exe'} = undef; } } } $self->{'_matrix_exe'}; } =head2 run Title : run Usage : $self->run() Function: runs the clustering Returns : Array Ref of clustered Ids Args : =cut sub run { my ($self,$input) = @_; if($self->description_file){ $self->_setup_description($self->description_file); } my $file = $self->_setup_input($input); defined($file) || $self->throw("Error setting up input "); #run tribe-matrix to generate matrix for mcl my ($index_file, $mcl_infile) = $self->_run_matrix($file); $self->throw("tribe-matrix not run properly as index file is missing") unless (-e $index_file); $self->throw("tribe-matrix not run properly as matrix file is missing") unless (-e $mcl_infile); #run mcl my $clusters = $self->_run_mcl($index_file,$mcl_infile); my $families; if($self->description){ my %consensus = $self->_consensifier($clusters); $families = $self->_generate_families($clusters,\%consensus); } else { $families = $self->_generate_families($clusters); } return @{$families}; } sub _generate_families { my ($self,$clusters,$consensus) = @_; my $family_tag = $self->family_tag || "TribeFamily"; my @fam; if($consensus){ my %description = %{$self->description}; my %consensus = %{$consensus}; for(my $i = 0; $i < scalar(@{$clusters}); $i++){ my @mem; foreach my $member (@{$clusters->[$i]}){ my $mem = Bio::Seq->new(-id=>$member, -alphabet=>"protein", -desc=>$description{$member}->[1]); my $annot = Bio::Annotation::DBLink->new(-database=>$description{$member}->[0], -primary_id=>$member); $mem->annotation->add_Annotation('dblink',$annot); #store species information my $taxon_str = $description{$member}->[2]; #parse taxon info into hash my %taxon; $taxon_str=~s/=;/=undef;/g if $taxon_str; %taxon = map{split '=',$_}split';',$taxon_str if $taxon_str; my $name = $taxon{'taxon_common_name'}; my @classification = $taxon{'taxon_classification'} ? split(':',$taxon{'taxon_classification'}) : (); my $tax_id = $taxon{'taxon_id'}; my $sub_species = $taxon{'taxon_sub_species'}; my $species = Bio::Species->new(); $species->common_name($name) if $name; #*** should this actually be scientific_name() ? $species->sub_species($sub_species) if $sub_species; $species->ncbi_taxid($tax_id) if $tax_id; $species->classification(@classification) if @classification; $mem->species($species); push @mem, $mem; } my $id = $family_tag."_".$i; my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id, -description=>$consensus{$i}{desc}, -annotation_score=>$consensus{$i}{conf}, -members=>\@mem); push @fam, $fam; } return \@fam; } else { for(my $i = 0; $i < scalar(@{$clusters}); $i++){ my @mem; foreach my $member (@{$clusters->[$i]}){ my $mem = Bio::Seq->new(-id=>$member, -alphabet=>"protein"); push @mem, $mem; } my $id = $family_tag."_".$i; my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id, -members=>\@mem); push @fam, $fam; } return \@fam; } } sub _consensifier { my ($self,$clusters) = @_; eval { require "Algorithm/Diff.pm"; }; if($@){ $self->warn("Algorith::Diff is needed to run TribeMCL"); return undef; } my %description = %{$self->description}; my %consensus; my $best_annotation; my %use_db; if($self->use_db){ foreach my $key(split(',',$self->use_db)){ $use_db{$key}++; } } CLUSTER: for(my $i = 0; $i < scalar(@{$clusters}); $i++){ my @desc; my @orig_desc; my $total_members = scalar(@{$clusters->[$i]}); foreach my $member(@{$clusters->[$i]}){ #if specify which dbs to use for consensifying if($self->use_db){ if($use_db{$description{$member}->[0]}){ push @desc, $description{$member}->[1] if $description{$member}->[1]; push @orig_desc, $description{$member}->[1] if $description{$member}->[1]; } } else { push @desc, $description{$member}->[1] if $description{$member}->[1]; push @orig_desc, $description{$member}->[1] if $description{$member}->[1]; } } if($#desc < 0){ #truly unknown $consensus{$i}{desc} = "UNKNOWN"; $consensus{$i}{conf} = 0; next CLUSTER; } if($#desc == 0){#only a single description $consensus{$i}{desc} = grep(/S+/,@desc); $consensus{$i}{desc} = $consensus{$i}{desc} || "UNKNOWN"; if ($consensus{$i}{desc} eq "UNKNOWN") { $consensus{$i}{conf} = 0; } else { $consensus{$i}{conf} = 100 * int(1/$total_members); } next CLUSTER; } #all the same desc my %desc = (); foreach my $desc (@desc) { $desc{$desc}++; } if ( (keys %desc) == 1 ) { my ($best_annotation,) = keys %desc; my $n = grep($_ eq $best_annotation, @desc); my $perc= int( 100*($n/$total_members) ); $consensus{$i}{desc} = $best_annotation; $consensus{$i}{conf} = $perc; next CLUSTER; } my %lcshash = (); my %lcnext = (); while (@desc) { # do an all-against-all LCS (longest commong substring) of the # descriptions of all members; take the resulting strings, and # again do an all-against-all LCS on them, until we have nothing # left. The LCS's found along the way are in lcshash. # # Incidentally, longest common substring is a misnomer, since it # is not guaranteed to occur in either of the original strings. It # is more like the common parts of a Unix diff ... for (my $i=0;$i<@desc;$i++) { for (my $j=$i+1;$j<@desc;$j++){ my @list1=split(" ",$desc[$i]); my @list2=split(" ",$desc[$j]); my @lcs=Algorithm::Diff::LCS(\@list1,\@list2); my $lcs=join(" ",@lcs); $lcshash{$lcs}=1; $lcnext{$lcs}=1; } } @desc=keys(%lcnext); undef %lcnext; } my ($best_score, $best_perc)=(0, 0); my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ; foreach my $candidate_consensus (@all_cands) { my @temp=split(" ",$candidate_consensus); my $length=@temp; # num of words in annotation # see how many members of cluster contain this LCS: my ($lcs_count)=0; foreach my $orig_desc (@orig_desc) { my @list1=split(" ",$candidate_consensus); my @list2=split(" ",$orig_desc); my @lcs=Algorithm::Diff::LCS(\@list1,\@list2); my $lcs=join(" ",@lcs); if ($lcs eq $candidate_consensus || index($orig_desc,$candidate_consensus) != -1 # addition; # many good (single word) annotations fall out otherwise ) { $lcs_count++; # Following is occurs frequently, as LCS is _not_ the longest # common substring ... so we can't use the shortcut either # if ( index($orig_desc,$candidate_consensus) == -1 ) { # warn "lcs:'$lcs' eq cons:'$candidate_consensus' and # orig:'$orig_desc', but index == -1\n" # } } } my $perc_with_desc=(($lcs_count/$total_members))*100; my $perc=($lcs_count/$total_members)*100; my $score=$perc + ($length*14); # take length into account as well $score = 0 if $length==0; if (($perc_with_desc >= 40) && ($length >= 1)) { if ($score > $best_score) { $best_score=$score; $best_perc=$perc; $best_annotation=$candidate_consensus; } } } if ($best_perc==0 || $best_perc >= 100 ) { $best_perc='NA'; } if ($best_annotation eq '') { $best_annotation = 'AMBIGUOUS'; } $consensus{$i}{desc} = $best_annotation; $consensus{$i}{conf} = $best_perc; } return %consensus; } sub _setup_description { my ($self,$file) = @_; my $goners='().-'; my $spaces= ' ' x length($goners); my $filter = "tr '$goners' '$spaces' < $file"; open (FILE,"$filter | ") || die "$filter: $!"; my %description; while(){ chomp; my ($db,$acc,$description,$taxon_str) = split("\t",$_); $description || $self->throw("Wrongly formated description file"); $description = $self->_apply_edits($description); if($description{$acc}){ $self->warn("Duplicated entry $acc in description file, overwriting.."); } $description{$acc} = [$db,$description,$taxon_str]; } $self->description(\%description); } sub as_words { #add ^ and $ to regexp my (@words); my @newwords=(); foreach my $word (@words) { push @newwords, "^$word\$" }; } sub _apply_edits { my ($self,$desc) = @_; my @deletes = ( 'FOR\$', 'SIMILAR TO\$', 'SIMILAR TO PROTEIN\$', 'RIKEN.*FULL.*LENGTH.*ENRICHED.*LIBRARY', '\w*\d{4,}','HYPOTHETICAL PROTEIN' ); my @newwords = &as_words(qw(NOVEL PUTATIVE PREDICTED UNNAMED UNNMAED ORF CLONE MRNA CDNA EST RIKEN FIS KIAA\d+ \S+RIK IMAGE HSPC\d+ FOR HYPOTETICAL HYPOTHETICAL)); push @deletes, @newwords; foreach my $re ( @deletes ) { $desc=~s/$re//g; } #Apply some fixes to the annotation: $desc=~s/EC (\d+) (\d+) (\d+) (\d+)/EC $1.$2.$3.$4/; $desc=~s/EC (\d+) (\d+) (\d+)/EC $1.$2.$3.-/; $desc=~s/EC (\d+) (\d+)/EC $1\.$2.-.-/; $desc=~s/(\d+) (\d+) KDA/$1.$2 KDA/; return $desc; } =head2 _run_mcl Title : _run_mcl Usage : $self->_run_mcl() Function: internal function for running the mcl program Returns : Array Ref of clustered Ids Args : Index_file name, matrix input file name =cut sub _run_mcl { my ($self,$ind_file,$infile) = @_; my $exe = $self->mcl_executable || $self->throw("mcl not found."); my $cmd = $exe . " $infile"; unless (defined $self->o) { my ($fh,$o) = $self->io->tempfile(-dir=>$self->tempdir); $self->o($o); # file handle not use later so deleted close($fh); undef $fh; } unless (defined $self->I) { $self->I(3.0); } foreach my $param (@MCLPROGRAM_PARAMS) { if (defined $self->$param) { $cmd .= " -$param ".$self->$param; } } if($self->quiet || ($self->verbose < 0)){ $cmd .= " -V all"; if( $^O !~ /Mac/) { my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd .= " 2> $null"; } } my $status = system($cmd); $self->throw( "mcl call ($cmd) crashed: $? \n") unless $status==0; my $families = $self->_parse_mcl($ind_file,$self->o); return $families; } =head2 _run_matrix Title : _run_matrix Usage : $self->_run_matrix() Function: internal function for running the tribe-matrix program Returns : index filepath and matrix file path Args : filepath of parsed ids and scores =cut sub _run_matrix { my ($self,$parse_file) = @_; my $exe = $self->matrix_executable || $self->throw("tribe-matrix not found."); my $cmd = $exe . " $parse_file"; unless (defined $self->ind) { my ($fh,$indexfile) = $self->io->tempfile(-dir=>$self->tempdir); $self->ind($indexfile); # file handle not use later so deleted close($fh); undef $fh; } unless (defined $self->out) { my ($fh,$matrixfile) = $self->io->tempfile(-dir=>$self->tempdir); $self->out($matrixfile); # file handle not use later so deleted close($fh); undef $fh; } foreach my $param (@MATRIXPROGRAM_PARAMS) { if (defined $self->$param) { $cmd .= " -$param ".$self->$param; } } my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; $cmd .= " > $null"; my $status = system($cmd); $self->throw( "tribe-matrix call ($cmd) crashed: $? \n") unless $status==0; return ($self->ind,$self->out); } =head2 _setup_input Title : _setup_input Usage : $self->_setup_input() Function: internal function for running setting up the inputs needed for running mcl Returns : filepath of parsed ids and scores Args : =cut sub _setup_input { my ($self,$input) = @_; my ($type,$rc); my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); $type = $self->inputtype(); if($type=~/scorefile/i){ -e $self->scorefile || $self->throw("Scorefile doesn't seem to exist or accessible"); return $self->scorefile; } if($type =~/blastfile/i){ $self->blastfile($input); $rc = $self->_parse_blastfile($self->blastfile,$tfh); } elsif($type=~/searchio/i){ $self->searchio($input); $rc = $self->_get_from_searchio($self->searchio,$tfh); } elsif($type=~/pairs/i) { $self->pairs($input); for my $line (@{ $self->pairs }){ print $tfh join("\t",@{$line}), "\n"; $rc++; } } elsif($type =~/hsp/i) { $self->hsp($input); $rc = $self->_get_from_hsp($self->hsp,$tfh); } elsif($type=~/hit/i){ $self->hit($input); $rc = $self->_get_from_hit($self->hit,$tfh); } else { $self->throw("Must set inputtype to either blastfile,searchio or ". "paris using \$fact->blastfile |\$fact->searchio| \$fact->pairs"); } unless ( $rc ) { $self->throw("Need inputs for running tribe mcl, nothing provided"); } close($tfh); $tfh= undef; return $outfile; } =head2 _get_from_hsp Title : _get_from_hsp Usage : $self->_get_from_hsp() Function: internal function for getting blast scores from hsp Returns : array ref to ids and score [protein1 protein2 magnitude factor] Args : L =cut sub _get_from_hsp { my ($self,$hsp,$tfh) = @_; my @array; my $count; foreach my $pair (@{$hsp}){ my $sig = $pair->score; $sig =~ s/^e-/1e-/g; my $expect=sprintf("%e",$sig); if ($expect==0){ my $wt = $self->weight; $expect=sprintf("%e","1e-$wt"); } my $first=(split("e-",$expect))[0]; my $second=(split("e-",$expect))[1]; print $tfh join("\t", $pair->feature1->seq_id, $pair->feature2->seq_id,int($first), int($second) ), "\n"; $count++; } return $count; } sub _get_from_hit { my ($self,$hit,$tfh) = @_; my $count; foreach my $pair(@{$hit}){ my $sig = $pair->raw_score; $sig =~s/^e-/1e-/g; my $expect = sprintf("%e",$sig); if ($expect==0){ my $wt = $self->weight; $expect=sprintf("%e","1e-$wt"); } my $first=(split("e-",$expect))[0]; my $second=(split("e-",$expect))[1]; print $tfh join("\t",$pair->name,$pair->description,int($first),int($second)),"\n"; $count++; } return $count; } =head2 _get_from_searchio Title : _get_from_searchio Usage : $self->_get_from_searchio() Function: internal function for parsing blast scores from searchio object Returns : array ref to ids and score [protein1 protein2 magnitude factor] Args : L =cut sub _get_from_searchio { my ($self,$sio,$tfh) = @_; my @array; my $count; while( my $result = $sio->next_result ) { while( my $hit = $result->next_hit ) { while( my $hsp = $hit->next_hsp ) { my $sig = $hsp->evalue; $sig =~ s/^e-/1e-/g; my $expect=sprintf("%e",$sig); if ($expect==0){ my $wt = $self->weight; $expect=sprintf("%e","1e-$wt"); } my $first=(split("e-",$expect))[0]; my $second=(split("e-",$expect))[1]; print $tfh join("\t", $hsp->feature1->seq_id, $hsp->feature2->seq_id, int($first), int($second) ), "\n"; $count++; } } } return $count; } =head2 _parse_blastfile Title : _parse_blastfile Usage : $self->_parse_blastfile() Function: internal function for quickly parsing blast evalue scores from raw blast output file Returns : array ref to ids and score [protein1 protein2 magnitude factor] Args : file path =cut sub _parse_blastfile { my ($self, $file,$tfh) = @_; open(FILE,$file) || $self->throw("Cannot open Blast Output File"); my ($query,$reference,$first,$second); my @array; my $count; my $weight = $self->weight; while(){ if(/Query=\s+(\S+)/){ $query = $1; } if(/^>(\S+)/){ $reference = $1; } if (/Expect = (\S+)/){ my $raw=$1; $raw=~s/^e-/1e-/g; my $expect=sprintf("%e",$raw); if ($expect==0){ $expect=sprintf("%e","1e-$weight"); } $first=(split("e-",$expect))[0]; $second=(split("e-",$expect))[1]; print $tfh join("\t", $query, $reference, int($first), int($second)), "\n"; $count++; } } return $count; } =head2 _parse_mcl Title : _parse_mcl Usage : $self->_parse_mcl() Function: internal function for quickly parsing mcl output and generating the array of clusters Returns : Array Ref of clustered Ids Args : index file path, mcl output file path =cut sub _parse_mcl { my ($self,$ind,$mcl) = @_; open (MCL,$mcl) || $self->throw("Error, cannot open $mcl for parsing"); my $i =-1; my $start; my (@cluster,@out); while() { if ($start) { chomp($_); $cluster[$i] = join(" ",$cluster[$i],"$_"); } if(/^\d+/){ $start = 1; $i++; $cluster[$i] = join(" ",$cluster[$i] || '',"$_"); } if (/\$$/){ $start = 0; } last if /^\(mclruninfo/; } open (IND,$ind) || $self->throw("Cannot open $ind for parsing"); my %hash; while(){ /^(\S+)\s+(\S+)/; $hash{$1}=$2; } for (my $j=0;$j<$i+1;$j++) { my @array=split(" ",$cluster[$j]); for (my $p=1;$p<$#array;$p++){ push @{$out[$array[0]]}, $hash{$array[$p]}; } } return \@out; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/Vista.pm000066400000000000000000000517631302566030400221540ustar00rootroot00000000000000# Please direct questions and support issues to # # Cared for by Shawn Hoon # # Copyright Shawn Hoon # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::Vista Wrapper for Vista =head1 SYNOPSIS use Bio::Tools::Run::Vista; use Bio::Tools::Run::Alignment::Lagan; use Bio::AlignIO; my $sio = Bio::SeqIO->new(-file=>$ARGV[0],-format=>'genbank'); my @seq; my $reference = $sio->next_seq; push @seq, $reference; while(my $seq = $sio->next_seq){ push @seq,$seq; } my @features = grep{$_->primary_tag eq 'CDS'} $reference->get_SeqFeatures; my $lagan = Bio::Tools::Run::Alignment::Lagan->new; my $aln = $lagan->mlagan(\@seq,'(fugu (mouse human))'); my $vis = Bio::Tools::Run::Vista->new('outfile'=>"outfile.pdf", 'title' => "My Vista Plot", 'annotation'=>\@features, 'annotation_format'=>'GFF', 'min_perc_id'=>75, 'min_length'=>100, 'plotmin' => 50, 'tickdist' => 2000, 'window'=>40, 'numwindows'=>4, 'start'=>50, 'end'=>1500, 'tickdist'=>100, 'bases'=>1000, 'java_param'=>"-Xmx128m", 'num_pages'=>1, 'color'=> {'EXON'=>'100 0 0', 'CNS'=>'0 0 100'}, 'quiet'=>1); my $referenceid= 'human'; $vis->run($aln,$referenceid); #alternative one can choose pairwise alignments to plot #where the second id in each pair is the reference sequence $vis->run($aln,([mouse,human],[fugu,human],[mouse,fugu])); =head1 DESCRIPTION Pls see Vista documentation for plotfile options Wrapper for Vista : C. Mayor, M. Brudno, J. R. Schwartz, A. Poliakov, E. M. Rubin, K. A. Frazer, L. S. Pachter, I. Dubchak. VISTA: Visualizing global DNA sequence alignments of arbitrary length. Bioinformatics, 2000 Nov;16(11):1046-1047. Get it here: http://www-gsd.lbl.gov/vista/VISTAdownload2.html On the command line, it is assumed that this can be executed: java Vista plotfile Some of the code was adapted from MLAGAN toolkit M. Brudno, C.B. Do, G. Cooper, M.F. Kim, E. Davydov, NISC Sequencing Consortium, E.D. Green, A. Sidow and S. Batzoglou LAGAN and Multi-LAGAN: Efficient Tools for Large-Scale Multiple Alignment of Genomic DNA, Genome Research, in press get lagan here: http://lagan.stanford.edu/ =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Shawn Hoon Email shawnh@fugu-sg.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::Vista; use vars qw($AUTOLOAD @ISA %DEFAULT_VALUES %EPONINE_PARAMS @VISTA_PARAMS $EPOJAR $JAVA $PROGRAMDIR $PROGRAMNAME $PROGRAM %OK_FIELD); use strict; use Bio::Root::Root; use Bio::Seq; use Bio::Root::IO; use Bio::Tools::Run::WrapperBase; use File::Copy; @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); BEGIN { $PROGRAMNAME = 'java'; if( ! defined $PROGRAMDIR ) { $PROGRAMDIR = $ENV{'JAVA_HOME'} || $ENV{'JAVA_DIR'}; } if (defined $PROGRAMDIR) { foreach my $progname ( [qw(java)],[qw(bin java)] ) { my $f = Bio::Root::IO->catfile($PROGRAMDIR, @$progname); if( -e $f && -x $f ) { $PROGRAM = $f; last; } } } %DEFAULT_VALUES= ('java' => 'java', 'min_perc_id' => 75, 'min_length' => 100, 'plotmin' => 50, 'bases' => 10000, 'tickdist' => 2000, 'resolution'=> 25, 'window' => 40, 'title' => 'VISTA PLOT', 'numwindows'=>4); @VISTA_PARAMS=qw(JAVA JAVA_PARAM OUTFILE MIN_PERC_ID QUIET VERBOSE ANNOTATION_FORMAT REGION_FILE REGION_FILE_DIR SCORE_FILE SCORE_FILE_DIR ALIGNMENT_FILE_DIR ALIGNMENT_FILE CONTIGS_FILE DIFFS PLOTFILE MIN_LENGTH PLOTMIN ANNOTATION BASES TICKDIST RESOLUTION TITLE PAPER WINDOW NUMWINDOWS START END NUM_PLOT_LINES LEGEND FILENAME NUM_PAGES AXIS_LABEL TICKS_FILE COLOR USE_ORDER GAPS SNPS_FILE REPEATS_FILE FILTER_REPEATS); foreach my $attr ( @VISTA_PARAMS) { $OK_FIELD{$attr}++; } } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $self->debug( "************ attr: $attr\n"); $attr =~ s/.*:://; $attr = uc $attr; $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; $self->{$attr} = shift if @_; return $self->{$attr}; } =head2 new Title : new Usage : my $vis = Bio::Tools::Run::Vista->new('outfile'=>$out, 'title' => "My Vista Plot", 'annotation'=>\@features, 'annotation_format'=>'GFF', 'min_perc_id'=>75, 'min_length'=>100, 'plotmin' => 50, 'tickdist' => 2000, 'window'=>40, 'numwindows'=>4, 'start'=>50, 'end'=>1500, 'tickdist'=>100, 'bases'=>1000, 'color'=> {'EXON'=>'100 0 0', 'CNS'=>'0 0 100'}, 'quiet'=>1); Function: Construtor for Vista wrapper Args : outfile - location of the pdf generated annotation - either a file or and array ref of Bio::SeqFeatureI indicating the exons regmin -region min =cut sub new { my ($caller, @args) = @_; # chained new my $self = $caller->SUPER::new(@args); # so that tempfiles are cleaned up foreach my $key(keys %DEFAULT_VALUES){ $self->$key($DEFAULT_VALUES{$key}); } while (@args) { my $attr = shift @args; my $value = shift @args; next if( $attr =~ /^-/ ); # don't want named parameters $self->$attr($value); } return $self; } =head2 java Title : java Usage : $obj->java('/usr/opt/java130/bin/java'); Function: Get/set method for the location of java VM Args : File path (optional) =cut sub executable { shift->java(@_); } sub java { my ($self, $exe,$warn) = @_; if( defined $exe ) { $self->{'_pathtojava'} = $exe; } unless( defined $self->{'_pathtojava'} ) { if( $PROGRAM && -e $PROGRAM && -x $PROGRAM ) { $self->{'_pathtojava'} = $PROGRAM; } else { my $exe; if( ( $exe = $self->io->exists_exe($PROGRAMNAME) ) && -x $exe ) { $self->{'_pathtojava'} = $exe; } else { $self->warn("Cannot find executable for $PROGRAMNAME") if $warn; $self->{'_pathtojava'} = undef; } } } $self->{'_pathtojava'}; } =head2 run Title : run Usage : my @genes = $self->run($seq) Function: runs Vista Returns : A boolean 1 if no errors Args : Argument 1: Bio::Align::Align required Argument 2: a string or number, which is the sequence id of the reference sequence or the rank of the sequence in the alignment =cut sub run{ my ($self,$align,$ref) = @_; $ref ||=1; my $infile = $self->_setinput($align,$ref); return $self->_run_Vista($infile); } =head2 _setinput Title : _setinput Usage : Internal function, not to be called directly Function: writes input sequence to file and return the file name Example : Returns : string Args : =cut sub _setinput { my ($self,$sim_aln,$ref) = @_; my($pairs,$files) = $self->_mf2bin($sim_aln,$ref); my $plotfile = $self->_make_plotfile($sim_aln,$pairs,$files); return $plotfile; } sub _parse_multi_fasta { my ($self,$file) = @_; my %seq; open(FASTA, $file) || $self->throw("Couldn't open $file"); my $last; my $count = 0; while (my $line = ) { chomp $line; next if $line=~/^$/; if (substr($line, 0, 1) eq ">") { $_ = substr($line, 1); /\w+/g; $seq{$&} = ""; $last = $&; } else { $seq{$last}.=$line; } print STDERR $count."\n"; $count++; } my @seq; foreach my $key(keys %seq){ my $seq = Bio::Seq->new(-id=>$key,-seq=>$seq{$key}); push @seq,$seq; } return @seq; } #adapted from mlagan utils mf2bin.pl sub _mf2bin { my ($self,$sim,$ref)= @_; my @seq; if(!ref $sim){ @seq = $self->_parse_multi_fasta($sim); } else { ($sim && $sim->isa("Bio::Align::AlignI")) || $self->throw("Expecting a Bio::Align::AlignI"); @seq = $sim->each_seq; } my $reference; my @files; my @pairs; if(ref($ref) eq 'ARRAY'){ my @ref; foreach my $set(@$ref){ my ($reference) = grep{$_->id eq $set->[1]}@seq; my ($other) = grep{$_->id eq $set->[0]}@seq; my ($pair,$file) = $self->_pack_bin($reference,$other); push @pairs, @$pair; push @files, @$file; push @ref,$set->[1]; } $self->_coordinate(\@ref); return \@pairs,\@files; } #figure out the reference sequence elsif($ref =~/^\d+$/){ #its a rank index $reference = $seq[$ref-1]; my $tmp = $ref; $ref = $reference->id; splice @seq,($tmp-1),1; } else { #its an id foreach my $i(0..$#seq){ if($seq[$i]->id =~/$ref/){ $reference = $seq[$i]; splice @seq,($i),1; last; } } } $self->_coordinate([$ref]); # pack bin # format from Alex Poliakov's glass2bin.pl script my %base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5, 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5); my @ref= (split ('',$reference->seq)); foreach my $seq2(@seq){ my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); my @seq2= (split('', $seq2->seq)); foreach my $index(0..$#ref){ unless($ref[$index] eq '-' && $seq2[$index] eq '-'){ print $tfh1 pack("H2",$base_code{$ref[$index]}.$base_code{$seq2[$index]}); } } close ($tfh1); undef ($tfh1); push @files, $outfile; push @pairs,[$reference->id,$seq2->id]; } return \@pairs,\@files; } sub _pack_bin { my ($self,$first,$sec) = @_; my @first = (split('',$first->seq)); my @sec = (split('',$sec->seq)); # pack bin # format from Alex Poliakov's glass2bin.pl script my %base_code = ('-' => 0, 'A' => 1, 'C' => 2, 'T' => 3, 'G' => 4, 'N' => 5, 'a' => 1, 'c' => 2, 't' => 3, 'g' => 4, 'n' => 5); my @files; my @pairs; my ($tfh1,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); foreach my $index(0..$#first){ unless($first[$index] eq '-' && $sec[$index] eq '-'){ print $tfh1 pack("H2",$base_code{$first[$index]}.$base_code{$sec[$index]}); } } close ($tfh1); undef ($tfh1); push @files, $outfile; push @pairs,[$first->id,$sec->id]; return \@pairs,\@files; } sub _make_plotfile { my ($self,$sim_aln,$pairs,$files) = @_; my ($tfh1,$plotfile) = $self->io->tempfile(-dir=>$self->tempdir); my @ids = map{$_->id}$sim_aln->each_seq; print $tfh1 "TITLE ".$self->title."\n\n"; print $tfh1 "OUTPUT ".$self->outfile."\n\n" ; print $tfh1 "SEQUENCES "; print $tfh1 join(" ",@ids)."\n\n"; foreach my $index(0..$#$pairs){ print $tfh1 "ALIGN ".$files->[$index]." BINARY\n"; print $tfh1 " SEQUENCES ".$pairs->[$index]->[0]." ".$pairs->[$index]->[1]."\n"; print $tfh1 " REGIONS ".$self->min_perc_id." ".$self->min_length."\n"; print $tfh1 " MIN ".$self->plotmin."\n"; print $tfh1 " DIFFS ". $self->diffs ."\n\n" if $self->diffs; if($self->region_file||$self->region_file_dir){ my $file = " REGION_FILE "; $file.=$self->region_file_dir."/" if $self->region_file_dir; $file.=$pairs->[$index]->[0]."_".$pairs->[$index]->[1].".region\n\n"; print $tfh1 $file; } if($self->score_file || $self->score_file_dir){ my $file = " SCORE_FILE "; $file.=$self->score_file_dir."/" if $self->score_file_dir; $file.=$pairs->[$index]->[0]."_".$pairs->[$index]->[1].".score\n\n"; print $tfh1 $file; } if($self->alignment_file || $self->alignment_file_dir){ my $file = " ALIGNMENT_FILE "; $file.=$self->alignment_file_dir."/" if $self->alignment_file_dir; $file.=$pairs->[$index]->[0]."_".$pairs->[$index]->[1].".alignment\n\n"; print $tfh1 $file; } print $tfh1 " CONTIGS_FILE ". $self->contigs_file ."\n\n" if $self->contigs_file; print $tfh1 " USE_ORDER ". $self->use_order."\n\n" if $self->use_order; print $tfh1 "END \n\n"; } my $annotation_file; if((ref $self->annotation eq 'ARRAY')&& $self->annotation->[0]->isa("Bio::SeqFeatureI")){ $annotation_file = $self->_dump2gff($self->annotation); $self->annotation_format('GFF'); } elsif($self->annotation){ $annotation_file = $self->annotation; } $annotation_file .= " GFF" if $self->annotation_format=~/GFF/i; print $tfh1 "GENES ".$annotation_file." \n\n" if $annotation_file; print $tfh1 "LEGEND on\n\n"; print $tfh1 "COORDINATE ".join(" ",@{$self->_coordinate})."\n\n"; print $tfh1 "PAPER letter\n\n"; print $tfh1 "BASES ".$self->bases."\n\n"; print $tfh1 "TICK_DIST ".$self->tickdist."\n\n"; print $tfh1 "RESOLUTION ".$self->resolution."\n\n"; print $tfh1 "WINDOW ".$self->window."\n\n"; print $tfh1 "NUM_WINDOWS ".$self->numwindows."\n\n"; print $tfh1 "AXIS_LABEL ".$self->axis_label ."\n\n" if $self->axis_label; print $tfh1 "TICKS_FILE ".$self->ticks_file ."\n\n" if $self->ticks_file; print $tfh1 "SNPS_FILE"." ".$self->snps_file."\n\n" if $self->snps_file; print $tfh1 "GAPS ".$self->gaps ."\n\n"if $self->gaps; print $tfh1 "REPEATS_FILE ".$self->repeats_file ."\n\n" if $self->repeats_file; print $tfh1 "FILTER_REPEATS ".$self->filter_repeats ."\n\n" if $self->filter_repeats; print $tfh1 "NUM_PAGES ".$self->num_pages ."\n\n" if $self->num_pages; print $tfh1 "START ".$self->start ."\n\n" if $self->start; print $tfh1 "END ".$self->end ."\n\n" if $self->end; my $color = $self->color; if(ref $color eq 'HASH'){ foreach my $region_type (keys %$color){ print $tfh1 "COLOR ".$region_type." ".$color->{$region_type}."\n\n"; } } close ($tfh1); undef $tfh1; if($self->plotfile) {#saving plotfile copy($plotfile,$self->plotfile); } else { $self->plotfile($plotfile); } return $self->plotfile; } sub _dump2gff { my ($self,$feat) = @_; my ($tfh1,$file) = $self->io->tempfile(-dir=>$self->tempdir); foreach my $f(@$feat){ print $tfh1 $f->gff_string."\n"; } close ($tfh1); undef $tfh1; return $file; } sub _run_Vista { my ($self,$infile) = @_; #run Vista $self->debug( "Running Vista\n"); my $java = $self->java; my $param = $self->java_param || ''; my $cmd = $java." ".$param.' Vista '; $cmd .= " -q " if $self->quiet || $self->verbose < 0; $cmd .= " -d " if $self->debug; $cmd .= $infile; $self->debug($cmd); my $status = system ($cmd); $self->throw("Problem running Vista: $? \n") if $status != 0; return 1; } sub _coordinate { my ($self,$val) = @_; if($val){ $self->{'_coordinate'} = $val; } return $self->{'_coordinate'}; } =head2 outfile Title : outfile Usage : $obj->outfile Function : Get/Set method outfile Args : =cut =head2 min_perc_id Title : min_perc_id Usage : $obj->min_perc_id Function : Get/Set method min_perc_id Args : =cut =head2 quiet Title : quiet Usage : $obj->quiet Function : Get/Set method quiet Args : =cut =head2 verbose Title : verbose Usage : $obj->verbose Function : Get/Set method verbose Args : =cut =head2 annotation_format Title : annotation_format Usage : $obj->annotation_format Function : Get/Set method annotation_format Args : =cut =head2 region_file Title : region_file Usage : $obj->region_file Function : Get/Set method region_file Args : =cut =head2 score_file Title : score_file Usage : $obj->score_file Function : Get/Set method score_file Args : =cut =head2 alignment_file Title : alignment_file Usage : $obj->alignment_file Function : Get/Set method alignment_file Args : =cut =head2 contigs_file Title : contigs_file Usage : $obj->contigs_file Function : Get/Set method contigs_file Args : =cut =head2 diffs Title : diffs Usage : $obj->diffs Function : Get/Set method diffs Args : =cut =head2 plotfile Title : plotfile Usage : $obj->plotfile Function : Get/Set method plotfile Args : =cut =head2 min_length Title : min_length Usage : $obj->min_length Function : Get/Set method min_length Args : =cut =head2 plotmin Title : plotmin Usage : $obj->plotmin Function : Get/Set method plotmin Args : =cut =head2 annotation Title : annotation Usage : $obj->annotation Function : Get/Set method annotation Args : =cut =head2 bases Title : bases Usage : $obj->bases Function : Get/Set method bases Args : =cut =head2 tickdist Title : tickdist Usage : $obj->tickdist Function : Get/Set method tickdist Args : =cut =head2 resolution Title : resolution Usage : $obj->resolution Function : Get/Set method resolution Args : =cut =head2 title Title : title Usage : $obj->title Function : Get/Set method title Args : =cut =head2 window Title : window Usage : $obj->window Function : Get/Set method window Args : =cut =head2 numwindows Title : numwindows Usage : $obj->numwindows Function : Get/Set method numwindows Args : =cut =head2 start Title : start Usage : $obj->start Function : Get/Set method start Args : =cut =head2 end Title : end Usage : $obj->end Function : Get/Set method end Args : =cut =head2 num_plot_lines Title : num_plot_lines Usage : $obj->num_plot_lines Function : Get/Set method num_plot_lines Args : =cut =head2 legend Title : legend Usage : $obj->legend Function : Get/Set method legend Args : =cut =head2 filename Title : filename Usage : $obj->filename Function : Get/Set method filename Args : =cut =head2 axis_label Title : axis_label Usage : $obj->axis_label Function : Get/Set method axis_label Args : =cut =head2 ticks_file Title : ticks_file Usage : $obj->ticks_file Function : Get/Set method ticks_file Args : =cut =head2 color Title : color Usage : $obj->color Function : Get/Set method color Args : =cut =head2 use_order Title : use_order Usage : $obj->use_order Function : Get/Set method use_order Args : =cut =head2 gaps Title : gaps Usage : $obj->gaps Function : Get/Set method gaps Args : =cut =head2 snps_file Title : snps_file Usage : $obj->snps_file Function : Get/Set method snps_file Args : =cut =head2 repeats_file Title : repeats_file Usage : $obj->repeats_file Function : Get/Set method repeats_file Args : =cut =head2 filter_repeats Title : filter_repeats Usage : $obj->filter_repeats Function : Get/Set method filter_repeats Args : =cut 1; __END__ bioperl-run-release-1-7-1/lib/Bio/Tools/Run/WrapperBase.pm000066400000000000000000000334561302566030400233000ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::WrapperBase # # Please direct questions and support issues to # # Cared for by Jason Stajich # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::WrapperBase - A Base object for wrappers around executables =head1 SYNOPSIS # do not use this object directly, it provides the following methods # for its subclasses my $errstr = $obj->error_string(); my $exe = $obj->executable(); $obj->save_tempfiles($booleanflag) my $outfile= $obj->outfile_name(); my $tempdir= $obj->tempdir(); # get a temporary dir for executing my $io = $obj->io; # Bio::Root::IO object my $cleanup= $obj->cleanup(); # remove tempfiles $obj->run({-arg1 => $value}); =head1 DESCRIPTION This is a basic module from which to build executable wrapper modules. It has some basic methods to help when implementing new modules. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Jason Stajich Email jason-at-bioperl.org =head1 CONTRIBUTORS Sendu Bala, bix@sendu.me.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::WrapperBase; use strict; # Object preamble - inherits from Bio::Root::Root use base qw(Bio::Root::Root); use File::Spec; use File::Path qw(); # don't import anything =head2 run Title : run Usage : $wrapper->run({ARGS HERE}); Function: Support generic running with args passed in as a hashref Returns : Depends on the implementation, status OR data Args : hashref of named arguments =cut sub run { my ($self,@args) = @_; $self->throw_not_implemented(); } =head2 error_string Title : error_string Usage : $obj->error_string($newval) Function: Where the output from the last analysis run is stored. Returns : value of error_string Args : newvalue (optional) =cut sub error_string{ my ($self,$value) = @_; if( defined $value) { $self->{'_error_string'} = $value; } return $self->{'_error_string'} || ''; } =head2 arguments Title : arguments Usage : $obj->arguments($newval) Function: Commandline parameters Returns : value of arguments Args : newvalue (optional) =cut sub arguments { my ($self,$value) = @_; if(defined $value) { $self->{'_arguments'} = $value; } return $self->{'_arguments'} || ''; } =head2 no_param_checks Title : no_param_checks Usage : $obj->no_param_checks($newval) Function: Boolean flag as to whether or not we should trust the sanity checks for parameter values Returns : value of no_param_checks Args : newvalue (optional) =cut sub no_param_checks{ my ($self,$value) = @_; if( defined $value || ! defined $self->{'no_param_checks'} ) { $value = 0 unless defined $value; $self->{'no_param_checks'} = $value; } return $self->{'no_param_checks'}; } =head2 save_tempfiles Title : save_tempfiles Usage : $obj->save_tempfiles($newval) Function: Get/set the choice of if tempfiles in the temp dir (see tempdir()) are kept or cleaned up. Default is '0', ie. delete temp files. NB: This must be set to the desired value PRIOR to first creating a temp dir with tempdir(). Any attempt to set this after tempdir creation will get a warning. Returns : boolean Args : none to get, boolean to set =cut sub save_tempfiles{ my $self = shift; my @args = @_; if (($args[0]) && (exists ($self->{'_tmpdir'}))) { $self->warn ("Tempdir already created; setting save_tempfiles will not affect cleanup behavior."); } return $self->io->save_tempfiles(@_); } =head2 outfile_name Title : outfile_name Usage : my $outfile = $wrapper->outfile_name(); Function: Get/Set the name of the output file for this run (if you wanted to do something special) Returns : string Args : [optional] string to set value to =cut sub outfile_name{ my ($self,$nm) = @_; if( defined $nm || ! defined $self->{'_outfilename'} ) { $nm = 'mlc' unless defined $nm; $self->{'_outfilename'} = $nm; } return $self->{'_outfilename'}; } =head2 tempdir Title : tempdir Usage : my $tmpdir = $self->tempdir(); Function: Retrieve a temporary directory name (which is created) Returns : string which is the name of the temporary directory Args : none =cut sub tempdir{ my ($self) = shift; $self->{'_tmpdir'} = shift if @_; unless( $self->{'_tmpdir'} ) { $self->{'_tmpdir'} = $self->io->tempdir(CLEANUP => ! $self->save_tempfiles ); } unless( -d $self->{'_tmpdir'} ) { mkdir($self->{'_tmpdir'},0777); } return $self->{'_tmpdir'}; } =head2 cleanup Title : cleanup Usage : $wrapper->cleanup(); Function: Will cleanup the tempdir directory Returns : none Args : none =cut sub cleanup{ my ($self) = @_; $self->io->_io_cleanup(); if( defined $self->{'_tmpdir'} && -d $self->{'_tmpdir'} ) { my $verbose = ($self->verbose >= 1) ? 1 : 0; File::Path::rmtree( $self->{'_tmpdir'}, $verbose); } } =head2 io Title : io Usage : $obj->io($newval) Function: Gets a Bio::Root::IO object Returns : Bio::Root::IO object Args : none =cut sub io{ my ($self) = @_; unless( defined $self->{'io'} ) { $self->{'io'} = Bio::Root::IO->new(-verbose => $self->verbose); } return $self->{'io'}; } =head2 version Title : version Usage : $version = $wrapper->version() Function: Returns the program version (if available) Returns : string representing version of the program Args : [Optional] value to (re)set version string =cut sub version{ my ($self,@args) = @_; return; } =head2 executable Title : executable Usage : my $exe = $factory->executable(); Function: Finds the full path to the executable Returns : string representing the full path to the exe Args : [optional] name of executable to set path to [optional] boolean flag whether or not warn when exe is not found =cut sub executable { my ($self, $exe, $warn) = @_; if (defined $exe) { $self->{'_pathtoexe'} = $exe; } unless( defined $self->{'_pathtoexe'} ) { my $prog_path = $self->program_path; if ($prog_path) { if (-f $prog_path && -x $prog_path) { $self->{'_pathtoexe'} = $prog_path; } elsif ($self->program_dir) { $self->warn("executable not found in $prog_path, trying system path...") if $warn; } } unless ($self->{'_pathtoexe'}) { my $exe; if ( $exe = $self->io->exists_exe($self->program_name) ) { $self->{'_pathtoexe'} = $exe; } else { $self->warn("Cannot find executable for ".$self->program_name) if $warn; $self->{'_pathtoexe'} = undef; } } } # bail if we never found the executable unless ( defined $self->{'_pathtoexe'}) { $self->throw("Cannot find executable for ".$self->program_name . ". path=\"".$self->program_path."\""); } return $self->{'_pathtoexe'}; } =head2 program_path Title : program_path Usage : my $path = $factory->program_path(); Function: Builds path for executable Returns : string representing the full path to the exe Args : none =cut sub program_path { my ($self) = @_; my @path; push @path, $self->program_dir if $self->program_dir; push @path, $self->program_name.($^O =~ /mswin/i ? '.exe' : '') if $self->program_name; return File::Spec->catfile(@path); } =head2 program_dir Title : program_dir Usage : my $dir = $factory->program_dir(); Function: Abstract get method for dir of program. To be implemented by wrapper. Returns : string representing program directory Args : none =cut sub program_dir { my ($self) = @_; $self->throw_not_implemented(); } =head2 program_name Title : program_name Usage : my $name = $factory->program_name(); Function: Abstract get method for name of program. To be implemented by wrapper. Returns : string representing program name Args : none =cut sub program_name { my ($self) = @_; $self->throw_not_implemented(); } =head2 quiet Title : quiet Usage : $factory->quiet(1); if ($factory->quiet()) { ... } Function: Get/set the quiet state. Can be used by wrappers to control if program output is printed to the console or not. Returns : boolean Args : none to get, boolean to set =cut sub quiet { my $self = shift; if (@_) { $self->{quiet} = shift } return $self->{quiet} || 0; } =head2 _setparams() Title : _setparams Usage : $params = $self->_setparams(-params => [qw(window evalue_cutoff)]) Function: For internal use by wrapper modules to build parameter strings suitable for sending to the program being wrapped. For each method name supplied, calls the method and adds the method name (as modified by optional things) along with its value (unless a switch) to the parameter string Example : $params = $self->_setparams(-params => [qw(window evalue_cutoff)], -switches => [qw(simple large all)], -double_dash => 1, -underscore_to_dash => 1); If window() and simple() had not been previously called, but evalue_cutoff(0.5), large(1) and all(0) had been called, $params would be ' --evalue-cutoff 0.5 --large' Returns : parameter string Args : -params => [] or {} # array ref of method names to call, or hash ref where keys are method names and values are how those names should be output in the params string -switches => [] or {}# as for -params, but no value is printed for these methods -join => string # define how parameters and their values are joined, default ' '. (eg. could be '=' for param=value) -lc => boolean # lc() method names prior to output in string -dash => boolean # prefix all method names with a single dash -double_dash => bool # prefix all method names with a double dash -mixed_dash => bool # prefix single-character method names with a # single dash, and multi-character method names # with a double-dash -underscore_to_dash => boolean # convert all underscores in method names to dashes =cut sub _setparams { my ($self, @args) = @_; my ($params, $switches, $join, $lc, $d, $dd, $md, $utd) = $self->_rearrange([qw(PARAMS SWITCHES JOIN LC DASH DOUBLE_DASH MIXED_DASH UNDERSCORE_TO_DASH)], @args); $self->throw('at least one of -params or -switches is required') unless ($params || $switches); $self->throw("-dash, -double_dash and -mixed_dash are mutually exclusive") if (defined($d) + defined($dd) + defined($md) > 1); $join ||= ' '; my %params = ref($params) eq 'HASH' ? %{$params} : map { $_ => $_ } @{$params}; my %switches = ref($switches) eq 'HASH' ? %{$switches} : map { $_ => $_ } @{$switches}; my $param_string = ''; for my $hash_ref (\%params, \%switches) { while (my ($method, $method_out) = each %{$hash_ref}) { my $value = $self->$method(); next unless (defined $value); next if (exists $switches{$method} && ! $value); $method_out = lc($method_out) if $lc; my $method_length = length($method_out) if $md; $method_out = '-'.$method_out if ($d || ($md && ($method_length == 1))); $method_out = '--'.$method_out if ($dd || ($md && ($method_length > 1))); $method_out =~ s/_/-/g if $utd; if ( exists $params{$method} ) { # if value are quoted with " or ', re-quote it if ( $value =~ m{^[\'\"]+(.+)[\'\"]+$} ) { $value = '"'. $1 . '"'; } # quote values that contain spaces elsif ( $value =~ m{\s+} ) { $value = '"'. $value . '"'; } } $param_string .= ' '.$method_out.(exists $switches{$method} ? '' : $join.$value); } } return $param_string; } sub DESTROY { my $self= shift; unless ( $self->save_tempfiles ) { $self->cleanup(); } $self->SUPER::DESTROY(); } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/WrapperBase/000077500000000000000000000000001302566030400227275ustar00rootroot00000000000000bioperl-run-release-1-7-1/lib/Bio/Tools/Run/WrapperBase/CommandExts.pm000066400000000000000000001201551302566030400255130ustar00rootroot00000000000000# # BioPerl module for Bio::Tools::Run::WrapperBase::CommandExts # # Please direct questions and support issues to # # Cared for by Mark A. Jensen # # Copyright Mark A. Jensen # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::WrapperBase::CommandExts - Extensions to WrapperBase for handling programs with commands *ALPHA* =head1 SYNOPSIS Devs, see L. Users, see L. =head1 DESCRIPTION This is a developer-focused experimental module. The main idea is to extend L to make it relatively easy to create run wrappers around I of related programs, like C or C. Some definitions: =over =item * program The program is the command-line frontend application. C, for example, is run from the command line as follows: $ samtools view -bS in.bam > out.sam $ samtools faidx =item * command The command is the specific component of a suite run by executing the program. In the example above, C and C are commands. =item * command prefix The command prefix is an abbreviation of the command name used internally by C method, and sometimes by the user of the factory for specifying command line parameters to subcommands of composite commands. =item * composite command A composite command is a pipeline or script representing a series of separate executions of different commands. Composite commands can be specified by configuring C appropriately; the composite command can be run by the user from a factory in the same way as ordinary commands. =item * options, parameters, switches and filespecs An option is any command-line option; i.e., a specification set off by a command-line by a specifier (like C<-v> or C<--outfile>). Parameters are command-line options that accept a value (C<-title mydb>); switches are boolean flags (C<--no-filter>). Filespecs are barewords at the end of the command line that usually indicate input or output files. In this module, this includes files that capture STDIN, STDOUT, or STDERR via redirection. =item * pseudo-program A "pseudo-program" is a way to refer to a collection of related applications that are run independently from the command line, rather than via a frontend program. The C suite of programs is an example: C, C, etc. C can be configured to create a single factory for a suite of related, independent programs that treats each independent program as a "pseudo-program" command. =back This module essentially adds the non-assembler-specific wrapper machinery of fangly's L to the L namespace, adding the general command-handling capability of L. It creates run factories that are automatically Bio::ParameterBaseI compliant, meaning that C, C, C, C, and C are available. =head1 DEVELOPER INTERFACE C is currently set up to read particular package globals which define the program, the commands available, command-line options for those commands, and human-readable aliases for those options. The easiest way to use C is probably to create two modules: Bio::Tools::Run::YourRunPkg Bio::Tools::Run::YourRunPkg::Config The package globals should be defined in the C module, and the run package itself should begin with the following mantra: use YourRunPkg::Config; use Bio::Tools::Run::WrapperBase; use Bio::Tools::Run::WrapperBase::CommandExts; sub new { my $class = shift; my @args = @_; my $self = $class->SUPER::new(@args); ... return $self; } The following globals can/should be defined in the C module: $program_name $program_dir $use_dash $join @program_commands %command_prefixes @program_params @program_switches %param_translation %composite_commands %command_files See L for detailed descriptions. The work of creating a run wrapper with C lies mainly in setting up the globals. The key methods for the developer interface are: =over =item * program_dir($path_to_programs) Set this to point the factory to the executables. =item * _run(@file_args) Runs an instantiated factory with the given file args. Use in the C method override. =item * _create_factory_set() Returns a hash of instantiated factories for each true command from a composite command factory. The hash keys are the true command names, so you could do $cmds = $composite_fac->_create_factory_set; for (@true_commands) { $cmds->{$_}->_run(@file_args); } =item * executables($cmd,[$fullpath]) For pseudo-programs, this gets/sets the full path to the executable of the true program corresponding to the command C<$cmd>. =back =head2 Implementing Composite Commands =head2 Implementing Pseudo-programs To indicate that a package wraps disparate programs under a single pseudo program, use an asterisk before the program name: package Bio::Tools::Run::YourPkg::Config; ... our $program_name = '*blast+'; and C<_run> will know what to do. Specify the rest of the globals as if the desired programs were commands. Use the basename of the programs for the command names. If all the programs can be found in a single directory, just specify that directory in C. If not, use C to set the paths to each program explicitly: foreach (keys %cmdpaths) { $self->executables($_, $cmdpaths{$_}); } =head2 Config Globals Here is an example config file. Further details in prose are below. package Dummy::Config; use strict; use warnings; no warnings qw(qw); use Exporter; our (@ISA, @EXPORT, @EXPORT_OK); push @ISA, 'Exporter'; @EXPORT = qw( $program_name $program_dir $use_dash $join @program_commands %command_prefixes @program_params @program_switches %param_translation %command_files %composite_commands ); our $program_name = '*flurb'; our $program_dir = 'C:\cygwin\usr\local\bin'; our $use_dash = 'mixed'; our $join = ' '; our @program_commands = qw( rpsblast find goob blorb multiglob ); our %command_prefixes = ( blastp => 'blp', tblastn => 'tbn', goob => 'g', blorb => 'b', multiglob => 'm' ); our @program_params = qw( command g|narf g|schlurb b|scroob b|frelb m|trud ); our @program_switches = qw( g|freen b|klep ); our %param_translation = ( 'g|narf' => 'n', 'g|schlurb' => 'schlurb', 'g|freen' => 'f', 'b|scroob' => 's', 'b|frelb' => 'frelb' ); our %command_files = ( 'goob' => [qw( fas faq )], ); our %composite_commands = ( 'multiglob' => [qw( blorb goob )] ); 1; C<$use_dash> can be one of C, C, or C. See L. There is a syntax for the C<%command_files> specification. The token matching C<[a-zA-Z0-9_]+> in each element of each arrayref becomes the named filespec parameter for the C<_run()> method in the wrapper class. Additional symbols surrounding this token indicate how this argument should be handled. Some examples: >out : stdout is redirected into the file specified by (..., -out => $file,... ) $file,... ) 2>log : stderr is redirected into the file specified by (..., -log => $file,... ) #opt : this filespec argument is optional (no throw if -opt => $option is missing) 2>#log: if -log is not specified in the arguments, the stderr() method will capture stderr *lst : this filespec can take multiple arguments, specify using an arrayref (..., -lst => [$file1, $file2], ...) *#lst : an optional list The tokens above are examples; they can be anything matching the above regexp. =head1 USER INTERFACE Using a wrapper created with C: =over =item * Getting a list of available commands, parameters, and filespecs: To get a list of commands, simply: @commands = Bio::Tools::Run::ThePkg->available_commands; The wrapper will generally have human-readable aliases for each of the command-line options for the wrapped program and commands. To obtain a list of the parameters and switches available for a particular command, do $factory = Bio::Tools::Run::ThePkg->new( -command => 'glurb' ); @params = $factory->available_parameters('params'); @switches = $factory->available_parameters('switches'); @filespec = $factory->available_parameters('filespec'); @filespec = $factory->filespec; # alias =item * Create factories The factory is a handle on the program and command you wish to run. Create a factory using C to set command-line parameters: $factory = Bio::Tools::Run::ThePkg->new( -command => 'glurb', -freen => 1, -furschlugginer => 'vreeble' ); A shorthand for this is: $factory = Bio::Tools::Run::ThePkg->new_glurb( -freen => 1, -furschlugginer => 'vreeble' ); =item * Running programs To run the program, use the C method, providing filespecs as arguments $factory = Bio::Tools::Run::ThePkg->new_assemble( -min_qual => 63 ); $factory->run( -faq1 => 'read1.fq', -faq2 => 'read2.fq', -ref => 'refseq.fas', -out => 'new.sam' ); # do another $factory->run( -faq1 => 'read-old1.fq', -faq2 => 'read-old2.fq', -ref => 'refseq.fas', -out => 'old.sam' ); Messages on STDOUT and STDERR are dumped into their respective attributes: $stdout = $factory->stdout; $stderr = $factory->stderr; unless STDOUT and/or STDERR are part of the named files in the filespec. =item * Setting/getting/resetting/polling parameters. A C-based factory is always L compliant. That means that you may set, get, and reset parameters using C, C, and C. You can ask whether parameters have changed since they were last accessed by using the predicate C. See L for more details. Once set, parameters become attributes of the factory. Thus, you can get their values as follows: if ($factory->freen) { $furs = $factory->furshlugginer; #... } =back =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: L rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: https://github.com/bioperl/bioperl-live/issues =head1 AUTHOR - Mark A. Jensen Email maj -at- fortinbras -dot- us Describe contact details here =head1 CONTRIBUTORS Dan Kortschak ( dan -dot- kortschak -at- adelaide -dot- edu -dot- au ) =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tools::Run::WrapperBase; # need these methods in WrapperBase/maj use strict; use warnings; no warnings qw(redefine); use Bio::Root::Root; use File::Spec; use IPC::Run; use base qw(Bio::Root::Root Bio::ParameterBaseI); our $AUTOLOAD; =head2 new() Title : new Usage : Function: constructor for WrapperBase::CommandExts ; correctly binds configuration variables to the WrapperBase object Returns : Bio::Tools::Run::WrapperBase object with command extensions Args : Note : this method subsumes the old _register_program_commands and _set_program_options, leaving out the assembler-specific parms ($qual_param and out_type()) =cut sub new { my ($class, @args) = @_; my $self = bless ({}, $class); # pull in *copies* of the Config variables from the caller namespace: my ($pkg, @goob) = caller(); my ($commands, $prefixes, $params, $switches, $translation, $use_dash, $join, $name, $dir, $composite_commands, $files); for (qw( @program_commands %command_prefixes @program_params @program_switches %param_translation $use_dash $join $program_name $program_dir %composite_commands %command_files ) ) { my ($sigil, $var) = m/(.)(.*)/; my $qualvar = "${sigil}${pkg}::${var}"; for ($sigil) { /\@/ && do { $qualvar = "\[$qualvar\]" }; /\%/ && do { $qualvar = "\{$qualvar\}" }; } my $locvar = "\$${var}"; $locvar =~ s/program_|command_|param_//g; eval "$locvar = $qualvar"; } # set up the info registry hash my %registry; if ($composite_commands) { $self->_register_composite_commands($composite_commands, $params, $switches, $prefixes); } @registry{qw( _commands _prefixes _files _params _switches _translation _composite_commands )} = ($commands, $prefixes, $files, $params, $switches, $translation, $composite_commands); $self->{_options} = \%registry; if (not defined $use_dash) { $self->{'_options'}->{'_dash'} = 1; } else { $self->{'_options'}->{'_dash'} = $use_dash; } if (not defined $join) { $self->{'_options'}->{'_join'} = ' '; } else { $self->{'_options'}->{'_join'} = $join; } if ($name =~ /^\*/) { $self->is_pseudo(1); $name =~ s/^\*//; } $self->program_name($name) if not defined $self->program_name(); $self->program_dir($dir) if not defined $self->program_dir(); $self->set_parameters(@args); $self->parameters_changed(1); # set on instantiation, per Bio::ParameterBaseI return $self; } =head2 program_name Title : program_name Usage : $factory->program_name($name) Function: get/set the executable name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->{'_program_name'} = $val if $val; return $self->{'_program_name'}; } =head2 program_dir Title : program_dir Usage : $factory->program_dir($dir) Function: get/set the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 _register_program_commands() Title : _register_program_commands Usage : $factory->_register_program_commands( \@commands, \%prefixes ) Function: Register the commands a program accepts (for programs that act as frontends for a set of commands, each command having its own set of params/switches) Returns : true on success Args : arrayref to a list of commands (scalar strings), hashref to a translation table of the form { $prefix1 => $command1, ... } [optional] Note : To implement a program with this kind of calling structure, include a parameter called 'command' in the @program_params global Note : The translation table is used to associate parameters and switches specified in _set_program_options with the correct program command. In the globals @program_params and @program_switches, specify elements as 'prefix1|param' and 'prefix1|switch', etc. =cut =head2 _set_program_options Title : _set_program_options Usage : $factory->_set_program_options( \@ args ); Function: Register the parameters and flags that an assembler takes. Returns : 1 for success Args : - arguments passed by the user - parameters that the program accepts, optional (default: none) - switches that the program accepts, optional (default: none) - parameter translation, optional (default: no translation occurs) - dash option for the program parameters, [1|single|double|mixed], optional (default: yes, use single dashes only) - join, optional (default: ' ') =cut =head2 _translate_params Title : _translate_params Usage : @options = @{$assembler->_translate_params( )}; Function: Translate the Bioperl arguments into the arguments to pass to the program on the command line Returns : Arrayref of arguments Args : none =cut sub _translate_params { my ($self) = @_; # Get option string my ($params, $switches, $join, $dash, $translat) = @{$self->{_options}}{qw(_params _switches _join _dash _translation)}; # access the multiple dash choices of _setparams... my @dash_args; $dash ||= 1; # default as advertised for ($dash) { $_ eq '1' && do { @dash_args = ( -dash => 1 ); last; }; /^s/ && do { #single dash only @dash_args = ( -dash => 1); last; }; /^d/ && do { # double dash only @dash_args = ( -double_dash => 1); last; }; /^m/ && do { # mixed dash: one-letter opts get -, # long opts get -- @dash_args = ( -mixed_dash => 1); last; }; do { $self->warn( "Dash spec '$dash' not recognized; using 'single'" ); @dash_args = ( -dash => 1 ); }; } my $options = $self->_setparams( -params => $params, -switches => $switches, -join => $join, @dash_args ); # Translate options # parse more carefully - bioperl-run issue #12 $options =~ s/^\s+//; $options =~ s/\s+$//; my @options; my $in_quotes; for (split(/(\s|$join)/, $options)) { if (/^-/) { push @options, $_; } elsif (s/^"//) { $in_quotes=1 unless (s/["']$//); push @options, $_; } elsif (s/"$//) { $options[-1] .= $_; $in_quotes=0; } else { $in_quotes ? $options[-1] .= $_ : push(@options, $_); } } $self->throw("Unmatched quote in option value") if $in_quotes; for (my $i = 0; $i < scalar @options; $i++) { my ($prefix, $name) = ( $options[$i] =~ m/^(-{0,2})(.+)$/ ); if (defined $name) { if ($name =~ /command/i) { $name = $options[$i+2]; # get the command splice @options, $i, 4; $i--; # don't add the command if this is a pseudo-program unshift @options, $name unless ($self->is_pseudo); # put command first } elsif (defined $$translat{$name}) { $options[$i] = $prefix.$$translat{$name}; } } else { splice @options, $i, 1; $i--; } } @options = grep (!/^\s*$/,@options); # this is a kludge for mixed options: the reason mixed doesn't # work right on the pass through _setparams is that the # *aliases* and not the actual params are passed to it. # here we just rejigger the dashes if ($dash =~ /^m/) { s/--([a-z0-9](?:\s|$))/-$1/gi for @options; } # Now arrayify the options return \@options; } =head2 executable() Title : executable Usage : Function: find the full path to the main executable, or to the command executable for pseudo-programs Returns : full path, if found Args : [optional] explicit path to the executable (will set the appropriate command exec if applicable) [optional] boolean flag whether or not to warn when exe no found Note : overrides WrapperBase.pm =cut sub executable { my $self = shift; my ($exe, $warn) = @_; if ($self->is_pseudo) { return $self->{_pathtoexe} = $self->executables($self->command,$exe); } # otherwise # setter if (defined $exe) { $self->throw("binary '$exe' does not exist") unless -e $exe; $self->throw("'$exe' is not executable") unless -x $exe; return $self->{_pathtoexe} = $exe; } # getter return $self->{_pathtoexe} if defined $self->{_pathstoexe}; # finder return $self->{_pathtoexe} = $self->_find_executable($exe, $warn); } =head2 executables() Title : executables Usage : Function: find the full path to a command's executable Returns : full path (scalar string) Args : command (scalar string), [optional] explicit path to this command exe [optional] boolean flag whether or not to warn when exe no found =cut sub executables { my $self = shift; my ($cmd, $exe, $warn) = @_; # for now, barf if this is not a pseudo program $self->throw("This wrapper represents a single program with commands, not multiple programs; can't use executables()") unless $self->is_pseudo; $self->throw("Command name required at arg 1") unless defined $cmd; $self->throw("The desired executable '$cmd' is not registered as a command") unless grep /^$cmd$/, @{$self->{_options}->{_commands}}; # setter if (defined $exe) { $self->throw("binary '$exe' does not exist") unless -e $exe; $self->throw("'$exe' is not executable") unless -x $exe; $self->{_pathstoexe} = {} unless defined $self->{_pathstoexe}; return $self->{_pathstoexe}->{$cmd} = $exe; } # getter return $self->{_pathstoexe}->{$cmd} if defined $self->{_pathstoexe}->{$cmd}; $exe ||= $cmd; # finder return $self->{_pathstoexe}->{$cmd} = $self->_find_executable($exe, $warn); } =head2 _find_executable() Title : _find_executable Usage : my $exe_path = $fac->_find_executable($exe, $warn); Function: find the full path to a named executable, Returns : full path, if found Args : name of executable to find [optional] boolean flag whether or not to warn when exe no found Note : differs from executable and executables in not setting any object attributes =cut sub _find_executable { my $self = shift; my ($exe, $warn) = @_; if ($self->is_pseudo && !$exe) { if (!$self->command) { # this throw probably appropriate # the rest are now warns if $warn.../maj $self->throw( "The ".__PACKAGE__." wrapper represents several different programs;". "arg1 to _find_executable must be specified explicitly,". "or the command() attribute set"); } else { $exe = $self->command; } } $exe ||= $self->program_path; my $path; if ($self->program_dir) { $path = File::Spec->catfile($self->program_dir, $exe); } else { $path = $exe; $self->warn('Program directory not specified; use program_dir($path).') if $warn; } # use provided info - we are allowed to follow symlinks, but refuse directories map { return $path.$_ if ( -x $path.$_ && !(-d $path.$_) ) } ('', '.exe') if defined $path; # couldn't get path to executable from provided info, so use system path $path = $path ? " in $path" : undef; $self->warn("Executable $exe not found$path, trying system path...") if $warn; if ($path = $self->io->exists_exe($exe)) { return $path; } else { $self->warn("Cannot find executable for program '".($self->is_pseudo ? $self->command : $self->program_name)."'") if $warn; return; } } =head2 _register_composite_commands() Title : _register_composite_commands Usage : Function: adds subcomand params and switches for composite commands Returns : true on success Args : \%composite_commands, \@program_params, \@program_switches =cut sub _register_composite_commands { my $self = shift; my ($composite_commands, $program_params, $program_switches, $command_prefixes) = @_; my @sub_params; my @sub_switches; foreach my $cmd (keys %$composite_commands) { my $pfx = $command_prefixes->{$cmd} || $cmd; foreach my $subcmd ( @{$$composite_commands{$cmd}} ) { my $spfx = $command_prefixes->{$subcmd} || $subcmd; my @sub_program_params = grep /^$spfx\|/, @$program_params; my @sub_program_switches = grep /^$spfx\|/, @$program_switches; for (@sub_program_params) { m/^$spfx\|(.*)/; push @sub_params, "$pfx\|${spfx}_".$1; } for (@sub_program_switches) { m/^$spfx\|(.*)/; push @sub_switches, "$pfx\|${spfx}_".$1; } } } push @$program_params, @sub_params; push @$program_switches, @sub_switches; # translations for subcmd params/switches not necessary return 1; } =head2 _create_factory_set() Title : _create_factory_set Usage : @facs = $self->_create_factory_set Function: instantiate a set of individual command factories for a given composite command Factories will have the correct parameter fields set for their own subcommand Returns : hash of factories: ( $subcmd_prefix => $subcmd_factory, ... ) Args : none =cut sub _create_factory_set { my $self = shift; $self->throw('command not set') unless $self->command; my $cmd = $self->command; $self->throw('_create_factory_set only works on composite commands') unless grep /^$cmd$/, keys %{$self->{_options}->{_composite_commands}}; my %ret; my $class = ref $self; my $subargs_hash = $self->_collate_subcmd_args($cmd); for (keys %$subargs_hash) { $ret{$_} = $class->new( -command => $_, @{$$subargs_hash{$_}} ); } return %ret; } =head2 _collate_subcmd_args() Title : _collate_subcmd_args Usage : $args_hash = $self->_collate_subcmd_args Function: collate parameters and switches into command-specific arg lists for passing to new() Returns : hash of named argument lists Args : [optional] composite cmd prefix (scalar string) [default is 'run'] =cut sub _collate_subcmd_args { my $self = shift; my $cmd = shift; my %ret; # default command is 'run' $cmd ||= 'run'; return unless $self->{'_options'}->{'_composite_commands'}; return unless $self->{'_options'}->{'_composite_commands'}->{$cmd}; my @subcmds = @{$self->{'_options'}->{'_composite_commands'}->{$cmd}}; my $cur_options = $self->{'_options'}; # collate foreach my $subcmd (@subcmds) { # find the composite cmd form of the argument in # the current params and switches # e.g., map_max_mismatches my $pfx = $self->{_options}->{_prefixes}->{$subcmd} || $subcmd; my @params = grep /^${pfx}_/, @{$$cur_options{'_params'}}; my @switches = grep /^${pfx}_/, @{$$cur_options{'_switches'}}; $ret{$subcmd} = []; # create an argument list suitable for passing to new() of # the subcommand factory... foreach my $opt (@params, @switches) { my $subopt = $opt; $subopt =~ s/^${pfx}_//; push(@{$ret{$subcmd}}, '-'.$subopt => $self->$opt) if defined $self->$opt; } } return \%ret; } =head2 _run Title : _run Usage : $fac->_run( @file_args ) Function: Run a command as specified during object contruction Returns : true on success Args : a specification of the files to operate on according to the filespec =cut sub _run { my ($self, @args) = @_; # _translate_params will provide an array of command/parameters/switches # -- these are set at object construction # to set up the run, need to add the files to the call # -- provide these as arguments to this function my $cmd = $self->command if $self->can('command'); my $opts = $self->{_options}; my %args; $self->throw("No command specified for the object") unless $cmd; # setup files necessary for this command my $filespec = $opts->{'_files'}->{$cmd}; my @switches; my ($in, $out, $err); # some applications rely completely on switches if (defined $filespec && @$filespec) { # parse args based on filespec # require named args $self->throw("Named args are required") unless !(@args % 2); s/^-// for @args; %args = @args; # validate my @req = map { my $s = $_; $s =~ s/^-.*\|//; $s =~ s/^[012]?[<>]//; $s =~ s/[^a-zA-Z0-9_]//g; $s } grep !/[#]/, @$filespec; !defined($args{$_}) && $self->throw("Required filearg '$_' not specified") for @req; # set up redirects and file switches for (@$filespec) { m/^1?>#?(.*)/ && do { defined($args{$1}) && ( open $out, '>', $args{$1} or $self->throw("Could not write file '$args{$1}': $!") ); next; }; m/^2>#?(.*)/ && do { defined($args{$1}) && ( open $err, '>', $args{$1} or $self->throw("Could not write file '$args{$1}': $!") ); next; }; m/^<#?(.*)/ && do { defined($args{$1}) && ( open $in, '<', $args{$1} or $self->throw("Could not read file '$args{$1}': $!") ); next; }; if (m/^-(.*)\|/) { push @switches, $self->_dash_switch($1); } else { push @switches, undef; } } } my $dum; $in || ($in = \$dum); $out || ($out = \$self->{'stdout'}); $err || ($err = \$self->{'stderr'}); # Get program executable my $exe = $self->executable; $self->throw("Can't find executable for '".($self->is_pseudo ? $self->command : $self->program_name)."'; can't continue") unless $exe; # Get command-line options my $options = $self->_translate_params(); # Get file specs sans redirects in correct order my @specs = map { my $s = $_; $s =~ s/^-.*\|//; $s =~ s/[^a-zA-Z0-9_]//g; $s } grep !/[<>]/, @$filespec; my @files = @args{@specs}; # expand arrayrefs my $l = $#files; # Note: below code block may be brittle, see link on this: # http://lists.open-bio.org/pipermail/bioperl-l/2010-June/033439.html for (0..$l) { if (ref($files[$_]) eq 'ARRAY') { splice(@switches, $_, 1, ($switches[$_]) x @{$files[$_]}); splice(@files, $_, 1, @{$files[$_]}); } } @files = map { my $s = shift @switches; defined $_ ? ($s, $_): () } @files; @files = map { defined $_ ? $_ : () } @files; # squish undefs my @ipc_args = ( $exe, @$options, @files ); $self->{_last_execution} = join( $self->{'_options'}->{'_join'}, @ipc_args ); eval { IPC::Run::run(\@ipc_args, $in, $out, $err) or die ("There was a problem running $exe : ".$$err); }; if ($@) { $self->throw("$exe call crashed: $@") unless $self->no_throw_on_crash; return 0; } return 1; } =head2 no_throw_on_crash() Title : no_throw_on_crash Usage : Function: prevent throw on execution error Returns : Args : [optional] boolean =cut sub no_throw_on_crash { my $self = shift; return $self->{'_no_throw'} = shift if @_; return $self->{'_no_throw'}; } =head2 last_execution() Title : last_execution Usage : Function: return the last executed command with options Returns : string of command line sent to IPC::Run Args : =cut sub last_execution { my $self = shift; return $self->{'_last_execution'}; } =head2 _dash_switch() Title : _dash_switch Usage : $version = $fac->_dash_switch( $switch ) Function: Returns an appropriately dashed switch for the executable Args : A string containing a switch without dashes Returns : string containing an appropriately dashed switch for the current executable =cut sub _dash_switch { my ($self, $switch) = @_; my $dash = $self->{'_options'}->{'_dash'}; for ($dash) { $_ eq '1' && do { $switch = '-'.$switch; last; }; /^s/ && do { #single dash only $switch = '-'.$switch; last; }; /^d/ && do { # double dash only $switch = '--'.$switch; last; }; /^m/ && do { # mixed dash: one-letter opts get -, $switch = '-'.$switch; $switch =~ s/^(-[a-z0-9](?:\w+))$/-$1/i; last; }; do { $self->warn( "Dash spec '$dash' not recognized; using 'single'" ); $switch = '-'.$switch; }; } return $switch; } =head2 stdout() Title : stdout Usage : $fac->stdout() Function: store the output from STDOUT for the run, if no file specified in _run arguments Example : Returns : scalar string Args : on set, new value (a scalar or undef, optional) =cut sub stdout { my $self = shift; return $self->{'stdout'} = shift if @_; return $self->{'stdout'}; } =head2 stderr() Title : stderr Usage : $fac->stderr() Function: store the output from STDERR for the run, if no file is specified in _run arguments Example : Returns : scalar string Args : on set, new value (a scalar or undef, optional) =cut sub stderr { my $self = shift; return $self->{'stderr'} = shift if @_; return $self->{'stderr'}; } =head2 is_pseudo() Title : is_pseudo Usage : $obj->is_pseudo($newval) Function: returns true if this factory represents a pseudo-program Example : Returns : value of is_pseudo (boolean) Args : on set, new value (a scalar or undef, optional) =cut sub is_pseudo { my $self = shift; return $self->{'is_pseudo'} = shift if @_; return $self->{'is_pseudo'}; } =head2 AUTOLOAD AUTOLOAD permits $class->new_yourcommand(@args); as an alias for $class->new( -command => 'yourcommand', @args ); =cut sub AUTOLOAD { my $class = shift; my $tok = $AUTOLOAD; my @args = @_; $tok =~ s/.*:://; unless ($tok =~ /^new_/) { $class->throw("Can't locate object method '$tok' via package '".ref($class)?ref($class):$class); } my ($cmd) = $tok =~ m/new_(.*)/; return $class->new( -command => $cmd, @args ); } =head1 Bio:ParameterBaseI compliance =head2 set_parameters() Title : set_parameters Usage : $pobj->set_parameters(%params); Function: sets the parameters listed in the hash or array Returns : true on success Args : [optional] hash or array of parameter/values. =cut sub set_parameters { my ($self, @args) = @_; # currently stored stuff my $opts = $self->{'_options'}; my $params = $opts->{'_params'}; my $switches = $opts->{'_switches'}; my $translation = $opts->{'_translation'}; my $use_dash = $opts->{'_dash'}; my $join = $opts->{'_join'}; unless (($self->can('command') && $self->command) || (grep /command/, @args)) { push @args, '-command', 'run'; } my %args = @args; my $cmd = $args{'-command'} || $args{'command'} || ($self->can('command') && $self->command); if ($cmd) { my (@p,@s, %x); $self->warn('Command present, but no commands registered') unless $self->{'_options'}->{'_commands'}; $self->throw("Command '$cmd' not registered") unless grep /^$cmd$/, @{$self->{'_options'}->{'_commands'}}; $cmd = $self->{_options}->{_prefixes}->{$cmd} || $cmd; @p = (grep(!/^.*?\|/, @$params), grep(/^${cmd}\|/, @$params)); @s = (grep(!/^.*?\|/, @$switches), grep(/^${cmd}\|/, @$switches)); s/.*?\|// for @p; s/.*?\|// for @s; @x{@p, @s} = @{$translation}{ grep( !/^.*?\|/, @$params, @$switches), grep(/^${cmd}\|/, @$params, @$switches) }; $opts->{_translation} = $translation = \%x; $opts->{_params} = $params = \@p; $opts->{_switches} = $switches = \@s; } $self->_set_from_args( \@args, -methods => [ @$params, @$switches, 'program_name', 'program_dir', 'out_type' ], -create => 1, # when our parms are accessed, signal parameters are unchanged for # future reads (until set_parameters is called) -code => ' my $self = shift; $self->parameters_changed(0); return $self->{\'_\'.$method} = shift if @_; return $self->{\'_\'.$method};' ); # the question is, are previously-set parameters left alone when # not specified in @args? $self->parameters_changed(1); return 1; } =head2 reset_parameters() Title : reset_parameters Usage : resets values Function: resets parameters to either undef or value in passed hash Returns : none Args : [optional] hash of parameter-value pairs =cut sub reset_parameters { my ($self, @args) = @_; my @reset_args; # currently stored stuff my $opts = $self->{'_options'}; my $params = $opts->{'_params'}; my $switches = $opts->{'_switches'}; my $translation = $opts->{'_translation'}; my $qual_param = $opts->{'_qual_param'}; my $use_dash = $opts->{'_dash'}; my $join = $opts->{'_join'}; # handle command name my %args = @args; my $cmd = $args{'-command'} || $args{'command'} || $self->command; $args{'command'} = $cmd; delete $args{'-command'}; @args = %args; # don't like this, b/c _set_program_args will create a bunch of # accessors with undef values, but oh well for now /maj for my $p (@$params) { push(@reset_args, $p => undef) unless grep /^[-]?$p$/, @args; } for my $s (@$switches) { push(@reset_args, $s => undef) unless grep /^[-]?$s$/, @args; } push @args, @reset_args; $self->set_parameters(@args); $self->parameters_changed(1); } =head2 parameters_changed() Title : parameters_changed Usage : if ($pobj->parameters_changed) {...} Function: Returns boolean true (1) if parameters have changed Returns : Boolean (0 or 1) Args : [optional] Boolean =cut sub parameters_changed { my $self = shift; return $self->{'_parameters_changed'} = shift if @_; return $self->{'_parameters_changed'}; } =head2 available_parameters() Title : available_parameters Usage : @params = $pobj->available_parameters() Function: Returns a list of the available parameters Returns : Array of parameters Args : 'params' for settable program parameters 'switches' for boolean program switches default: all =cut sub available_parameters { my $self = shift; my $subset = shift; my $opts = $self->{'_options'}; my @ret; for ($subset) { (!defined || /^a/) && do { @ret = (@{$opts->{'_params'}}, @{$opts->{'_switches'}}); last; }; m/^p/i && do { @ret = @{$opts->{'_params'}}; last; }; m/^s/i && do { @ret = @{$opts->{'_switches'}}; last; }; m/^c/i && do { @ret = @{$opts->{'_commands'}}; last; }; m/^f/i && do { # get file spec return @{$opts->{'_files'}->{$self->command}}; }; do { #fail $self->throw("available_parameters: unrecognized subset"); }; } return @ret; } sub available_commands { shift->available_parameters('commands') } sub filespec { shift->available_parameters('filespec') } =head2 get_parameters() Title : get_parameters Usage : %params = $pobj->get_parameters; Function: Returns list of key-value pairs of parameter => value Returns : List of key-value pairs Args : [optional] A string is allowed if subsets are wanted or (if a parameter subset is default) 'all' to return all parameters =cut sub get_parameters { my $self = shift; my $subset = shift; $subset ||= 'all'; my @ret; my $opts = $self->{'_options'}; for ($subset) { m/^p/i && do { #params only for (@{$opts->{'_params'}}) { push(@ret, $_, $self->$_) if $self->can($_) && defined $self->$_; } last; }; m/^s/i && do { #switches only for (@{$opts->{'_switches'}}) { push(@ret, $_, $self->$_) if $self->can($_) && defined $self->$_; } last; }; m/^a/i && do { # all for ((@{$opts->{'_params'}},@{$opts->{'_switches'}})) { push(@ret, $_, $self->$_) if $self->can($_) && defined $self->$_; } last; }; do { $self->throw("get_parameters: unrecognized subset"); }; } return @ret; } 1; bioperl-run-release-1-7-1/lib/Bio/Tools/Run/tRNAscanSE.pm000066400000000000000000000156741302566030400227700ustar00rootroot00000000000000# BioPerl module for Bio::Tools::Run::tRNAscanSE # # Please direct questions and support issues to # # Cared for by Bioperl # # Copyright Bioperl, Mark Johnson # # Special thanks to Chris Fields, Sendu Bala # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::Tools::Run::tRNAscanSE - Wrapper for local execution of tRNAscan-SE =head1 SYNOPSIS my $factory = Bio::Tools::Run::tRNAscanSE->new('-program' => 'tRNAscan-SE'); # Pass the factory Bio::Seq objects # returns a Bio::Tools::tRNAscanSE object my $factory = $factory->run($seq); or my $factory = $factory->run(@seq); =head1 DESCRIPTION Wrapper module for tRNAscan-SE. tRNAscan-SE is open source and available at L. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR - Mark Johnson Email: johnsonm-at-gmail-dot-com =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::Tools::Run::tRNAscanSE; use strict; use warnings; use Bio::SeqIO; use Bio::Root::Root; use Bio::Tools::Run::WrapperBase; use Bio::Tools::tRNAscanSE; use English; use IPC::Run; # Should be okay on WIN32 (See IPC::Run Docs) use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); our @params = (qw(program)); our @tRNAscanSE_switches = (qw(A B C G O P)); =head2 program_name Title : program_name Usage : $factory>program_name() Function: gets/sets the program name Returns: string Args : string =cut sub program_name { my ($self, $val) = @_; $self->program($val) if $val; return $self->program(); } =head2 program_dir Title : program_dir Usage : $factory->program_dir() Function: gets/sets the program dir Returns: string Args : string =cut sub program_dir { my ($self, $val) = @_; $self->{'_program_dir'} = $val if $val; return $self->{'_program_dir'}; } =head2 new Title : new Usage : $tRNAscanSE->new(@params) Function: creates a new tRNAscanSE factory Returns: Bio::Tools::Run::tRNAscanSE Args : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->io->_initialize_io(); $self->_set_from_args( \@args, -methods => [ @params, @tRNAscanSE_switches, ], -create => 1, ); unless (defined($self->program())) { $self->throw('Must specify program'); } return $self; } =head2 run Title : run Usage : $obj->run($seq_file) Function: Runs tRNAscan-SE Returns : A Bio::Tools::tRNAscanSE object Args : An array of Bio::PrimarySeqI objects =cut sub run{ my ($self, @seq) = @_; unless (@seq) { $self->throw("Must supply at least one Bio::PrimarySeqI"); } foreach my $seq (@seq) { unless ($seq->isa('Bio::PrimarySeqI')) { $self->throw("Object does not implement Bio::PrimarySeqI"); } } my $program_name = $self->program_name(); my $file_name = $self->_write_seq_file(@seq); return $self->_run($file_name); } =head2 _run Title : _run Usage : $obj->_run() Function: Internal(not to be used directly) Returns : An instance of Bio::Tools::tRNAscanSE Args : file name =cut sub _run { my ($self, $seq_file_name) = @_; my @cmd = ( $self->executable(), split(/\s+/, $self->_setparams()), $seq_file_name, ); my $cmd = join(' ', @cmd); $self->debug("tRNAscan-SE Command = $cmd"); my $program_name = $self->program_name(); my ($program_stderr); my ($output_fh, $output_file_name) = $self->io->tempfile(-dir=> $self->tempdir()); my @ipc_args = (\@cmd, \undef, '>', $output_file_name, '2>', \$program_stderr); # Run the program via IPC::Run so: # 1) The console doesn't get cluttered up with the program's STDERR/STDOUT # 2) We don't have to embed STDERR/STDOUT redirection in $cmd # 3) We don't have to deal with signal handling (IPC::Run should take care # of everything automagically. eval { IPC::Run::run(@ipc_args) || die $CHILD_ERROR;; }; if ($EVAL_ERROR) { $self->throw("tRNAscan-SE call crashed: $EVAL_ERROR"); } $self->debug(join("\n", 'tRNAscanSE STDERR:', $program_stderr)) if $program_stderr; return Bio::Tools::tRNAscanSE->new(-file => $output_file_name); } sub _setparams { my ($self) = @_; my $param_string = $self->SUPER::_setparams( -params => [ ], -switches => [ @tRNAscanSE_switches, ], -dash => 1 ); # Kill leading and trailing whitespace $param_string =~ s/^\s+//g; $param_string =~ s/\s+$//g; return $param_string; } =head2 _write_seq_file Title : _write_seq_file Usage : obj->_write_seq_file($seq) or obj->_write_seq_file(@seq) Function: Internal(not to be used directly) Returns : Name of a temp file containing program output Args : One or more Bio::PrimarySeqI objects =cut sub _write_seq_file { my ($self, @seq) = @_; my ($fh, $file_name) = $self->io->tempfile(-dir=>$self->tempdir()); my $out = Bio::SeqIO->new(-fh => $fh , '-format' => 'Fasta'); foreach my $seq (@seq){ $out->write_seq($seq); } close($fh); $out->close(); return $file_name; } 1; bioperl-run-release-1-7-1/scripts/000077500000000000000000000000001302566030400170605ustar00rootroot00000000000000bioperl-run-release-1-7-1/scripts/bioperl_application_installer.PLS000066400000000000000000000202271302566030400255370ustar00rootroot00000000000000#!/usr/bin/perl # BioPerl script for Bio::Installer # # Cared for by Albert Vilella # # based on the CPAN::FirstTime module # # Copyright Albert Vilella # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME bioperl_application_installer - downloads and installs runnables =head1 SYNOPSIS bioperl_application_installer =head1 DESCRIPTION This script will ask the user which programs wants to install and where. It will download the packages, decompress them (if necessary) and compile/install them in the specified directory. =head1 AUTHOR Albert Vilella, avilella-AT-gmail-DOT-com =head1 TODO Check if the programs are already installed in the computer and prompt it so that the user is at least aware of it. Check for the available installers, instead of hard-coding $INSTALLERLIST in this script. =cut use strict; use ExtUtils::MakeMaker; use Data::Dumper; use Bio::Factory::ObjectFactory; use vars qw($DEFAULT $CONFIG $INSTALLERLIST); BEGIN { $DEFAULT = 'bioperl-runnables'; $INSTALLERLIST = [ 'Clustalw', 'EMBOSS', 'TCoffee', 'PAML', 'Hyphy', 'SLR', 'Probcons', ]; } my $dir = shift @ARGV || $DEFAULT; init($dir); 1; ################################################################################ sub init { my($configpm) = @_; use Config; local($/) = "\n"; local($\) = ""; local($|) = 1; my($ans,$default); # # Files, directories # print qq[ This script will install the runnable programs associated with bioperl-run. Bioperl-run contain modules that provides a PERL interface to various bioinformatics applications. This allows various applications to be used with common bioperl objects. If you do not want to enter a dialog now, you can answer 'no' to this question. ]; my $manual_conf = ExtUtils::MakeMaker::prompt("Are you ready for manual configuration?", "yes"); my $fastread; { local $^W; if ($manual_conf =~ /^\s*y/i) { $fastread = 0; *prompt = \&ExtUtils::MakeMaker::prompt; } else { print "Done.\n\n"; exit; } } print qq{ The following questions are intended to help you with the installation. }; $default = File::Spec->catdir( $ENV{'HOME'}, $configpm); while ($ans = prompt("Where do you want to install the runnables in your computer?",$default)) { unless (File::Spec->file_name_is_absolute($ans)) { require Cwd; my $cwd = Cwd::cwd(); my $absans = File::Spec->catdir($cwd,$ans); warn "The path '$ans' is not an absolute path. Please specify an absolute path\n"; $default = $absans; next; } eval { File::Path::mkpath($ans); }; # dies if it can't if ($@) { warn "Couldn't create directory $ans. Please retry.\n"; next; } if (-d $ans && -w _) { print qq{ Directory $ans successfully created. }; last; } else { warn "Couldn't find directory $ans or directory is not writable. Please retry.\n"; } } print qq{ The script will need a few external programs to work properly. Please correct me, if I guess the wrong path for a program. Don\'t panic if you do not have some of them, just press ENTER for those. }; my $old_warn = $^W; local $^W if $^O eq 'MacOS'; my(@path) = split /$Config{'path_sep'}/, $ENV{'PATH'}; local $^W = $old_warn; my $progname; for $progname (qw/gzip tar unzip make links wget ncftpget ncftp ftp gpg/) { if ($^O eq 'MacOS') { $CONFIG->{$progname} = 'not_here'; next; } my $progcall = $progname; # we don't need ncftp if we have ncftpget next if $progname eq "ncftp" && $CONFIG->{ncftpget} gt " "; my $path = $CONFIG->{$progname} || $Config::Config{$progname} || ""; if (File::Spec->file_name_is_absolute($path)) { # testing existence is not good enough, some have these exe # extensions # warn "Warning: configured $path does not exist\n" unless -e $path; # $path = ""; } else { $path = ''; } unless ($path) { # e.g. make -> nmake $progcall = $Config::Config{$progname} if $Config::Config{$progname}; } $path ||= find_exe($progcall,[@path]); warn "Warning: $progcall not found in PATH\n" unless $path; # not -e $path, because find_exe already checked that $ans = prompt("Where is your $progname program?",$path) || $path; $CONFIG->{$progname} = $ans; } my $path = $CONFIG->{'pager'} || $ENV{PAGER} || find_exe("less",[@path]) || find_exe("more",[@path]) || ($^O eq 'MacOS' ? $ENV{EDITOR} : 0 ) || "more"; $ans = prompt("What is your favorite pager program?",$path); $CONFIG->{'pager'} = $ans; $path = $CONFIG->{'shell'}; if (File::Spec->file_name_is_absolute($path)) { warn "Warning: configured $path does not exist\n" unless -e $path; $path = ""; } $path ||= $ENV{SHELL}; if ($^O eq 'MacOS') { $CONFIG->{'shell'} = 'not_here'; } else { $path =~ s,\\,/,g if $^O eq 'os2'; # Cosmetic only $ans = prompt("What is your favorite shell?",$path); $CONFIG->{'shell'} = $ans; } print qq{ Which programs would you like to install? }; my @selected_programs; my $prompt = "Select the programs you would like to install (by number), put them on one line, separated by blanks, e.g. '1 2 4'"; push (@selected_programs, @$INSTALLERLIST); @selected_programs = picklist (\@selected_programs, $prompt); push @{$CONFIG->{selected_programs_list}}, @selected_programs; print qq{ The selected programs will now be installed }; # TODO: Check for the available installers, instead of hard-coding # $INSTALLERLIST in this script. # my @l; # for my $i (@INC) { # next unless (-e $i."/My/Stuff/"); # opendir(X,$i."/My/Stuff")|| warn "$!"; # push @l,readdir(X); # } # find sub { push(@l, $File::Find::name) if -f && /\.pm$/ }, # map { "$_/My/Module" } @INC; foreach my $program (@selected_programs) { my $type = 'Bio::Installer::' . $program; my $factory = new Bio::Factory::ObjectFactory(-type => $type); my $instance = $factory->create_object(); $instance->destination_install_dir($default); $instance->download(); $instance->install(); } } sub picklist { my($items,$prompt,$default,$require_nonempty,$empty_warning)=@_; $default ||= ''; my $pos = 0; my @nums; while (1) { # display, at most, 15 items at a time my $limit = $#{ $items } - $pos; $limit = 15 if $limit > 15; # show the next $limit items, get the new position $pos = display_some($items, $limit, $pos); $pos = 0 if $pos >= @$items; my $num = prompt($prompt,$default); @nums = split (' ', $num); my $i = scalar @$items; (warn "invalid items entered, try again\n"), next if grep (/\D/ || $_ < 1 || $_ > $i, @nums); if ($require_nonempty) { (warn "$empty_warning\n"); } print "\n"; # a blank line continues... next unless @nums; last; } for (@nums) { $_-- } @{$items}[@nums]; } sub display_some { my ($items, $limit, $pos) = @_; $pos ||= 0; my @displayable = @$items[$pos .. ($pos + $limit)]; for my $item (@displayable) { printf "(%d) %s\n", ++$pos, $item; } printf("%d more items, hit SPACE RETURN to show them\n", (@$items - $pos) ) if $pos < @$items; return $pos; } sub find_exe { my($exe,$path) = @_; my($dir); #warn "in find_exe exe[$exe] path[@$path]"; for $dir (@$path) { my $abs = File::Spec->catfile($dir,$exe); if (($abs = MM->maybe_command($abs))) { return $abs; } } } bioperl-run-release-1-7-1/scripts/multi_hmmsearch.PLS000066400000000000000000000067251302566030400226330ustar00rootroot00000000000000#!/usr/bin/perl -w # $Id: multi_hmmsearch.PLS,v 1.3 2006-07-04 22:23:36 mauricio Exp $ use strict; =head1 NAME multi_hmmsearch - perform a hmmsearch into multiple FASTA files using an INDEX file =head1 SYNOPSIS multi_hmmsearch -p hmm_file [-i] -f index_file =head1 DESCRIPTION Not technically a Bio::Tools::Run script as this doesn't use any Bioperl or Bioperl-run components but it's useful. =head2 Mandatory Options: -p HMM profile to use for the search. -f INDEX file that contains a list of FASTA files for the multiple search. =head2 Special Options: -i Create a new index file with the resulting hmms files. This is useful if you want to pass this list as input arguments into another programs. -h Show this documentation. =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/wiki/Mailing_lists - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via the web: http://redmine.open-bio.org/projects/bioperl/ =head1 AUTHOR Mauricio Herrera Cuadra =cut # Modules, pragmas and variables to use use Getopt::Long; use vars qw($opt_p $opt_i $opt_f $opt_h $index_file); # Gets options from the command line GetOptions qw(-p=s -i -f=s -h); # Print documentation if help switch was given exec('perldoc', $0) and exit() if $opt_h; # If no mandatory options are given prints an error and exits if (!$opt_p) { print "ERROR: No HMM profile has been specified.\n Use '-h' switch for documentation.\n" and exit(); } elsif (!$opt_f) { print "ERROR: No INDEX file has been specified.\n Use '-h' switch for documentation.\n" and exit(); } # Locates hmmsearch in the filesystem my $hmmsearch = `which hmmsearch`; chomp $hmmsearch; # Creates a directory for writing the resulting files mkdir("multi", 0755) unless -e "multi" and -d "multi"; # Creates a new INDEX file if the option was given if ($opt_i) { my $prefix = $opt_f; $prefix =~ s/\.INDEX$//; $index_file = "$prefix.hmms.INDEX"; open(HMMSINDEX, ">", $index_file) or die("Unable to create file: $index_file ($!)"); } # Opens the INDEX file sent as input open(FH, "<", $opt_f) or die("Unable to open INDEX file: $opt_f ($!)"); print "==> Opening INDEX file:\t\t\t\t$opt_f\n"; print "==> HMM profile file is:\t\t\t$opt_p\n"; # Cycle that extracts one line for every loop until finding the end of # file while (my $line = ) { # Deletes the new line characters from the line chomp $line; # Gets the name for the result file my $out = $line; $out =~ s/^split\///; $out =~ s/\.faa$//; # Performs the hmmsearch for the FASTA file in turn print "--> Performing hmmsearch in file:\t\t$line\n"; system("$hmmsearch $opt_p $line > multi/$out.hmms"); print "==> hmmsearch results stored in file:\t\tmulti/$out.hmms\n"; # Prints the result file name into the new INDEX file if the # option was given print HMMSINDEX "multi/$out.hmms\n" if $opt_i; } # Closes INDEX files close(FH); if ($opt_i) { print "==> New INDEX stored in file:\t\t\t$index_file\n"; close(HMMSINDEX); } # Exits the program exit(); bioperl-run-release-1-7-1/scripts/panalysis.PLS000066400000000000000000000562451302566030400214570ustar00rootroot00000000000000#!/usr/bin/perl -w # # A client showing how to use Bio::Tools::Run::Analysis module, # a module for executing and controlling local or remote analysis tools. # It also calls methods from Bio::Tools::Run::AnalysisFactory module. # # It has many options in order to cover as many methods as # possible. Because of that, it can be also used as a fully # functional command-line client for accessing various analysis # tools. # # Usage: ./panalysis.PLS -h # or: perldoc panalysis.PLS # # martin.senger@gmail.com # July 2002 # # $Id: panalysis.PLS,v 1.10 2006-07-04 22:23:36 mauricio Exp $ #----------------------------------------------------------------------------- use strict; sub get_usage { return <<"END_OF_USAGE"; Usage: panalysis.PLS [options] [input-data] where 'options' are: -A access method (default 'soap') -l where are the analyses -n name of an analysis -j ID of a previously created job -L list all available analyses -c list all available categories -C show all analyses in given category -i, -I show specification of data inputs -o, -O show specification of results -a show specification of the analysis -d show analysis metadata (XML) -b create job from [input-data] (default: create a job also without -b option if there is no -j option and if there are some 'input-data' on the command-line) -x create job from [input-data] and run it -w create job from [input-data], run it and wait for it -x -j run a previously created job -w -j run a previously created job and wait for it -k -j kill a previously created job -s show job status -t show all job times -T show some job times (all, created, begun, finished, elapsed) -e show job last event (XML) -r retrieve all results -R retrieve named results; comma-separated list, each item: = =@[filename-template] =?[filename-template] where 'filename-template' can contain: * ... will be replaced by a unique number \$ANALYSIS ... will be replaced by an analysis name \$RESULT ... will be replaced by a result name any other characters (suitable for filenames) -z at the end remove job and all its results -h this help -v, -V show version(s) -q be less verbose where 'input-data' are: =... =@... Environment variables: HTTPPROXY HTTP proxy server HTTPTIMEOUT HTTP timeout (0 means no timeout at all) RESULT_FILENAME_TEMPLATE template for inventing filenames for results For more details type: perldoc panalysis.PLS END_OF_USAGE } BEGIN { # add path to the directory with this script my $mylib; ($mylib = $0) =~ s|/[^/]+$||; unshift @INC, $mylib; # be prepare for command-line options/arguments use Getopt::Std; # general options use vars qw/ $opt_h $opt_v $opt_V $opt_q /; # specialized options use vars qw/ $opt_A $opt_l $opt_n $opt_j /; # service use vars qw/ $opt_L $opt_c $opt_C /; # factory use vars qw/ $opt_d $opt_i $opt_I $opt_o $opt_O $opt_a /; # metadata use vars qw/ $opt_x $opt_w $opt_k $opt_s $opt_e $opt_t $opt_T $opt_b /; # job use vars qw/ $opt_r $opt_R /; # results use vars qw/ $opt_z /; # cleaning my $switches = 'ACjlnRT'; # switches taking an argument (a value) getopt ($switches); use vars qw($VERSION $Revision); # set the version for version checking $VERSION = do { my @r = (q[$Revision: 1.10 $] =~ /\d+/g); sprintf "%d.%-02d", @r }; $Revision = q[$Id: panalysis.PLS,v 1.10 2006-07-04 22:23:36 mauricio Exp $]; # help wanted? if ($opt_h) { print get_usage; exit 0; } # print version of this script and exit if ($opt_v) { print "$0 $VERSION\n"; exit 0; } } use Bio::Tools::Run::Analysis; # to access analysis tools directly use Bio::Tools::Run::AnalysisFactory; # to access list/factory of analysis tools # --- create a factory object; # the new() method understands the following parameters: # -location (taken from '-l' option if given) # -access (taken from '-A' option, default is 'soap') # # Additionally, it uses env. variable HTTPPROXY to create parameter # '-httpproxy', and env. variable HTTPTIMEOUT to set max HTTP timeout. # my @access = ('-access', $opt_A) if defined $opt_A; my @location = ('-location', $opt_l) if defined $opt_l; my @httpproxy = ('-httpproxy', $ENV{'HTTPPROXY'}) if defined $ENV{'HTTPPROXY'}; my @timeout = ('-timeout', $ENV{'HTTPTIMEOUT'}) if defined $ENV{'HTTPTIMEOUT'}; my $factory = new Bio::Tools::Run::AnalysisFactory (@location, @httpproxy, @timeout); # --- create an analysis (service) object; # the new() method understands the following parameters: # -location (taken from '-l' option if given) # -access (taken from '-A' option, default is 'soap') # -name (taken from '-n' option; mandatory!, no default value) # -destroy_on_exit (set to true if '-z' option given) # -httpproxy (taken from an env.variable) # -timeout (taken from an env.variable) # my @name = ('-name', $opt_n) if defined $opt_n; my @destroy = ('-destroy_on_exit', 0) unless $opt_z; my $service = new Bio::Tools::Run::Analysis (@name, @location, @httpproxy, @timeout, @destroy); die "Stopped. No success in accessing analysis factory.\n" unless $factory; die "Stopped. No success in accessing analysis tools.\n" unless $service; # --- print class and version of "real-workers" and exit if ($opt_V) { print ref $factory, " ", $factory->VERSION . "\n"; print ref $service, " ", $service->VERSION . "\n"; exit 0; } # # --- here are methods of the "directory service" (factory) # # what categories are available? if ($opt_c) { my $msg = "Available categories"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); print join ("\n", sort @{ $factory->available_categories }), "\n"; } # what analyses are available? if ($opt_L) { my $msg = "Available analyses"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); print join ("\n", sort @{ $factory->available_analyses }), "\n"; } # what analyses are available in a particular category? if ($opt_C) { my $msg = "Available analyses in category '$opt_C':"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); print join ("\n", sort @{ $factory->available_analyses ($opt_C) }), "\n"; } # # --- here are methods describing one analysis # # print full analysis metadata in XML # ('$service->describe' returns an XML string) print $service->describe . "\n" if $opt_d; # print major characteristics of an analysis # ('$service->analysis_spec' returns a hash reference) if ($opt_a) { my $rh_spec = $service->analysis_spec; my $msg = "Specification of analysis"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); my ($key, $value); print "Analysis '$opt_n':\n"; while (($key, $value) = each %{ $rh_spec }) { print "\t$key => $value\n"; } } # print input specification (either full, or just input data names) # ('$service->input_spec' returns a reference to an array of hashes) if ($opt_i or $opt_I) { my $ra_spec = $service->input_spec; my $msg = "Specification of inputs"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); my ($key, $value); foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) { print $$_{'name'},"\n"; if ($opt_I) { while (($key, $value) = each %{ $_ }) { unless ($key eq 'name') { if (ref $value eq 'ARRAY') { # for 'allowed values' print "\t$key => " . join (", ", @$value) . "\n"; } else { print "\t$key => $value\n"; } } } } } } # print result specification (either full, or just names of results) # ('$service->result_spec' returns a reference to an array of hashes) if ($opt_o or $opt_O) { my $ra_spec = $service->result_spec; my $msg = "Specification of results"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); my ($key, $value); foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) { print $$_{'name'},"\n"; if ($opt_O) { while (($key, $value) = each %{ $_ }) { print "\t$key => $value\n" unless ($key eq 'name'); } } } } # # --- let's create a job # my $job; if ($opt_j) { # ... either by re-creating a previous job $job = $service->create_job ($opt_j); if ($opt_x) { $job->run; } elsif ($opt_w) { $job->wait_for; } elsif ($opt_k) { $job->terminate; } } else { # ... or creating a new job using given input data if ($opt_x) { $job = $service->run (\@ARGV); } elsif ($opt_w) { $job = $service->wait_for (\@ARGV); } elsif ($opt_b or @ARGV > 0) { $job = $service->create_job (\@ARGV); } # often you need to know the JOB's ID to be able to come back # later and ask for results, status, events etc. - so I print it # here even in quiet mode (option -q) - but to STDERR in order not # to intervene with redirected real results print STDERR "JOB ID: " , $job->id . "\n" if $job; } # # --- having a job, ask it for something # if ($job) { print "JOB STATUS: " . $job->status . "\n" if $opt_s; print "LAST EVENT: " . $job->last_event . "\n" if $opt_e; # ...get job times (all of them in one go, formatted) if ($opt_t) { my $rh_times = $job->times (1); # '1' means 'formatted' print "TIMES:\n"; print "\tCreated: " . $$rh_times{'created'} . "\n" if $$rh_times{'created'}; print "\tStarted: " . $$rh_times{'started'} . "\n" if $$rh_times{'started'}; print "\tEnded: " . $$rh_times{'ended'} . "\n" if $$rh_times{'ended'}; print "\tElapsed: " . $$rh_times{'elapsed'} . "\n" if defined $$rh_times{'elapsed'}; } # ...get individual job characteristics (both formatted and raw) if ($opt_T) { print "CREATED: " . $job->created (1) . " (" . $job->created . ")\n" if $opt_T =~ /a|c/; print "STARTED: " . $job->started (1) . " (" . $job->started . ")\n" if $opt_T =~ /a|b/; print "ENDED: " . $job->ended (1) . " (" . $job->ended . ")\n" if $opt_T =~ /a|f/; print "ELAPSED: " . $job->elapsed . "\n" if $opt_T =~ /a|e/; } # retrieve results my $rh_results; if ($opt_R) { $rh_results = $job->results (split /\s*,\s*/, $opt_R); } elsif ($opt_r) { $rh_results = $job->results ('?'); } if ($rh_results) { foreach my $name (sort keys %$rh_results) { my $msg = "RESULT: $name"; &msg ("$msg\n" . '-' x length ($msg) . "\n"); if (ref $$rh_results{$name}) { # ... this is probably what you do not want (binary on terminal); # unless you wisely used: -R result_name=filename print join ("\n", @{ $$rh_results{$name} }) . "\n"; } else { print $$rh_results{$name} . "\n"; } } } } sub msg { print shift unless $opt_q; } __END__ =head1 NAME panalysis.PLS - An example/tutorial script how to access analysis tools =head1 SYNOPSIS # run an analysis with your sequence in a local file ./panalysis.PLS -n 'edit.seqret'-w -r \ sequence_direct_data=@/home/testdata/my.seq See more examples in the text below. =head1 DESCRIPTION A client showing how to use C module, a module for executing and controlling local or remote analysis tools. It also calls methods from the C module, a module providing lists of available analyses. Primarily, this client is meant as an example how to use analysis modules, and also to test them. However, because it has a lot of options in order to cover as many methods as possible, it can be also used as a fully functional command-line client for accessing various analysis tools. =head2 Defining location and access method C is independent on the access method to the remote analyses (the analyses running on a different machines). The method used to communicate with the analyses is defined by the C<-A> option, with the default value I. The other possible values (not yet supported, but coming soon) are I and I. Each access method may have different meaning for parameter C<-l> defining a location of services giving access to the analysis tools. For example, the I access expects a URL of a Web Service in the C<-l> option, while the I access may find here a stringified Interoperable Object Reference (IOR). A default location for the I access is C which represents services running at European Bioinformatics Institute on top of over hundred EMBOSS analyses (and on top of few others). =head2 Available analyses C can show a list of available analyses (from the given location using given access method). The C<-L> option shows all analyses, the C<-c> option lists all available categories (a category is a group of analyses with similar functionality or processing similar type of data), and finally the C<-C> option shows only analyses available within the given category. Note, that all these functions are provided by module C (respectively, by one of its access-dependent sub-classes). The module has also a I method C which is not used by this script. =head2 Service A C is a higher level of abstraction of an analysis tool. It understands a well defined interface (module C, a fact which allows this script to be independent on the access protocol to various services. The service name must be given by the C<-n> option. This option can be omitted only if you invoked just the C methods (described above). Each service (representing an analysis tool, a program, or an application) has its description, available by using options C<-a> (analysis name, type, etc.), C<-i>, C<-I> (specification of analysis input data, most important are their names), and C<-o>, C<-O> (result names and their types). The option C<-d> gives the most detailed description in the XML format. The service description is nice but the most important is to use the service for invoking an underlying analysis tool. For each invocation, the service creates a C and feeds it with input data. There are three stages: (a) create a job, (b) run the job, and (c) wait for its completion. Correspondingly. there are three options: the C<-b> which just creates (builds) a job, the C<-x> which creates a job and executes it, and finally C<-w> which creates a job, runs it and blocks the client until the job is finished. Always only one of these options is used (so it does not make sense to use more of them, the C priorities them in the order C<-x>, C<-w>, and C<-b>). All of these options take input data from the command-line (see next section about it) and all of them return (internally) an object representing a job. There are many methods (options) dealing with the job objects (see one after next section about them). Last note in this section: the C<-b> option is actually optional - a job is created even without this option when there are some input data found on the command-line. You I to use it, however, if you do not pass any data to an analysis tool (an example would be the famous C service). =head2 Input data Input data are given as name/value pairs, put on the command-line with equal sign between name and value. If the I part starts with an un-escaped character C<@>, it is used as a local file name and the C reads the file and uses its contents instead. Examples: panalysis.PLS -n edit.seqret -w -r sequence_direct_data='tatatctcccc' osformat=embl panalysis.PLS ... sequence_direct_data=@/my/data/my.seq The names of input data come from the C that can be shown by the C<-i> or C<-I> options. The input specification (when using option C<-I>) shows also - for some inputs - a list of allowed values. The specification, however, does not tell what input data are mutually exclusive, or what other constrains apply. If there is a conflict, an error message is produced later (before the job starts). Input data are used when any of the options C<-b>, C<-x>, or C<-w> is present, but option C<-j> is not present (see next section about this job option). =head2 Job Each service (defined by a name given in the C<-n> option) can be executed one or more times, with the same, but usually with different input data. Each execution creates a I. Actually, the job is created even before execution (remember that option C<-b> builds a job but does not execute it yet). Any job, executed or not, is persistent and can be used again later from another invocation of the C script. Unless you explicitly destroy the job using option C<-z>. A job created by options C<-b>, C<-x> and C<-w> (and by input data) can be accessed in the same C invocation using various job-related options, the most important are C<-r> and C<-R> for retrieving results from the finished job. However, you can also re-create a job created by a previous invocation. Assuming that you know the job ID (the C prints it always on the standard error when a new job is created), use option C<-j> to re-create the job. Example: ./panalysis.PLS -n 'edit.seqret' sequence_direct_data=@/home/testdata/my.seq It prints: JOB ID: edit.seqret/bb494b:ef55e47c99:-8000 Next invocation (asking to run the job, to wait for its completion and to show job status) can be: ./panalysis.PLS -n 'edit.seqret' -j edit.seqret/bb494b:ef55e47c99:-800 -w -s And again later another invocation can ask for results: ./panalysis.PLS -n 'edit.seqret' -j edit.seqret/bb494b:ef55e47c99:-800 -r Here is a list of all job options (except for results, they are in the next section): =over 4 =item Job execution and termination There are the same options C<-x> and C<-w> for executing a job and for executing it and waiting for its completion, as they were described above. But now, the options act on a job given by the C<-j> option, now they do not use any input data from the command-line (the input data had to be used when the job was created). Additionally, there is a C<-k> option to kill a running job. =item Job characteristics Other options tell about the job status (C<-s>, about the job execution times (C<-t> and C<-T>, and about the last available event what happened with the job (C<-e>). Note that the event notification is not yet fully implemented, so this option will change in the future to reflect more notification capabilities. =back =head2 Results Of course, the most important on the analysis tools are their results. The results are named (in the similar way as the input data) and they can be retrieved all in one go using option C<-r> (so you do not need to know their names actually), or by specifying (all or some) result names using the C<-R> option. If a result does not exist (either not yet, or the name is wrong) an undef value is returned (no error message produced). Some results are better to save directly into files instead to show them in the terminal window (this applies to the I results, mostly containing images). The C helps to deal with binary results by saving them automatically to local files (actually it is the module C and its submodules who do help with the binary data). So why not to use a traditional shell re-direction to a file? There are two reasons. First, a job can produce more than one result, so they would be mixed together. But mainly, because each result can consist of several parts whose number is not known in advance and which cannot be mixed together in one file. Again, this is typical for the binary data returning images - an invocation can produce many images. The C<-r> option retrieves all available results and treat them as described by the C<'?'> format below. The C<-R> option has a comma-separated list of result names, each of the names can be either a simple name (as specified by the C obtainable using the C<-o> or C<-O> options), or a equal-sign-separated name/format construct suggesting what to do with the result. The possibilities are: =over 4 =item result-name It prints given result on the standard output. =item result-name=filename It saves the given result into given file. =item result-name=@ It saves the given result into a file whose name is automatically invented, and it guarantees that the same name will not be used in the next invocation. =item result=name=@template It saves the given result into a file whose name is given by the C