#! /usr/bin/env perl
use strict;
use warnings;
use Pod::Usage;
use File::Path;
use Cwd;
use Smash::Core;
use Smash::Global qw($SMASH_SCRIPT_NAME);
use Smash::CommandLineParser qw(parse_options check_required_options);
use Smash::Utils::BLAST;

##############
# Set up command line parsing
##############

my @allowed  = qw(blast=s flavor=s query=s database=s label=s subjects=n evalue=f extra_args=s cpus=n makedb tabular! help); # arguments I expect
my @required = qw(blast flavor query database);                       # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Handle command line options
# (except checking for presence of required args, which has already 
# been done by parse_options)
##############

my %defaults = (evalue => 0.1, subjects => 10000, cpus => 1, tabular => 1, extra_args => "");
foreach my $key (keys %defaults) {
	$options{$key} = $defaults{$key} unless defined($options{$key});
}

##############
# Make the blast object
##############

my %blast_params = map {uc($_), $options{$_}} keys %options;
my $blast_obj    = new Smash::Utils::BLAST(%blast_params);

my $opt_blastdb = $options{database};
my $opt_query   = $options{query};

##############
# Get the output
##############

# If it is not a file, blast_obj tells you where to place the output.
# If it is a file, just keep it in cwd.

my $output_dir  = $blast_obj->output_dir || Cwd::cwd;
my $output;
if ($options{label}) {
	$output = join(".", $options{label}, $blast_obj->blast);
} else {
	$output = join(".", $opt_query, $opt_blastdb, $blast_obj->blast);
}
$output = "$output_dir/$output";

##############
# Make the command
##############

my $command_line = $blast_obj->get_command_line();
$command_line .= " > $output";
print STDERR "Running: $command_line\n";
$status = $blast_obj->smash->execute($command_line);
if ($status == 0) {
	print "<output>success</output>\n";
}
exit(0);

1;

=head1 NAME

	runBlast.pl - A wrapper to run BLAST.

=head1 SYNOPSIS

	runBlast.pl [options]

=head1 OPTIONS

=over 4

=item C<--flavor=name>

WU-BLAST (WU), NCBI-BLAST (NCBI) or NCBI-BLAST+ (NCBI+) flavors of BLAST (default: WU)

=item C<--blast=program>

BLAST program (blastn|blastp|blastx|tblastn|tblastx)

=item C<--database=name>

BLAST database (an actual blast database or databases known to SMASH)

=item C<--query=string>

BLAST query (a file, gene prediction id, assembly id or metagenome id)

=item C<--label=name>

Prefix for the output file - final file will be <name>.<program> in the
current working directory.

=item C<--evalue=num>

E-value threshold for results to be included, passed to blast (default: 0.1)

=item C<--subjects=num>

Maximum number of subjects per query to report, passed to blast (default: 10000)

=item C<--(no)tabular>

Whether to use tabular output of BLAST (default: true)

=item C<--extra_args=string>

Additional parameters passed to blast as they are. Use quotes if you use multiple parameters

=item C<--cpus=num>

Number of cpus to use (default: 1)

=back

=head1 DESCRIPTION

=head2 BLAST Query

If you specify a file using C<--query>, then that will be used as the query. If you specify
a gene prediction id, then the protein sequences from that gene prediction will be used. If
you specify an assembly id, then the contig sequences from that assembly will be used. If you
specify a metagenome id, then the read sequences from that metagenome will be used.

=head2 BLAST database

If you specify the prefix of an existing blast database using C<--database>, then that will 
be used as the database. 

=over 4

=over 4

=item Note 

You should not specify the file extension. For example,
for an NCBI blast formatted database named B<protdb>, there will be at least three files 
called F<protdb.phr>, F<protdb.pin> and F<protdb.psq>. You should only mention B<protdb>
and not B<protdb.phr> or such.

=back

=back

If you want to use a database that SMASH knows about, such as the eggNOG protein database or the
reference genomes database, which are located in the subdirectory called F<external> in 
the F<data> directory, you can just specify the name. Right now, SMASH knows about B<eggnog2>
and B<reference_genomes.20100704>.
