#! /usr/bin/env perl
use strict;
use warnings;
use Pod::Usage;
use File::Path;
use Cwd;
use Smash::Core;
use Smash::Global qw($SMASH_SCRIPT_NAME);
use Smash::CommandLineParser qw(parse_options check_required_options);
use Smash::Utils::ClusterBLAST;

##############
# Set up command line parsing
##############

my @allowed  = qw(blast=s flavor=s query=s database=s pieces=n splitsize=n outdir=s label=s subjects=n evalue=f extra_args=s stage=s cpus=n bitscore=f wordsize=i preprocessor=s postprocessor=s makedb tabular! help); # arguments I expect
my @required = qw(blast flavor query database);                       # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

if (!defined($options{pieces}) && !defined($options{splitsize})) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: --splitsize or --pieces required\n", -exitstatus => 2, -verbose => 1);
}

if (defined($options{splitsize})) {
	$options{pieces} = 2;
}

##############
# Handle command line options
# (except checking for presence of required args, which has already 
# been done by parse_options)
##############

##############
# Get the label
##############

if (!-f $options{query}) {
	my $smash = Smash::Core->new()->init();
	my ($collection, $metagenome, $assembly, $genepred) = $smash->parse_concat_id($options{query});
	if (!$options{label}) {
		$options{label} = $genepred || $assembly || $metagenome;
	}
}

my %defaults = (evalue => 0.1, subjects => 1000, cpus => 1, tabular => 1, extra_args => "", stage => "tmp");
foreach my $key (keys %defaults) {
	$options{$key} = $defaults{$key} unless defined($options{$key});
}

# if you want it ultrafast, put the database in shared memory

my $node_location;
my $user = getpwuid($<);
if ($options{stage} eq "memory") {
	$node_location = "/dev/shm/$user/in";
} else {
	if ($options{stage} ne "tmp") {
		warn "Stage option $options{stage} not recognized. Using 'tmp' instead!\n";
	}
	$node_location = "/tmp/$user/in";
}
$options{localdisk} = $node_location;

Smash::Utils::ClusterBLAST->run_blast_cluster(map {uc($_) => $options{$_}} keys %options);

exit(0);

=head1 NAME

	runParallelBlast.pl - Run BLAST on clusters.

Splits a query file into small files, submits to the cluster and combines the results.

=head1 SYNOPSIS

	runParallelBlast.pl [options]

=head1 OPTIONS

=head2 Required options

=over 4

=item B<C<--flavor=name>>

WU-BLAST (WU), NCBI-BLAST (NCBI) or NCBI-BLAST+ (NCBI+) flavors of BLAST

=item B<C<--blast=program>>

BLAST program (blastn|blastp|blastx|tblastn|tblastx)

=item B<C<--database=prefix>>

name of the blast database file

=item B<C<--query=file>>

query file

=item B<C<--label=name>>

Output file will be stored as <out_dir>/<label>.<program>

=item B<C<--outdir=dir>>

directory to store the output file

=item B<C<--splitsize=num>>

(approximate) number of basepairs in each smaller query file

=item B<C<--outfmt=format>>

NCBI tabular output format string (default: \"7 std\")

=item B<C<--help>>

print this help

=back

=head2 BLAST options

=over 4

=item B<C<--wordsize=num>>

Word size for BLAST seeding step (default: BLAST defaults)

=item B<C<--evalue=num>>

E-value threshold for results to be included, passed to blast (default: 0.1)

=item B<C<--subjects=num>>

maximum number of subjects per query to report, passed to blast (default: 1000)

=item B<C<--notabular>>

disable tabular output of BLAST (default: enabled)
C<--notabular> only works when --pieces=1

=item B<C<--extra_args=string>>

additional arguments passed to blast as they are (use quotes if you use multiple parameters)

=item B<C<--cpus=num>>

number of cpus to use for every subjob (default: 1)

=item B<C<--stage=mode>>

local staging location (memory or tmp) in the node to copy the blast database (default: tmp)

=back

=head2 Post processing Options:

=over 4

=item B<C<--bits=num>>

bit threshold for results to be included (default: none)

=item B<C<--preprocessor=command>>

pre processing command. For example, if you want to blast to be run by a another script
that takes blast commands as argument (such as MPblast),
then you should run it as:

	runParallelBlast.pl [...] --notabular --preprocessor="mpblast.pl -s -w -b"

This will make a final command such as:

	mpblast.pl -b -w blastn database query option=value > output.blastn

=item B<C<--postprocessor=command>>

post processing command. For example, if you want to blast output through a script
that performs conversion to cigar format, then you should run it as:

	runParallelBlast.pl [...] --notabular --postprocessor="| blast2cigar"

This will make a final command such as:

	blastn database query option=value | blast2cigar > output.blastn

=back
