#! /usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use Smash::Global qw($SMASH_SCRIPT_NAME);
use Smash::CommandLineParser qw(parse_options check_required_options);
use Smash::Analyses::GenePredictorWrapper qw(run_predictor);

##############
# Set up command line parsing
##############

my @allowed  = qw(assembly=s predictor=s genepred=s version=s pkg_dir=s organism=s translation_table=s fasta_file=s output_dir=s self_train parallelize cluster=s label=s merge pieces=i help);
my @required = ();                               # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Handle command line options
# (except checking for presence of required args, which has already 
# been done by parse_options)
##############

if ($options{merge}) {
	if (!($options{pieces} && $options{genepred})) {
		pod2usage(-message => "$SMASH_SCRIPT_NAME: --merge requires --genepred and --pieces to be specified\n", -exitstatus => 2, -verbose => 1);
	}
} else {
	if (!$options{predictor}) {
		pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --predictor\n", -exitstatus => 2, -verbose => 1);
	}
	if (defined($options{assembly}) == defined($options{genepred})) {
		pod2usage(-message => "$SMASH_SCRIPT_NAME: One (and exactly one) of --assembly or --genepred must be specified\n", -exitstatus => 2, -verbose => 1);
	}
	if ($options{genepred} && !($options{fasta_file} && $options{label} && $options{output_dir})) {
		pod2usage(-message => "$SMASH_SCRIPT_NAME: --genepred requires --fasta_file, --label and --output_dir to be specified\n", -exitstatus => 2, -verbose => 1);
	}
	if ($options{parallelize} && !$options{assembly}) {
		pod2usage(-message => "$SMASH_SCRIPT_NAME: --parallelize requires --assembly to be specified\n", -exitstatus => 2, -verbose => 1);
	}
}

####
# if fasta file is given, directly make an instance of the predictor
# else make an instance of a wrapper
####

Smash::Analyses::GenePredictorWrapper::run_predictor(%options);

exit(0);

=head1 Name

doGenePrediction.pl - Make gene predictions on a sequence assembly in Smash

=head1 Synopsis

	doGenePrediction.pl [options]

=head1 Options

=over 4

=item B<C<--predictor>>

name of the gene predictor (GeneMark|MetaGene) (required)

=item B<C<--version>>

run the specified version of the program, if available

=item B<C<--assembly>>

assembly id (either --assembly or --genepred must be specified)

=item B<C<--genepred>>

gene prediction id (either --assembly or --genepred must be specified)

=item B<C<--fasta_file>>

fasta file 

=item B<C<--output_dir>>

directory where the output files should be stored

=item B<C<--label>>

label for trained parameters (used with --self_train)

=item B<C<--self_train>>

train parameters using sequences (default: false)

=item B<C<--parallelize>>

parallelize gene prediction by breaking the input into smaller files

=item B<C<--cluster>>

cluster to run the parallel jobs for prediction

=item B<C<--pkg_dir>>

location where the gene predictor <program> is installed

=item B<C<--help>>

Prints this manual.

=back

One of (--assembly) or (--genepred) must be specified.

=head1 Description

B<doGenePrediction.pl> is a wrapper script to run gene prediction on a given metagenome assembly.

A normal execution of this script would be:

	doGenePrediction.pl --assembly=MC20.MG1.AS1 --predictor=GeneMark \
	    --version=2.6r --self_train

When you parallelize this run using B<C<--parallelize>>, it will generate two shell scripts that
should be run separately -- the predictor script and the loader script. First you run the predictor
script, potentially on a cluster where each line could go to a different host and they can all be
run simultaneously. When they are all done, then you run the loader script.

For example, if you ran:

	doGenePrediction.pl --assembly=MC20.MG1.AS1 --predictor=GeneMark \
	    --version=2.6r --self_train --parallelize

it could generate two shell script files: F<MC20.MG1.AS1.pred.sh> and F<MC20.MG1.AS1.load.sh>.
If you have a script C<qsub_line> that submits each line in a file as a job to C<qsub>,
then you would run:

	qsub_lines MC20.MG1.AS1.GP1.pred.sh

and when all the jobs finish, you would run:

	qsub_lines MC20.MG1.AS1.GP1.load.sh

If you want Smash to manage it completely, you could specify the name of a cluster where these jobs
should be sent to. For example, assuming you have an SGE grid where you can submit jobs to,

	doGenePrediction.pl --assembly=MC20.MG1.AS1 --predictor=GeneMark \
	    --version=2.6r --self_train --parallelize --cluster=SGE

will submit the jobs to the default SGE queue for the execution host. Two jobs will be submitted: one
for gene prediction, and one for loading the gene predictions. The loader job will only start after 
the prediction jobs finish.

=cut
