#! /usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use Smash::Core;
use Smash::Global qw($SMASH_SCRIPT_NAME $SMASH_SCRIPT_LOCATION);
use Smash::CommandLineParser qw(parse_options check_required_options);
use File::Temp;

##############
# Set up command line parsing
##############

my @allowed  = qw(genepred=s external=s kegg=s flavor=s help); # arguments I expect
my @required = qw(kegg flavor);            # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Check the options
##############

my $flavor       = uc($options{flavor});
my $kegg       = lc($options{kegg});
my $genepred     = $options{genepred};

$flavor = "WU" if $flavor eq "WU-BLAST";

if ($flavor ne "WU" && $flavor ne "NCBI") {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: flavor must be WU or NCBI\n", -exitstatus => 2, -verbose => 1);
}

if (!$options{genepred} && !$options{external}) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: --genepred or --external must be specified\n", -exitstatus => 2, -verbose => 1);
}

##############
# do the mapping
##############

my $smash;
my $blast_file;
my $og_map_file;
my $error_file;

# Get the right input files

if ($options{external}) {
	   $smash        = new Smash::Core();
	   $smash->init();
	   $blast_file   = $options{external};
	   $og_map_file  = "$blast_file.keggmapping.txt";
	   $error_file   = "$blast_file.keggmapping.err";
} else {
	   $smash        = new Smash::Core(GENEPRED => $genepred);
	   $smash->init();
	my $genepred_id  = $smash->get_id_by_name("gene_prediction", $genepred);
	my $genepred_dir = $smash->genepred_dir($genepred);
	   $blast_file   = "$genepred_dir/$genepred.$kegg.blastp";
	   $og_map_file  = "$genepred_dir/$genepred.keggmapping.txt";
	   $error_file   = "$genepred.keggmapping.err";
}

# SMASH files used for og mapping

my $external_dir = $smash->get_smash_conf_value("data_dir")."/external";

# run og mapping

my $fields;
if (uc($flavor) eq "WU") {
	$fields = "1,2,18,19,21,22";
} elsif (uc($flavor) eq "NCBI") {
	$fields = "1,2,7,8,9,10";
}
my $command = "$SMASH_SCRIPT_LOCATION/filterBlastReport.pl --flavor=$flavor --bits=60 --sbjct=1 $blast_file | cut -f$fields";
print STDERR "#$command\n";
$status = $smash->execute("(echo '## $command ##'; $command) 2>$error_file 1>$og_map_file");

# done!

$smash->finish();

if ($status == 0) {
	print "<output>success</output>\n";
}

exit(0);

=head1 doKeggMapping.pl

Script to map proteins to KEGG orthologous groups (KOs)
using BLASTP results against KEGG protein database.

=head1 Synopsis

	doKeggMapping.pl [options]

=head1 Options

=over 4

=item B<C<--genepred>>

Name of geneprediction whose proteins should be mapped.

=item B<C<--external>>

External file containing BLAST output against KEGG database.

=item B<C<--kegg>>

Version of the kegg database to be used.

=item B<C<--flavor>>

BLAST flavor used to generate alignments. NCBI or WU-BLAST.

=item B<C<--help>>

Prints this manual.

=back

=head1 Description

B<doKeggMapping.pl> is a wrapper script that maps predicted proteins 
in a metagenome to KEGG orthologous groups. It is run as:

	doKeggMapping.pl --genepred=MC20.MG10.AS2.GP1 --kegg=kegg57

It can also provide orthologous group mappings to proteins outside of Smash
if you have BLAST outputs after searching against the KEGG protein
database. If your BLAST output file is in F<mysample.kegg.blastp>, then
you can run

	doKeggMapping.pl --external=mysample.kegg.blastp --kegg=kegg57

This will create a file called F<mysample.kegg.blastp.keggmapping.txt> 
that contains the KEGG orthologous group mapping information per protein
sequence in your dataset.

=head1 Required files

B<doKeggMapping.pl> requires the following 
files:

=over 4

=item 1. BLAST results

File containing results from BLASTing the predicted proteins against 
KEGG proteins. 

B<Note:>
Currently SMASH supports tabular BLAST outputs from WU-BLAST 
(run using C<"-mformat=2">) and NCBI BLAST (run using C<"-m 8">).

This file is expected to be in the gene prediction directory. Once you have
the results of BLAST, please move it to that directory. To see the
location of this directory for a given gene prediction (e.g. B<MC10.MG23.AS1.GP2>), run:

	perl showLocations.pl --item=MC10.MG23.AS1.GP2

You should place F<MC10.MG23.AS1.GP2.kegg.blastp> in that directory.

=back

=cut
