#! /usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use Smash::Core;
use Smash::Global qw($SMASH_SCRIPT_NAME $SMASH_SCRIPT_LOCATION);
use Smash::CommandLineParser qw(parse_options check_required_options);
use File::Temp;

##############
# Set up command line parsing
##############

my @allowed  = qw(genepred=s external=s eggnog=s flavor=s help); # arguments I expect
my @required = qw(eggnog flavor);            # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Check the options
##############

my $flavor       = uc($options{flavor});
my $eggnog       = lc($options{eggnog});
my $genepred     = $options{genepred};

$flavor = "WU" if $flavor eq "WU-BLAST";

if ($flavor ne "WU" && $flavor ne "NCBI") {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: flavor must be WU or NCBI\n", -exitstatus => 2, -verbose => 1);
}

if (!$options{genepred} && !$options{external}) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: --genepred or --external must be specified\n", -exitstatus => 2, -verbose => 1);
}

##############
# do the mapping
##############

my $smash;
my $blast_file;
my $og_map_file;
my $error_file;

# Get the right input files

if ($options{external}) {
	   $smash        = new Smash::Core();
	   $smash->init();
	   $blast_file   = $options{external};
	   $og_map_file  = "$blast_file.eggnogmapping.txt";
	   $error_file   = "$blast_file.eggnogmapping.err";
} else {
	   $smash        = new Smash::Core(GENEPRED => $genepred);
	   $smash->init();
	my $genepred_id  = $smash->get_id_by_name("gene_prediction", $genepred);
	my $genepred_dir = $smash->genepred_dir($genepred);
	   $blast_file   = "$genepred_dir/$genepred.$eggnog.blastp";
	   $og_map_file  = "$genepred_dir/$genepred.eggnogmapping.txt";
	   $error_file   = "$genepred.eggnogmapping.err";
}

# SMASH files used for og mapping

my $external_dir = $smash->get_smash_conf_value("data_dir")."/external";
my $orthgrp_file = "$external_dir/${eggnog}_final_orthgroups.txt";
my $plength_file = "$external_dir/${eggnog}_protein_lengths.txt";

# make best hit file

$smash->execute("$SMASH_SCRIPT_LOCATION/find_best_hit.pl -i $blast_file -f $flavor > $blast_file.besthit");

# run og mapping

my $command = "$SMASH_SCRIPT_LOCATION/og_mapping.py -b $blast_file -t $blast_file.besthit -g $orthgrp_file -p $plength_file -f $flavor";
print STDERR "#$command\n";
$status = $smash->execute("(echo '## $command ##'; $command) 2>$error_file 1>$og_map_file.preliminary");

# clean up

$smash->execute("egrep -v ' NONE ' $og_map_file.preliminary > $og_map_file");

# done!

$smash->finish();

if ($status == 0) {
	print "<output>success</output>\n";
}

exit(0);

=head1 doOrthologMapping.pl

Script to map proteins to eggNOG orthologous groups (OGs)
using BLASTP results against eggNOG.

=head1 Synopsis

	doOrthologMapping.pl [options]

=head1 Options

=over 4

=item B<C<--genepred>>

Name of geneprediction whose proteins should be mapped.

=item B<C<--external>>

External file containing BLAST output against eggNOG database.

=item B<C<--eggnog>>

Version of the eggnog database to be used.

=item B<C<--flavor>>

BLAST flavor used to generate alignments. NCBI or WU-BLAST.

=item B<C<--help>>

Prints this manual.

=back

=head1 Description

B<doOrthologMapping.pl> is a wrapper script that maps predicted proteins 
in a metagenome to eggNOG orthologous groups. If your BLASTP results were
generated by WU-BLAST, it can be run as:

	doOrthologMapping.pl --flavor=WU-BLAST --genepred=MC20.MG10.AS2.GP1 --eggnog=eggnog2

It can also provide orthologous group mappings to proteins outside of Smash
if you have BLAST outputs after searching against the eggNOG protein
database. If your NCBI BLASTP output file is in F<mysample.eggnog.blastp>, then
you can run

	doOrthologMapping.pl --flavor=NCBI --external=mysample.eggnog.blastp --eggnog=eggnog2

This will create a file called F<mysample.eggnog.blast.eggnogmapping.txt> 
that contains the eggNOG orthologous group mapping information per protein
sequence in your dataset.

=head1 Required files

B<doOrthologMapping.pl> requires the following 
files (assuming eggnog version 2; replace 2 with the correct version):

=over 4

=item 1. BLAST results

File containing results from BLASTing the predicted proteins against 
eggNOG proteins. 

B<Note:>
Currently SMASH supports tabular BLAST outputs from WU-BLAST 
(run using C<"-mformat=2">) and NCBI BLAST (run using C<"-m 8">).

This file is expected to be in the gene prediction directory. Once you have
the results of BLAST, please move it to that directory. To see the
location of this directory for a given gene prediction (e.g. B<MC10.MG23.AS1.GP2>), run:

	perl showLocations.pl --item=MC10.MG23.AS1.GP2

You should place F<MC10.MG23.AS1.GP2.eggnog2.blastp> in that directory.

=item 2. eggNOG orthologous group information and protein length files

These files are automatically downloaded from SMASH website when you install SMASH. They
should be installed in your repository under the data directory. To see where they are,
run:

	perl showLocations.pl --external

You should find the following files there: F<eggnog2_final_orthgroups.txt>, 
F<eggnog2_protein_lengths.txt>

=back

=cut
