#! /usr/bin/env perl

use strict;
use warnings;
use Fcntl;
use File::Temp;
use File::Path;
use File::Copy;
use File::Glob;
use Smash::Core;
use Smash::Global qw($SMASH_SCRIPT_NAME $SMASH_SCRIPT_LOCATION);
use Smash::CommandLineParser qw(parse_options check_required_options print_options);
use Smash::Utils::WUBLAST;
use Smash::Utils::GFFlite;
use Smash::Analyses::GenePredictor;
use Pod::Usage;

##############
# ??DEBUG?? 
##############

my $DEBUG  = 0;
my $UNLINK = 1;
   $UNLINK = 0 if $DEBUG;


##############
# Set up command line parsing
##############

my @allowed  = qw(metagenome=s memory=n db=s strict! prefilter=s fasta=s cpus=n help);

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
if (!$options{metagenome} && !$options{fasta}) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: --metagenome or --fasta must be specified!\n", -exitstatus => 2, -verbose => 1);
}

if ($options{prefilter} && (lc($options{prefilter}) ne "smash" && lc($options{prefilter}) ne "rrna_hmm_only") && !-f $options{prefilter}) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: 16S database file ${options{prefilter}} does not exist!\n", -exitstatus => 2, -verbose => 1);
}

############################################
# fix options
############################################

$options{memory} = 4000 unless $options{memory};
$options{strict} = 1 unless defined($options{strict});

############################################
# Begin Execution
############################################

my $smash        = new Smash::Core();
$smash->init();
my ($rdp_dir, $rdp_version) = $smash->software_dir("rdp_classifier", "current");
my ($rna_hmm_dir) = $smash->software_dir("meta_rna", "current");
my ($hmmer3_dir)  = $smash->software_dir("hmmer", "current");
$smash->finish();

my $prefix;
my $input_fasta;
my $class_file;
my $ssu_fasta;
my $combined_fasta = new File::Temp(UNLINK => $UNLINK);
my $searched_ssu   = new File::Temp(UNLINK => $UNLINK);
my $fasta_file     = $options{fasta};
my $fasta_db_16S   = $options{prefilter};
my $cpus           = $options{cpus} || 2;

if (lc($fasta_db_16S) eq "smash") {
	$fasta_db_16S = $smash->get_smash_conf_value("data_dir")."/external/full16S.v4.fa";
}

if ($options{metagenome}) {
	$smash->init($options{metagenome});
	my $metagenome   = $smash->metagenome;
	my $analyses_dir = $smash->analyses_dir($metagenome);
	mkpath "$analyses_dir";

	$smash->execute(sprintf("cat %s > $combined_fasta", join(' ', $smash->fasta_files($metagenome))));
	$smash->finish();

	# if skipping search, write all reads directly to input for classifier

	if ($options{prefilter}) { 
		$input_fasta = $combined_fasta->filename;
	} else {
		$ssu_fasta   = $combined_fasta->filename;
	}

	$prefix   = "$analyses_dir/$metagenome";
} elsif ($options{fasta}) {

	# if skipping search, write all reads directly to input for classifier

	if ($options{prefilter}) { 
		$input_fasta = $fasta_file;
	} else {
		$ssu_fasta   = $fasta_file;
	}

	$prefix = $fasta_file;
}

$class_file = "$prefix.16S.RDP.class";
unlink $class_file;

# As of now:
# If !prefilter:
#    1. $ssu_fasta is already populated with the name of a file
#	that contains the reads/sequences that need to just be 
#	classified.
# Else:
#    2. $input_fasta is populated with the name of a file that
#	contains the reads/sequences that need to be searched
#	and then classified
#    3. $ssu_fasta will be populated with the name of a file
#	after searching $input_fasta for SSU molecules and 
#	writing their sequences in $ssu_fasta

# first search for 16S, then make input for classifier

if ($options{prefilter}) {
	my $gff_file   = "$prefix.16S.gff";
	my $tmp_gff    = new File::Temp(UNLINK => $UNLINK);
	my $blastfasta = new File::Temp(UNLINK => $UNLINK);
	my $predictor  = new Smash::Analyses::GenePredictor(TRANS_TABLE => 11); # doesnt matter anyway, since this is rRNA

	# blast options: loose/strict

	my $blast_options = "E=1e-10";
	if ($options{strict}) {
		$blast_options = "M=5 N=-11 Q=22 R=11 E=1e-50";
	}

	if ($options{prefilter} ne "rrna_hmm_only") {
		####
		# blast filter as step 1
		####

		# blast all 16S full length against input
		# SBJCT: RDP full length
		# QUERY: input

		my $blastdb    = new File::Temp(UNLINK => $UNLINK);
		my $blastout   = new File::Temp(UNLINK => $UNLINK);
		my $blastgff   = new File::Temp(UNLINK => $UNLINK);

		if ($DEBUG) {
			print "#BLASTOUT = $blastout\n";
			print "#BLASTGFF = $blastgff\n";
			print "#BLASTFA  = $blastfasta\n";
			print "#RDPGFF   = $tmp_gff\n";
			print "#FIXGFF   = $gff_file\n";
			print "#SSUFA    = $searched_ssu\n";
		}

		my $prev_query  = "dummy";
		my $locus_count = 0;

		execute("xdformat -n -o $blastdb $fasta_db_16S");

		# should not use -span1, since that will suppress multiple copies of 16S gene from being found
		# 15 spout and topComboN, since that's the maximum rrn copy number I know.

		execute("blastn $blastdb $input_fasta $blast_options B=1 V=1 hspsepQmax=5000 spoutmax=15 topComboN=15 mformat=2 cpus=$cpus -warnings -notes > $blastout");

		seek $blastout, 0, SEEK_SET;
		$, = "\t";
		my $parser = new Smash::Utils::WUBLAST($blastout);
		while (my $report = $parser->nextReport) {
			my $query = $report->queryName;
			while (my $sbjct = $report->nextSbjct) {
				while (my $group = $sbjct->nextGroup) {
					my $hsps = $group->{HSPS};
					if (scalar @$hsps == 2 && $hsps->[0]->strand eq $hsps->[1]->strand) {
					}
					foreach my $hsp (@$hsps) {
						my ($qb, $qe) = sort {$a <=> $b} ($hsp->qb, $hsp->qe);
						#$qb -= 2000;
						#$qb = 1 if ($qb < 1);
						#$qe += 2000;
						my $locus = $group->groupId;
						print $blastgff $query, "BLASTN", "rRNA", $qb, $qe, ".", $hsp->strand, ".", "gene_id \"$query.rrn16S.$locus\"; molecule \"16S_rRNA\";\n";
					}
				}
			}
		}
		$, = "";

		seek $blastgff, 0, SEEK_SET;
		get_sequence_for_feature("rRNA", $blastgff, $input_fasta, $blastfasta);
		$predictor->make_genes_and_proteins(GFF => $blastgff, FASTA => $input_fasta, GENE => $blastfasta, PROTEIN => "/dev/null");
		seek $blastfasta, 0, SEEK_SET;
		$input_fasta = $blastfasta;
		unlink <$blastdb.x*> if $UNLINK;
	}

	# scan with rrna_hmm 

	my $command = "PATH=\$PATH:$hmmer3_dir && $rna_hmm_dir/rna_hmm3.py -i $input_fasta -L $rna_hmm_dir/HMM3 -k arc,bac -m ssu -o $tmp_gff -p $cpus";
	execute($command);
	seek $tmp_gff, 0, SEEK_SET;
	open(GFFOUT, ">$gff_file") || die "Cannot open $gff_file: $!";
	my $gff = new Smash::Utils::GFFlite($tmp_gff);
	while (my $f = $gff->nextFeature) {
		my $gene_id  =  $f->seqname;
		   $gene_id  =~ s/\s.*//;
		   $gene_id .=  (":".$f->start."-".$f->end);
		$f->{ATTRIBUTES}->{gene_id}  = $gene_id;
		$f->{ATTRIBUTES}->{molecule} = "16S_rRNA";
		$f->print_feature_gff(\*GFFOUT);
	}
	close(GFFOUT);
	$predictor->make_genes_and_proteins(GFF => $gff_file, FASTA => $input_fasta, GENE => $searched_ssu->filename, PROTEIN => "/dev/null");

	$ssu_fasta = $searched_ssu->filename;
}

rdp_classify($options{memory}, "$rdp_dir/rdp_classifier-$rdp_version.jar", $ssu_fasta, $class_file, "$rdp_dir/training_data/classifier/rRNAClassifier.properties");

print "<output>success</output>\n";

exit(0);

############################################
# End Execution
############################################

sub execute {
	my $command = shift;
	print "#$command\n";
	system($command);
}

sub rdp_classify {
	my ($memory, $jar_file, $fasta_file, $class_file, $prop_file) = @_;
	$status = execute("java -Xmx${memory}m -jar $jar_file --format allrank --queryFile $fasta_file --outputFile $class_file -train_propfile $prop_file");

	# if something went wrong,
	if ($status != 0) {
		unlink $class_file;
		print STDERR "Error finding 16S sequences. Aborting!\n";
	}
}

sub get_sequence_for_feature {
	use Smash::Utils::GFFlite;
	use FAlite;
	my ($feature, $gff_file, $fasta_in, $fasta_out) = @_;
	my %Begin;
	my %End;
	my %Seq;
	my %RevComp;
	my $GFF;

	# filename or filehandle?

	if (ref($gff_file) !~ /GLOB/) {
		open($GFF, "<$gff_file") || die "Cannot open $gff_file: $!";
	} else {
		$GFF = $gff_file;
	}

	my $gff = new Smash::Utils::GFFlite($GFF);
	while (my $f = $gff->nextFeature) {
		if ($f->feature eq "$feature") {
			my $gene        = $f->seqname;
			my $id          = $f->get_attribute("gene_id");
			   $gene        = "$gene.$id" if $id;
			$Seq{$gene}     = $f->seqname;
			$Begin{$gene}   = $f->begin;
			$End{$gene}     = $f->end;
			$RevComp{$gene} = ($f->strand eq "-")?1:0;
		}
	}
	if (ref($gff_file) !~ /GLOB/) {
		close($GFF);
	}

	open(OUT, ">$fasta_out") || die "Cannot open $fasta_out: $!";
	open(FASTA, "<$fasta_in") || die "Cannot open $fasta_in: $!";
	my $fasta = new FAlite(\*FASTA);
	while (my $entry = $fasta->nextEntry) {
		my $def = $entry->def;
		$def =~ s/^>//;
		$def =~ s/\s+.*//;
		foreach my $gene (grep {$Seq{$_} eq $def} keys %Seq) {
			my $begin  = $Begin{$gene};
			my $end    = $End{$gene};
			my $length = length($entry->seq);
			my $len16S = $end - $begin + 1;
			if ($len16S >= 50) {
				my $seq    = uc(substr($entry->seq, $begin-1, $len16S));
				if ($RevComp{$gene}) {
					$seq =~ y/ACTG/TGAC/;
					$seq =  reverse($seq);
				}
				print OUT ">$gene\n";
				print OUT Smash::Core->pretty_fasta($seq);
			}
		}
	}
	close(FASTA);
	close(OUT);
}

sub get_attribute {
	my ($string, $attribute) = @_;
	my (undef, $value) = $string =~ m#${attribute}=(["'])([^\1]*?)\1#i;
	return $value;
}

__END__

=head1 Name

find16S.pl - Smash wrapper for finding 16S sequences in a metagenome

=head1 Synopsis

	find16S.pl {--metagenome=<name>|--fasta-<file>} [--memory=<number>] [--db=<db>]

=head1 Options

=over 4

=item B<C<--metagenome>>

name of the metagenome

=item B<C<--fasta>>

input fasta file with sequences to be classified

=item B<C<--memory>>

memory (MB) allocated for JVM (default: 4000);

=item B<C<--prefilter>>

Name of a fasta file containing (preferably) full-length 16S rDNA sequences to use as a prefilter
before running the expensive HMMER search for the 16S profiles. This will make it much faster.
If you do not have your own full-length 16S rDNA sequence database, you can use the one included
in SMASH by specifying C<--prefilter=smash> or just scan for 16S molecules using rrna_hmm by
specifying C<--prefilter=rrna_hmm_only>. C<--prefilter=smash> or C<--prefilter=file> requires
WU-BLAST to be installed. (default: none)

=item B<C<--strict|--nostrict>>

Used in conjunction with C<--prefilter>. When looking for a hit against the 16S sequence
database, by default C<find16S.pl> uses strict BLASTN parameters that look for matches
over 85% or so. By specifying C<--nostrict>, you can relax this and switch to default
BLASTN parameters. (default: C<--strict>)

=item B<C<--db>>

16S databases. Does not have an effect currently since we only use C<RDP>.

=item B<C<--cpus>>

Number of cpus to use in the BLAST and HMMER searches. (default: 2)

=item B<C<--help>>

Prints this manual.

=head1 Description

C<find16S.pl> can identify 16S rDNA sequences from a metagenome sample and classify them using
the RDP Classifier. Since only a small fraction of metagenome sequences contain 16S rDNA, it is
advisable to use the C<--prefilter> option to speed up your search by BLASTing against a set of
full-length 16S rDNA sequences. SMASH comes with such a set, and this can be used by specifying
C<--prefilter=smash>.

For classifying sequences in a 16S rDNA library, you do not have to use C<--prefilter> since each
read is expected to be a 16S rDNA sequence anyway.

=back

=cut

