#! /usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use File::Basename;
use File::Temp;
use FAlite;
use Smash::Global qw($SMASH_SCRIPT_NAME);
use Smash::CommandLineParser qw(parse_options check_required_options);
use Smash::Analyses::Assembler;
use Smash::Analyses::GenePredictor;
use Smash::Utils::GFFlite;
use Smash::Utils::ACE;
use Smash::Databases::MetaGenomeDB::Loader qw(load_smash);

my $PROGRESS = \*STDERR;
select(STDERR); $| = 1; select(STDOUT);

##############
# Set up command line parsing
##############

my @allowed  = qw(metagenome=s assembler=s version=s parameters=s contig_fasta=s contig_gff=s contig_ace=s scaffold_fasta=s scaffold_gff=s scaffold_agp=s gene_gff=s genepred=s genepred_version=s genepred_parameters=s help); # arguments I expect
my @required = qw(metagenome assembler version);                       # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Handle command line options
# (except checking for presence of required args, which has already 
# been done by parse_options)
##############

my $metagenome     = $options{metagenome};
my $contig_fasta   = $options{contig_fasta};
my $contig_gff     = $options{contig_gff};
my $contig_ace     = $options{contig_ace};
my $scaffold_fasta = $options{scaffold_fasta};
my $scaffold_gff   = $options{scaffold_gff};
my $scaffold_agp   = $options{scaffold_agp};
my $fake_scaf      = !(defined($scaffold_gff) || defined($scaffold_agp));
my $gene_gff       = $options{gene_gff};

if (!$contig_ace && !($contig_fasta && $contig_gff)) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Contig information missing: please use --contig_ace or (--contig_fasta and --contig_gff)\n", -exitstatus => 2, -verbose => 1);
}

############################################
# Instantiate an external assembler
############################################

$options{OPTIONS} = $options{parameters};
my $assembler = new Smash::Analyses::Assembler::external(map {uc($_) => $options{$_}} keys %options);
$assembler->init(); 
$assembler->prepare(); # Get a new name, init the directory structure etc.

############################################
# Get the required files and locations and
# other parameters
############################################

my $assembly     = $assembler->name;
my $assembly_dir = $assembler->assembly_dir;
my $contig2read  = "$assembly_dir/$assembly.contig2read.gff";
my $scaf2contig  = "$assembly_dir/$assembly.scaf2contig.gff";
my $contig_fa    = "$assembly_dir/$assembly.contigs.fa";
my $scaf_fa      = "$assembly_dir/$assembly.scaffolds.fa";

my %External2Smash;

############################################
# Process the read fasta file and make
# contig2read and dump it in CONTIGS
# and CONTIG2READ files
############################################

open(CONTIG2READ, ">$contig2read") || die "Cannot open $contig2read: $!";
open(SCAF2CONTIG, ">$scaf2contig") || die "Cannot open $scaf2contig: $!";
open(CONTIG_FA, ">$contig_fa")     || die "Cannot open $contig_fa: $!";
open(SCAF_FA, ">$scaf_fa")         || die "Cannot open $scaf_fa: $!";

############################################
# Contigs!
############################################

my %AssembledReads;

if ($contig_ace) {

	############################################
	# Read the contig ace file and rename the 
	# contigs using SMASH standards.
	############################################

	print $PROGRESS "Processing contigs in ACE file ";
	my $count = 0;
	my $ace = new Smash::Utils::ACE(FILE => $contig_ace, SOURCE => "$assembler");
	while (my $contig = $ace->nextContig) {
		$count++;
		my $contig_id = "$assembly.C$count";
		$External2Smash{$contig->name} = $contig_id;
		printf CONTIG_FA ">%s\n", $contig_id;
		printf CONTIG_FA Smash::Core->pretty_fasta($contig->sequence);
		my @features = $contig->getReadMappingGFF();
		foreach my $f (@features) {
			my $read = $f->get_attribute("read");
			if ($read !~ /^${metagenome}\./) {
				$read = "$metagenome.$read";
				$f->set_attribute("read", $read);
			}
			$AssembledReads{$read} = 1;
			$f->set_property("SEQNAME", $contig_id);
			$f->print_feature_gff(\*CONTIG2READ);
		}
		if ($fake_scaf) {
			my $scaf_id   = "$assembly.S$count";
			print SCAF2CONTIG join("\t", $scaf_id, "$assembler", "contig", 1, $contig->length, ".", "+", ".", "contig \"$contig_id\";\n");
			printf SCAF_FA ">%s\n", $scaf_id;
			printf SCAF_FA Smash::Core->pretty_fasta($contig->sequence);
		}
		print $PROGRESS Smash::Core->progress_bar($count);
	}
	print $PROGRESS " done\n";
} else {

	############################################
	# Read the contig fasta file and rename the 
	# contigs using SMASH standards.
	############################################

	print $PROGRESS "Processing contigs in FASTA file ";
	open(IN_CONTIGS, "<$contig_fasta") || die "Cannot open $contig_fasta: $!";
	my $fasta = new FAlite(\*IN_CONTIGS);
	my $count = 0;
	while (my $entry = $fasta->nextEntry) {
		$count++;
		my $contig_id = "$assembly.C$count";
		my $contig_length = length($entry->seq);
		my $contig_name = $assembler->process_fasta_header($entry->def);
		print CONTIG_FA ">$contig_id\n";
		print CONTIG_FA $assembler->pretty_fasta($entry->seq);
		$External2Smash{$contig_name} = $contig_id;
		if ($fake_scaf) {
			my $scaf_id   = "$assembly.S$count";
			print SCAF2CONTIG join("\t", $scaf_id, "$assembler", "contig", 1, $contig_length, ".", "+", ".", "contig \"$contig_id\";\n");
			printf SCAF_FA ">%s\n", $scaf_id;
			printf SCAF_FA Smash::Core->pretty_fasta($entry->seq);
		}
		print $PROGRESS Smash::Core->progress_bar($count);
	}
	close(IN_CONTIGS);
	print $PROGRESS " done\n";

	############################################
	# Read the contig2read gff file and remap
	# the names then make the proper contig2read
	# file in SMASH repository
	############################################

	print $PROGRESS "Processing mappings in GFF file ...";
	open(GFF, "<$contig_gff") || die "Cannot open $contig_gff: $!";
	my $gff = new Smash::Utils::GFFlite(\*GFF);
	while (my $f = $gff->nextFeature) {
		my $read = $f->get_attribute("read");
		if ($read !~ /^${metagenome}\./) {
			$read = "$metagenome.$read";
			$f->set_attribute("read", $read);
		}
		$AssembledReads{$read} = 1;
		print CONTIG2READ join("\t", $External2Smash{$f->seqname}, "$assembler", "read", $f->start, $f->end, $f->score, $f->strand, $f->frame, $f->flattened_attributes)."\n";
	}
	close(GFF);
	print $PROGRESS " done\n";
}

# Unassembled reads

print $PROGRESS "Parsing singletons ...";

my $maui_dir        = $assembler->maui_dir;
my $tmp             = new File::Temp();
my $contig_reads    = "$tmp.cont";
my $singleton_fasta = "$tmp.sngl";
my $reads           = "$tmp.read";

open(ASM_READS, ">$contig_reads") || die "Cannot open $contig_reads: $!";
foreach my $read (keys %AssembledReads) {
	print ASM_READS "$read\n";
}
close(ASM_READS);
$assembler->execute("cat @{[join(' ', $assembler->fasta_files($metagenome))]} > $reads");
$assembler->execute("$maui_dir/filterFasta --exclude --input=$reads --list=$contig_reads --output=$singleton_fasta");

open(SINGLETON, "<$singleton_fasta") || die "Cannot open singleton file $singleton_fasta: $!";
my $fasta = new FAlite(\*SINGLETON);
my $singleton = 0;
while (my $entry = $fasta->nextEntry) {
	$singleton++;
	my $def    = $entry->def;
	my $seq    = $entry->seq;
	my $length = length($seq);
	my $ctg    = sprintf("%s.R%d", $assembly, $singleton);
	$def =~ s/\s.*//;
	$def =~ s/^>//;
	print CONTIG_FA ">$ctg\n";
	print CONTIG_FA $assembler->pretty_fasta($seq);
	print SCAF_FA   ">$ctg\n";
	print SCAF_FA   $assembler->pretty_fasta($seq);
	print CONTIG2READ "$ctg	$assembler	read	1	$length	.	+	.	read \"$def\";\n";
	print SCAF2CONTIG "$ctg	$assembler	contig	1	$length	.	+	.	contig \"$ctg\";\n";
}
close(SINGLETON);
unlink $singleton_fasta;
unlink $contig_reads;
unlink $reads;
print $PROGRESS " done\n";

############################################
# Scaffolds!
############################################

my %ScafExternal2Smash;

if ($scaffold_agp) {
	print $PROGRESS "Processing scaffolds in AGP file ...";
	my $count = 0;
	open(AGP, "<$scaffold_agp") || die "Cannot open $scaffold_agp: $!";
	while (<AGP>) {
		chomp();
		my ($escaf, $start, $end, $order, $type, $econtig, $cstart, $cend, $strand) = split(/\t/);
		next unless $type eq "W"; # only contigs are processed
		my $scaf_id   = $ScafExternal2Smash{$escaf};
		if (!$scaf_id) {
			$count++;
			$scaf_id = "$assembly.S$count";
			$ScafExternal2Smash{$escaf} = $scaf_id;
		}
		my $contig = $External2Smash{$econtig};
		print SCAF2CONTIG join("\t", $scaf_id, "$assembler", "contig", $start, $end, ".", $strand, ".", "contig \"$contig\";\n");
	}
	close(AGP);
	print $PROGRESS " done\n";
} elsif ($scaffold_gff) {
	print $PROGRESS "Processing scaffolds in GFF file ...";
	open(GFF, "<$scaffold_gff") || die "Cannot open $scaffold_gff: $!";
	my $gff = new Smash::Utils::GFFlite(\*GFF);
	close(GFF);
	my $count = 0;
	while (my $f = $gff->nextFeature) {
		my $escaf = $f->seqname;
		my $scaf_id   = $ScafExternal2Smash{$escaf};
		if (!$scaf_id) {
			$count++;
			$scaf_id = "$assembly.S$count";
		}
		my $contig = $f->get_attribute("contig");
		$f->{ATTRIBUTES}->{contig} = $External2Smash{$contig};
		print SCAF2CONTIG join("\t", $scaf_id, "$assembler", "contig", $f->start, $f->end, $f->score, $f->strand, $f->frame, $f->flattened_attributes)."\n";
	}
	print $PROGRESS " done\n";
}

if ($scaffold_fasta) {
	print $PROGRESS "Processing scaffolds in FASTA file ...";
	open(IN_SCAFFOLDS, "<$scaffold_fasta") || die "Cannot open $scaffold_fasta: $!";
	my $fasta = new FAlite(\*IN_SCAFFOLDS);
	my $count = 0;
	while (my $entry = $fasta->nextEntry) {
		$count++;
		my $scaf_length = length($entry->seq);
		my $escaf       = $assembler->process_fasta_header($entry->def);
		my $scaf_id     = $ScafExternal2Smash{$escaf};
		printf SCAF_FA ">%s\n", $scaf_id;
		printf SCAF_FA Smash::Core->pretty_fasta($entry->seq);
		print $PROGRESS Smash::Core->progress_bar($count);
	}
	close(IN_SCAFFOLDS);
	print $PROGRESS " done\n";
}

close(CONTIG_FA);
close(CONTIG2READ);
close(SCAF_FA);
close(SCAF2CONTIG);
$assembler->finish();

print "<output>$assembly</output>\n";
print "********************************************************\n";
print "   Assembly id assigned for this assembly: $assembly\n";
print "********************************************************\n";

load_smash("assembly", (ASSEMBLY => $assembly));

############################################
# Load gene annotations as well!
############################################

if (defined($gene_gff)) {
	$options{OPTIONS}  = $options{genepred_parameters};
	$options{VERSION}  = $options{genepred_version};
	$options{ASSEMBLY} = $assembly;
	my $predictor = new Smash::Analyses::GenePredictor::external(map {uc($_) => $options{$_}} keys %options);
	$predictor->init();
	my $prediction   = $predictor->name;
	my $genepred_dir = $predictor->genepred_dir($prediction);
	my $contig2gene  = "$genepred_dir/$prediction.contig2gene.gff";
	open(CONTIG2GENE, ">$contig2gene") || die "Cannot open $contig2gene: $!";
	open(GENE_GFF, "<$gene_gff") || die "Cannot open $gene_gff: $!";
	my $gff = new Smash::Utils::GFFlite(\*GENE_GFF);
	while (my $f = $gff->nextFeature) {
		my $seqname = $External2Smash{$f->seqname};
		$f->set_property("SEQNAME", $seqname);
		$f->set_property("SOURCE", "$predictor");
		$seqname =~ s/^${assembly}//;
		$f->set_attribute("gene_id", "$prediction$seqname.".$f->get_attribute("gene_id"));
		Smash::Utils::GFF::Feature::print_feature_gff($f, \*CONTIG2GENE);
	}
	close(GENE_GFF);
	close(CONTIG2GENE);
	$predictor->finish();

	print "<output>$prediction</output>\n";
	print "********************************************************\n";
	print "   Gene Prediction id assigned for this assembly: $prediction\n";
	print "********************************************************\n";
	load_smash("geneprediction", ('GENEPRED' => $prediction));
}

exit(0);

############################################
# End Execution
############################################

=head1 Name

loadExternalAssembly.pl - Wrapper script to load an external assembly into SMASH repository

=head1 Synopsis

	loadExternalAssembly.pl [options]

=head1 Options

=over 4

=item B<C<--metagenome>>

(required) name of the metagenome this assembly belongs to. This metagenome must be present in the repository, and reads
should already have been loaded.

=item B<C<--assembler>>

(required) name of the assembly software used to generate this assembly outside of SMASH. 

=item B<C<--version>>

(required) version of the assembly software used to generate this assembly outside of SMASH. 

=item B<C<--parameters>>

(optional) special parameters used to generate this assembly outside of SMASH. 

=item B<C<--contig_ace>>

ace file containing the assembly information

=item B<C<--contig_fasta>>

fasta file containing the assembled contig sequences

=item B<C<--contig_gff>>

tab-delimited GFF file containing read to contig mapping 

=item B<C<--scaffold_agp>>

AGP file containing contig to scaffold mapping

=item B<C<--scaffold_fasta>>

fasta file containing the assembled scaffold sequences

=item B<C<--scaffold_gff>>

tab-delimited GFF file containing contig to scaffold mapping

=item B<C<--gene_gff>>

GFF file containing gene coordinates in the standard GFF format

=item B<C<--help>>

Prints this manual.

=back

=head1 Description

B<loadExternalAssembly.pl> is a wrapper script to load an external assembly into SMASH
repository and database. 

=head2 Using ACE assembly files

A typical use of B<loadExternalAssembly.pl> follows B<loadMetaGenome.pl>.
It can handle the ACE file format for the contig assembly information and the AGP 
format for the scaffolding information. 
For example, if you have an assembly from Newbler and want to load it into SMASH,

	loadExternalAssembly.pl --metagenome=MC99.MG1 \
	    --assembler=Newbler --version=2.3 \
	    --contig_ace=454Contigs.ace \
	    --scaffold_agp=454Scaffolds.txt \
	    --scaffold_fasta=454Scaffolds.fna

=head2 Without ACE assembly files

If you have a program that does not generate ACE files, then you have to either make it
yourself, or create a GFF file that explains the contig assembly 
in the following format:

	<contig_id>  <assembler>  read  <start>  <end>  <contig_length>  <strand>  .  \
	read "<read_name>"; [mate_pair "<mate_name>"; [contig "<contig_name>"; insert_size "<insert_size>";]]

E.g.,

	contig1	Newbler	read	131	259	2023	+	.	\
	read "MC99.MG1.ABC.y"; mate_pair "MC99.MG1.ABC.z"; contig "contig1"; insert_size 1000;
	contig1	Newbler	read	198	363	2023	+	.	\
	read "MC99.MG1.XYZ.y";
	contig1	Newbler	read	910	1030	2023	+	.	\
	read "MC99.MG1.ABC.z"; mate_pair "MC99.MG1.ABC.y"; contig "contig1"; insert_size 1000;

Once you have it, then you can load the assembly as follows:

	loadExternalAssembly.pl --metagenome=MC99.MG1 \
	    --assembler=Unknown --version=0.0 \
	    --contig_fasta=assembly.fasta \
	    --contig_gff=contigs.gff \
	    --scaffold_gff=scaffolds.gff

If you dont have scaffolding information, then you could use

	loadExternalAssembly.pl --metagenome=MC99.MG1 \
	    --assembler=Unknown --version=0.0 \
	    --contig_fasta=assembly.fasta \
	    --contig_gff=contigs.gff

The GFF file can have contig names generated by the external assembly program. The script
will rename the contigs to SMASH format. However, the contig names in the contig_fasta 
file and the contig_gff files MUST match, since the script will match them accordingly 
the contig names.

=head2 Scaffolding information

Some assemblies have scaffold information, and some don't. For example, Newbler generates the
scaffold information in AGP format and writes it to a file called F<454Scaffolds.txt>.
If you do have scaffold information, but it is not in AGP format, then you must create either 
the AGP file or a GFF file similar to the one above. If you dont have scaffold information, then do not create a 
scaffold_gff file. Without a C<--scaffold_gff> or C<--scaffold_agp> option, a "fake" scaffold is created for each contig. 

=head2 Gene predictions

For external assemblies, we recommend loading the external assembly into SMASH first, and using SMASH to
predict genes. If you have a gene-calling program that is not supported by SMASH, then of course this
won't work. In that case we recommend using the contig FASTA file from the 
repository to make gene prediction. The location of the contig FASTA file can be obtained as follows:

	showLocations.pl --item=MC99.MG1.AS1

Then load the genes using L<loadExternalGenePrediction.pl|loadExternalGenePrediction>,
but if that is not possible you can load both the assembly and gene prediction together by specifying
C<--gene_gff>. Please make sure that the contig name in the C<gene_gff> file matches that from
C<contig_ace> or C<contig_fasta>.

=cut
