#! /usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use File::Basename;
use FAlite;
use Smash::Global qw($SMASH_SCRIPT_NAME);
use Smash::CommandLineParser qw(parse_options check_required_options);
use Smash::Analyses::Assembler;
use Smash::Databases::MetaGenomeDB::Loader qw(load_smash);

##############
# Set up command line parsing
##############

my @allowed  = qw(metagenome=s fakescaf regexp=s help); # arguments I expect
my @required = qw(metagenome);                       # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Handle command line options
# (except checking for presence of required args, which has already 
# been done by parse_options)
##############

my $metagenome   = $options{metagenome};
my $fakescaf     = $options{fakescaf};
my $regexp       = $options{regexp} || '^(\d+)\.';

############################################
# Instantiate a default assembler
############################################

my $assembler = new Smash::Analyses::Assembler::external(METAGENOME => $metagenome, ASSEMBLER => "Fake", VERSION => "none", OPTIONS => "none");
$assembler->init();
$assembler->prepare(); # Get a new name, init the directory structure etc.

############################################
# Get the required files and locations and
# other parameters
############################################

my $read_dir     = $assembler->read_dir($metagenome);
my $read_fa      = "$read_dir/$metagenome.fasta";

my $assembly     = $assembler->name;
my $assembly_dir = $assembler->assembly_dir;
my $contig2read  = "$assembly_dir/$assembly.contig2read.gff";
my $scaf2contig  = "$assembly_dir/$assembly.scaf2contig.gff";
my $contig_fa    = "$assembly_dir/$assembly.contigs.fa";


my %Scaf;
my %Contig2Scaf;
my %ContigLength;
my %ScafLength;

############################################
# Process the read fasta file and make
# a contig per read and dump it in CONTIGS
# and CONTIG2READ files
############################################

open(CONTIG2READ, ">$contig2read") || die "Cannot open $contig2read: $!";
open(SCAF2CONTIG, ">$scaf2contig") || die "Cannot open $scaf2contig: $!";
open(CONTIG_FA, ">$contig_fa") || die "Cannot open $contig_fa: $!";

foreach my $read_fa ($assembler->fasta_files($metagenome)) {
	open(READ, "<$read_fa") || die "Cannot open $read_fa: $!";
	# For each read,
	#  1. make a contig
	#  2. a. get the defline if we need to fake scaffolds
	#     b. parse the defline to group reads that belong together
	# Make scaffolds

	my %Scaf;
	my %ContigLength;
	my %Contig2Scaf;
	my %ScafLength;

	my $fasta = new FAlite(\*READ);
	my $count = 0;
	my $dbh = $assembler->get_db_handle();
	my $sth = $dbh->prepare("SELECT defline FROM readinfo r WHERE read_id=?") || die "Couldn't prepare statement: " .$dbh->errstr;
	while (my $entry = $fasta->nextEntry) {
		$count++;
		my $contig_id = "$assembly.C$count";
		my $contig_length = length($entry->seq);
		my $read_name = $assembler->process_fasta_header($entry->def);
		print CONTIG_FA ">$contig_id\n";
		print CONTIG_FA $assembler->pretty_fasta($entry->seq);
		print CONTIG2READ join("\t", $contig_id, "$assembler", "read",   1, $contig_length, $contig_length, "+", ".", "read \"$read_name\";\n");
		if ($fakescaf) {
			my $scaf_id;
			$sth->execute($read_name) || die "Cannot execute statement: ".$sth->errstr;
			my ($def_line) = $sth->fetchrow_array();
			$sth->fetchrow_array(); # to shush the "handle Active error"
			if ($def_line =~ m/$regexp/) {
				$scaf_id = $1;
			} else {
				die "Cannot find $regexp in $def_line!";
			}
			$ContigLength{$contig_id} = $contig_length;
			$Contig2Scaf{$contig_id} = $scaf_id;
			$Scaf{$scaf_id} = 1;
			$ScafLength{$scaf_id} += $contig_length;
		} else {
			my $scaf_id = "$assembly.S$count";
			print SCAF2CONTIG join("\t", $scaf_id,   "$assembler", "contig", 1, $contig_length, $contig_length, "+", ".", "contig \"$contig_id\";\n");
		}
	}
	$sth->finish();
	$assembler->close_db_handle();
	close(READ);
	if ($fakescaf) {
		$count = 0;
		foreach my $scaf_id (sort {$a cmp $b} keys %Scaf) {
			$count++;
			my $contig_begin = 0;
			my $smash_scaf_id = "$assembly.S$count";
			foreach my $contig_id (sort {$a cmp $b} grep {$Contig2Scaf{$_} eq $scaf_id} keys %Contig2Scaf) {
				print SCAF2CONTIG join("\t", $smash_scaf_id,   "$assembler", "contig", $contig_begin+1, $contig_begin+$ContigLength{$contig_id}, $ScafLength{$scaf_id}, "+", ".", "contig \"$contig_id\";\n");
				$contig_begin += $ContigLength{$contig_id};
			}
		}
	}

}
close(CONTIG_FA);
close(CONTIG2READ);
close(SCAF2CONTIG);
$assembler->finish();

print "<output>$assembly</output>\n";
print "********************************************************\n";
print "   Assembly id assigned for this assembly: $assembly\n";
print "********************************************************\n";


exit(0);

############################################
# End Execution
############################################

__END__

=head1 Name

makeFakeAssembly.pl - Wrapper script to make a dummy/fake assembly for a metagenome.

=head1 Synopsis

	makeFakeAssembly.pl [options]

=head1 Options

=over 4

=item B<C<--metagenome>>

(required) name of the metagenome to create fake assembly for. 
This metagenome must be present in the repository, and reads
should already have been loaded.

=back

=head1 Description

B<makeFakeAssembly.pl> is a wrapper script that makes a fake assembly by creating a contig
for each read in the metagenome sample. This is useful for samples where for some reason
assembly is not required or preferred, e.g. highly complex sample where assembly is not
useful. When you run this script, Smash will make it look as though it is a real assembly
generated through B<doAssembly.pl>, so you can proceed with B<loadAssembly.pl> to load this
assembly into the Smash database.

=cut
