#!/usr/bin/env perl

use strict;
use warnings;

use Pod::Usage;
use Bio::SeqIO;
use Smash::Core;
use Smash::Global qw($SMASH_SCRIPT_NAME $SMASH_SCRIPT_LOCATION);
use Smash::CommandLineParser qw(parse_options check_required_options print_options);
use Smash::Analyses::GenePredictor;
use File::Find;

##############
# Set up command line parsing
##############

my @allowed  = qw(input=s source=s prefix=s format=s extension=s directory recursive db! help);
my @required = qw(input   source   prefix);

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

# get the parameters from commandline

$options{db}= 1 unless defined($options{db});

my $source  = $options{source};
my $prefix  = $options{prefix};
my $input   = $options{input};
my $is_dir  = $options{directory};
my $recurse = $options{recursive};
my $ext     = $options{extension} || "gbff";
my $format  = $options{format} || "genbank";
my $db      = $options{db};

# init smash object and prepare database connections

my $smash = new Smash::Core();
   $smash->init();

my $dbh;
my ($tax_sth, $proj_sth, $seq_sth, $gene_sth, $update_gene_sth, $rna_sth, $feature_sth);

# Keep track of rRNAs processed so far to avoid the stupid 23S rRNA split annotations with same GeneID.
# They should have used join() to put them together instead of two rRNA features separated by
# "intervening sequence"

my %ProcessedGene = ();
my %ProcessedTaxId = ();
my %ProcessedProjId = ();

if ($db) {
	$dbh             = $smash->get_refgenomedb_handle;
	$tax_sth         = $dbh->prepare("INSERT INTO taxonomy(taxonomy_id, organism) VALUES(?,?);");
	$proj_sth        = $dbh->prepare("INSERT INTO project(project_id, taxonomy_id, source, accession) VALUES(?,?,?,?);");
	$seq_sth         = $dbh->prepare("INSERT INTO sequence(sequence_id, project_id, seq_type, mol_type, length, definition) VALUES(?,?,?,?,?,?);");
	$gene_sth        = $dbh->prepare("INSERT INTO gene(external_id, gene_info, sequence_id, type, length, start, end, strand, start_codon, stop_codon, gc) VALUES(?,?,?,?,?,?,?,?,?,?,?);");
	$update_gene_sth = $dbh->prepare("UPDATE gene SET end=? WHERE gene_id=?;");
	$rna_sth         = $dbh->prepare("INSERT INTO mature_rna(gene_id, start, end) VALUES(?,?,?);");
	$feature_sth     = $dbh->prepare("INSERT INTO feature(sequence_id, start, end, type, info) VALUES(?,?,?,?,?);");
}

# prepare output files

open(PROTEIN, ">$prefix.proteins.fa");
open(GENE, ">$prefix.genes.fa");
open(SEQUENCE, ">$prefix.sequences.fa");

select(STDOUT); $| = 1;
select(STDERR); $| = 1;
select(STDOUT);

# generate list of genbank files, if necessary

my @genbanks;

if ($is_dir) {
	@genbanks = <$input/*.$ext>;
} elsif ($recurse) {
	find sub {-f && m/\.${ext}/ && push(@genbanks, $File::Find::name);}, $input;
} else {
	@genbanks = ($input);
}

# parse genbank file

foreach my $genbank (@genbanks) {
	warn "PROGRESS: $genbank\n";
	my $seqio = Bio::SeqIO->new(-file => $genbank, -format=>$format);
	my $project_id;
	SEQUENCE: while(my $seq = $seqio->next_seq) {
		my $sequence_id = $seq->id;
		my $definition  = $seq->desc();
		my $length      = $seq->length;
		my $sequence    = $seq->seq();
		my $circular    = $seq->is_circular?"circular":"linear";
		my $tax_id;
		my $organism;
		my $accession;
		my $gene_counter = 1;
		if (!$sequence) {
			warn "SKIP: SEQUENCE EMPTY $sequence_id ($definition)\n"; 
			next SEQUENCE;
		} elsif ($definition =~ /\btransposon\b/i) {
			warn "SKIP: SEQUENCE TRANSPOSON $sequence_id ($definition)\n"; 
			next SEQUENCE;
		} elsif ($definition =~ /\bextrachromosomal\b/i) {
			warn "SKIP: SEQUENCE EXTRACHROMOSOMAL $sequence_id ($definition)\n"; 
			next SEQUENCE;
		}

		# Get genome project id. This has been hacked in my local copy of BioPerl.
		# Won't work on a general BioPerl installation

		my @annotations = $seq->annotation->get_Annotations('project');
		if (@annotations == 1) {
			$project_id = $annotations[0]->value;
		} 

		# It is possible that only the first entry in the project has the project id and the others dont.
		# In that case, you pass the value on till the end of the file. So you only die if it is not
		# defined at this point.

		if (!$project_id) {
			die "ERROR: $sequence_id PROJECT NOT FOUND\n";
		}

		# Get secondary accessions, if any.

		my @accessions = $seq->annotation->get_Annotations('secondary_accession');
		if (@accessions) {
			$accession = $accessions[0]->value;
		}

		# If there are multiple sources, then we have to catch information from the primary source

		my $gb_source;

		# Get the source with /focus tag

		my @sources = grep {$_->primary_tag eq "source"} $seq->get_SeqFeatures;
		SOURCE:foreach my $feature (@sources) {
			if ($feature->has_tag("focus")) { # don't worry - /focus can only occur once in a record
				$gb_source = $feature;
				last SOURCE;
			}
		}

		# If no /focus, get the first source

		if (!$gb_source) {
			$gb_source = shift @sources;
		}

		# Skip plasmid

		if ($gb_source->has_tag("plasmid")) {
			warn "SKIP: SEQUENCE PLASMID $sequence_id ($definition)\n"; 
			next SEQUENCE;
		}

		# Find out who this is!

		if ($gb_source->has_tag("db_xref")) {
			if ($gb_source->has_tag("organism")) {
				$organism = ($gb_source->get_tag_values("organism"))[0];

				# Is this phage?
				# /focus tag tells you which is the primary source. Having phage in the record is ok, as long as the primary source is not a phage.

				if ($organism =~ /\bphage\b/i) {
					warn "SKIP: SEQUENCE PHAGE $sequence_id ($definition)\n"; 
					next SEQUENCE;
				}
			}
			foreach my $buf ($gb_source->get_tag_values("db_xref")) {
				if ($buf =~ /taxon:(\d+)/) {
					$tax_id = $1;
				}
			}
		}

		$gb_source = undef;

		# Insert into taxonomy and project tables, if the entry doesnt exist yet!

		if ($tax_id) {
			$sequence_id = "$tax_id.$project_id.$sequence_id";
			print join("\t", $genbank, $sequence_id, $length)."\n";

			# eval so that if the tax_id already exists, this will throw an error, but not die!
			if ($db) {
				if (!$ProcessedTaxId{$tax_id}) {
					eval {
						no warnings 'all';
						local $SIG{'__DIE__'};
						$tax_sth->execute($tax_id, $organism);
					};
					if ($@) {
						warn "NOTE: DUPLICATE TAXID $tax_id\n";
					}
				}
				$ProcessedTaxId{$tax_id} = 1;

				if (!$ProcessedProjId{$project_id}) {
					eval {
						no warnings 'all';
						local $SIG{'__DIE__'};
						$proj_sth->execute($project_id, $tax_id, $source, $accession);
					};
					if ($@) {
						warn "NOTE: DUPLICATE PROJECT $project_id\n";
					}
				}
				$ProcessedProjId{$project_id} = 1;

				# eval so that if the sequence already exists, this will throw an error, but not die!
				eval {
					no warnings 'all';
					local $SIG{'__DIE__'};
					$seq_sth->execute($sequence_id, $project_id, "chromosome", $circular, $length, $definition);
				};
				if ($@) {
					warn "SKIP: SEQUENCE DUPLICATE $sequence_id ($definition)\n"; 
					next SEQUENCE;
				}
			}
		} else {
			die "ERROR: TAXID MISSING $genbank\n";
		}

		FEATURE: foreach my $feature ($seq->get_SeqFeatures) {
			my $primary_tag = $feature->primary_tag;
			my $external_id;   # the order of preference for the external_id is documented later
			my $protein;

			if ($primary_tag eq "source") {
				if ($feature->has_tag("proviral")) {
					if ($feature->has_tag("db_xref")) {
						my %info;
						if ($feature->has_tag("organism")) {
							$info{organism} = ($feature->get_tag_values("organism"))[0];
						}
						foreach my $buf ($feature->get_tag_values("db_xref")) {
							if ($buf =~ /taxon:(\d+)/) {
								$info{taxon} = $1;
							}
						}
						my $info = join(" ", map {sprintf("%s=\"%s\"", $_, $info{$_})} keys %info);
						$info = "<annotation $info />";
						$feature_sth->execute($sequence_id, $feature->location->start, $feature->location->end, "proviral", $info);
					}
				}
			} elsif (scalar(grep {$primary_tag eq $_} qw(misc_RNA rRNA tRNA CDS)) > 0) {

				# 16S hack again
				if (0 == 1) {
					my $skip = 1;
					if ($feature->has_tag("product")) {
						foreach my $value ($feature->get_tag_values("product")) {
							if ($value =~ /^16S rRNA$/i || $value =~ /^16S ribosomal RNA$/i || $value =~ /^Small Subunit Ribosomal RNA$/i || $value =~ /^SSU Ribosomal RNA/i) {
								$skip = 0;
							}
						}
					}
					next FEATURE if $skip;
				}
				# end hack

				###############################################################
				# Some checks that will lead to skipping this feature/sequence
				###############################################################

				# Sanity check - was the GenBank file incomplete?

				my $location = $feature->location;
				if ($location->end > $length) {
					warn "ERROR: SEQUENCE INCOMPLETE $genbank:$sequence_id\n";
					next SEQUENCE;
				}

				# Get the gene name (I call it "external_id").
				# Priorities are: 
				#   1. locus_tag
				#   2. GI
				#   3. GeneID
				#   4. gene<number>
				# This goes into the database as tax_id.external_id anyway!

				# %gene_info stores annotation information for each gene

				my %gene_info;
				my $gene_annotation = "";

				if ($feature->has_tag("locus_tag")) {
					($external_id) = $feature->get_tag_values("locus_tag");
				}
				if ($feature->has_tag("db_xref")) {
					my %dbxref;
					foreach my $buf ($feature->get_tag_values("db_xref")) {
						my ($key, $value) = split(/:/, $buf);
						$dbxref{$key} = $value;
						$gene_info{$key} = $value if ($key eq "GI" || $key eq "GeneID");
					}
					if (!$external_id) {
						if ($dbxref{"GI"}) {
							$external_id = $dbxref{"GI"};
						} elsif ($dbxref{"GeneID"}) {
							$external_id = "GeneID:".$dbxref{"GeneID"};
							$gene_info{GeneId} = $external_id;
						}
					}
				}
				$external_id = $external_id || ($seq->id.".gene$gene_counter");

				# Get information about the gene from the following fields:
				#    gene, gene_synonym, product, protein_id, EC_number

				my $product;
				if ($feature->has_tag("product")) {
					$product = ($feature->get_tag_values("product"))[0];
				}
				foreach my $tag qw(gene gene_synonym product protein_id EC_number) {
					if ($feature->has_tag($tag)) {
						foreach my $value ($feature->get_tag_values($tag)) {
							if ($value ne "hypothetical protein") {
								$gene_info{$tag} = $value;
							}
						}
					}
				}

				# Sometimes, COG annotation is given in the /note field

				if ($feature->has_tag("note")) {
					my @cogs = ();
					my @notes = ();
					foreach my $value ($feature->get_tag_values("note")) {
						if ($value =~ /\b(COG[0-9]{4})\b/) {
							push(@cogs, $1);
						}
						push(@notes, $value);
					}
					if (@cogs) {
						$gene_info{COG} = join(";", @cogs);
					}
					$gene_info{note} = join("; ", @notes);
				}


				# Watch out: inside the "if(pseudo)" clause, this will be recreated.
				# Any changes here should also be made there.

				if (%gene_info) {
					$gene_annotation = join(" ", map {sprintf("%s=\"%s\"", $_, $gene_info{$_})} keys %gene_info);
				}

				# Is this a pseudo gene?

				if ($feature->has_tag("pseudo")) {
					my $warning  = "NOTE: GENE PSEUDO $external_id $primary_tag";
					   $warning .= " $gene_annotation" if $gene_annotation;
					   $warning .= "\n";
					warn $warning;
					#next FEATURE;

					# Don't skip it, but add a pseudo field
					$gene_annotation = "pseudogene=\"yes\" $gene_annotation";
				}

				# Hack to fix the 23S split annotations

				if (my $gene_id = $ProcessedGene{"$tax_id.$project_id.$external_id"}) {

					# Skip the introns

					if ($primary_tag eq "misc_RNA" && $product && $product =~ /intervening sequence/i) {
						warn "SKIP: GENE SPLIT 23S $external_id $primary_tag='$product'\n";
						next FEATURE;
					}

					# Fuse the exons.
					# We know that start < end in both gene and mature_rna tables.
					# So just update the gene table with the higher coordinate, and then add the feature anyway.

					if ($primary_tag eq "rRNA" && $product && $product eq "23S ribosomal RNA") {
						warn "NOTE: GENE MERGE 23S $external_id $primary_tag='$product'\n";
						$update_gene_sth->execute($location->end, $gene_id);
					}
				}

				# Is translation (of course for CDS genes) already in there, or do we have to do it later?

				my $translation_table = 1; # default GenBank trans_table
				if ($feature->has_tag("translation")) {
					$protein = ($feature->get_tag_values("translation"))[0];
				} else {
					if ($feature->has_tag("transl_table")) {
						$translation_table = ($feature->get_tag_values("transl_table"))[0];
						if ($translation_table != 11 && $translation_table != 4) {
							warn "ERROR: TRANSTABLE $translation_table $external_id\n";
						}
					}
				}

				# Now we are ready to process the feature and add it to the database

				process_gene($primary_tag, $tax_id, $sequence_id, \$sequence, $gene_annotation, $external_id, $location, $translation_table, $protein);
				$gene_counter++;
			}
		}

		# write the nuc sequence, with tax_id if possible

		print SEQUENCE ">$sequence_id\n";
		print SEQUENCE $smash->pretty_fasta($sequence);
	}
}

# close connections

if ($db) {
	$tax_sth->finish();
	$proj_sth->finish();
	$seq_sth->finish();
	$gene_sth->finish();
	$update_gene_sth->finish();
	$rna_sth->finish();
	$feature_sth->finish();
	$dbh->commit();
}

$smash->finish();

# close files

close(PROTEIN);
close(GENE);
close(SEQUENCE);

exit(0);

########################
# Subroutines
########################

sub process_gene {
	my ($type, $gene_prefix, $sequence_id, $sequence_ref, $gene_annotation, $external_id, $location, $translation_table, $protein) = @_;

	my $gene_id; # for the gene created later
	my $complete5 = 0;
	my $complete3 = 0;
	my @cds_locations = ();

	# Check if 5' and 3' complete
	if ($location->start_pos_type() eq "EXACT") {
		$complete5 = 1;
	} elsif ($location->start_pos_type() eq "BEFORE") {
		$complete5 = 0;
	} else {
		die "Cannot handle position type ", $location->start_pos_type(), "\n";
	}
	if ($location->end_pos_type() eq "EXACT") {
		$complete3 = 1;
	} elsif ($location->end_pos_type() eq "AFTER") {
		$complete3 = 0;
	} else {
		die "Cannot handle position type ", $location->end_pos_type(), "\n";
	}

	# get the length of the gene
	my $length = 0;
	if ($location->isa('Bio::Location::SplitLocationI')) {
		foreach my $sublocation ($location->sub_Location) {
			$length += ($sublocation->end-$sublocation->start+1);
			push(@cds_locations, $sublocation->start, $sublocation->end);
		}
	} else {
		$length += ($location->end-$location->start+1);
		push(@cds_locations, $location->start, $location->end);
	}

	# If they are not exact and run outside of the range, get the sequence that's a triplet set
	# This only applies for CDS

	if ($type eq "CDS") {
		if ($complete5 == 1 && $complete3 != 1) {
			$cds_locations[$#cds_locations] -= ($length%3);
		} elsif ($complete5 != 1 && $complete3 == 1) {
			$cds_locations[0] += ($length%3);
		}
	}

	# convert them into presence/absence of start/stop codons!
	my $strand = get_strand_char($location->strand);
	if ($strand eq ".") {
		warn "NOTE: GENE STRAND $gene_prefix.$external_id.\n";
	}

	my ($start_codon, $stop_codon);
	if ($strand eq "+") {
		($start_codon, $stop_codon) = ($complete5, $complete3);
	} else {
		($start_codon, $stop_codon) = ($complete3, $complete5);
	}

	# Get the transcript 
	# Reestimate the length at the same time!

	my $transcript = "";
	$length = 0;
	for (my $i=0; $i<@cds_locations; $i+=2) {
		my $cds_start = $cds_locations[$i]; 
		my $cds_end   = $cds_locations[$i+1];
		$transcript  .= substr($$sequence_ref, $cds_start-1, $cds_end-$cds_start+1);
		$length += ($cds_end-$cds_start+1);
	}
	if ($strand eq "-") {
		$transcript =~ y/ACTG/TGAC/;
		$transcript = reverse($transcript);
	}

	# Get protein if it is not there

	if ($type eq "CDS" && !$protein) {

		my $translator = Smash::Analyses::GenePredictor::get_translator($translation_table);
		my $inframestop = 0;

		# Fix the frame if CDS open-ended on both sides

		# Find ORF if length is not multiple of 3

		if ($length%3 != 0) {
			warn "NOTE: GENE CODON $gene_prefix.$external_id. Finding longest ORF.\n";
			my $frame   = $translator->get_best_frame($transcript);
			if ($frame < 0) {
				warn "ERROR: GENE INFRAMESTOP $gene_prefix.$external_id\n";
				$frame = -$frame;
				$inframestop = 1;
			}
			$frame--; # Convert from [1,3] to [0,2]
			$transcript = substr($transcript, $frame, ($length-$frame) - ($length-$frame)%3);
			if ($strand eq "-") {
				$cds_locations[$#cds_locations] -= $frame;
			} else {
				$cds_locations[0] += $frame;
			}
		}

		# Translate

		$protein = $translator->translate($transcript, $start_codon, $external_id);

		if (index($protein, '*') != -1) {
			$gene_annotation = "partial=\"inframestop\" $gene_annotation";
			$protein = $translator->get_longest_protein($protein);
		}
	}

	#print join(":", $external_id, $gene_annotation, $sequence_id, $length, $cds_locations[0], $cds_locations[$#cds_locations], $strand, $start_codon, $stop_codon)."\n";

	if ($db) {
		my $gc = $smash->get_gc_percent($transcript, 0, 100);

		# insert into gene/ncgene table

		my $table_external_id = "$gene_prefix.$external_id";
		$gene_id = $ProcessedGene{$table_external_id};
		if (!defined($gene_id)) {
			$gene_sth->execute($table_external_id, "<annotation $gene_annotation />", $sequence_id, $type, $length, $cds_locations[0], $cds_locations[$#cds_locations], $strand, $start_codon, $stop_codon, $gc);
			$gene_id = $smash->last_refgenomedb_insert_id($dbh);
			$ProcessedGene{$table_external_id} = $gene_id;
		} else {
			warn "SKIP: GENE DUPLICATE $table_external_id\t$type\t$gene_annotation\n";
		}

		# insert into mature_rna table

		for (my $i=0; $i<@cds_locations; $i+=2) {
			my $cds_start = $cds_locations[$i]; 
			my $cds_end   = $cds_locations[$i+1];
			eval {
				no warnings 'all';
				local $SIG{'__DIE__'};
				$rna_sth->execute($gene_id, $cds_start, $cds_end);
			};
			if ($@) {
				warn "SKIP: CDS DUPLICATE $table_external_id\t$sequence_id:$cds_start-$cds_end\n";
			}
		}
	}

	print GENE ">$gene_prefix.$external_id\n";
	print GENE $smash->pretty_fasta($transcript);

	if ($type eq "CDS") {
		print PROTEIN ">$gene_prefix.$external_id $gene_annotation\n";
		print PROTEIN $smash->pretty_fasta($protein);
	}
}

sub get_strand_char {
	my $n = shift;
	return '.' unless $n;
	if ($n < 0) {
		return '-';
	} elsif ($n > 0) {
		return '+';
	} else {
		return '.';
	}
}

__END__

=head1 Name

addRefGenomeSequences.pl - Wrapper script to parse genbank/embl files containing 
reference genomes and possibly annotations. It loads the information into Smash 
refgenome database.

=head1 Synopsis

	addRefGenomeSequences.pl [options]

=head1 Options

=over 4

=item B<C<--input>> (required)

genbank file to be parsed, or the directory containing multiple genbank files
to be parsed.

=item B<C<--directory>>

input is a directory and not a GenBank file. The script will then
look for files with an extension .gbff (or given by C<--extension>) 
in that directory.

=item B<C<--format>>

format of the input file (supported: genbank, embl)

=item B<C<--extension>>

file name extension to look for under directory if C<--directory>
is given. (default: gbff)

=item B<C<--source>> (required)

source for the genbank file (e.g., NCBI, HMP, etc)

=item B<C<--prefix>> (required)

prefix for the output files. The following files will be created:

=over 4

=item F<E<lt>prefixE<gt>.sequences.fa>

all the DNA sequences in the genbank file

=item F<E<lt>prefixE<gt>.genes.fa>

all the coding and non-coding genes in the genbank file

=item F<E<lt>prefixE<gt>.proteins.fa>

protein sequences of the coding genes in the genbank file (translation
in the genbank file is used if available, otherwise the CDS regions are
translated by this script).

=back

=item B<C<--help>>

Prints this manual.

=back

=cut
