#! /usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use Smash::Global qw($SMASH_SCRIPT_NAME);
use Smash::CommandLineParser qw(parse_options check_required_options);
use Smash::Databases::MetaGenomeDB::Loader qw(load_smash);

##############
# Set up command line parsing
##############

my @allowed  = qw(metagenome=s sample=s library=s insert_size=n insert_stdev=n type=s tech=s weird_fasta_header quality_trim=s cluster=s unload wipeout help); # arguments I expect
   @allowed  = (@allowed, "reads=s{,}", "quals=s{,}", "xmls=s{,}", "sffs=s{,}"); 
my @required = qw(metagenome);   # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

##############
# Handle command line options
# (except checking for presence of required args, which has already 
# been done by parse_options)
##############

if (!$options{unload}) {
	my $message = "";
	@required = qw(type sample);   # arguments I require
	($status, $missing) = check_required_options(\@required, %options);

	if ($status != 1) {
		$message = "Missing argument --$missing\n";
		pod2usage(-message => "$SMASH_SCRIPT_NAME: $message", -exitstatus => 2, -verbose => 1);
	}

	my $type = $options{type};
	if ($type eq "external") { # external needs reads
		if (!@{$options{reads}}) {
			$message .= ("type=$type needs reads\n");
		}
	} elsif ($type eq "sanger") {
		if (!@{$options{reads}} || !@{$options{quals}} || !@{$options{xmls}}) {
			$message .= ("type=$type needs reads, quals and xmls\n");
		}
	} elsif ($type eq "454") {
		if (@{$options{sffs}}) {
			if (!$options{library}) {
				$message .= "type=454 with SFFs needs library name\n";
			}
			if ($options{insert_size} && !$options{tech}) {
				$message .= "type=454 with paired end SFFs: please specify --tech=flx|titanium\n";
			}
			if (my $tech = $options{tech}) {
				if ($tech ne "flx" && $tech ne "titanium") {
					$message .= "Unknown tech $tech\n";
				}
			}
		} else {
			if (!@{$options{reads}} || !@{$options{quals}}) {
				$message .= "type=454 neads (reads, quals) or sffs\n";
			}
		}
	} else {
		$message .= "Unknown type $type\n";
	}
	if (defined($options{quality_trim}) && scalar(grep {$options{quality_trim} eq $_} qw(forge lucy xml)) == 0) {
		$message .= "$SMASH_SCRIPT_NAME: invalid option '".$options{quality_trim}."' to --quality_trim\n";
	}
	if ($message ne "") {
		pod2usage(-message => "$message", -exitstatus => 2, -verbose => 1);
	}
}

##############
# Load
##############

load_smash("MetaGenome", %options);
print "<output>success</output>\n";
print "********************************************************\n";
if ($options{wipeout}) {
print "  Metagenome $options{metagenome} wiped out from SMASH\n";
} elsif ($options{unload}) {
print "  Metagenome $options{metagenome} unloaded from SMASH\n";
} else {
print "  Metagenome $options{metagenome} loaded to SMASH\n";
}
print "********************************************************\n";

exit(0);

=head1 Name

loadMetaGenome.pl - Wrapper script to load/unload a metagenome to/from Smash repository and database

=head1 Synopsis

	loadMetaGenome.pl [options]

=head1 Options

=over 4

=item B<C<--metagenome>> (required)

Name of metagenome where data is added.

=item B<C<--sample>>

If you have multiple samples collected from the same source, but you want to consider them as a single
metagenome, you can use the C<sample> parameter to specify that. An example would be samples collected
at two timepoints, which you can process together. If you use C<sample>, you could trace a gene or a 
contig back to which of the samples it comes from. If you pool the samples together and do not use
C<sample>, that information is lost.

=item B<C<--library>>

This is mostly useful for 454 data. If you have constructed multiple libraries and sequenced them
independently, you can specify that here. It is advisable to label everything that was processed
together in the emulsion-PCR step as coming from the same library, so that artificial replicates
from the emulsion-PCR step for each batch can be removed. 

=item B<C<--type>>

Type of metagenome data being added. For raw sequence data, the options should be one of 
C<sanger>, C<454> or C<external>. Use C<external> for preassembled sequences, or reads 
without quality values.

=item B<C<--tech>>

Name of 454 sequencing technology used to generate the sequences. Required when SFF files 
containing paired end 454 sequences are added. Ignored otherwise. Must be one of C<flx> or
C<titanium>.

=item B<C<--insert_size>>

Paired end insert size for this library.
Again valid only for 454 data. Sanger data takes the insert information from the XML files.

=item B<C<--insert_stdev>>

Paired end insert size standard deviation for this library. If not specified, 10% of the insert
size is used.
Again valid only for 454 data. Sanger data takes the insert information from the XML files.

=item B<C<--reads>>

List of fasta files containing DNA sequence reads, separated by whitespace.

=item B<C<--quals>>

List of quality files containing quality values for the reads specified through B<C<--reads>>.

=item B<C<--xmls>>

List of tracearchive style XML files that contain ancillary information about the reads.
The following fields are required to be in the XML file: SEQ_LIB_ID or LIBRARY_ID, PLATE_ID,
WELL_ID, TEMPLATE_ID, TRACE_END, TRACE_NAME, INSERT_SIZE, INSERT_STDEV.

=item B<C<--sffs>>

List of Roche 454 flowgrams (SFF files)

=item B<C<--weird_fasta>>

If set, the fasta headers are (what I call) weird. The actual identifier of the sequence
is the last word of the fasta header, and not the first word. E.g., if a fasta file contains
an entry like this:

	>i_am_not_the_id but the real id is ME
	agtcgactacagagcatcagcagctagactg

if B<C<--weird_fasta>> is set, B<ME> is the identifier. Otherwise B<i_am_not_the_id>
is the identifier. Most data downloaded from NCBI trace archive have this format.

=item B<C<--quality_trim>>

Specifies if quality trimming should be performed or not. Possible values are C<forge>, 
C<lucy> or C<xml>. C<forge> uses the quality trimming program part of the B<Forge> assembler, 
and C<lucy> uses the B<lucy> quality/vector trimming software. Be aware that Forge assembler 
or lucy software should be installed in your system and available in the Smash software 
directory or your path, resp., for this to work. The last option is to use the
C<CLIP_LEFT>, C<CLIP_VECTOR_LEFT>, C<CLIP_QUALITY_LEFT> fields for left trim and
C<CLIP_RIGHT>, C<CLIP_VECTOR_RIGHT>, C<CLIP_QUALITY_RIGHT> fields for right trim. In case
any of these operations are performed, the six fields mentioned above, if present, will be 
removed since they are not valid any more after trimming. If no quality trim is chosen, 
these fields will be left in tact.

=item B<C<--cluster>>

Name of cluster to run quality trimming procedure (only when using Forge), or name of 
cluster under which Celera assembler has been installed (for handling SFF files).

=item B<C<--unload>>

Unloads the entries corresponding to this metagenome from the database. The entry in the main
C<metagenome> table listing all metagenomes is not modified. This allows you to reload the 
metagenome without adding it first to the main table. All the fasta, quality and xml files 
corresponding to this metagenome are also removed. Use this option if the data are corrupt 
or did not load properly or if you are bored and want to remove the data but would eventually 
load it back again.

=item B<C<--wipeout>>

Unloads the entries corresponding to this metagenome from the database and removes 
the entry in the main table listing all metagenomes. All the fasta, quality and xml files
corresponding to this metagenome are also removed. Once a metagenome is B<C<wipedout>>, it 
disappears from Smash. Thus it has to be added to the main C<metagenome> table before it
can be loaded into the database again.

=item B<C<--help>>

Prints this manual.

=back

=head2 Interdependence of options

=over 4

=item *

B<C<--type>> and B<C<--sample>> are required unless you are unloading with B<C<--unload>> or B<C<--wipeout>>

=item *

B<C<--reads>>, B<C<--quals>> and B<C<--xmls>> are required for B<C<--type>>=sanger 

=item *

(B<C<--reads>> and B<C<--quals>>) or B<C<--sffs>> is required for B<C<--type>>=454 

=item *

B<C<--library>> is required for B<C<--type>>=454 with B<C<--sffs>>

=item *

B<C<--reads>> is required for B<C<--type>>=external

=back

=head1 Description

B<loadMetaGenome.pl> is a wrapper script to add given sequence data to a metagenome in Smash.
This script processes the sequence data (see B<C<--quality_trim>>), adds the 
sequence to the data repository and loads the information to the database. 

A typical use of B<loadMetaGenome.pl> is like this:

	loadMetaGenome.pl --metagenome=MC20.MG1 --type=sanger \
	      --reads seq1.fasta seq2.fasta --quals seq1.qual seq2.qual \
	      --xmls seq1.xml seq2.xml --quality_trim=forge

Multiple runs of B<loadMetaGenome.pl> on the same metagenome add the new sequences to the
database and append them to the files in the repository. Therefore, be careful not to run
B<loadMetaGenome.pl> on the same data twice by accident. Although it might seem odd in the
beginning, it is quite a useful feature when we have data from multiple sequencing technologies.

For example, a typical metagenome with both B<Sanger> and B<454> technology data must be loaded 
as follows:

	loadMetaGenome.pl --metagenome=MC20.MG1 --type=sanger \
	      --reads seq1.fasta seq2.fasta --quals seq1.qual seq2.qual \
	      --xmls seq1.xml seq2.xml --quality_trim=forge
	loadMetaGenome.pl --metagenome=MC20.MG1 --type=454 \
	      --reads run1.fna run2.fna --quals run1.qual run2.qual

For reasons far beyond the scope of this manual, a B<sanger+454> hybrid dataset should be added
as mentioned above: first load all the B<sanger> data and then load all the B<454> data. This is
important for consistent execution of all the scripts part of Smash.

=cut
