#!/usr/bin/env perl

use strict;
use warnings;
use Pod::Usage;
use Getopt::Long;
use File::Basename;
use Cwd;
use Smash::Utils::MatrixIO qw(:all);

use Smash::Core;
use Smash::Global qw(:all);
use Smash::CommandLineParser qw(parse_options check_required_options print_options);
use Smash::Utils::Taxonomy qw(:all);
use Smash::Utils::iTOL;
use Smash::Analyses::BootStrap;
use Smash::Analyses::Comparative;

my $UNIFORM_PRIOR = 0.000001;
my $REPLICATES = 100;

##############
# Set up command line parsing
##############

my @allowed  = qw(tag=s analysis=s experiment=s mode=s list=s level=s normalize=s identity=n replicate=n genomesize! filterlow=f conf_threshold=f length_threshold=i tree=s top=i local help); # arguments I expect
   @allowed  = (@allowed, "refdb=s{,}"); 
my @required = qw(tag analysis mode list);   # arguments I require

##############
# Parse command line options
##############

my $status;
my $missing;
my %options;

($status, %options) = parse_options(\@allowed);
if ($options{help}) {
	pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($status != 1) {
	pod2usage(-message => "", -exitstatus => 2, -verbose => 1);
}
#print_options(%options);
($status, $missing) = check_required_options(\@required, %options);
if ($status != 1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Missing argument --$missing\n", -exitstatus => 2, -verbose => 1);
}

########################
# Command line arg vars
########################


my @opt_refdb = @{$options{refdb}};
my @opt_distance;

# defaults

# genome size normalize is on unless --nogenomesize is specified

$options{genomesize} = 1 unless defined($options{genomesize});


########################
# Parse command line options
########################

############################################
# Often used command line parameters
############################################

my $analysis = $options{analysis};
my $level    = lc($options{level});

####
# Check proper usage
####

if (index(":functional:refgenome:16S:evodistance:refcoverage:refbasecoverage:keggrescoverage:", ":$analysis:") == -1) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: Invalid option --analysis=$analysis", -exitstatus => 2, -verbose => 1);
}

if ($analysis eq "refgenome" && $options{mode} eq "raw" && !defined($options{identity})) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: --analysis=refgenome --mode=raw needs --identity to be specified!", -exitstatus => 2, -verbose => 1);
}

if ($options{mode} eq "summary" && !defined($options{normalize})) {
	pod2usage(-message => "$SMASH_SCRIPT_NAME: --mode=summary needs --normalize to be specified!", -exitstatus => 2, -verbose => 1);
}

############################################
# fix command line parameters
############################################

$options{experiment} = "metagenome" unless $options{experiment};

############################################
# Common
############################################

my $bootstrap_item;   # Item to bootstrap: reads, genes or proteins etc
my $bootstrap_label;  # Name of dir under $read_dir where bootstrap lists are kept
my $GLOBAL_TOP_VAL;   # Y axis range for itol/gnuplot graph
my $reporter;
my $analyzer;
my $refdb;            # Used to generate files in refgenome

my %Labels;
my %DataCount = (); # Gene or read count
my %Assigned = ();
my @entities;

my $PROGRESS = \*STDERR;
select($PROGRESS); $| = 1; select(STDOUT);

# for smash paper and MetaHIT
my %Colors = (
		# 41 sanger samples
		"AM-A" => "#808284", "DA-A" => "#0000FF", "ES-A" => "#000000", "FR-A" => "#006E3A", 
		"IT-A" => "#00ADEF", "JP-A" => "#8E191B", "JP-I" => "#F26522", "AM-F" => "#8CC63F", 

		# sanger/454 comparison
		#"DA-AD-1" => "#8E191B", "DA-AD-3" => "#006E3A",

		# Smash paper figure
		"MN-SOIL" => "#000000", "EBPR" => "#006E3A", "WF-A" => "#00ADEF", "AMD" => "#F26522",

		# misc
		"assigned" => "#AA5B48", "unassigned" => "#B8D7EF",

		);

#################################################################
# Init a SMASH object and destroy it.
# This is to make sure that the smash.conf file is read and the 
# global variables are set.
#################################################################

Smash::Core->new()->init()->finish();

#################################################################
# Get the metagenome details
# @entities contains the list
# %Labels{$entity} contains the label of $entity
#################################################################

open(LIST, "<$options{list}") || die "Cannot open $options{list}: $!";
my @lines = <LIST>;
close(LIST);

# Get the labels for the samples/entities
# This could come from the input list file, then dont worry about the database

%Labels = ();
%DataCount = ();
@entities = ();
map {chomp($_); my ($entity, $count, $label) = split(/\t/); $Labels{$entity} = $label if $label; $DataCount{$entity} = $count if defined $count; push(@entities, $entity);} @lines;

# for local mode, data count must be passed from the file

if ($options{local} && scalar(grep {defined} values %DataCount) < 0) {
	die "List file $options{list} must contain datacount for --local\n";
}

# if labels werent defined, either get from database, or if local, assign id as label

if (scalar(grep {defined} values %Labels) == 0) { # labels are not given in input file
	if ($options{local}) {
		map {$Labels{$_} = $_} keys %Labels; # for genome, this will be undef, so we use the entity as label as well
	} else {
		print $PROGRESS "Getting sample information from database ...";
		my $smash = new Smash::Core();
		for (my $e = 0; $e < scalar(@entities); $e++) {
			my $entity = $entities[$e];
			$smash->init($entity);
			my $label = $smash->get_metagenome_label($smash->metagenome);
			$Labels{$entity} = $label || $entity; # for genome, this will be undef, so we use the entity as label as well
		}
		$smash->finish();
		print $PROGRESS " done\n";
		my $RevLabels = invert_hash(\%Labels);
		if (scalar(keys %Labels) != scalar(keys %$RevLabels)) {
			map {$Labels{$_} .= ".$_"} keys %Labels;
		}
	}
}


@opt_distance = ("jsd"); #, "euclidean");

if ($options{mode} eq "cluster") {
	if (scalar(keys %Labels) >= 3) {
		run_clustering([values %Labels], "neighbor", @opt_distance);
		success(0);
	} else {
		warn "--mode=cluster needs at least 3 samples!\n";
		exit(0);
	}
}

if ($options{mode} eq "bootstrap+cluster") {
	if (scalar(keys %Labels) >= 3) {
		run_bootstrap_clustering([values %Labels], "neighbor", @opt_distance);
		success(0);
	} else {
		warn "--mode=bootstrap+cluster needs at least 3 samples!\n";
		exit(0);
	}
}


if ($analysis eq "refgenome") {
	$options{tree}   = "ncbi" unless $options{tree};
	$bootstrap_label = "reads_refgenomes";
	$bootstrap_item  = "reads";
	$GLOBAL_TOP_VAL  = 0.23;
	if (!@opt_refdb) {
		@opt_refdb  = qw(ALL);
	}
} elsif ($analysis eq "functional") {
	$bootstrap_label = "genes_functional";
	$bootstrap_item  = "genes";
	$GLOBAL_TOP_VAL  = 0.05;
	if (!@opt_refdb) {
		@opt_refdb  = qw(eggnog);
	}
} elsif ($analysis eq "16S") {
	$options{tree}   = "bergey" unless $options{tree};
	$bootstrap_label = "reads_16S";
	$bootstrap_item  = "reads";
	$level           = "genus" unless $level; # default level is genus
	$GLOBAL_TOP_VAL  = 0.50;
	if (!@opt_refdb) {
		@opt_refdb  = qw(RDP);
	}
}

#################################################################
# Process each sample
#################################################################

REFDB: for (my $i=0; $i<@opt_refdb; $i++) {
	$refdb = $opt_refdb[$i];

	####################
	# set up analyzer
	####################

	if ($refdb eq "eggnog" && !$level) {
		$level = "og";
	}
	if ($refdb eq "kegg" && !$level) {
		$level = "ko";
	}

	my $class = get_analyzer($analysis);
	my $analyzer = $class->new(NAME => $options{tag}, TYPE => $analysis, EXPERIMENT => $options{experiment}, NORMALIZE => $options{normalize}, LEVEL => $level, IDENTITY => $options{identity}, REF_DB => $refdb, REPLICATE => $options{replicate}, IS_LOCAL => $options{local}, GENOMESIZE_NORMALIZE => $options{genomesize}, FILTERLOW => $options{filterlow}, CONFIDENCE_THRESHOLD => $options{conf_threshold}, READ_LENGTH_THRESHOLD => $options{length_threshold}, TREE => $options{tree});
	$analyzer->init();
	print $PROGRESS "Processing samples ";
	for (my $e = 0; $e < scalar(@entities); $e++) {

		####################
		# Convert from sample id to user-friendly labels
		# Attach this label/entity as the current sample
		####################

		my $entity = $entities[$e];
		my $label  = $Labels{$entity};
		if ($options{experiment} eq "metagenome") {
			$analyzer->attach_sample($entity, $label);
		} elsif ($options{experiment} eq "genome") {
			$analyzer->attach_genome($entity, $label);
		} else {
		}

		# read the OG annotation information

		if ($analysis eq "functional") {
			$analyzer->parse_annotations();
		}

		####################
		# run the analysis:
		####################

		if ($options{mode} eq "raw") {

			if ($options{experiment} eq "metagenome") {
				$Assigned{$label} = $analyzer->analyze_one_sample($bootstrap_label);
			} elsif ($options{experiment} eq "genome") {
				$Assigned{$label} = $analyzer->analyze_one_genome();
			}

			####################
			# We have used the datacount properly for analysis.
			# Now we might have to modify it for normalization purposes!
			####################

			if ($options{normalize} eq "hits") {
				$analyzer->{DATA_COUNT}->{$label} = $Assigned{$label};
			} else {
				my $this_data_count;
				if ($options{local}) {
					$this_data_count = $DataCount{$entity};
				} else {
					$this_data_count = $analyzer->get_data_count($entity);
				}
				$analyzer->{DATA_COUNT}->{$label} = $this_data_count;
				$analyzer->{FEATURE_COUNT}->{-1}->{$label} = $this_data_count - $Assigned{$label};
			}

			####################
			# Now adjust the datacount so that anything less than 100bp, which doesnt have a chance, is not included!
			####################

			if ($analysis eq "refcoverage") {
				my $dbh;
				my $sth = $dbh->prepare_cached("SELECT COUNT(*) FROM readinfo r INNER JOIN library l USING (library_id) INNER JOIN sample s USING (sample_id) WHERE metagenome_id=? AND r.length >= ?");
				$sth->execute($entity, 100);
				my ($count) = $sth->fetchrow_array();
				$analyzer->data_count->{$label} = $count;
			}
		}
		print $PROGRESS Smash::Core->progress_bar($e+1);
	}
	print $PROGRESS " done\n";

	########################
	# if mode=raw, write the counts and move to the next refdb
	########################

	my $raw_file = "$options{tag}.feature_vector.rawcount.$analysis.$refdb";
	if ($options{mode} eq "raw") {
		zero_fill_matrix($analyzer->feature_count);
		write_R_matrix($raw_file, $analyzer->feature_count);
		$analyzer->finish();
		next REFDB;
	}

	########################
	# read the rawcounts in
	########################

	if (! -f $raw_file) {
		die "Raw file $raw_file missing. Perhaps you need to run $SMASH_SCRIPT_NAME with '--mode=raw'?";
	}

	my $feature_count = read_R_matrix($raw_file);

	########################
	# filter the matrix and remove things not in the list file
	# get the data count while you are at it!
	########################

	print $PROGRESS "Filtering samples ...";
	$feature_count = transpose_matrix($feature_count);
	{
		my @keys = keys %$feature_count;
		my %keep = map {$_ => 1} values %Labels;
		map {delete $feature_count->{$_} unless defined($keep{$_})} @keys;
	}
	foreach my $label (keys %$feature_count) {
		$analyzer->{DATA_COUNT}->{$label} = sum_hash($feature_count->{$label});
	}
	$feature_count = transpose_matrix($feature_count);
	print $PROGRESS " done\n";

	$analyzer->{FEATURE_COUNT} = $feature_count;

	# Add unknown to the tree

	if ($options{normalize} ne "hits") {
		$NCBITree->add_unknown();
		$BergeyTree->add_unknown();
	}

	########################
	# For bootstrapping, we need the raw counts. So we do that before we convert things to abundance.
	########################

	if ($options{mode} eq "bootstrap+distmat") {
		$analyzer->bootstrap_elements();
		$analyzer->finish();
		next REFDB;
	}

	########################
	# Convert to abundance
	# Merge into higher levels, if necessary
	########################

	# normalize by genome size to convert to abundance

	if ($options{genomesize}) {
		$analyzer->{FEATURE_COUNT} = $analyzer->convert_to_abundance($analyzer->feature_count); # now %FeatureCount is abundance
	}

	if ($analyzer->level) {
		$analyzer->{FEATURE_COUNT} = $analyzer->merge_feature_vectors($analyzer->feature_count, $analyzer->level); # now %FeatureCount is merged
	}
	zero_fill_matrix($analyzer->feature_count);

	########################
	# Remap to different tree, if necessary
	########################

	if ($analysis eq "16S" && $options{tree} =~ /NCBI/i) {
		$analyzer->{FEATURE_COUNT} = $analyzer->bergey2ncbi_feature($analyzer->feature_count);
		zero_fill_matrix($analyzer->feature_count);
	}

	my $file;

	########################
	# do the analysis
	########################

	if ($options{mode} eq "summary") {

		my $copy = copy_matrix($analyzer->feature_count);
		   $copy = normalize_cols_by_sum($copy);    
		   $copy = $analyzer->convert_id_to_name($copy); # get names in the matrix

		if ($analyzer->filterlow) {
			$analyzer->filter_low_occurrences($analyzer->filterlow, $copy);
		}
		zero_strip_matrix($copy);
		zero_fill_matrix($copy);

		if ($analysis eq "refgenome" || $analysis eq "16S") {
			$file = sprintf("%s.feature_vector.summary.%s.%s%s.%s", $options{tag}, $analysis, (($options{identity})?($options{identity}."p."):""), $level, $refdb);
			write_R_matrix($file, $copy);
			
			# Special mode for count-based analysis like the over/under-representation analysis
			# refgenome: raw counts, genome size in Mb normalized, rescaled back to total sample size
			# 16S: raw counts, rRNA copy number normalized, rescaled back to total sample size
			# taxa by name

			if ($analysis eq "16S" || $analysis eq "refgenome") {
				foreach my $feature (keys %$copy) {
					foreach my $label (keys %{$copy->{$feature}}) {
						$copy->{$feature}->{$label} *= $analyzer->{DATA_COUNT}->{$label};
					}
				}
			}
			$file = sprintf("%s.feature_vector.summary.rescaledcounts.%s.%s%s.%s", $options{tag}, $analysis, (($options{identity})?($options{identity}."p."):""), $level, $refdb);
			write_R_matrix($file, $copy);

		} else {
			$file = sprintf("%s.feature_vector.summary.%s.%s.%s", $options{tag}, $analysis, $level, $refdb);
			write_R_matrix_fixed_length($file, $copy, 64);
		}

	} elsif ($options{mode} eq "distmat") {
		########################
		# For creating distance matrix, we DONT REALLY need the raw counts. 
		# But this is where I can create the raw counts for the non-raw levels (like module/pathway or genus/phylum).
		# So we do that before we convert things to abundance.
		########################

		# raw counts, genome size in bp normalized, not sample size normalized
		# taxa by tax_id

		$file = sprintf("%s.feature_vector.rawcount.%s.%s.%s", $options{tag}, $analysis, (($options{identity})?($options{identity}."p."):"").$level, $refdb);
		write_R_matrix($file, $analyzer->feature_count);

		if ($options{normalize} eq "hits") {
			delete $analyzer->{FEATURE_COUNT}->{-1};
		}
		$analyzer->{FEATURE_COUNT} = normalize_cols_by_sum($analyzer->feature_count);
		if ($analyzer->filterlow) {
			$analyzer->filter_low_occurrences($analyzer->filterlow, $analyzer->feature_count);
		}

		# raw counts, genome size in bp normalized, sample size normalized
		# filtered for low occurrences, if applicable
		# taxa by tax_id

		$file = sprintf("%s.feature_vector.normalized.%s.%s.%s", $options{tag}, $analysis, (($options{identity})?($options{identity}."p."):"").$level, $refdb);
		write_R_matrix($file, $analyzer->feature_count);

		$analyzer->make_distance_matrices(Cwd::cwd, $analyzer->feature_count);

	} elsif ($options{mode} eq "itol") {
		init_reporter();
		$analyzer->{FEATURE_COUNT} = normalize_cols_by_sum($analyzer->feature_count);
		if ($analyzer->filterlow) {
			$analyzer->filter_low_occurrences($analyzer->filterlow, $analyzer->feature_count);
		}

		# remove samples where >98% is unknown

		my $fc = $analyzer->feature_count;
		$fc = transpose_matrix($fc);
		my $RevLabels = invert_hash(\%Labels);
		foreach my $sample (keys %$fc) {
			if ($fc->{$sample}->{-1} && $fc->{$sample}->{-1} > 0.98) {
				delete $fc->{$sample};
				delete $RevLabels->{$sample};
			}
		}
		$fc = transpose_matrix($fc);
		%Labels = %{invert_hash($RevLabels)};

		# Reset labels, since some might have been lost!

		my $new_labels = [];
		foreach my $e (@entities) {
			if ($Labels{$e}) {
				push(@$new_labels, $Labels{$e});
			}
		}
		$reporter->{LABELS} = $new_labels;

		# Set the tree

		my $tree;
		if ($analysis eq "functional") {
			$tree = $analyzer->get_functional_hierarchy();
		} elsif (($analysis eq "refgenome" || $analysis eq "16S")) {
			if ($options{tree} =~ /bergey/i) {
				$tree = $BergeyTree;
			} else {
				$tree = $NCBITree;
			}
		}
		$reporter->{TREE} = $tree;

		delete $analyzer->feature_count->{-1};
		if ($options{normalize} eq "hits") {
			$analyzer->{FEATURE_COUNT} = normalize_cols_by_sum($analyzer->feature_count);
		}

		my $popup_info = $analyzer->get_popup_info($analyzer->feature_count);
		$reporter->add_dataset(NAME => $refdb, DATA => $analyzer->feature_count, GRAPH_TYPE => "barplot", SCALE => 100, UNIT => "%", KEEP_TOP_N => $options{top}, POPUP_INFO => $popup_info); # convert to percent from fraction

		#$reporter->add_dataset(NAME => $refdb, DATA => $analyzer->feature_count, GRAPH_TYPE => "heatmap", SCALE => 100, KEEP_TOP_N => $options{top}); # convert to percent from fraction

	} elsif ($options{mode} eq "boxplot") {
		make_boxplot_files();
	} elsif ($options{mode} eq "refbasecoverage") {
		#compare_replicates_with_reference_base();
	} elsif ($options{mode} eq "refcoverage") {
		compare_replicates_with_reference(\%DataCount);
	} elsif ($options{mode} eq "qualdistmat") {
		make_qualitative_distance_matrices($analyzer->feature_count, $options{replicate});
	}
	$analyzer->finish();
}

finish_reporter();

success(0);

############################################
# Subroutines
############################################

sub success {
	print "<output>success</output>\n";
	exit(shift);
}

sub get_analyzer {
	my $type = shift;
	my @available = qw(Functional RefGenome 16S);
	my ($name) = grep {lc($type) eq lc($_)} @available;
	return $analyzer = "Smash::Analyses::Comparative::$name";
}

sub init_reporter {
	if ($options{mode} eq "itol") {
		my $desc = "$analysis mapping (";
		if ($options{tag}) {
			$desc .= "Tag:$options{tag}; ";
		}
		if ($level) {
			$desc .= "Level:$level; ";
		}
		$desc .= sprintf("Database:%s)", join("_", @opt_refdb));

		my $smash = new Smash::Core();
		$smash->init();
		my %reporter_options = (
					    NAME => "$options{tag}.$analysis",
					  LABELS => [map {$Labels{$_}} @entities],
				        POST_FIX => 1,
				  DATASET_COLORS => \%Colors,
				    PICTURE_SIZE => 2000,
				     DESCRIPTION => $desc,
				     GRAPH_WIDTH => 0.05, #$GLOBAL_TOP_VAL,
				       UPLOAD_ID => $smash->get_conf_value("iTOL", "uploadID")
				);
		$smash->finish();

		my $DISPLAY_THRESHOLD;
		if (defined $options{filterlow}) {
			$DISPLAY_THRESHOLD = $options{filterlow};
			$DISPLAY_THRESHOLD = 0.0000001 unless $DISPLAY_THRESHOLD;
		}
			
		if ($analysis eq "functional") {
			$reporter_options{GRAPH_WIDTH} = 0.08;
			if ($level eq "ko" || $level eq "og" || $level eq "cog") {
				$reporter_options{GRAPH_WIDTH} = 0.01;
			} elsif ($level eq "funcat") {
				$reporter_options{GRAPH_WIDTH} = 0.20;
			}
			$reporter_options{DISPLAY_THRESHOLD} = $DISPLAY_THRESHOLD || 0.01;
			$reporter_options{NCBI_TAXONOMY_ID} = 0;
		} elsif ($analysis eq "refgenome") {
			$reporter_options{GRAPH_WIDTH} = 0.4;
			$reporter_options{DISPLAY_THRESHOLD} = $DISPLAY_THRESHOLD || 0.01;
			$reporter_options{NCBI_TAXONOMY_ID} = 0;
			#$reporter_options{RE_SORT} = 1;
			$reporter_options{COLOR_CLADES}     = 1;
		} elsif ($analysis eq "16S") {
			$reporter_options{GRAPH_WIDTH} = 0.5;
			$reporter_options{DISPLAY_THRESHOLD} = $DISPLAY_THRESHOLD || 0.01;
			$reporter_options{NCBI_TAXONOMY_ID} = 0;
			$reporter_options{COLOR_CLADES}     = 1;
		}

		$reporter = new Smash::Utils::iTOL(%reporter_options);
		$reporter->init();
	}
}

sub finish_reporter {
	$reporter->finish() if $reporter;
}

# Mani, 01 Apr 2010. No, not a joke!
# run_bootstrap()
# Old style bootstrapping. Will keep it for a while, but if we
# dont use it for a while, then should be deleted from here.

sub run_bootstrap {
	use Config;

	# Bootstrap

	Smash::Analyses::BootStrap::bootstrap_list(@_);

	# Create job script for replicate analysis

	my $perl = $Config{perlpath};
	my $replicate_job_file = "$options{tag}.replicates.queue";
	open(JOB, ">$replicate_job_file") || die "Cannot open $replicate_job_file";
	for (my $i=0; $i<$REPLICATES; $i++) {
		print JOB "$perl $0 --analysis=$analysis --tag=$options{tag} --identity=$options{identity} --normalize=$options{normalize} --list=$options{list} --mode=distmat --replicate=$i --refdb ".join(" ", @opt_refdb);
		if (defined($level)) {
			print JOB " --level=$level";
		}
		print JOB "\n";
	}
	close(JOB);
	print $PROGRESS "I have generated a script '$replicate_job_file' to process all the replicates.\n";
	print $PROGRESS "Run the script to generate all the matrices\n";
}

sub encode_labels {
	my @labels     = @_;
	my $Label2Code = {};
	my $id = 1;
	foreach my $ll (sort {$a cmp $b} @labels) {
		my $code = "S${id}S";
		$Label2Code->{$ll} = $code;
		$id++;
	}
	return $Label2Code;
}

sub apply_encoding {
	my $dist       = shift;
	my $Label2Code = shift;
	my $new_dist   = {};
	foreach my $row (keys %$dist) {
		foreach my $col (keys %{$dist->{$row}}) {
			$new_dist->{$Label2Code->{$row}}->{$Label2Code->{$col}} = $dist->{$row}->{$col};
		}
	}
	return $new_dist;
}

sub decode_labels {
	my $tree       = shift;
	my $Code2Label = shift;
	foreach my $code (keys %$Code2Label) {
		my $label = $Code2Label->{$code};
		$tree =~ s/${code}/${label}/g;
	}
	return $tree;
}

sub cleanup_tree {
	my $tree = shift;
	$tree =~ s/([\d]+)\.0([^\d])/$1$2/g;
	$tree =~ s/\):/\)#/g;
	$tree =~ s/:100//g;
	$tree =~ s/#//g;
	return $tree;
}

sub run_clustering {
	use File::Copy;

	my $labels  = shift;
	my $method  = shift || die "clustering needs method!";
	my @metrics = @_;
	die "clustering needs metric!" if @metrics < 1;

	my %keep = map {$_ => 1} @$labels; # samples to keep

	my $Label2Code = encode_labels(@$labels);

	foreach my $metric (@metrics) {
		foreach my $refdb (@opt_refdb) {

			#####
			# Distance matrix
			#####

			open(DIST, ">infile") || die "Cannot open infile: $!";
			my $dist = read_phylip_matrix("$options{tag}.distance.$metric.$analysis.$level.$refdb");
			my @keys = keys %$dist;

			# delete unlisted in the row
			map {delete $dist->{$_} unless defined $keep{$_}} @keys;

			# delete unlisted in the column (row after transpose)
			$dist = transpose_matrix($dist);
			map {delete $dist->{$_} unless defined $keep{$_}} @keys;
			$dist = transpose_matrix($dist);

			# Remap names to numbers to avoid the 10 character width issue

			my $new_dist = apply_encoding($dist, $Label2Code);

			write_phylip_matrix(\*DIST, $new_dist);
			close(DIST);

			#####
			# Run clustering method
			#####

			unlink qw(outfile outtree);
			my $seed = 1+2*int(rand(49));
			open(NEIGHBOR, "| $method");
			print NEIGHBOR<<EOF;
2
y

EOF
			close(NEIGHBOR);

			#####
			# clean up the tree
			#####

			my $tree = "";
			open(TREE, "<outtree");
			while (<TREE>) {
				chomp();
				$tree .= $_;
			}
			close(TREE);
			#unlink "outtree";

			# remap the codes to names

			$tree = decode_labels($tree, invert_hash($Label2Code));
			$tree = cleanup_tree($tree);

			#####
			# Make the new clean tree
			#####

			open(TREE, ">$options{tag}.$analysis.tree");
			print TREE "$tree\n";
			close(TREE);

			#####
			# Upload to iTOL
			#####

			my $treeName = sprintf("$analysis clustering%s (%sDatabase:%s; Metric:%s; UniformPrior: %f; Method:%s;)", ($options{tag})?" $options{tag}":"", ($level)?"Level:$level; ":"", $refdb, $metric, $UNIFORM_PRIOR, $method);

			my $smash = new Smash::Core();
			$smash->init();
			my %reporter_options = (
						    NAME => "$options{tag}.$analysis",
						  LABELS => [map {$Labels{$_}} @entities],
					     DESCRIPTION => $treeName,
					     GRAPH_WIDTH => 25,
					NCBI_TAXONOMY_ID => 0,
					       UPLOAD_ID => $smash->get_conf_value("iTOL", "uploadID")
					);
			$smash->finish();


			$reporter = new Smash::Utils::iTOL(%reporter_options);
			if (-f "sample.cluster.itol.colors") {
				$reporter->{COLOR_FILE} = "sample.cluster.itol.colors";
			}

			$reporter->init();
			$reporter->upload();
			$reporter->download("svg");
		}
	}
	success(0);
}

sub run_bootstrap_clustering {
	use File::Copy;

	my $labels  = shift;
	my $method  = shift || die "clustering needs method!";
	my @metrics = @_;
	die "clustering needs metric!" if @metrics < 1;

	my %keep = map {$_ => 1} @$labels; # samples to keep

	my $Label2Code = encode_labels(@$labels);

	foreach my $metric (@metrics) {
		foreach my $refdb (@opt_refdb) {

			#####
			# Distance matrix
			#####

			open(DIST, ">infile") || die "Cannot open infile: $!";
			for (my $i=0; $i<$REPLICATES; $i++) {
				my $dist = read_phylip_matrix("bootstrap_files/$options{tag}.distance.$metric.$analysis.$level.$refdb.$i");
				my @keys = keys %$dist;

				# delete unlisted in the row
				map {delete $dist->{$_} unless defined $keep{$_}} @keys;

				# delete unlisted in the column (row after transpose)
				$dist = transpose_matrix($dist);
				map {delete $dist->{$_} unless defined $keep{$_}} @keys;
				$dist = transpose_matrix($dist);

				my $new_dist = apply_encoding($dist, $Label2Code);
				write_phylip_matrix(\*DIST, $new_dist);
			}
			close(DIST);

			#####
			# Run clustering method
			#####

			unlink qw(outfile outtree);
			my $seed = 1+2*int(rand(49));
			open(NEIGHBOR, "| $method");
			print NEIGHBOR<<EOF;
m
$REPLICATES
$seed
2
y

EOF
			close(NEIGHBOR);
			move qw(outtree intree);

			#####
			# run consense
			#####

			unlink qw(outfile);
			open(CONSENSE, "| consense");
			print CONSENSE<<EOF;
2
y

EOF
			close(CONSENSE);

			#####
			# clean up the tree from consense
			#####

			my $tree = "";
			open(TREE, "<outtree");
			while (<TREE>) {
				chomp();
				$tree .= $_;
			}
			close(TREE);
			unlink "outtree";

			$tree = decode_labels($tree, invert_hash($Label2Code));
			$tree = cleanup_tree($tree);

			#####
			# Make the new clean tree
			#####

			open(TREE, ">$options{tag}.$analysis.bootstrapcluster.tree");
			print TREE "$tree\n";
			close(TREE);

			#####
			# Upload to iTOL
			#####

			my $treeName = sprintf("$analysis clustering%s (%sDatabase:%s; Metric:%s; UniformPrior: %f; Method:%s;)", ($options{tag})?" $options{tag}":"", ($level)?"Level:$level; ":"", $refdb, $metric, $UNIFORM_PRIOR, $method);

			my $smash = new Smash::Core();
			$smash->init();
			my %reporter_options = (
						    NAME => "$options{tag}.$analysis.bootstrapcluster",
						  LABELS => [map {$Labels{$_}} @entities],
					     DESCRIPTION => $treeName,
					     GRAPH_WIDTH => 25,
					NCBI_TAXONOMY_ID => 0,
					       UPLOAD_ID => $smash->get_conf_value("iTOL", "uploadID")
					);
			$smash->finish();


			$reporter = new Smash::Utils::iTOL(%reporter_options);
			if (-f "sample.cluster.itol.colors") {
				$reporter->{COLOR_FILE} = "sample.cluster.itol.colors";
			}

			$reporter->init();
			$reporter->upload();
			$reporter->download("svg");
		}
	}
	success(0);
}

############################################
# Common
############################################

# make distance matrices using shared features. Distance measures are from Korbel et al SHOT paper

sub make_qualitative_distance_matrices {
	use Smash::Utils::MatrixIO qw(write_phylip_matrix);
	my $feature_count = shift;
	my $replicate     = shift;
	my @labels   = sort {$a cmp $b} values %Labels; # has to be consistent across header/data
	my @features = keys %$feature_count;
	my %Count; # num features in each label
	my %Shared; # num features shared bw two labels
	foreach my $l1 (@labels) {
		$Count{$l1} = 0;
		foreach my $l2 (@labels) {
			$Shared{$l1}{$l2} = 0;
		}
	}
	foreach my $feature (@features) {
		my @positive_labels;
		foreach my $label (@labels) {
			if (defined($feature_count->{$feature}->{$label}) && $feature_count->{$feature}->{$label} == 1) {
				push(@positive_labels, $label);
				$Count{$label}++;
			}
		}
		foreach my $pos1 (@positive_labels) {
			foreach my $pos2 (@positive_labels) {
				$Shared{$pos1}{$pos2}++;
			}
		}
	}
	my %Dist;
	foreach my $l1 (@labels) {
		foreach my $l2 (@labels) {
			my ($c1, $c2) = ($Count{$l1}, $Count{$l2});
			my $normalization = $c1*$c2*sqrt(2 / ($c1*$c1 + $c2*$c2));
			my $sim = $Shared{$l1}{$l2}/$normalization;
			$Dist{$l1}{$l2} = -log($sim);
		}
	}
	my $distance_file = "$options{tag}.distance.shared.$analysis.$refdb";
	write_phylip_matrix($distance_file, %Dist);
}

1;

=begin html

=head1 doComparativeAnalysis.pl

=head1 Name

doComparativeAnalysis.pl - Comparative analysis of metagenomes

=end html

=begin latex

=head1 Name

doComparativeAnalysis.pl - Comparative analysis of metagenomes

=end latex

=head1 Synopsis

	doComparativeAnalysis.pl [options]

=head1 Options

=over 4

=item B<C<--tag>> (required)

Label for this analysis. Every file created by this run of this step of the 
analysis will have the given label as its prefix, to help you perform a 
series of steps as an analysis unit.

=item B<C<--analysis>> (required)

Type of analysis you would like to perform. Currently available options are 
C<refgenome>, C<16S> and C<functional>.

=item B<C<--list>> (required)

Name of file containing a list of samples, one per line. For C<refgenome> and
C<16S> analysis, this should contain the metagenome ids. For C<functional>
analysis, this list should contain  the gene prediction ids, because there
can be multiple gene predictions per sample.

By default, this script assumes that the samples to be analyzed are in the 
database and will try to get additional information from the database. If you
want to analyze external data using SMASH, then you have to provide this
information through the list file (See L<Analysis types>) for more details).

=item B<C<--mode>> (required)

Which step of the analysis do you want to perform now? Available steps are
C<raw>, C<summary>, C<distmat>, C<cluster>, C<bootstrap+distmat>, 
C<bootstrap+cluster>, C<itol>.

=item B<C<--replicate>>

Replicate to process, if this is part of C<bootstrap> analysis. If this is not
specified, the whole dataset will be processed.

=item B<C<--level>>

Functional or phylogenetic level at which to perform the analysis. For 
C<refgenome> and C<16S>, these are phylogenetic levels. Please keep in mind that
some organisms do not have some taxonomic levels, so asking for that level will
assign the closest rank above the requested level available for that organism.
For C<functional> analysis, this must be one of C<og> (for orthologous groups),
C<module> (for KEGG modules, if the data contains KEGG annotations) or C<pathway>
(for KEGG pathways, if the data contains KEGG annotations).

=item B<C<--refdb>>

Name of the reference database used. Although it is recommended that you use the
real name of databases (e.g., eggnog if the db is the eggNOG proteins, NCBI if 
the db is the NCBI genomes), it is really used in identifying the file to parse.

For example, if file F<list.txt> contains

	MC20.MG1
	MC20.MG2
	MC30.MG5

and you call the script 

	doComparativeAnalysis.pl --analysis=refgenome --list=list.txt \
		--mode=raw --refdb=NCBI --tag=April01 ...

then the script expects F<MC20.MG1.NCBI.blastn>, F<MC20.MG2.NCBI.blastn> and 
F<MC30.MG5.NCBI.blastn> in the present working directory. This also creates 
an output file called F<April01.feature_vector.rawcount.B<refgenome>.B<NCBI>> 
(bold words are from the commandline options).

If file F<list.txt> contains

	MC20.MG1.AS2.GP2
	MC20.MG2.AS3.GP1
	MC30.MG5.AS1.GP3

and you call the script 

	doComparativeAnalysis.pl --analysis=functional --list=list.txt \
		--mode=raw --refdb=eggnog --tag=April01 ...

then the script expects F<MC20.MG1.AS2.GP2.eggnogmapping.txt>, F<MC20.MG2.AS3.GP1.eggnogmapping.txt> and 
F<MC30.MG5.AS1.GP3.eggnogmapping.txt> in the present working directory. This also creates 
an output file called F<April01.feature_vector.rawcount.B<functional>.B<eggnog>> 
(bold words are from the commandline options).

For historic reasons, this option can also take a comma-separated list of 
such C<refdb>'s to perform the analysis on multiple reference databases. 
For example, running the command

	doComparativeAnalysis.pl --analysis=functional --list=list.txt \
		--mode=rawcount --refdb=eggnog,kegg --tag=April01 ...

is equivalent to running the same command with C<--refdb=eggnog> and 
C<--refdb=kegg> sequentially. This was just an example, and unless you just have
one CPU to do all your analysis, we do not recommend running the command above.
Instead, run two analysis for C<--refdb=eggnog> and C<--refdb=kegg>
simultaneously so that you have your results in roughly half as much time.
However, this options comes in very handy when you use C<--mode=itol>. For
example, running the command

	doComparativeAnalysis.pl --analysis=refgenome --list=list.txt \
		--level=genus --mode=itol --refdb=NCBI,MyDB --tag=April01 ...

will parse the read mappings at the genus level using the NCBI database and
MyDB database, and upload them as two datasets to one underlying iTOL tree
which contains the genera hit in either database.

Unfortunately, this won't work for C<functional> analysis using C<kegg>
and C<eggnog>, since the orthologous groups have different names! The 
resulting tree will have all the eggNOG OGs and the KEGG KOs, and you
end up with a bizarre iTOL tree of mutually exclusive datasets that
shouldn't have uploaded together.

=item B<C<--normalize>>

Normalization procedure to be used. Available options are: C<data> that 
will normalize the abundances by the datasize, and C<hits> that will
normalize the abundances by the positive hits. For example, if a dataset
contains 1000 reads, 500 of which are mapped to some genus, and 50 of 
them are mapped to I<Lactobacillus>, then using C<hits> will result in
an abundance of 10% for I<Lactobacillus> (50 out of 500), whereas using 
C<data> will result in 5% (50 out of 1000). You get the idea!

=item B<C<--filterlow>>

filters features (genera, species, orthologous groups) that are below the 
given threshold in ALL samples. If at least one sample is above the 
threshold, the feature remains in all samples. Otherwise, it is removed
from the feature table.

=item B<C<--help>>

Prints this manual.

=back

=head2 Options specific to B<C<--mode=itol>>

These options tell iTOL which features to display in the figure. C<--top> lets
you display all the top C<N> features in each sample. Compare this with 
C<--filterlow> that lets you
filter all the features whose maximum value in any sample is below the threshold.

=over 4

=item B<C<--top>>

Applies only to C<itol> mode. Use only the C<N> most abundant features, be
it genera, species or orthologous groups. This top C<N> filter is applied 
to each sample independently, so the final list could be more than C<N> long
when multiple samples have different sets of top C<N> features.

=back

=head2 Options specific to B<C<--analysis=refgenome>>

=over 4

=item B<C<--identity>>

Percent identity cutoff to be used for this analysis. Only applies to 
C<refgenome> analysis where it is used as the threshold above which
a read is assigned to the organism of the BLAST hit at the taxonomic 
level specified by C<level>. Typical values we use are 65% for C<phylum>,
85% for C<genus> and 95% for C<species>.

=item B<C<--local>>

Applies only to C<refgenome> analysis. SMASH uses a database that maps each
sequence in the reference genome database to its source organism identified
by its NCBI taxonomy id. These sequences are also annotated for protein-coding
genes, non-coding genes, etc. If you do not have this database installed at 
your site, e.g. because you are calling this script from B<SmashCell>, you
probably have downloaded the information from SMASH website. Then this option
is just for you.

=item B<C<--genomesize | --nogenomesize>>

By default the script performs genome size normalization for the refgenome
analysis. If you do not need this, specify C<--nogenomesize> to turn this off.

=back

=head2 Options specific to B<C<--analysis=16S>>

=over 4

=item B<C<--conf_threshold>>

Minimum bootstrap confidence threshold reported by RDP
classifier. (default: 0.6)

=item B<C<--length_threshold>>

Minimum length of a read to be considered mapped to a rank. (default: 250)

=back

=head1 Description

C<doComparativeAnalysis.pl> is a multifunctional script that enables comparative
metagenomic analysis. Before I explain the functionalities of the script, I must
explain its dependencies on external information and/or resources.

=head2 External dependencies

=begin html

<BLOCKQUOTE>
<HR>

=end html

=head3 iTOL batch access

Many functionalities of this script generate cluster or trees and these can be 
uploaded to the iTOL webserver for viewing as well storage. SMASH uses the iTOL
batch access website through scripts kindly provided by iTOL. You must have an 
user account at the iTOL website to use this feature. When you upload a
tree through this script, the tree will be uploaded to a project called 
B<itol_uploader> under your user account. Please create this project under 
your user account, otherwise
the trees might be lost in digital space. Additionally, for iTOL to allow
batch access to your account, you must enable it. Please see 
L<http://itol.embl.de/help/batch_help.shtml> for details (under B<Enabling
batch upload>). Once you enable it, you receive an upload ID, which you should enter 
it in the SMASH config file under the B<iTOL> section as shown below:

	[Smash]
	data_dir          : /some/embl/location/smash/data_repos
	software_dir      : software
	workspace_dir     : workspace
	collection_prefix : MC
	metagenome_prefix : MG
	assembly_prefix   : AS
	genepred_prefix   : GP

	[SmashDB]
	database_engine : mysql
	database_name   : SmashDB
	user : smashdb
	pass : smashdb
	host : server
	port : 9999

	[RefOrganismDB]
	data_dir        : /some/embl/location/smash/data_repos/reference_organisms
	database_engine : sqlite3
	database_name   : /some/embl/location/smash/db/RefOrganismDB.sqlite
	user :
	pass : 
	host : 
	port : 

	[iTOL]
	uploadID : xyzw

=begin html

<HR>

=end html

=head3 NCBI Taxonomy information

Phylogenetic analysis performed by this script uses the taxonomy information from
NCBI. It uses a local repository of the NCBI taxonomy dump (normally available at
L<ftp://ftp.ncbi.nih.gov/pub/taxonomy>) to get detailed information on each 
organism. You must specify the remote and local repository location in the 
configuration file, in the B<Taxonomy> section as shown below:

	[Taxonomy]
	remote_repository : ftp://ftp.ncbi.nih.gov/pub/taxonomy
	local_repository  : /some/embl/location/smash/data_repos/external/taxonomy

When the script needs to parse this taxonomy information, it checks whether 
F<local_repository> contains the required files. If they do not exist, it downloads
them from F<remote_repository>. If you would like to update the files using the 
latest available from NCBI, you could replace the files in F<local_repository>
with the newer ones yourself, or update it automatically by running the script
L<updateNCBITaxonomyFiles.pl|updateNCBITaxonomyFiles> without any arguments:

	updateNCBITaxonomyFiles.pl

=begin html

<HR>

=end html

=head3 PHYLIP package

For clustering the samples, we use the C<neighbor> method from the B<phylip>
package. For generating consensus clusters from bootstrap analysis, we use the
C<consense> program from B<phylip> as well. Please make sure that C<neighbor>
and C<consense> are available in your path. You can get B<phylip> package from
L<http://evolution.genetics.washington.edu/phylip.html>.

=begin html

<HR>
</BLOCKQUOTE>

=end html

=head2 Subroutines

=over 4

=item C<get_analyzer($type)>

returns a C<Smash>>Analyses::Comparative::$type> object, if it exists.

=item C<init_reporter>

inits the reporter for this instance.

=item C<finish_reporter>

closes the reporter after finalizing the report.

=back

=cut

=head1 Dependencies

doComparativeAnalysis.pl requires the following PERL modules:

	Config, Pod::Usage 

The iTOL batch access scripts require the following PERL modules:

	Getopt::Regex, Config::File::Simple, HTTP::Request::Common and LWP::UserAgent.

=cut
