#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <assert.h>
#include <argtable2.h>
#include "mDNA.h"
#include "mQual.h"

void simulateSequencingErrors(int nfasta, const char **in_fasta, int nqual, const char **in_qual, const char **simulation_fasta, const char **simulation_qual);
char mSimulateBase(char base, int qual);
void mProcessSingleRead(mDNA *dna, mQual *qual, FILE *sim_fasta, FILE *sim_quality);

/* coverage per read, 454 quality values, Sanger reads, Sanger quality values */
int main(int argc, char* argv[]) {

	/* Command line parsing for argtable */

	struct arg_file *in_fasta;
	struct arg_file *in_qual;
	struct arg_file *out_fasta;
	struct arg_file *out_qual;
	struct arg_lit  *help;
	struct arg_end  *end;
	int              nerrors;
	void           **argtable;
	int              i;

	out_fasta    = arg_file1(NULL, "outfasta", "<file>",            "fasta file for simulated reads");
	out_qual     = arg_file1(NULL,  "outqual", "<file>",            "quality file for simulated reads");
	in_qual      = arg_filen( "q",   "inqual", "<file>", 1, argc+2, "quality file containing quality values to use");
	in_fasta     = arg_filen(NULL,       NULL, "<file>", 1, argc+2, "fasta file containing input sequences");
	help         = arg_lit0 ( "h",     "help",                      "print this help and exit");
	end          = arg_end(6); /* this needs to be even, otherwise each element in end->parent[] crosses an 8-byte boundary */
									/* Check arg_end() in arg_end.c */
	argtable     = (void**) mMalloc(8*sizeof(void*));

	i = 0;
	argtable[i++] = out_fasta;
	argtable[i++] = out_qual;
	argtable[i++] = in_qual;
	argtable[i++] = in_fasta;
	argtable[i++] = help;
	argtable[i++] = end;

	if (arg_nullcheck(argtable) != 0) {
		mDie("insufficient memory");
	}

	/* command line defaults */
	
	nerrors = arg_parse(argc, argv, argtable);

	if (help->count > 0) {
		fprintf(stdout, "Sequencing error simulator\n");
		fprintf(stdout, "Usage: %s", argv[0]);
		arg_print_syntax(stdout, argtable, "\n");
		arg_print_glossary(stdout, argtable, "  %-25s %s\n");
		mQuit("");
	}

	if (nerrors > 0) {
		arg_print_errors(stderr, end, argv[0]);
		fprintf(stderr, "try using -h\n");
		mQuit("");
	}

	srand(3510);

	simulateSequencingErrors(in_fasta->count, in_fasta->filename, in_qual->count, in_qual->filename, out_fasta->filename, out_qual->filename);
	/* Free up the memory */

	arg_freetable(argtable, 8);
	mFree(argtable);
	return 0;
}

/*************
 * introduce errors in sequences from nfasta fasta files (in in_fasta[]),
 * using quality values from nqual quality files (in in_qual[]), 
 * write the sequence to simulation_fasta[0],
 * write the quality  to simulation_qual[0]
 */

void simulateSequencingErrors(int nfasta, const char **in_fasta, int nqual, const char **in_qual, const char **simulation_fasta, const char **simulation_qual) {
	FILE  *ifasta, *sim_fasta, *sim_quality;
	FILE **iquality;
	mDNA  *dna;
	mQual *qual;
	int    dstatus, qstatus;
	int    i;
	int    fasta_counter, qual_counter;

	/* Open output files */

	if ((sim_fasta = fopen(simulation_fasta[0], "w")) == NULL) {
		mDie("Cannot open simulation fasta file: %s", simulation_fasta[0]);
	}
	if ((sim_quality = fopen(simulation_qual[0], "w")) == NULL) {
		mDie("Cannot open simulation quality file: %s", simulation_qual[0]);
	}

	/* Open the first fasta file */

	fasta_counter = 0;
	fprintf(stderr, "Processing fasta: %s\n", in_fasta[fasta_counter]);
	if ((ifasta = fopen(in_fasta[fasta_counter], "r")) == NULL) {
		mDie("Cannot open input fasta file: %s", in_fasta[fasta_counter]);
	}

	/* Open all the quality files so you can interleave */

	iquality = (FILE**) mMalloc(nqual*sizeof(FILE*));
	for (i=0; i<nqual; i++) {
		if ((iquality[i] = fopen(in_qual[i], "r")) == NULL) {
			mDie("Cannot open input quality file: %s", in_qual[i]);
		}
	}
	qual_counter = 0;

	/* Memory for the dna and qual objects */

	dna  = (mDNA*)  mMalloc(sizeof(mDNA));
	qual = (mQual*) mMalloc(sizeof(mQual));

	/***
	 * You will read one entry from the qual file and move on to the next one.
	 * Keep reading until the end of file, and rewind if you reached the end.
	 **/

	for (;;) {
		FILE *current_qual = iquality[qual_counter];

		/***
		 * Read one DNA and one Qual entry from the right files.
		 * Record the status that tells you if EOF was reached.
		 * Apply quality values from qual to the DNA sequence in dna.
		 * Write both DNA and Qual into the right output files
		 */
		dstatus = mReadDNALite(ifasta, dna);
		qstatus = mReadQual(current_qual, qual);
		mProcessSingleRead(dna, qual, sim_fasta, sim_quality);
		mFreeDNA(dna);
		mFreeQual(qual);

		/***
		 * Reached end of fasta file.
		 * Close this file and open the next file.
		 */
		if (dstatus == END_OF_FASTA) { 
			fclose(ifasta);
			fasta_counter++;
			if (fasta_counter == nfasta) { /* reached last file */
				break;
			}
			fprintf(stderr, "Processing fasta: %s\n", in_fasta[fasta_counter]);
			if ((ifasta = fopen(in_fasta[fasta_counter], "r")) == NULL) {
				mDie("Cannot open input fasta file: %s", in_fasta[fasta_counter]);
			}
		}

		/***
		 * Reached end of current quality file. *
		 * Rewind the current quality file.
		 */
		if (qstatus == END_OF_QUAL) { 
			rewind(current_qual);
			fprintf(stderr, "Exhausted quality sequences in %s. Restarting from the beginning.\n", in_qual[qual_counter]);
		}

		/* Interleave and go to the next quality file */
		qual_counter++;
		qual_counter%=nqual;
	}
	mFree(dna);
	mFree(qual);
	fclose(sim_quality);
	fclose(sim_fasta);
	for (i=0; i<nqual; i++) {
		fclose(iquality[i]);
	}
	mFree(iquality);
}

/* mutate base using the quality value */

char mSimulateBase(char base, int qual) {
	float  prob;
	float  number;
	char   sim;
	int    position;
	char   bases[]   = "ACGT";
	char   lcbases[] = "acgt";
	char  *first;

	if (base == 'N') return 'N';
	if (qual == 0)   return 'n';

	first = index(bases, (int) base);
	if (first != NULL) { 
		/***
		 * One of ACGT.
		 * Since index() returns pointer to char, get the int index */
		position = (first - bases); 
	} else {             
		/***
		 * Other ambiguous characters.
		 * So assign it randomly to one base (0-3).
		 */
		position = (int) (4.0 * (rand() / (RAND_MAX + 1.0))); 
	}

	/* Get probability from qual */

	prob = -1.0*qual/10;
	prob = pow(10, prob);

	/* Generate a random number */

	number = (1.0 * (rand() / (RAND_MAX + 1.0)));

	/* If that number is less than the prob value, then mutate base */

	if (number < prob) {
		/***
		 * Make random int between 1 and 3, then move that many times from current, so guaranteed change.
		 * Return the lowercase of the mutated base so that we can distinguish changes.
		 */
		int inumber = 1 + (int) (3.0 * (rand() / (RAND_MAX + 1.0))); 
		sim = lcbases[(position+inumber)%4];
/*
		fprintf(stdout, "Q=%d\tIn=%c|%d\tOut=%c\n", qual, base, position, sim);
*/
		return sim;
	} else {
		return base;
	}
}

/* process the dna, introduce errors based on qual and write sequence and qual to *sim_fasta and *sim_quality */

void mProcessSingleRead(mDNA *dna, mQual *qual, FILE *sim_fasta, FILE *sim_quality) {
	int   j;
	char *bases;
	int  *qualv;
	char *def;
	int   length = dna->length;

	assert(length == qual->length);

	def = qual->def;
	qual->def = dna->def;

	bases = dna->data;
	qualv = qual->data;
	for (j=0; j<length; j++) {
		bases[j] = mSimulateBase(bases[j], qualv[j]); 
	}
	mWriteFixedLengthDNA(sim_fasta, dna, 80);
	mWriteQual(sim_quality, qual);
	qual->def = def;
}
