#include <argtable2.h>
#include "mDNA.h"

#define REVCOMP  (0)
#define BREAK    (1)
#define SEPARATE (2)
#define GC       (3)
#define CLIP_454 (4)
#define TRIM     (5)
#define LENGTH   (6)

int scaf_break(int count, const char* files[], int threshold) {
	mDNA *dna = (mDNA*) mMalloc(sizeof(mDNA));
	mDNA *sub;
	char def[256];
	FILE *stream;
	int i;
	int status;

	for (i=0; i<count; i++) {
		if (strcmp(files[i], "-") == 0) {
			stream = stdin;
		} else if ((stream = fopen(files[i], "r")) == NULL) {
			mDie("Cannot open %s for reading", files[i]);
		}
		while ((status=mReadDNA(stream, dna))) {
			int current=0, length = (int)dna->length, counter=0;
			int gap_stretch=0, gap_start;
			int seq_length=0, seq_start, seq_end=0;
			int *code = dna->code;

			/* Skip N's at the beginning */
			while (current<length && code[current] == 4) {current++;}
			seq_start = current;

			/* Now we are guaranteed to have non-N here */

			while (current<length) {
				while (current<length && code[current] != 4) {current++;}
				seq_end = current-1;
				/* Now, [seq_start,seq_end] is a sequence stretch */

				/* Now skip N's if there are any */
				gap_start=current;
				while (current<length && code[current] == 4) {gap_stretch++;current++;}

				if (gap_stretch >= threshold || current == length) {
					seq_length = seq_end - seq_start + 1;
					sub = mSubDNA(dna, seq_start, seq_length);
					sprintf(def, "%s_part%d", sub->def, counter++);
					sub->def = def;
					mWriteDNA(stdout, sub);
					seq_start = current;
				}
				gap_stretch = 0;
			}
			seq_length = current - seq_start;
			sub = mSubDNA(dna, seq_start, seq_length);
			if (sub != NULL) {
				sprintf(def, "%s_part%d", sub->def, counter++);
				sub->def = def;
				mWriteDNA(stdout, sub);
			}
			mFreeDNA(dna);
			if (status==END_OF_FASTA) break;
		}
		fclose(stream);
	}
	return 0;
}

int separate(int count, const char* files[], int size) {
	mDNA *dna = (mDNA*) mMalloc(sizeof(mDNA));
	FILE *stream;
	int i;
	int status;
	FILE *out1, *out2;

	out1 = stdout;
	out2 = stderr;

	for (i=0; i<count; i++) {
		if ((stream = fopen(files[i], "r")) == NULL) {
			mDie("Cannot open %s for reading", files[i]);
		}
		while((status = mReadDNALite(stream, dna))) {
			if (dna->length > size)
				mWriteDNA(out1, dna);
			else
				mWriteDNA(out2, dna);
			mFreeDNA(dna);
			if (status == END_OF_FASTA) { /* Last entry */
				break;
			}
		}
		fclose(stream);
	}
	return 0;
}

#define LIMIT 5000
int length_dist(int file_count, const char* files[]) {
	mDNA *dna = (mDNA*) mMalloc(sizeof(mDNA));
	FILE *stream;
	int i;
	int lengths[LIMIT];
	int count = 0;
	int status;

	for (i=0; i<LIMIT; i++) lengths[i] = 0;

	for (i=0; i<file_count; i++) {
		if (strcmp(files[i], "-") == 0) {
			stream = stdin;
		} else if ((stream = fopen(files[i], "r")) == NULL) {
			mDie("Cannot open %s for reading", files[i]);
		}
		while((status = mReadDNALite(stream, dna))) {
			lengths[dna->length]++;
			count++;
			if (status == END_OF_FASTA) { /* Last entry */
				break;
			}
			mFreeDNA(dna);
		}
		fclose(stream);
	}

	for (i=0; i<LIMIT; i++) {
		if (lengths[i] > 0) printf("%d\t%d\t%.6f\n", i, lengths[i], 1.0*lengths[i]/count);
	}
	exit(0);
}

int generic(int mode, int count, const char* files[], int size) {
	mDNA *dna = (mDNA*) mMalloc(sizeof(mDNA));
	mDNA *sub = (mDNA*) mMalloc(sizeof(mDNA));
	FILE *stream;
	int i;
	int status;

	for (i=0; i<count; i++) {
		if (strcmp(files[i], "-") == 0) {
			stream = stdin;
		} else if ((stream = fopen(files[i], "r")) == NULL) {
			mDie("Cannot open %s for reading", files[i]);
		}
		while((status = mReadDNALite(stream, dna))) {
			switch(mode) {
				case REVCOMP:
					mReverseComplementDNA(dna);
					mWriteDNA(stdout, dna);
					break;
				case TRIM:
					sub = mSubDNA(dna, 0, size);
					mWriteDNA(stdout, sub);
					break;
				case BREAK:
					mBreakAndWriteDNA(stdout, dna, size);
					break;
				case LENGTH:
					fprintf(stdout, "%s\t%ld\n", dna->def, dna->length);
					break;
				case GC:
					fprintf(stdout, "%s\t%.4f\n", dna->def, mGetGC(dna));
					break;
				case CLIP_454:
					sub = mSubDNA(dna, 4, dna->length - 4);
					mWriteDNA(stdout, sub);
					break;
			}
			mFreeDNA(dna);
			if (status == END_OF_FASTA) { /* Last entry */
				break;
			}
		}
		fclose(stream);
	}
	mFree(sub);
	mFree(dna);
	return 0;
}

int print_help(void **argtable) {
	fprintf(stdout, "Usage:\nprocessFasta");
	arg_print_syntax(stdout, argtable, "\n");
	arg_print_glossary(stdout, argtable, "  %-25s %s\n");
	fprintf(stdout, "\nProcessing modes:\n");
	fprintf(stdout, "%10s    %s\n", "", "");
	fprintf(stdout, "%10s    %s\n", "revcomp",   "reverse complement each sequence");
	fprintf(stdout, "%10s    %s\n", "trim",      "trim each sequence to the first <size> bases");
	fprintf(stdout, "%10s    %s\n", "break",     "break sequences into fragments of size <size>");
	fprintf(stdout, "%10s    %s\n", "scafbreak", "break scaffolds at stretches of N's longer than <size>");
	fprintf(stdout, "%10s    %s\n", "separate",  "separate sequences into two files");
	fprintf(stdout, "%10s    %s\n", " ",         "  sequences longer than <size> go to STDOUT");
	fprintf(stdout, "%10s    %s\n", " ",         "  sequences shorter than or equal to <size> go to STDERR");
	fprintf(stdout, "%10s    %s\n", "clip454",   "clip first 4 bases from the first cycle in every read");
	fprintf(stdout, "%10s    %s\n", "gc",        "report GC content of each sequence");
	fprintf(stdout, "%10s    %s\n", "length",    "report length of each sequence in all files");
	fprintf(stdout, "%10s    %s\n", "dist",      "report length distribution of sequences in all files");
	mQuit("");
	return 0;
}

int main(int argc, char* argv[]) {
	struct arg_str  *mode;
	struct arg_file *files;
	struct arg_int  *size;
	struct arg_end  *end;
	int              nerrors;
	int              threshold;
	void           **argtable;

	mode        = arg_str1(NULL, "mode", "MODE",       "revcomp|trim|break|scafbreak|separate|clip454|gc|len|dist");
	files       = arg_filen(NULL, NULL,  "FILE", 0, argc+2, NULL);
	size        = arg_int0("s",  "size", "SIZE", "size of output fragments for \"split\" (or) threshold for separating in \"separate\"");
	end         = arg_end(6); /* this needs to be even, otherwise each element in end->parent[] crosses an 8-byte boundary */
	argtable    = (void**) mMalloc(4*sizeof(void*));
	argtable[0] = mode;
	argtable[1] = size;
	argtable[2] = files;
	argtable[3] = end;

	size->ival[0] = -1;

	if (arg_nullcheck(argtable) != 0) {
		mDie("insufficient memory");
	}

	nerrors = arg_parse(argc, argv, argtable);
	if (nerrors > 0) {
		arg_print_errors(stderr, end, "processFasta");
		print_help(argtable);
	}

	threshold = size->ival[0];

	if (strcmp(mode->sval[0], "revcomp") == 0) {
		generic(REVCOMP, files->count, files->filename, 0);
	} else if (strcmp(mode->sval[0], "trim") == 0) {
		generic(TRIM, files->count, files->filename, threshold);
	} else if (strcmp(mode->sval[0], "gc") == 0) {
		generic(GC, files->count, files->filename, 0);
	} else if (strcmp(mode->sval[0], "clip454") == 0) {
		generic(CLIP_454, files->count, files->filename, 0);
	} else if (strcmp(mode->sval[0], "length") == 0) {
		generic(LENGTH, files->count, files->filename, 0);
	} else if (strcmp(mode->sval[0], "break") == 0) {
		if (threshold == -1) {
			print_help(argtable);
			exit(-1);
		}
		generic(BREAK, files->count, files->filename, threshold);
	} else if (strcmp(mode->sval[0], "scafbreak") == 0) {
		if (threshold == -1) {
			print_help(argtable);
			exit(-1);
		}
		scaf_break(files->count, files->filename, threshold);
	} else if (strcmp(mode->sval[0], "separate") == 0) {
		if (threshold == -1) {
			print_help(argtable);
			exit(-1);
		}
		separate(files->count, files->filename, threshold);
	} else if (strcmp(mode->sval[0], "dist") == 0) {
		length_dist(files->count, files->filename);
	}
	arg_freetable(argtable, 3);
	mFree(argtable);
	return 0;
}
