#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "mDNA.h"

#define COLUMNS 80

void mFreeDNA(mDNA *dna) {
	mFree(dna->def);
	mFree(dna->data);
	mFree(dna->code);
}

void mFreeDNAVector(mVector *vec) {
	int i;
	for (i=0; i<vec->size; i++) {
		mFreeDNA((mDNA*)vec->elem[i]);
		mFree(vec->elem[i]);
	}
	mFreeVector(vec);
}

/* returns END_OF_FASTA - if there is no more fasta entry left,
 *         HAS_MORE_SEQUENCE - if there is more fasta entries left
 */
int mReadDNALite(FILE *stream, mDNA *dna) {
	int     next = END_OF_FASTA;
	char   *data;
	char   *def;
	int     c;
	coor_t  count = 0;
	coor_t  seq_limit = 512; /* initial mem alloc */
	coor_t  hdr_limit = 512; /* initial mem alloc */

	data = (char*) mCalloc(seq_limit, sizeof(char));
	def  = (char*) mCalloc(hdr_limit, sizeof(char));

	while (isspace(c = fgetc(stream)));
	if ('>' == (char) c) { /* Header line found */
		while ( (c = fgetc(stream)) != EOF && c != '\n') { 
			/* Process header */
			def[count++] = (char) c;
			if (count == hdr_limit) {
				hdr_limit *= 2;
				def = (char*) mRealloc(def, hdr_limit*sizeof(char));
			}
		}
		def[count] = '\0';
	} else {
		/*mExit("Not a valid Fasta file!");*/
		mDie("Not a valid Fasta file!");
	}

	count = 0;
	for(;;){
		c = fgetc(stream);
		if (c == EOF || c == '>') {
			/* next record */
			break;
		}
		if (isalpha(c)) {
			if (count == seq_limit) {
				seq_limit *= 2;
				data = (char*) mRealloc(data, seq_limit*sizeof(char));
			}
			data[count] = (char) c;
			count++;
		}
	}
	if (c == '>') {
		/* next record */
		ungetc(c, stream);
		next = HAS_MORE_SEQUENCE;
	}

	dna->length = count;
	dna->def  = def;
	dna->data = data;
	dna->code = NULL;
	dna->gc   = -FLT_MAX;
	return next;
}

/* returns END_OF_FASTA - if there is no more fasta entry left,
 *         HAS_MORE_SEQUENCE - if there is more fasta entries left
 */
int mReadDNA(FILE *stream, mDNA *dna) {
	int next = mReadDNALite(stream, dna);
	mCalculateDNACode(dna);
	return next;
}

void mCalculateDNACode(mDNA *dna) {
	int    length = dna->length;
	char  *data   = dna->data;
	int   *code   = (int*)  mCalloc(length, sizeof(int));
	int    at=0, gc=0;
	coor_t i;

	for(i=0; i<length; i++){
		char c = data[i];
		switch(c) {
			case 'a':
			case 'A':
				code[i] = 0; at++;
				break;
			case 'c':
			case 'C':
				code[i] = 1; gc++;
				break;
			case 'g':
			case 'G':
				code[i] = 2; gc++;
				break;
			case 't':
			case 'T':
				code[i] = 3; at++;
				break;
			case 'n':
			case 'N':
			default:
				code[i] = 4;
				break;
		}
	}

	dna->code = code;
	dna->gc   = 1.0 * gc / (gc+at);
}

int mReadMultipleDNA(FILE *stream, mVector *multi) {
	int status;
	mDNA*  dna   = (mDNA*)  mCalloc(1, sizeof(mDNA));
	while((status = mReadDNA(stream, dna))) {
		mPushVector(multi, dna);
		if (status == END_OF_FASTA) { /* Last entry */
			break;
		}
		dna = (mDNA*) mCalloc(1, sizeof(mDNA));
	}
	return multi->size;
}

void mWriteMultipleDNA(FILE *stream, mVector *multi) {
	int i;
	for (i=0; i<multi->size; i++) {
		mWriteDNA(stream, (mDNA*)multi->elem[i]);
	}
}

void mWriteDNA(FILE *stream, mDNA *dna) {
	mWriteFixedLengthDNA(stream, dna, COLUMNS);
}

void mWriteFixedLengthDNA(FILE *stream, mDNA *dna, int columns) {
	int i;
	int   length = dna->length;
	char *data   = dna->data;
	fprintf(stream, ">%s\n", dna->def);
	for (i=0; i < length; i++) {
		fprintf(stream, "%c", data[i]);
		if (i%columns == columns-1) {
			fprintf(stream, "\n");
		}
	}
	if (length%columns != 0) fprintf(stream, "\n");
}

void mComplementDNA(mDNA *dna) {
	int i;
	int length = dna->length;
	char *data = dna->data;
	for (i=0; i<length; i++) {
		switch(data[i]) {
			case 'A':
				data[i] = 'T';
				break;
			case 'a':
				data[i] = 't';
				break;
			case 'C':
				data[i] = 'G';
				break;
			case 'c':
				data[i] = 'g';
				break;
			case 'G':
				data[i] = 'C';
				break;
			case 'g':
				data[i] = 'c';
				break;
			case 'T':
				data[i] = 'A';
				break;
			case 't':
				data[i] = 'a';
				break;
		}
	}

	if (dna->gc != -FLT_MAX) {
		dna->gc = 1 - dna->gc;
	}

	if (dna->code != NULL) {

		/* weird error: for 'N', this becomes -1 and not 4.
		 * so I added a 5 in front just in case */

		int *code = dna->code;
		for (i=0; i<length; i++) {
			code[i] = (5 + 3 - code[i])%5; 
		}
	}
}

void mReverseDNA(mDNA *dna) {
	int i;
	int length = dna->length;
	char *data = dna->data;
	for (i=0; i<length/2; i++) {
		char tmp = data[length-i-1];
		data[length-i-1] = data[i];
		data[i] = tmp;
	}
	if (dna->code != NULL) {
		int *code = dna->code;
		for (i=0; i<length/2; i++) {
			char tmp = code[length-i-1];
			code[length-i-1] = code[i];
			code[i] = tmp;
		}
	}
}

void mReverseComplementDNA(mDNA *dna) {
	mReverseDNA(dna);
	mComplementDNA(dna);
}

char* mDNA2Char(mDNA *dna) {
	char *seq = (char*) mCalloc(dna->length+1, sizeof(char));
	memcpy(seq, dna->data, dna->length*sizeof(char));
	seq[dna->length] = '\0';
	return seq;
}

void mProcessDNADef(mDNA *dna) {
	char *def    = dna->def;
	int   length = strlen(def);
	int   i;
	for (i=0; i<length; i++) {
		if (isspace(def[i])) {
			def[i] = '\0';
		}
	}
}

mDNA* mSubDNA(mDNA *dna, coor_t offset, coor_t length) {
	mDNA *sub;

	if (offset >= dna->length) return NULL;

	sub         = (mDNA*) mCalloc(1, sizeof(mDNA));
	sub->data   = dna->data + (int)offset;
	if (dna->code != NULL)
		sub->code   = dna->code + (int)offset;
	sub->def    = dna->def;
	sub->length = length;
	if (offset + length > dna->length) {
		sub->length = dna->length - offset;
	}

	/* Don't calculate GC percentage. A call to mGetGC will calculate it. */
	sub->gc = -FLT_MAX;
	return sub;
}

float mGetGC(mDNA *dna) {
	int i, at=0, gc=0;
	int  *code;
	if (dna->gc != -FLT_MAX) {
		return dna->gc;
	}
	/* Calculate GC percentage */
	if (dna->code == NULL) {
		mCalculateDNACode(dna);
	}
	code = dna->code;

	for (i=0; i<dna->length; i++) {
		switch(code[i]) {
			case 0: at++; break;
			case 1: gc++; break;
			case 2: gc++; break;
			case 3: at++; break;
		}
	}
	dna->gc = 1.0 * gc/(gc+at);
	return dna->gc;
}

void mBreakAndWriteDNA(FILE *stream, mDNA *dna, coor_t chunk) {
	char *def, *dup, *brk;
	coor_t i;
	int count;

	def  = (char*) mCalloc(strlen(dna->def)+2*((int)log10(dna->length/chunk)) + 3, sizeof(char));
	dup  = (char*) mCalloc(strlen(dna->def) + 1, sizeof(char));
	strcpy(dup, dna->def);
	brk = strpbrk(dup, " \t");
	if (brk != NULL) {
		*brk = '\0';
	}
	for (i=0; i<dna->length; i+=chunk) {
		mDNA *sub = mSubDNA(dna, i, chunk);
		sprintf(def, "%s:%ld-%ld", dup, i, i+sub->length-1);
		sub->def  = def;
		mWriteDNA(stream, sub);
		count++;
	}
	mFree(def);
	mFree(dup);
}

void mSplitDNAToFiles(char *prefix, mDNA *dna, coor_t chunk) {
	FILE *stream;
	char *def, *dup, *brk, *filename;
	coor_t i;
	int count;

	filename  = (char*) mMalloc((strlen(prefix)+((int)log10(dna->length/chunk)+1) + 2)*sizeof(char));
	def  = (char*) mMalloc((strlen(dna->def)+2*((int)log10(dna->length/chunk)) + 3)*sizeof(char));
	dup  = (char*) mMalloc((strlen(dna->def) + 1)*sizeof(char));
	strcpy(dup, dna->def);
	brk = strpbrk(dup, " \t");
	if (brk != NULL) {
		*brk = '\0';
	}
	count = 0;
	for (i=0; i<dna->length; i+=chunk) {
		mDNA *sub = mSubDNA(dna, i, chunk);
		sprintf(def, "%s:%ld-%ld", dup, i, i+sub->length-1);
		sprintf(filename, "%s.%d", prefix, count);
		sub->def  = def;
		if ((stream = fopen(filename, "w")) == NULL) {
			mDie("Cannot open %s for writing", filename);
		}
		mWriteDNA(stream, sub);
		fclose(stream);
		count++;
	}
	mFree(def);
	mFree(dup);
	mFree(filename);
}
