/*
 * Copyright  2014 Daniel Taliun, Johann Gamper and Cristian Pattaro. All rights reserved.
 *
 * This file is part of S-MIG++.
 *
 * S-MIG++ is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * S-MIG++ is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with S-MIG++. If not, see <http://www.gnu.org/licenses/>.
 */

#include <iostream>
#include <time.h>
#include <map>
#include <vector>

#include "db/include/Db.h"
#include "algorithms/include/ContourBuilder.h"
#include "algorithms/include/MIG.h"

using namespace std;

static const char* HELP = "--help";
static const char* VERSION = "--version";
static const char* HAPMAP2 = "--hapmap2";
static const char* VCF = "--vcf";
static const char* MAF = "--maf";
static const char* REGION = "--region";
static const char* LDRATIO = "--ld-ratio";
static const char* CI = "--ci";
static const char* SAMPLES = "--samples";
static const char* PROBABILITY = "--probability";
static const char* SEED = "--seed";
static const char* OUT = "--out";

static const char* BLOCKS_SUFFIX = ".blocks.gz";

void clean_options(map<const char*, vector<const char*>*, bool(*)(const char*, const char*)>& options) {
	map<const char*, vector<const char*>*, bool(*)(const char*, const char*)>::iterator options_it;

	for (options_it = options.begin(); options_it != options.end(); ++options_it) {
		delete options_it->second;
	}
	options.clear();
}

void print_version() {
	cout << "S-MIG++ 1.0.0" << endl << endl;
	cout << "Copyright (C) 2014 Daniel Taliun, Johann Gamper and Cristian Pattaro." << endl;
	cout << "All rights reserved." << endl << endl;
	cout << "This is free software: you can redistribute it and/or modify" << endl;
	cout << "it under the terms of the GNU General Public License as published by" << endl;
	cout << "the Free Software Foundation, either version 3 of the License, or" << endl;
	cout << "(at your option) any later version." << endl << endl;
	cout << "This software is distributed in the hope that it will be useful," << endl;
	cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of" << endl;
	cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the" << endl;
	cout << "GNU General Public License for more details." << endl << endl;
	cout << "You should have received a copy of the GNU General Public License" << endl;
	cout << "along with S-MIG++. If not, see <http://www.gnu.org/licenses/>." << endl;
	cout << endl << endl;
}

void print_help() {
	cout << "Description:" << endl << endl;
	cout << " The S-MIG++ algorithm is a sampling based, memory and runtime efficient" << endl;
	cout << " haplotype blocks recognition algorithm that uses Gabriel et al. (2002)" << endl;
	cout << " criteria to define blocks based on |D'| LD values between SNPs. It was" << endl;
	cout << " designed to handle large datasets with millions of SNPs and thousands of" << endl;
	cout << " samples." << endl;
	cout << endl << endl;

	cout << "Usage:" << endl;
	cout <<	endl << " (with HapMap format)" << endl;
	cout << " smigpp " << HAPMAP2 << " <legend file> <phase file> " << OUT << " <output prefix>" << endl;

	cout << endl << " (with VCF format)" << endl;
	cout << " smigpp " << VCF << " <file> " << OUT << " <output prefix>" << endl;
	cout << endl << endl;

	cout << "Mandatory arguments:" << endl << endl;

	cout << " " << HAPMAP2;
	cout << "\tTwo files in HapMap format: (1) the legend file with marker" << endl;
	cout << "\t\tpositions; (2) the file with phased genotypes." << endl;
	cout << endl << endl;

	cout << " " << VCF;
	cout << "\t\tVCF file with phased genotypes.";
	cout << endl << endl;

	cout << " " << OUT;
	cout << "\t\tPrefix for the output file with non-overlapping haplotype" << endl;
	cout << "\t\tblocks. The output file is compressed with GZIP and is named" << endl;
	cout << "\t\tas <output prefix>.blocks.gz." << endl;
	cout << endl << endl;

	cout << "Optional arguments:" << endl << endl;

	cout << " " << MAF;
	cout << "\t\tThreshold for the Minor Allele Frequency (MAF). Must be from" << endl;
	cout << "\t\tthe [0, 0.5) interval. The default value is 0, i.e. all" << endl;
	cout << "\t\tmonomorphic markers are filtered out." << endl;
	cout << endl << endl;

	cout << " " << REGION;
	cout << "\tThe start and end positions (in base-pairs) of the chromosomal" << endl;
	cout << "\t\tregion to be partitioned." << endl;
	cout << endl << endl;

	cout << " " << LDRATIO;
	cout << "\tThreshold for the ratio of high LD SNP pairs to all" << endl;
	cout << "\t\tinformative SNP pairs inside a haplotype block. The default" << endl;
	cout << "\t\tand recommended value is 0.95." << endl;
	cout << endl << endl;

	cout << " " << CI;
	cout << "\t\t|D'| confidence interval (CI) estimation method. Supported" << endl;
	cout << "\t\tmethods are WP = Wall and Pritchard (2003) method;" << endl;
	cout << "\t\tAV = approximate variance estimator by Zapata et al. (1997)." << endl;
	cout << "\t\tThe default and recommended method is WP." << endl;
	cout << endl << endl;

	cout << " " << SAMPLES;
	cout << "\tProportion of SNP pairs to sample. Must be from the (0; 1)" << endl;
	cout << "\t\tinterval. The recommended values are from 0.01 to 0.05." << endl;
	cout << "\t\tThe default value is 0.01." << endl;
	cout << endl << endl;

	cout << " " << PROBABILITY;
	cout << "\tThe probability of the correctly estimated upper limits for" << endl;
	cout << "\t\thaplotype blocks boundaries. Must be from the (0; 1) interval." << endl;
	cout << "\t\tIt is highly recommended to set this value to 0.95 or greater." << endl;
	cout << "\t\tThe default value is 0.99." << endl;
	cout << endl << endl;

	cout << " " << SEED;
	cout << "\t\tSeed for the random sampling. Must be a positive integer" << endl;
	cout << "\t\tnumber. The default value is generated from the current time." << endl;
	cout << endl << endl;

	cout << " " << HELP;
	cout << "\t\tDisplay this information." << endl;
	cout << endl << endl;

	cout << " " << VERSION;
	cout << "\tDisplay S-MIG++ version information." << endl;
	cout << endl << endl;

	cout << "Output:" << endl << endl;

	cout << " The first meta-information lines in the output file start with '#' symbol." << endl;
	cout << " The output file consists of the following 12 columns:" << endl;
	cout << " BLOCK_NAME\t\tGenerated unique block name." << endl;
	cout << " FIRST_SNP\t\tName of the first SNP in block." << endl;
	cout << " LAST_SNP\t\tName of the last SNP in block." << endl;
	cout << " FIRST_SNP_ID\t\tIndex of the first SNP in block with respect to the" << endl;
	cout <<	" \t\t\tfiltered SNPs." << endl;
	cout << " LAST_SNP_ID\t\tIndex of the last SNP in block with respect to the" << endl;
	cout <<	" \t\t\tfiltered SNPs." << endl;
	cout << " START_BP\t\tThe base-pair position of the first SNP in block." << endl;
	cout << " END_BP\t\t\tThe base-pair position of the last SNP in block." << endl;
	cout << " N_SNPS\t\t\tNumber of SNPs in block." << endl;
	cout << " N_HAPS\t\t\tNumber of haplotypes in block." << endl;
	cout << " N_UNIQUE_HAPS\t\tNumber of unique haplotypes in block." << endl;
	cout << " N_COMMON_HAPS\t\tNumber of common (which appear more than once)" << endl;
	cout <<	" \t\t\thaplotypes in block." << endl;
	cout << " N_HAPS_DIVERSITY\tThe haplotype diversity in block (Patil et al., 2001)." << endl;
	cout << " \t\t\t1 - low diversity, 0 - high diversity." << endl;
	cout << endl;
}

int main(int args, char** argv) {
	const char* input_files_format = NULL;
	const char* input_phase_file = NULL;
	const char* input_map_file = NULL;
	const char* output_file = NULL;

	double maf_threshold = 0.0;
	unsigned long int start = 0u;
	unsigned long int end = numeric_limits<unsigned long int>::max();
	const char* ci_method = AlgorithmFactory::WP;
	double ld_ratio = 0.95;
	double samples = 0.01;
	double probability = 0.99;
	unsigned long int seed = 0ul;
	bool region = false;
	unsigned int n_segments = 0u;
	unsigned int window = 0u;
	char* endptr = NULL;

	char* blocks_file = NULL;

	unsigned int* estimated_contour = NULL;

	clock_t start_time = 0;
	double elapsed_time = 0.0;
	double sampling_time = 0.0;
	double migpp_time = 0.0;

	unsigned long int matrix_size = 0ul;
	unsigned long int migpp_computations = 0ul;

	map<const char*, vector<const char*>*, bool(*)(const char*, const char*)> options(auxiliary::bool_strcmp_ignore_case);
	map<const char*, vector<const char*>*, bool(*)(const char*, const char*)>::iterator options_it;
	vector<const char*>* arguments = NULL;

	for (int i = 1; i < args; i++) {
		if (auxiliary::strcmp_ignore_case(argv[i], "--", 2) == 0) {
			options_it = options.find(argv[i]);
			if (options_it != options.end()) {
				arguments = options_it->second;
			} else {
				arguments = new vector<const char*>();
				options.insert(pair<const char*, vector<const char*>*>(argv[i], arguments));
			}
		} else if (arguments != NULL) {
			arguments->push_back(argv[i]);
		} else {
			cout << "Vague command line argument '" << argv[i] << "' was specified."  << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
	}

	options_it = options.find(HELP);
	if (options_it != options.end()) {
		if (options_it->second->size() != 0) {
			cout << "Option '" << HELP << "' doesn't require any arguments." << endl;
			clean_options(options);
			return 0;
		}
		print_help();
		clean_options(options);
		return 0;
	}

	options_it = options.find(VERSION);
	if (options_it != options.end()) {
		if (options_it->second->size() != 0) {
			cout << "Option '" << VERSION << "' doesn't require any arguments." << endl;
			clean_options(options);
			return 0;
		}
		print_version();
		clean_options(options);
		return 0;
	}

	if ((options.count(VCF) > 0) && (options.count(HAPMAP2) > 0)) {
		cout << "Specify only one option: '"<< VCF << "' or '" << HAPMAP2 << "'." << endl;
		cout << "Specify '" << HELP << "' for the command line description." << endl;
		clean_options(options);
		return 0;
	}

	if ((options.count(VCF) == 0) && (options.count(HAPMAP2) == 0)) {
		cout << "Specify input file names (command line option '" << VCF << "' or '" << HAPMAP2 << "')." << endl;
		cout << "Specify '" << HELP << "' for the command line description." << endl;
		clean_options(options);
		return 0;
	}

	options_it = options.find(VCF);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify VCF file name after '"<< VCF << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		input_files_format = Db::VCF;
		input_phase_file = arguments->at(0);
	}

	options_it = options.find(HAPMAP2);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 2) {
			cout << "Specify legend file name and phase file name after '" << HAPMAP2 << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		input_files_format = Db::HAPMAP2;
		input_map_file = arguments->at(0);
		input_phase_file = arguments->at(1);
	}

	options_it = options.find(REGION);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 2) {
			cout << "Specify region start and end positions after '" << REGION << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		region = true;
		start = strtoul(arguments->at(0), NULL, 0);
		if (start == 0) {
			cout << "Invalid region start position after '" << REGION << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		end = strtoul(arguments->at(1), NULL, 0);
		if (end == 0) {
			cout << "Invalid region end position after '" << REGION << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		if (start >= end) {
			cout << "Region start position must be greater than region end position after '" << REGION << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
	}

	options_it = options.find(MAF);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify MAF threshold after '" << MAF << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		maf_threshold = strtod(arguments->at(0), &endptr);
		if (*endptr != '\0') {
			cout << "Invalid MAF threshold after '" << MAF << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
	}

	options_it = options.find(LDRATIO);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify ratio of LD to informative SNP pairs after '" << LDRATIO << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		ld_ratio = strtod(arguments->at(0), &endptr);
		if ((*endptr != '\0') || (ld_ratio <= 0.0) || (ld_ratio >= 1.0)) {
			cout << "Invalid ration of LD to informative SNP pairs after '" << LDRATIO << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
	}

	options_it = options.find(CI);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify |D'| CI computation method (" << AlgorithmFactory::WP << " or " << AlgorithmFactory::AV << ") after '" << CI << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		ci_method = arguments->at(0);
	}

	options_it = options.find(SAMPLES);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify proportion of SMP pairs to sample after '" << SAMPLES << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		samples = strtod(arguments->at(0), &endptr);
		if ((*endptr != '\0') || (samples <= 0.0) || (samples >= 1.0)) {
			cout << "Invalid proportion of SNP pairs to sample after '" << SAMPLES << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
	}

	options_it = options.find(PROBABILITY);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify probability of the correct estimations after '" << PROBABILITY << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		probability = strtod(arguments->at(0), &endptr);
		if ((*endptr != '\0') || (probability <= 0.0) || (probability >= 1.0)) {
			cout << "Invalid probability of the correct estimations after '" << PROBABILITY << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
	}

	options_it = options.find(OUT);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify output file name (exactly one) after '" << OUT << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		output_file = arguments->at(0);
	} else {
		cout << "Specify output file name after '" << OUT << "' option." << endl;
		cout << "Specify '" << HELP << "' for the command line description." << endl;
		clean_options(options);
		return 0;
	}

	options_it = options.find(SEED);
	if (options_it != options.end()) {
		arguments = options_it->second;
		if (arguments->size() != 1) {
			cout << "Specify random seed after '"<< SEED << "' option." << endl;
			cout << "Specify '" << HELP << "' for the command line description." << endl;
			clean_options(options);
			return 0;
		}
		seed = strtoul(options_it->second->at(0), NULL, 0);
	}

	try {
		blocks_file = (char*)malloc((strlen(output_file) + strlen(BLOCKS_SUFFIX) + 1u) * sizeof(char));
		if (blocks_file == NULL) {
			throw Exception(__FILE__, __LINE__, "Error in memory allocation.");
		}
		blocks_file[0u] = '\0';
		strcat(blocks_file, output_file);
		strcat(blocks_file, BLOCKS_SUFFIX);

//		BEGIN: LOAD DATA
		Db db;

		start_time = clock();
		if (!region) {
			if (strcmp(input_files_format, Db::VCF) == 0) {
				db.load_vcf(input_phase_file);
			} else if (strcmp(input_files_format, Db::HAPMAP2) == 0) {
				db.load_hapmap2(input_map_file, input_phase_file);
			}
		} else {
			if (strcmp(input_files_format, Db::VCF) == 0) {
				db.load_vcf(input_phase_file, start, end);
			} else if (strcmp(input_files_format, Db::HAPMAP2) == 0) {
				db.load_hapmap2(input_map_file, input_phase_file, start, end);
			}
		}
		elapsed_time = (clock() - start_time)/(double)CLOCKS_PER_SEC;

		cout << "===============================================================================" << endl;
		cout << "Input:" << endl;
		cout << " File: " << input_phase_file << endl;
		if (region) {
			cout << " Region: [" << start << ", " << end << "]" << endl;
		}
		cout << " Markers: " << db.get_n_markers() << endl;
		db.mask(maf_threshold);
		cout << " Non-monomorphic markers (MAF > " << maf_threshold << "): " << db.get_n_markers() << endl;
		cout << " Haplotypes: " << db.get_n_haplotypes() << endl;
		cout << " Memory used (Mb): " << db.get_memory_usage() << endl;
		cout << " Time used (sec): " << elapsed_time << endl;
		cout << endl;

		matrix_size = (((unsigned long int)db.get_n_markers()) * (((unsigned long int)db.get_n_markers()) - 1ul)) / 2.0;
//		END: LOAD DATA

//		BEGIN: SAMPLING
		if (n_segments == 0u) {
			n_segments = db.get_n_markers() * sqrt(samples);
			while (n_segments > 1u) {
				if (pow(db.get_n_markers() / n_segments, 2.0) * samples >= 1.0) {
					break;
				}
				--n_segments;
			}
		}

		ContourBuilder builder(db, ci_method, ProfileFactory::FS, probability, ld_ratio, samples, n_segments, true, seed);

		cout << "===============================================================================" << endl;
		cout << "Sampling:" << endl;
		cout << " LD ratio: " << ld_ratio << endl;
		cout << " D' CI method: " << ci_method << endl;
		cout << " Segments: " << n_segments << endl;
		cout << " Cells: " << builder.get_n_cells() << endl;
		cout << " Mean cell side length (markers): " << ((double)db.get_n_markers() / (double)builder.get_n_segments()) << endl;
		cout << " Samples (\%): " << samples << endl;
		cout << " Probability: " << probability << endl;
		cout << " Seed: " << builder.get_seed() << endl;
		cout << " Significance: " << builder.get_alpha() << endl;

		start_time = clock();
		builder.build();
		sampling_time = (clock() - start_time)/(double)CLOCKS_PER_SEC;;

		cout << " Performed computations: " << builder.get_n_computations() << " (" << ((long double)builder.get_n_computations() / (long double)matrix_size) << ")" << endl;
		cout << " Performed computations per cell: " << (builder.get_n_computations() / (double)builder.get_n_cells()) << endl;
		cout << " Estimated haplotype blocks contour size: " << builder.get_estimated_contour_area() << " (" << ((long double)builder.get_estimated_contour_area() / (long double)matrix_size) << ")" << endl;
		cout << " Memory used (Mb): " << builder.get_memory_usage() << endl;
		cout << " Time used (sec): " << sampling_time << endl;
		cout << endl;

		estimated_contour = builder.get_estimated_contour();
//		END: SAMPLING

//		BEGIN: COMPUTE HAPLOTYPE BLOCKS
		cout << "===============================================================================" << endl;
		cout << "Computing haplotype blocks:" << endl;

		MIG mig(db);

		if (window == 0u) {
			window = (unsigned int)((0.05 * (double)builder.get_estimated_contour_area()) / (double)db.get_n_markers());
		}

		start_time = clock();
		migpp_computations = mig.compute_candidate_blocks_migpp(estimated_contour, ci_method, window);
		migpp_time = (clock() - start_time)/(double)CLOCKS_PER_SEC;

		cout << " D' CI method: " << ci_method << endl;
		cout << " Window: " << window << endl;
		cout << " Performed computations: " << migpp_computations << " (" << ((long double)migpp_computations / (long double)matrix_size) << ")" << endl;
		cout << " Candidate haplotype blocks: " << mig.get_n_strong_pairs() << endl;
		cout << " Memory used (Mb): " << mig.get_max_memory_usage() << endl;
		cout << " Time used (sec): " <<  migpp_time << endl;
		cout << endl;
//		END: COMPUTE HAPLOTYPE BLOCKS

//		BEGIN: SELECTING NON-OVERLAPPING HAPLOTYPE BLOCKS
		cout << "===============================================================================" << endl;
		cout << "Selecting non-overlapping haplotype blocks:" << endl;

		start_time = clock();
		mig.sort_candidate_blocks();
		mig.select_final_blocks();
		elapsed_time = (clock() - start_time)/(double)CLOCKS_PER_SEC;

		cout << " Final haplotype blocks: " << mig.get_n_blocks() << endl;
		cout << " Time used (sec): " << elapsed_time << endl;
		cout << endl;
//		END: SELECTING NON-OVERLAPPING HAPLOTYPE BLOCKS

//		BEGIN: WRITING HAPLOTYPE BLOCKS
		cout << "===============================================================================" << endl;
		cout << "Writing haplotype blocks... " << endl;

		start_time = clock();
		mig.write_blocks(blocks_file, WriterFactory::GZIP, input_phase_file, input_map_file, maf_threshold, region, start, end, ci_method);
		elapsed_time = (clock() - start_time)/(double)CLOCKS_PER_SEC;

		cout << " Non-overlapping haplotype blocks: " << blocks_file << endl;
		cout << " Time used (sec): " << elapsed_time << endl;
		cout << endl;

//		END: WRITING HAPLOTYPE BLOCKS
		cout << "===============================================================================" << endl;
		cout << "Total time used (sec): " << setprecision(10) << (sampling_time + migpp_time) << endl;
		cout << endl;
	} catch (Exception &e) {
		cout << endl;
		cout << e.what() << endl;
	}

	if (blocks_file != NULL) {
		free(blocks_file);
		blocks_file = NULL;
	}

	clean_options(options);

	return 0;
}
