use strict;
use File::Spec;

my $buildno = '0.1.2013.03.14';

print(STDERR <<"_END");
clmergeclass $buildno
=======================================================================

Official web site of this script is
http://www.fifthdimension.jp/products/claident/ .
To know script details, see above URL.

Copyright (C) 2011-2013  Akifumi S. Tanabe

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

_END

# display usage if command line options were not specified
unless (@ARGV) {
	&helpMessage();
}

my $devnull = File::Spec->devnull();
my $numthreads = 1;
my $assamsoption;
my $outputfolder = $ARGV[-1];
my @inputfiles;
if (-e $outputfolder) {
	&errorMessage(__LINE__, "\"$outputfolder\" already exists.");
}

# get command line options
{
	my $assamsmode = 0;
	my %inputfiles;
	for (my $i = 0; $i < scalar(@ARGV) - 1; $i ++) {
		if ($ARGV[$i] eq 'end') {
			$assamsmode = 0;
		}
		elsif ($assamsmode) {
			$assamsoption .= " $ARGV[$i]";
		}
		elsif ($ARGV[$i] eq 'assams') {
			$assamsmode = 1;
		}
		elsif ($ARGV[$i] =~ /^-+(?:n|n(?:um)?threads?)=(\d+)$/i) {
			$numthreads = $1;
		}
		else {
			my @temp = glob($ARGV[$i]);
			if (scalar(@temp) > 0) {
				foreach (@temp) {
					if (!exists($inputfiles{$_})) {
						$inputfiles{$_} = 1;
						push(@inputfiles, $_);
					}
					else {
						&errorMessage(__LINE__, "\"$_\" is doubly specified.");
					}
				}
			}
			else {
				&errorMessage(__LINE__, "Input file does not exist.");
			}
		}
	}
}

if ($assamsoption =~ /-+(?:n|n(?:um)?threads?)=\d+/i) {
	&errorMessage(__LINE__, "The option for assams is invalid.");
}
if ($assamsoption !~ /-+min(?:imum)?n(?:um)?seqs?=\d+/i) {
	$assamsoption .= ' --minnseq=5000';
}
if ($assamsoption !~ /-+(?:min(?:imum)?ident(?:ity|ities)?|m)=\d+/i) {
	$assamsoption .= ' --minident=0.97,0.97,0.7';
}
if ($assamsoption !~ /-+min(?:imum)?(?:overlap|ovl)(?:length|len)=\d+/i) {
	$assamsoption .= ' --minovllen=100';
}
if ($assamsoption !~ /-+max(?:imum)?n(?:um)?hits?=\d+/i) {
	$assamsoption .= ' --maxnhits=0';
}
if ($assamsoption !~ /-+(?:strand|s)=(?:forward|plus|single|both|double)/i) {
	$assamsoption .= ' --strand=both';
}
if ($assamsoption !~ /-+merge(?:mode)?=(?:draft|rapid|fast|middle|moderate|normal|accurate|slow|strict)/i) {
	$assamsoption .= ' --mergemode=accurate';
}
if ($assamsoption !~ /-+(?:(?:aln|align|alignment)wiggle|a)=\d+/i) {
	$assamsoption .= ' --alnwiggle=11';
}

if (!@inputfiles) {
	&errorMessage(__LINE__, "No input file was specified.");
}

print(STDERR "Command line options for assams:$assamsoption\n");
print(STDERR "\n");

my $bank2fasta;
my $listReadPlacedStatus;
my $dumpreads;
my $bankreport;
my $banktransact;
{
	my $pathto;
	if ($ENV{'ASSAMSHOME'}) {
		$pathto = $ENV{'ASSAMSHOME'};
	}
	else {
		my $temp;
		if (-e $ENV{'HOME'} . '/.assams') {
			$temp = $ENV{'HOME'} . '/.assams';
		}
		elsif (-e '/etc/assams/.assams') {
			$temp = '/etc/assams/.assams';
		}
		if ($temp) {
			my $filehandle;
			unless (open($filehandle, "< $temp")) {
				&errorMessage(__LINE__, "Cannot read \"$temp\".");
			}
			while (<$filehandle>) {
				if (/^\s*ASSAMSHOME\s*=\s*(\S[^\r\n]*)/) {
					$pathto = $1;
					$pathto =~ s/\s+$//;
					last;
				}
			}
			close($filehandle);
		}
	}
	if ($pathto) {
		$pathto =~ s/^"(.+)"$/$1/;
		$pathto =~ s/\/$//;
		$pathto .= '/bin';
		if (!-e $pathto) {
			&errorMessage(__LINE__, "Cannot find \"$pathto\".");
		}
		$bank2fasta = "\"$pathto/bank2fasta\"";
		$listReadPlacedStatus = "\"$pathto/listReadPlacedStatus\"";
		$dumpreads = "\"$pathto/dumpreads\"";
		$bankreport = "\"$pathto/bank-report\"";
		$banktransact = "\"$pathto/bank-transact\"";
	}
	else {
		$bank2fasta = 'bank2fasta';
		$listReadPlacedStatus = 'listReadPlacedStatus';
		$dumpreads = 'dumpreads';
		$bankreport = 'bank-report';
		$banktransact = 'bank-transact';
	}
}

# make output directory
print(STDERR "Making output folder...\n");
if (!mkdir($outputfolder)) {
	&errorMessage(__LINE__, 'Cannot make output directory.');
}
print(STDERR "done.\n\n");

# make afg and delete banks
print(STDERR "Generating AMOS banks from .afg.gz files...\n");
{
	my $child = 0;
	$| = 1;
	$? = 0;
	for (my $i = 0; $i < scalar(@inputfiles); $i ++) {
		if (my $pid = fork()) {
			$child ++;
			if ($child >= ($numthreads + 1)) {
				if (wait == -1) {
					$child = 0;
				} else {
					$child --;
				}
			}
			if ($?) {
				&errorMessage(__LINE__, "The processes did not finished correctly.");
			}
			next;
		}
		else {
			if (system("gzip -dc $inputfiles[$i] 2> $devnull | $banktransact -c -f -z -b $outputfolder/temp.$i.bnk -m - 2> $devnull 1> $devnull")) {
				&errorMessage(__LINE__, "Cannot run \"gzip -dc $inputfiles[$i] | $banktransact -c -f -z -b $outputfolder/temp.$i.bnk -m -\".");
			}
			exit;
		}
	}
	# join
	while (wait != -1) {
		if ($?) {
			&errorMessage(__LINE__, "The processes did not finished correctly.");
		}
	}
}
print(STDERR "done.\n\n");

# change working directory
unless (chdir($outputfolder)) {
	&errorMessage(__LINE__, 'Cannot change working directory.');
}

# merge all assemblies
print(STDERR "Merging all assemblies by assams...\n");
if (system("assams$assamsoption --numthreads=$numthreads \"temp.*.bnk\" all.bnk")) {
	&errorMessage(__LINE__, "Cannot run assams to merge assemblies.");
}
print(STDERR "done.\n\n");

# delete temporary banks
print(STDERR "Delete temporary AMOS banks...\n");
{
	my $child = 0;
	$| = 1;
	$? = 0;
	for (my $i = 0; $i < scalar(@inputfiles); $i ++) {
		if (my $pid = fork()) {
			$child ++;
			if ($child >= ($numthreads + 1)) {
				if (wait == -1) {
					$child = 0;
				} else {
					$child --;
				}
			}
			if ($?) {
				&errorMessage(__LINE__, "The processes did not finished correctly.");
			}
			next;
		}
		else {
			while (glob("temp.$i.bnk/*")) {
				unlink;
			}
			rmdir("temp.$i.bnk");
			exit;
		}
	}
	# join
	while (wait != -1) {
		if ($?) {
			&errorMessage(__LINE__, "The processes did not finished correctly.");
		}
	}
}
print(STDERR "done.\n\n");

# read assembly
my %contigmembers;
my @singletons;
{
	print(STDERR "Postprocessing...\n");
	my $pipehandle;
	# get the members
	unless (open($pipehandle, "$listReadPlacedStatus all.bnk 2> $devnull |")) {
		&errorMessage(__LINE__, "Cannot run listReadPlacedStatus.");
	}
	while (<$pipehandle>) {
		s/\r?\n?$//;
		if (my @row = split(/\t/)) {
			if (scalar(@row) == 4 && $row[2] eq 'S') {
				push(@singletons, $row[1]);
			}
			elsif (scalar(@row) == 5 && $row[2] eq 'P') {
				push(@{$contigmembers{"contig_$row[4]"}}, $row[1]);
			}
			else {
				&errorMessage(__LINE__, "Invalid assemble results.\nBank: all.bnk\nSequence: $row[1]\nContig: $row[4]\n");
			}
		}
	}
	close($pipehandle);
	print(STDERR "done.\n\n");
}

# make consensus sequence file
print(STDERR "Extracting consensus sequences...\n");
if (system("$bank2fasta -b all.bnk -eid -q consensus.fasta.qual > consensus.fasta 2> $devnull")) {
	&errorMessage(__LINE__, "Cannot run $bank2fasta for \"all.bnk\".");
}
if (system("$listReadPlacedStatus -S -I all.bnk > singletons.iid 2> $devnull")) {
	&errorMessage(__LINE__, "Cannot run listReadPlacedStatus for \"all.bnk\".");
}
if (system("$dumpreads -r -e -I singletons.iid all.bnk >> consensus.fasta 2> $devnull")) {
	&errorMessage(__LINE__, "Cannot run dumpreads for \"all.bnk\".");
}
if (system("$dumpreads -r -e -q -I singletons.iid all.bnk >> consensus.fasta.qual 2> $devnull")) {
	&errorMessage(__LINE__, "Cannot run dumpreads for \"all.bnk\".");
}
unlink("singletons.iid");
print(STDERR "done.\n\n");

print(STDERR "Generating AFG file...\n");
if (system("$bankreport -p -b all.bnk 2> $devnull | gzip -9 -c > all.afg.gz 2> $devnull")) {
	&errorMessage(__LINE__, "Cannot run $bankreport -p|gzip for \"all.bnk\".");
}
while (glob("all.bnk/*")) {
	unlink;
}
rmdir("all.bnk");
print(STDERR "done.\n\n");

print(STDERR "Generating contig member file...\n");
{
	my $outputhandle;
	unless (open($outputhandle, "> contigmembers.txt")) {
		&errorMessage(__LINE__, "Cannot open \"contigmembers.txt\".");
	}
	foreach my $contigname (keys(%contigmembers)) {
		print($outputhandle "$contigname");
		foreach my $member (@{$contigmembers{$contigname}}) {
			print($outputhandle "\t$member");
		}
		print($outputhandle "\n");
	}
	foreach my $singleton (@singletons) {
		print($outputhandle "\t$singleton\n");
	}
	close($outputhandle);
}
print(STDERR "done.\n\n");

chdir('..');

sub errorMessage {
	my $lineno = shift(@_);
	my $message = shift(@_);
	print(STDERR "ERROR!: line $lineno\n$message\n");
	print(STDERR "If you want to read help message, run this script without options.\n");
	exit(1);
}

sub helpMessage {
	print(STDERR <<"_END");
Usage
=====
clmergeclass options inputfiles outputfolder

Command line options
====================
assams options end
  Specify commandline options for assams.
(default: --minnseq=5000 --minident=0.97,0.97,0.7 --minovllen=100
 --maxnhits=0 --strand=both --mergemode=accurate --alnwiggle=11)

-n, --numthreads=INTEGER
  Specify the number of processes. (default: 1)

Acceptable input file formats
=============================
AFG.gz
_END
	exit;
}
