use strict;
use File::Spec;

my $buildno = '0.1.2013.03.14';

print(STDERR <<"_END");
clclassseq $buildno
=======================================================================

Official web site of this script is
http://www.fifthdimension.jp/products/claident/ .
To know script details, see above URL.

Copyright (C) 2011-2013  Akifumi S. Tanabe

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

_END

# display usage if command line options were not specified
unless (@ARGV) {
	&helpMessage();
}

my $devnull = File::Spec->devnull();
my $numthreads = 1;
my $assamsoption;
my $uchimeoption;
my $outputfolder = $ARGV[-1];
my @inputfiles;
if (-e $outputfolder) {
	&errorMessage(__LINE__, "\"$outputfolder\" already exists.");
}

# get command line options
{
	my $assamsmode = 0;
	my $uchimemode = 0;
	my %inputfiles;
	for (my $i = 0; $i < scalar(@ARGV) - 1; $i ++) {
		if ($ARGV[$i] eq 'end') {
			$assamsmode = 0;
			$uchimemode = 0;
		}
		elsif ($assamsmode) {
			$assamsoption .= " $ARGV[$i]";
		}
		elsif ($uchimemode) {
			$uchimeoption .= " $ARGV[$i]";
		}
		elsif ($ARGV[$i] eq 'assams') {
			$assamsmode = 1;
		}
		elsif ($ARGV[$i] eq 'uchime') {
			$uchimemode = 1;
		}
		elsif ($ARGV[$i] =~ /^-+(?:n|n(?:um)?threads?)=(\d+)$/i) {
			$numthreads = $1;
		}
		else {
			my @temp = glob($ARGV[$i]);
			if (scalar(@temp) > 0) {
				foreach (@temp) {
					if (!exists($inputfiles{$_})) {
						$inputfiles{$_} = 1;
						push(@inputfiles, $_);
					}
					else {
						&errorMessage(__LINE__, "\"$_\" is doubly specified.");
					}
				}
			}
			else {
				&errorMessage(__LINE__, "Input file does not exist.");
			}
		}
	}
}

if ($assamsoption =~ /-+(?:n|n(?:um)?threads?)=\d+/i) {
	&errorMessage(__LINE__, "The option for assams is invalid.");
}
if ($assamsoption !~ /-+min(?:imum)?n(?:um)?seqs?=\d+/i) {
	$assamsoption .= ' --minnseq=5000';
}
if ($assamsoption !~ /-+(?:min(?:imum)?ident(?:ity|ities)?|m)=\d+/i) {
	$assamsoption .= ' --minident=0.97';
}
if ($assamsoption !~ /-+min(?:imum)?(?:overlap|ovl)(?:length|len)=\d+/i) {
	$assamsoption .= ' --minovllen=100';
}
if ($assamsoption !~ /-+max(?:imum)?n(?:um)?hits?=\d+/i) {
	$assamsoption .= ' --maxnhits=0';
}
if ($assamsoption !~ /-+(?:strand|s)=(?:forward|plus|single|both|double)/i) {
	$assamsoption .= ' --strand=both';
}
if ($assamsoption !~ /-+merge(?:mode)?=(?:draft|rapid|fast|middle|moderate|normal|accurate|slow|strict)/i) {
	$assamsoption .= ' --mergemode=accurate';
}
if ($assamsoption !~ /-+(?:(?:aln|align|alignment)wiggle|a)=\d+/i) {
	$assamsoption .= ' --alnwiggle=11';
}
if ($uchimeoption =~ /\-+input/) {
	&errorMessage(__LINE__, "The option for uchime is invalid.");
}
if ($uchimeoption =~ /\-+uchimeout/) {
	&errorMessage(__LINE__, "The option for uchime is invalid.");
}
if ($uchimeoption !~ /\-\-minh \d+/) {
	$uchimeoption .= ' --minh 0.1';
}

if (!@inputfiles) {
	&errorMessage(__LINE__, "No input file was specified.");
}

print(STDERR "Command line options for assams for assembly:$assamsoption\n");
print(STDERR "Command line options for uchime for chimera detection:$uchimeoption\n");
print(STDERR "\n");

my $bank2fasta;
my $listReadPlacedStatus;
my $dumpreads;
my $bankreport;
my $banktransact;
{
	my $pathto;
	if ($ENV{'ASSAMSHOME'}) {
		$pathto = $ENV{'ASSAMSHOME'};
	}
	else {
		my $temp;
		if (-e $ENV{'HOME'} . '/.assams') {
			$temp = $ENV{'HOME'} . '/.assams';
		}
		elsif (-e '/etc/assams/.assams') {
			$temp = '/etc/assams/.assams';
		}
		if ($temp) {
			my $filehandle;
			unless (open($filehandle, "< $temp")) {
				&errorMessage(__LINE__, "Cannot read \"$temp\".");
			}
			while (<$filehandle>) {
				if (/^\s*ASSAMSHOME\s*=\s*(\S[^\r\n]*)/) {
					$pathto = $1;
					$pathto =~ s/\s+$//;
					last;
				}
			}
			close($filehandle);
		}
	}
	if ($pathto) {
		$pathto =~ s/^"(.+)"$/$1/;
		$pathto =~ s/\/$//;
		$pathto .= '/bin';
		if (!-e $pathto) {
			&errorMessage(__LINE__, "Cannot find \"$pathto\".");
		}
		$bank2fasta = "\"$pathto/bank2fasta\"";
		$listReadPlacedStatus = "\"$pathto/listReadPlacedStatus\"";
		$dumpreads = "\"$pathto/dumpreads\"";
		$bankreport = "\"$pathto/bank-report\"";
		$banktransact = "\"$pathto/bank-transact\"";
	}
	else {
		$bank2fasta = 'bank2fasta';
		$listReadPlacedStatus = 'listReadPlacedStatus';
		$dumpreads = 'dumpreads';
		$bankreport = 'bank-report';
		$banktransact = 'bank-transact';
	}
}

# make output directory
print(STDERR "Making output folder...\n");
if (!mkdir($outputfolder)) {
	&errorMessage(__LINE__, 'Cannot make output directory.');
}
print(STDERR "done.\n\n");

# run first assams in parallel
print(STDERR "Running assembly by assams at each file...\n");
{
	my @newinput;
	foreach my $inputfile (@inputfiles) {
		print(STDERR "Processing $inputfile...\n");
		my $filename = $inputfile;
		$filename =~ s/^.+(?:\/|\\)//;
		push(@newinput, $filename);
		if (system("assams$assamsoption --numthreads=$numthreads $inputfile $outputfolder/$filename.bnk 2> $devnull 1> $devnull")) {
			&errorMessage(__LINE__, "Cannot run \"assams$assamsoption --numthreads=$numthreads $inputfile $outputfolder/$filename.bnk\".");
		}
	}
	@inputfiles = @newinput;
}
print(STDERR "done.\n\n");

# change working directory
unless (chdir($outputfolder)) {
	&errorMessage(__LINE__, 'Cannot change working directory.');
}

# prepare for chimera detection
my %contigmembers;
print(STDERR "Preparing files for chimera detection...\n");
{
	my $child = 0;
	$| = 1;
	$? = 0;
	foreach my $inputfile (@inputfiles) {
		if (my $pid = fork()) {
			$child ++;
			if ($child >= ($numthreads + 1)) {
				if (wait == -1) {
					$child = 0;
				} else {
					$child --;
				}
			}
			if ($?) {
				&errorMessage(__LINE__, "The processes did not finished correctly.");
			}
			next;
		}
		else {
			if (-e "$inputfile.bnk/CTG.ifo") {
				my $pipehandle;
				my $outputhandle;
				my $numsingletons;
				# get the members
				unless (open($pipehandle, "$listReadPlacedStatus $inputfile.bnk 2> $devnull |")) {
					&errorMessage(__LINE__, "Cannot run $listReadPlacedStatus.");
				}
				while (<$pipehandle>) {
					s/\r?\n?$//;
					if (my @row = split(/\t/)) {
						if (scalar(@row) == 4 && $row[2] eq 'S') {
							$numsingletons ++;
						}
						elsif (scalar(@row) == 5 && $row[2] eq 'P') {
							push(@{$contigmembers{$inputfile}{"contig_$row[4]"}}, $row[1]);
						}
						else {
							&errorMessage(__LINE__, "Invalid assemble results.\nBank: $inputfile.bnk\nSequence: $row[1]\nContig: $row[4]\n");
						}
					}
				}
				close($pipehandle);
				# contigs to preuchime
				unless (open($outputhandle, "> $inputfile.preuchime.fasta")) {
					&errorMessage(__LINE__, "Cannot write \"$inputfile.preuchime.fasta\".");
				}
				my @tempcontig = keys(%{$contigmembers{$inputfile}});
				if (@tempcontig) {
					unless (open($pipehandle, "$bank2fasta -b $inputfile.bnk -iid 2> $devnull |")) {
						&errorMessage(__LINE__, "Cannot run $bank2fasta.");
					}
					while (<$pipehandle>) {
						if (/^>(\d+)/) {
							my $contigname = "contig_$1";
							if ($contigmembers{$inputfile}{$contigname}) {
								my $ab = sprintf("%.1f", scalar(@{$contigmembers{$inputfile}{$contigname}}));
								print($outputhandle ">$contigname\/ab=$ab\/\n");
							}
							else {
								&errorMessage(__LINE__, "Invalid assemble results.\nBank: $inputfile.bnk\nContig: $contigname\n");
							}
						}
						else {
							print($outputhandle $_);
						}
					}
					close($pipehandle);
				}
				# singletons to preuchime
				if ($numsingletons) {
					if (system("$listReadPlacedStatus -S -I $inputfile.bnk > $inputfile.singletons.iid 2> $devnull")) {
						&errorMessage(__LINE__, "Cannot run $listReadPlacedStatus for \"$inputfile.bnk\".");
					}
					unless (open($pipehandle, "$dumpreads -r -e -I $inputfile.singletons.iid $inputfile.bnk 2> $devnull |")) {
						&errorMessage(__LINE__, "Cannot run $dumpreads for \"$inputfile.bnk\".");
					}
					while (<$pipehandle>) {
						if (/^>\S+/) {
							s/^>(\S+)/>$1\/ab=1.0\//;
						}
						print($outputhandle $_);
					}
					close($pipehandle);
					unlink("$inputfile.singletons.iid");
				}
				close($outputhandle);
			}
			else {
				my $pipehandle;
				my $outputhandle;
				unless (open($outputhandle, "> $inputfile.preuchime.fasta")) {
					&errorMessage(__LINE__, "Cannot write \"$inputfile.preuchime.fasta\".");
				}
				unless (open($pipehandle, "$dumpreads -r -e $inputfile.bnk 2> $devnull |")) {
					&errorMessage(__LINE__, "Cannot run \"$dumpreads -r -e $inputfile.bnk\".");
				}
				while (<$pipehandle>) {
					if (/^>\S+/) {
						s/^>(\S+)/>$1\/ab=1.0\//;
					}
					print($outputhandle $_);
				}
				close($pipehandle);
				close($outputhandle);
			}
			exit;
		}
	}
	# join
	while (wait != -1) {
		if ($?) {
			&errorMessage(__LINE__, "The processes did not finished correctly.");
		}
	}
}
print(STDERR "done.\n\n");

# run chimera detection in parallel
print(STDERR "Running chimera detection by uchime at each file...\n");
{
	my $child = 0;
	$| = 1;
	$? = 0;
	foreach my $inputfile (@inputfiles) {
		if (my $pid = fork()) {
			$child ++;
			if ($child >= $numthreads) {
				if (wait == -1) {
					$child = 0;
				} else {
					$child --;
				}
			}
			if ($?) {
				&errorMessage(__LINE__, "The processes did not finished correctly.");
			}
			next;
		}
		else {
			print(STDERR "Processing $inputfile...\n");
			if (-e "$inputfile.preuchime.fasta") {
				if (system("uchime --input $inputfile.preuchime.fasta --uchimeout $inputfile.uchime.txt$uchimeoption 2> $devnull 1> $devnull")) {
					&errorMessage(__LINE__, "Chimera detection by uchime was failed at \"$inputfile.preuchime.fasta\".");
				}
			}
			else {
				&errorMessage(__LINE__, "Unknown error.");
			}
			exit;
		}
	}
	# join
	while (wait != -1) {
		if ($?) {
			&errorMessage(__LINE__, "The processes did not finished correctly.");
		}
	}
}
print(STDERR "done.\n\n");

# delete chimeras
print(STDERR "Deleting chimeras and generating AFG files...\n");
{
	my $child = 0;
	$| = 1;
	$? = 0;
	foreach my $inputfile (@inputfiles) {
		if (my $pid = fork()) {
			$child ++;
			if ($child >= ($numthreads + 1)) {
				if (wait == -1) {
					$child = 0;
				} else {
					$child --;
				}
			}
			if ($?) {
				&errorMessage(__LINE__, "The processes did not finished correctly.");
			}
			next;
		}
		else {
			if (-e "$inputfile.bnk" && -e "$inputfile.preuchime.fasta" && -e "$inputfile.uchime.txt") {
				unlink("$inputfile.preuchime.fasta");
				my $inputhandle;
				# store chimera list
				my %chimeras;
				if (-z "$inputfile.uchime.txt") {
					&errorMessage(__LINE__, "UCHIME output \"$inputfile.uchime.txt\" is invalid.");
				}
				unless (open($inputhandle, "< $inputfile.uchime.txt")) {
					&errorMessage(__LINE__, "Cannot open \"$inputfile.uchime.txt\".");
				}
				while (<$inputhandle>) {
					my @entry = split(/\t/, $_);
					if ($entry[-1] =~ /^Y/ && $entry[1] =~ /^(.+)\/ab=\d+\.\d+\//) {
						$chimeras{$1} = 1;
						if ($entry[1] =~ /^(contig_\d+)\/ab=\d+\.\d+\//) {
							foreach my $member (@{$contigmembers{$inputfile}{$1}}) {
								$chimeras{$member} = 1;
							}
						}
					}
				}
				close($inputhandle);
				# make new banks
				my $pipehandlein;
				unless (open($pipehandlein, "$bankreport -p -b $inputfile.bnk 2> $devnull |")) {
					&errorMessage(__LINE__, "Cannot run $bankreport -p.");
				}
				my $pipehandleout;
				unless (open($pipehandleout, "| $banktransact -c -f -z -b $inputfile.postuchime.bnk -m - 1> $devnull 2> $devnull")) {
					&errorMessage(__LINE__, "Cannot run $banktransact.");
				}
				my $temp;
				my $nesting = 0;
				my $contig;
				my $read;
				while (<$pipehandlein>) {
					if (/^\}/) {
						$nesting --;
					}
					elsif (/^\{([A-Z]+)/) {
						$nesting ++;
						if ($1 eq 'CTG') {
							$contig ++;
						}
						elsif ($1 eq 'RED') {
							$read ++
						}
					}
					$temp .= $_;
					if ($nesting == 0) {
						if (!$contig && !$read || $contig && $temp =~ /^iid:(\d+)/m && !$chimeras{"contig_$1"} || $read && $temp =~ /^eid:(\S+)/m && !$chimeras{$1}) {
							print($pipehandleout $temp);
						}
						undef($temp);
						undef($contig);
						undef($read);
					}
				}
				close($pipehandleout);
				close($pipehandlein);
				# make preuchime afg
				if (system("$bankreport -p -b $inputfile.bnk 2> $devnull | gzip -9 -c > $inputfile.preuchime.afg.gz 2> $devnull")) {
					&errorMessage(__LINE__, "Cannot run $bankreport -p|gzip for \"$inputfile.bnk\".");
				}
				# delete bank
				while (glob("$inputfile.bnk/*")) {
					unlink;
				}
				rmdir("$inputfile.bnk");
				# make postuchime afg
				if (system("$bankreport -p -b $inputfile.postuchime.bnk 2> $devnull | gzip -9 -c > $inputfile.postuchime.afg.gz 2> $devnull")) {
					&errorMessage(__LINE__, "Cannot run $bankreport -p|gzip for \"$inputfile.postuchime.afg\".");
				}
				# delete bank
				while (glob("$inputfile.postuchime.bnk/*")) {
					unlink;
				}
				rmdir("$inputfile.postuchime.bnk");
			}
			else {
				&errorMessage(__LINE__, "Unknown error.");
			}
			exit;
		}
	}
	# join
	while (wait != -1) {
		if ($?) {
			&errorMessage(__LINE__, "The processes did not finished correctly.");
		}
	}
}
print(STDERR "done.\n\n");

chdir('..');

sub errorMessage {
	my $lineno = shift(@_);
	my $message = shift(@_);
	print(STDERR "ERROR!: line $lineno\n$message\n");
	print(STDERR "If you want to read help message, run this script without options.\n");
	exit(1);
}

sub helpMessage {
	print(STDERR <<"_END");
Usage
=====
clclassseq options inputfiles outputfolder

Command line options
====================
assams options end
  Specify commandline options for assams.
(default: --minnseq=5000 --minident=0.97 --minovllen=100 --maxnhits=0
 --strand=both --mergemode=accurate --alnwiggle=11)

uchime options end
  Specify commandline options for uchime.
(default: --minh 0.1)

-n, --numthreads=INTEGER
  Specify the number of processes. (default: 1)

Acceptable input file formats
=============================
FASTA (+.qual)
_END
	exit;
}
