use strict;

my $buildno = '0.1.2014.02.06';

print(STDERR <<"_END");
clsumclass $buildno
=======================================================================

Official web site of this script is
http://www.fifthdimension.jp/products/claident/ .
To know script details, see above URL.

Copyright (C) 2011-2014  Akifumi S. Tanabe

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

_END

# display usage if command line options were not specified
unless (@ARGV) {
	&helpMessage();
}

# initialize variables
my $outputfile = $ARGV[-1];
if (-e $outputfile) {
	&errorMessage(__LINE__, "\"$outputfile\" already exists.");
}
my $inputfile = $ARGV[-2];
if (!-e $inputfile) {
	&errorMessage(__LINE__, "\"$inputfile\" does not exist.");
}

# read command line options
my $minnseqcontig = 0;
my $minnseqsample = 0;
my $minntotalseqcontig = 0;
my $minntotalseqsample = 0;
my $minpseqcontig = 0;
my $minpseqsample = 0;
my $runname;
my $old;
my $outformat = 'Column';
for (my $i = 0; $i < scalar(@ARGV) - 2; $i ++) {
	if ($ARGV[$i] =~ /^-+min(?:imum)?n(?:um)?seq(?:uence)?s?con(?:tig)?=(\d+)$/i) {
		$minnseqcontig = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?n(?:um)?seq(?:uence)?s?sam(?:ple)?=(\d+)$/i) {
		$minnseqsample = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?n(?:um)?totalseq(?:uence)?s?con(?:tig)?=(\d+)$/i) {
		$minntotalseqcontig = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?n(?:um)?totalseq(?:uence)?s?sam(?:ple)?=(\d+)$/i) {
		$minntotalseqsample = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?(?:r|rate|p|percentage)seq(?:uence)?s?con(?:tig)?=(\d+)$/i) {
		$minpseqcontig = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?(?:r|rate|p|percentage)seq(?:uence)?s?sam(?:ple)?=(\d+)$/i) {
		$minpseqsample = $1;
	}
	elsif ($ARGV[$i] =~ /^-+runname=(.+)$/i) {
		$runname = $1;
	}
	elsif ($ARGV[$i] =~ /^-+old$/i) {
		$old = 1;
	}
	elsif ($ARGV[$i] =~ /^-+(?:o|output)=(.+)$/i) {
		if ($1 =~ /^Matrix$/i) {
			$outformat = 'Matrix';
		}
		elsif ($1 =~ /^Column$/i) {
			$outformat = 'Column';
		}
		else {
			&errorMessage(__LINE__, "\"$ARGV[$i]\" is unknown option.");
		}
	}
	else {
		&errorMessage(__LINE__, "\"$ARGV[$i]\" is unknown option.");
	}
}

# read input file
my %table;
my @contignames;
unless (open(INFILE, "< $inputfile")) {
	&errorMessage(__LINE__, "Cannot open \"$inputfile\".");
}
{
	while (<INFILE>) {
		s/\r?\n?$//;
		if (my @row = split(/\t/)) {
			if (scalar(@row) > 2) {
				my $contigname = shift(@row);
				push(@contignames, $contigname);
				foreach my $contigmember (@row) {
					my @temp;
					if ($old) {
						@temp = split(/_/, $contigmember);
					}
					else {
						@temp = split(/__/, $contigmember);
					}
					if (scalar(@temp) == 3) {
						my ($temp, $temprunname, $primer) = @temp;
						if ($runname) {
							$temprunname = $runname;
						}
						$table{"$temprunname\__$primer"}{$contigname} ++;
					}
					elsif (scalar(@temp) == 4) {
						my ($temp, $temprunname, $tag, $primer) = @temp;
						if ($runname) {
							$temprunname = $runname;
						}
						$table{"$temprunname\__$tag\__$primer"}{$contigname} ++;
					}
					else {
						&errorMessage(__LINE__, "\"$contigmember\" is invalid name.");
					}
				}
			}
			elsif (scalar(@row) == 2) {
				push(@contignames, $row[1]);
				my @temp;
				if ($old) {
					@temp = split(/_/, $row[1]);
				}
				else {
					@temp = split(/__/, $row[1]);
				}
				if (scalar(@temp) == 3) {
					my ($temp, $temprunname, $primer) = @temp;
					if ($runname) {
						$temprunname = $runname;
					}
					$table{"$temprunname\__$primer"}{$row[1]} ++;
				}
				elsif (scalar(@temp) == 4) {
					my ($temp, $temprunname, $tag, $primer) = @temp;
					if ($runname) {
						$temprunname = $runname;
					}
					$table{"$temprunname\__$tag\__$primer"}{$row[1]} ++;
				}
				else {
					&errorMessage(__LINE__, "\"$row[1]\" is invalid name.");
				}
			}
			else {
				&errorMessage(__LINE__, "Invalid assemble results.\nInput file: $inputfile\nContig: $row[0]\n");
			}
		}
	}
}
close(INFILE);
my @samplenames = sort(keys(%table));

# select columns and rows
if ($minnseqcontig || $minnseqsample || $minntotalseqcontig || $minntotalseqsample || $minpseqcontig || $minpseqsample) {
	my $switch = 1;
	while ($switch) {
		my %outcol;
		my %outrow;
		my %coltotal;
		my %rowtotal;
		# count total of selected columns and rows
		foreach my $samplename (@samplenames) {
			foreach my $contigname (@contignames) {
				$coltotal{$contigname} += $table{$samplename}{$contigname};
				$rowtotal{$samplename} += $table{$samplename}{$contigname};
			}
		}
		# select columns which has one or more, equal or larger value cell than $minnseqcontig and $minpseqcontig
		# select rows which has one or more, equal or larger value cell than $minnseqsample and $minpseqsample
		if ($minnseqcontig || $minnseqsample || $minpseqcontig || $minpseqsample) {
			foreach my $samplename (@samplenames) {
				foreach my $contigname (@contignames) {
					if ($coltotal{$contigname} && $table{$samplename}{$contigname} >= $minnseqcontig && $table{$samplename}{$contigname} / $coltotal{$contigname} >= $minpseqcontig) {
						$outcol{$contigname} = 1;
					}
					if ($rowtotal{$samplename} && $table{$samplename}{$contigname} >= $minnseqsample && $table{$samplename}{$contigname} / $rowtotal{$samplename} >= $minpseqsample) {
						$outrow{$samplename} = 1;
					}
				}
			}
		}
		else {
			foreach my $samplename (@samplenames) {
				$outrow{$samplename} = 1;
			}
			foreach my $contigname (@contignames) {
				$outcol{$contigname} = 1;
			}
		}
		# delete columns which has smaller number of total sequences than $minntotalseqcontig
		if ($minntotalseqcontig) {
			foreach my $contigname (keys(%outcol)) {
				if ($coltotal{$contigname} < $minntotalseqcontig) {
					delete($outcol{$contigname});
					delete($coltotal{$contigname});
				}
			}
		}
		# delete rows which has smaller number of total sequences than $minntotalseqsample
		if ($minntotalseqsample) {
			foreach my $samplename (keys(%outrow)) {
				if ($rowtotal{$samplename} < $minntotalseqsample) {
					delete($outrow{$samplename});
					delete($rowtotal{$samplename});
				}
			}
		}
		# make new table
		$switch = 0;
		foreach my $samplename (@samplenames) {
			unless ($outrow{$samplename}) {
				delete($table{$samplename});
				$switch = 1;
			}
			else {
				foreach my $contigname (@contignames) {
					unless ($outcol{$contigname}) {
						delete($table{$samplename}{$contigname});
						$switch = 1;
					}
				}
			}
		}
		@contignames = keys(%outcol);
		@samplenames = keys(%outrow);
	}
}

@contignames = sort(@contignames);
@samplenames = sort(@samplenames);

# save output file
my $filehandle;
unless (open($filehandle, "> $outputfile")) {
	&errorMessage(__LINE__, "Cannot make \"$outputfile\".");
}
if ($outformat eq 'Matrix') {
	print($filehandle "samplename\t" . join("\t", @contignames) . "\n");
	foreach my $samplename (@samplenames) {
		print($filehandle $samplename);
		foreach my $contigname (@contignames) {
			if ($table{$samplename}{$contigname}) {
				print($filehandle "\t$table{$samplename}{$contigname}");
			}
			else {
				print($filehandle "\t0");
			}
		}
		print($filehandle "\n");
	}
}
elsif ($outformat eq 'Column') {
	print($filehandle "samplename\tcontigname\tnreads\n");
	foreach my $samplename (@samplenames) {
		foreach my $contigname (@contignames) {
			if ($table{$samplename}{$contigname}) {
				print($filehandle "$samplename\t$contigname\t$table{$samplename}{$contigname}\n");
			}
			else {
				print($filehandle "$samplename\t$contigname\t0\n");
			}
		}
	}
}
close($filehandle);

sub errorMessage {
	my $lineno = shift(@_);
	my $message = shift(@_);
	print(STDERR "ERROR!: line $lineno\n$message\n");
	print(STDERR "If you want to read help message, run this script without options.\n");
	exit(1);
}

sub helpMessage {
	print(STDERR <<"_END");
Usage
=====
clsumclass options inputfile outputfile

Command line options
====================
--minnseqcontig=INTEGER
  Specify minimum number of sequences of contig. If the number of
sequences of a contig is smaller than this value at all samples, the
contig will be omitted. (default: 0)

--minnseqsample=INTEGER
  Specify minimum number of sequences of sample. If the number of
sequences of a sample is smaller than this value at all contigs, the
sample will be omitted. (default: 0)

--minntotalseqcontig=INTEGER
  Specify minimum total number of sequences of contig. If the total
number of sequences of a contig is smaller than this value, the contig
will be omitted. (default: 0)

--minntotalseqsample=INTEGER
  Specify minimum total number of sequences of sample. If the total
number of sequences of a sample is smaller than this value, the sample
will be omitted. (default: 0)

--minpseqcontig=DECIMAL
  Specify minimum percentage of sequences of contig. If the number of
sequences of a contig / the total number of sequences of a contig is
smaller than this value at all samples, the contig will be omitted.
(default: 0)

--minpseqsample=DECIMAL
  Specify minimum percentage of sequences of sample. If the number of
sequences of a sample / the total number of sequences of a sample is
smaller than this value at all contigs, the sample will be omitted.
(default: 0)

--runname=RUNNAME
  Specify run name for replacing run name.
(default: given by sequence name)

Acceptable input file formats
=============================
contigmembers.txt
_END
	exit;
}
