use strict;

my $buildno = '0.1.2013.03.14';

print(STDERR <<"_END");
clfilterseq $buildno
=======================================================================

Official web site of this script is
http://www.fifthdimension.jp/products/claident/ .
To know script details, see above URL.

Copyright (C) 2011-2013  Akifumi S. Tanabe

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

_END

# display usage if command line options were not specified
unless (@ARGV) {
	&helpMessage();
}

# initialize variables
my $outputfile = $ARGV[-1];
my $inputfile = $ARGV[-2];
unless (-e $inputfile) {
	&errorMessage(__LINE__, "\"$inputfile\" does not exist.");
}
my $qualfile;

my $append;
my $minlen;
my $maxlen;
my $minqual;
my $replaceinternal;
my %keywords;
my %ngwords;
my $converse;
my $contigmembers;
my %contigmembers;
my $minnread;
for (my $i = 0; $i < scalar(@ARGV) - 2; $i ++) {
	if ($ARGV[$i] =~ /^-+(?:keyword|keywords|k)=(.+)$/i) {
		my $keywords = $1;
		foreach my $keyword (split(/,/, $keywords)) {
			$keywords{$keyword} = 1;
		}
	}
	elsif ($ARGV[$i] =~ /^-+(?:ngword|ngwords|n)=(.+)$/i) {
		my $ngwords = $1;
		foreach my $ngword (split(/,/, $ngwords)) {
			$ngwords{$ngword} = 1;
		}
	}
	elsif ($ARGV[$i] =~ /^-+(?:c|converse)$/i) {
		$converse = 1;
	}
	elsif ($ARGV[$i] =~ /^-+(?:qual|qualfile|q)=(.+)$/i) {
		$qualfile = $1;
	}
	elsif ($ARGV[$i] =~ /^-+contigmembers?=(.+)$/i) {
		$contigmembers = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?len(?:gth)?=(\d+)$/i) {
		$minlen = $1;
	}
	elsif ($ARGV[$i] =~ /^-+max(?:imum)?len(?:gth)?=(\d+)$/i) {
		$maxlen = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?qual(?:ity)?=(\d+)$/i) {
		$minqual = $1;
	}
	elsif ($ARGV[$i] =~ /^-+min(?:imum)?(?:n|num)reads?=(\d+)$/i) {
		$minnread = $1;
	}
	elsif ($ARGV[$i] =~ /^-+replaceinternal$/i) {
		$replaceinternal = 1;
	}
	elsif ($ARGV[$i] =~ /^-+(?:a|append)$/i) {
		$append = 1;
	}
	else {
		&errorMessage(__LINE__, "\"$ARGV[$i]\" is unknown option.");
	}
}
if (-e $outputfile && !$append) {
	&errorMessage(__LINE__, "\"$outputfile\" already exists.");
}
if ($contigmembers && !-e $contigmembers) {
	&errorMessage(__LINE__, "\"$contigmembers\" does not exist.");
}
if ($minnread && !$contigmembers) {
	&errorMessage(__LINE__, "The minimum number threshold for reads of contigs requires contigmembers file.");
}
unless ($qualfile) {
	$qualfile = "$inputfile.qual";
}

my $inputhandle;
# read contig members
if ($contigmembers) {
	unless (open($inputhandle, "< $contigmembers")) {
		&errorMessage(__LINE__, "Cannot read \"$contigmembers\".");
	}
	while (<$inputhandle>) {
		s/\r?\n?$//;
		my @temp = split(/\t/, $_);
		if (scalar(@temp) > 2) {
			$contigmembers{$temp[0]} = scalar(@temp) - 1;
		}
		elsif (scalar(@temp) == 2) {
			$contigmembers{$temp[1]} = 1;
		}
		elsif (/.+/) {
			&errorMessage(__LINE__, "The contigmembers file is invalid.");
		}
	}
	close($inputhandle);
}

# open output file(s)
my $outputhandle;
if ($append) {
	unless (open($outputhandle, ">> $outputfile")) {
		&errorMessage(__LINE__, "Cannot write \"$outputfile\".");
	}
}
else {
	unless (open($outputhandle, "> $outputfile")) {
		&errorMessage(__LINE__, "Cannot make \"$outputfile\".");
	}
}
my $outqualhandle;
if (!$replaceinternal || !$minqual) {
	if ($append) {
		unless (open($outqualhandle, ">> $outputfile.qual")) {
			&errorMessage(__LINE__, "Cannot write \"$outputfile.qual\".");
		}
	}
	else {
		unless (open($outqualhandle, "> $outputfile.qual")) {
			&errorMessage(__LINE__, "Cannot make \"$outputfile.qual\".");
		}
	}
}

# read input file
unless (open($inputhandle, "< $inputfile")) {
	&errorMessage(__LINE__, "Cannot open \"$inputfile\".");
}
my $qualhandle;
if (-e $qualfile) {
	unless (open($qualhandle, "< $qualfile")) {
		&errorMessage(__LINE__, "Cannot open \"$qualfile\".");
	}
}
elsif ($minqual) {
	&errorMessage(__LINE__, "There is no quality file.");
}
{
	my $taxon;
	local $/ = "\n>";
	while (<$inputhandle>) {
		if (/^>?\s*(\S[^\r\n]*)\r?\n(.+)\r?\n/s) {
			$taxon = $1;
			my $sequence = $2;
			$taxon =~ s/\s+$//;
			if ($minnread && $contigmembers{$taxon} < $minnread) {
				next;
			}
			$sequence =~ s/[> \r\n]//g;
			my @seq = $sequence =~ /\S/g;
			my @qual;
			if (-e $qualfile) {
				local $/ = "\n>";
				my $qualline = readline($qualhandle);
				if ($qualline =~ /^>?\s*(\S[^\r\n]*)\r?\n(.+)\r?\n/s) {
					my $temp = $1;
					my $qual = $2;
					$temp =~ s/\s+$//;
					if ($temp eq $taxon) {
						@qual = $qual =~ /\d+/g;
					}
				}
				else {
					&errorMessage(__LINE__, "The quality file is invalid.");
				}
			}
			if (%keywords || %ngwords) {
				my $out = 1;
				foreach my $keyword (keys(%keywords)) {
					if ($taxon !~ /$keyword/i) {
						$out = 0;
					}
				}
				foreach my $ngword (keys(%ngwords)) {
					if ($taxon =~ /$ngword/i) {
						$out = 0;
					}
				}
				if (!$out && !$converse || $out && $converse) {
					next;
				}
			}
			# skip short sequence
			if ($minlen && scalar(@seq) < $minlen) {
				next;
			}
			if ($maxlen && scalar(@seq) > $maxlen) {
				next;
			}
			# mask based on quality value
			if ($minqual) {
				# mask end-side characters
				my $num = 0;
				for (my $i = -1; $i >= (-1) * scalar(@qual); $i --) {
					if ($qual[$i] < $minqual) {
						$num ++;
					}
					else {
						last;
					}
				}
				foreach (1..$num) {
					pop(@qual);
					pop(@seq);
				}
				# mask internal characters
				if ($replaceinternal) {
					my @pos;
					for (my $i = 0; $i < scalar(@qual); $i ++) {
						if ($qual[$i] < $minqual) {
							push(@pos, $i);
						}
					}
					foreach my $pos (@pos) {
						splice(@seq, $pos, 1, '?');
					}
				}
			}
			# skip short sequence
			if ($minlen && scalar(@seq) < $minlen) {
				next;
			}
			# output an entry
			print($outputhandle ">$taxon\n");
			print($outputhandle join('', @seq) . "\n");
			if (-e $qualfile) {
				print($outqualhandle ">$taxon\n");
				print($outqualhandle join(' ', @qual) . "\n");
			}
		}
	}
}
close($inputhandle);
if (-e $qualfile) {
	close($qualhandle);
}
close($outputhandle);
if (!$replaceinternal || !$minqual) {
	close($outqualhandle);
}

sub errorMessage {
	my $lineno = shift(@_);
	my $message = shift(@_);
	print("ERROR!: line $lineno\n$message\n");
	print("If you want to read help message, run this script without options.\n");
	exit(1);
}

sub helpMessage {
	print <<"_END";
Usage
=====
clfilterseq options inputfile outputfile

Command line options
====================
-k, --keyword=REGEXP(,REGEXP..)
  Specify regular expression(s) for sequence names. You can use regular
expression but you cannot use comma. All keywords will be used as AND
conditions. (default: none)

-n, --ngword=REGEXP(,REGEXP..)
  Specify regular expression(s) for sequence names. You can use regular
expression but you cannot use comma. All ngwords will be used as AND
conditions. (default: none)

-c, --converse
  If this option is specified, matched sequences will be cut off and
nonmatched sequences will be saved. (default: off)

-q, --qualfile=FILENAME
  Specify .qual file name. (default: inputfile.qual)

-a, --append
  Specify outputfile append or not. (default: off)

--minlen=INTEGER
  Specify minimum length threshold. (default: 1)

--maxlen=INTEGER
  Specify maximum length threshold. (default: Inf)

--minqual=INTEGER
  Specify minimum quality threshold. (default: none)

--replaceinternal
  Specify whether internal low-quality characters replace to missing
data (?) or not. (default: off)

--contigmembers=FILENAME
  Specify file path to contigmembers.txt. (default: none)

--minnread=INTEGER
  Specify the minimum number threshold for reads of contigs.
(default: 0)

Acceptable input file formats
=============================
FASTA (+.qual)
_END
	exit;
}
