#!/bin/env perl

=pod

=head1 NAME

unmatrix - convert a matrix into a list

=head1 SYNOPSIS

  cat matrix.txt | ./unmatrix [-sort] [-nsort] [-bycol] [-byrow] [-sym] [-delim ,]

=head1 DESCRIPTION

Convert a tabular data file

  - a b c
  d 1 - 0
  e 2 5 2
  f 1 2 3

into one in which each table cell is reported individually on separate lines.

  d a 1
  d b -
  d c 0
  e a 2
  ...

This script can function together with the 'matrix' script. For example, if you want to multiply all values in a matrix by 2 on the command line

  > cat 468x468.txt | ./unmatrix | awk '{print $1,$2,2*$3}' | ./matrix -nsort -missing -0

You can replace the awk call with your own filter. Here you can remove lines and modify values. The subsequent call to 'matrix' will cast the data in a table format again.

=head1 OPTIONS

=head2 -byrow

This is the default behaviour. See -bycol below.

In other words, -byrow does nothing and is allowed as a command-line flag to remind you of the output format.

=head2 -bycol

By default, the script is row-dominant - the triplet returned is

  row col value

When -nsort or -sort are used (see below), for each sorted row value the columns are sorted.

For column-dominant behaviour, use -bycol. This will produce

  col row value

and if you asked for sorting, for each sorted column value the rows will be sorted.

=head2 -nsort

Sort row and column names numerically.

=head2 -sort

Sort row and column names asciibetically.

=head2 -sym

If your matrix is symmetric, using this flag will exclude all entries above the diagonal.

    a b c
  a   x x
  b     x    x - not reported
  c

A sanity check is done that the matrix is indeed symmetric (for every cell x,y=z there exists y,x=z). If it is not, the script will quit with an error.

This feature applies only to matrixes whose row and column names are the same.

-sym can be combined with -sort/-nsort

=head2 -delim

Using -delim STRING sets the field separator.

By default this is general whitespace, which should be correct for most spaces (comma separated, tab separated). To specify a tab delimiter explicitly, use -delim "tab" or the CTRV-V bash trick.

=cut

our $VERSION = "0.11.1";

=pod

=head1 HISTORY

=over 

=item 16 Jun 2014

Fixed -sym bug.

=item 10 Jun 2014

Added -sym.

=item 3 Jun 2014

Added -byrow, -bycol, -nsort and sort.

=item 6 May 2014

Started.

=back 

=head1 BUGS

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

  Martin Krzywinski
  Genome Sciences Centre
  Vancouver BC Canada
  www.bcgsc.ca
  martink@bcgsc.ca

=cut

use strict;
use Config::General;
use Data::Dumper;
use FindBin;
use File::Basename;
use Math::VecStat qw(min max average sum minabs maxabs);
use Getopt::Long;
use Pod::Usage;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);

use Statistics::Descriptive;
use Math::Round;

GetOptions(\%OPT,
					 "man",
					 "bycol",
					 "byrow",
					 "nsort",
					 "sort",
					 "sym",
					 "delim",
					 "help",
					 "version",
					 "debug+");

if($OPT{version}) {
	printinfo($VERSION);exit;
}
pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration(); 
if($CONF{debug} > 1) {
  $Data::Dumper::Pad = "debug parameters";
  $Data::Dumper::Indent = 1;
  $Data::Dumper::Quotekeys = 0;
  $Data::Dumper::Terse = 1;
  print Dumper(\%CONF);
}

my $inputhandle;
if(my $file = $ARGV[0]) {
  die "No such file $file" unless -e $file;
  open(FILE,$file);
  $inputhandle = \*FILE;
} else {
  $inputhandle = \*STDIN;
}

my $matrix;
my @header;
my $data;
# read all lines
while(<$inputhandle>) {
  chomp;
  next if /^\s*#/;
	my @cols = split($CONF{delim},$_);
	if(! @header) {
		@header = @cols;
		next;
	}
	die "Found lines with different number of columns" unless @cols == @header;
	for my $i (1..@cols-1) {
		if($CONF{bycol}) {
			$data->{$header[$i]}{$cols[0]} = $cols[$i];
		} else {
			$data->{$cols[0]}{$header[$i]} = $cols[$i];
		}
	}
}

if($CONF{sym}) {
	my @rows = keys %$data;
	my $report;
	for my $ri (0..@rows-1) {
		my $row = $rows[$ri];
		my @cols = keys %{$data->{$row}};
		if(@rows != @cols) {
			die sprintf("You used the -sym flag but the matrix is not square [%d x %d].",int(@rows),int(@cols));
		}
		for my $ci (0..@cols-1) {
			my $col = $cols[$ci];
			if(defined $data->{$row}{$col}
				 &&
				 defined $data->{$col}{$row}
				 &&
				 $data->{$row}{$col} eq $data->{$col}{$row}) {
				if($ci <= $ri) {
					$report->{$row}{$col} = $data->{$row}{$col};
				}
			} else {
				printinfo("You used -sym but the matrix is not symmetric.");
				if(! defined $data->{$col}{$row}) {
					printinfo("The entry [$row,$col] exists, but not [$col,$row].");
				} elsif (! defined $data->{$row}{$col}) {
					printinfo("The entry [$col,$row] exists, but not [$row,$col].");
				} elsif ($data->{$row}{$col} ne $data->{$col}{$row}) {
					printinfo(sprintf("The entry [%s,%s]=%s and [%s,%s]=%s do not match.",$row,$col,$data->{$row}{$col},$col,$row,$data->{$col}{$row}));
				}
				exit;
			}
		}
	}
	$data = $report;
}

my @rows = sortmat(keys %$data);

for my $r (@rows) {
	my @cols = sortmat(keys %{$data->{$r}});
	for my $c (@cols) {
		printinfo($r,$c,$data->{$r}{$c});
	}
}



sub sortmat {
	my @x = @_;
	if($CONF{nsort}) {
		return sort {$a <=> $b} @x;
	} elsif ($CONF{sort}) {
		return sort @x;
	} else {
		return @x;
	}
}

sub validateconfiguration {
	$CONF{delim} ||= " ";
  $CONF{delim}   = "\t" if $CONF{delim} =~ /^tab$/i;

}

################################################################
#
# *** Do NOT EDIT BELOW THIS LINE ***
#
################################################################
################################################################
################################################################
################################################################


sub populateconfiguration {
  foreach my $key (keys %OPT) {
    $CONF{$key} = $OPT{$key};
  }
}

sub loadconfiguration {
  my $file = shift;
  my ($scriptname) = fileparse($0);
  if(-e $file && -r _) {
    # great the file exists
  } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
    $file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/etc/$scriptname.conf";
  } else {
    return undef;
  }
  $OPT{configfile} = $file;
  my $conf = new Config::General(-ConfigFile=>$file,
				 -AllowMultiOptions=>"yes",
				 -LowerCaseNames=>1,
				 -AutoTrue=>1);
  %CONF = $conf->getall;
}

sub printdebug {
  printinfo("debug",@_)  if $CONF{debug};
}

sub printinfo {
	print join(" ",@_),"\n";
}


