#!/usr/bin/env perl

=pod

=head1 NAME

clustal2link - parse a Clustal multiple alignment file and generate a Circos link file

=head1 SYNOPSIS

  clustal2link -aln alignment.aln 
               [-idref ID] 
               [-dir OUTDIR]  
               [-conf etc/clustal2link.conf] 
               [-debug] [-man] [-help]

=head1 DESCRIPTION

Parse a multiple alignment created by CLUSTAL and generate Circos data files to visualize it.

=head1 OPTIONS

=item * -aln FILE

Specifies the name of the alignment file. The format should be

  CLUSTAL 2.1 multiple sequence alignment

  seq1             XXXXX-----XXXXX-----XXXXX---------------XXXXX-----XXXXX----- 
  seq2             XXXXXXXXXXXXXXXXXXXXXXXXX----------XXXXXXXXXXXXXXXXXXXXXXXXX 
  seq3             -----XXXXX-----XXXXX-----XXXXX-----XXXXX-----XXXXX-----XXXXX 

where X is one of {A,C,G,T}. See C<data/> for sample files.

=item * -dir OUTDIR

Specifies the output directory to which Circos file should be written.

=item * -idref ID

Specifies the ID of the reference sequence. When creating output
files, links from all sequences to the reference sequence will be
reported.

=head1 HISTORY

=over

=item * 27 Sep 2014

Output link file is now in single line format.

=item * 4 Jun 2012

First version.

=back 

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

Martin Krzywinski
Genome Sciences Center
BC Cancer Research Center
100-570 W 7th Ave
Vancouver BC V5Z 4S6

mkweb.bcgsc.ca
martink@bcgsc.ca

=cut

use strict;
use warnings FATAL=>"all";

use Carp;
use Config::General;
use Cwd qw(getcwd abs_path);
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use Math::VecStat qw(sum min max average);
use Pod::Usage;
use Time::HiRes qw(gettimeofday tv_interval);
use Storable;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";

our (%OPT,%CONF,$conf);
our @COMMAND_LINE = ("aln=s",
										 "dir=s",
										 "idref=s",
										 "configfile=s",
										 "help",
										 "cdump",
										 "man",
										 "debug");
our $VERSION = 0.02;

# common and custom module imports below
use Bio::AlignIO;
#use IO::File;
#use List::Util;
#use List::MoreUtils;
use Set::IntSpan;
#use Statistics::Descriptive;

# read and parse configuration file
_parse_config();

my $in = Bio::AlignIO->new(-file=>$CONF{aln});
while(my $aln = $in->next_aln()) {
    my ($alignments,$sizes) = process_aln($aln);
    report_alignments($alignments);
    create_karyotype($sizes);
    create_links($alignments);
    create_conf($sizes);
}

sub create_conf {
    my $sizes = shift;
    #open(F,">$CONF{dir}/sequences.conf");
    #my $seq = join(",",grep($_ ne $CONF{idref}, sort keys %$sizes));
    #printf F ("sequences=%s\n",$seq);
    #close(F);    
}

sub create_links {
	my $alignments = shift;
	my $file  = "$CONF{dir}/links.txt";
	printdebug(1,"creating links",$file);
	open(L,">$file");
	for my $id (sort keys %$alignments) {
		open(H,">$CONF{dir}/highlight.$id.txt");
		for my $a (@{$alignments->{$id}}) {
	    printf H ("%s %d %d\n",
								$CONF{idref},
								$a->{to_set}->min,
								$a->{to_set}->max);
	    printf L ("%s %d %d %s %d %d\n",
								$id,
								$a->{from_set}->min,
								$a->{from_set}->max,
								$CONF{idref},
								$a->{to_set}->min,
								$a->{to_set}->max);
		}
		close(H);
	}
	close(L);
}

sub create_karyotype {
	my $sizes = shift;
	my $file  = "$CONF{dir}/karyotype.txt";
	printdebug(1,"creating karytotype",$file);
	open(F,">$file");
	for my $id (sort keys %$sizes) {
		printf F ("chr - %s %s 0 %d %s\n",
							$id,
							lc $id,
							$sizes->{$id},
							lc $id);
	}
	close(F);
}

sub process_aln {
	my $aln = shift;

	printdebug(1,"processing");
	my %seq = make_seq_table($aln);
	my @ids = sort keys %seq;
	confess "Could not find sequence with id [$CONF{idref}] to use as reference axis. Pick one of [",join(",",@ids),"]" if ! $seq{$CONF{idref}};

	my $idref       = $CONF{idref};
	printdebug(1,"using reference",$CONF{idref});

	my @ids_not_ref = grep($_ ne $CONF{idref}, @ids);
	printdebug(1,"calculating alignment coordinates for",@ids_not_ref);

	my (%alignments,%cursor,%ingap);

    my $len         = max( map { $_->length } values %seq );
    my $seq_ref     = $seq{ $CONF{idref} };

    map { $ingap{$_} = 1 } (@ids);
    
    # resr - residue for reference
    for my $i (1..$len) {
			my $resr = $seq_ref->subseq($i,$i);
			if($resr ne $CONF{gap}) {
				# we are not in a gap
				$cursor{$idref}++;
				if($ingap{$idref}) {
					# just came out of a gap
					#push @alignments, {$idref => init_alignment($cursor{$idref},$resr) };
				} else {
					#add_to_alignment($alignments[-1]{$idref},$cursor{$idref},$resr);
				}
				$ingap{$idref} = 0;
			} else {
				# we are in a gap
				$ingap{$idref} = 1;
			}
			for my $id (@ids_not_ref) {
				my $res = $seq{$id}->subseq($i,$i);
				if($res ne $CONF{gap}) {
		$cursor{$id}++;
	    }
	    my $is_aligned = $res ne $CONF{gap};
	    if($is_aligned) {
		# move to next sequence if the reference has a gap here
		next if $resr eq $CONF{gap};
		if($ingap{$id}) {
		    # just came out of a gap
		    push @{$alignments{$id}}, init_alignment($cursor{$id},
							     $cursor{$idref},
							     $res,
							     $resr);
		} else {
		    add_to_alignment($alignments{$id}[-1],
				     $cursor{$id},
				     $cursor{$idref},
				     $res,
				     $resr);
		}
		$ingap{$id} = 0;
	    } else {
		$ingap{$id} = 1;
	    }
	}
    }
    return (\%alignments,\%cursor);
}

sub report_sizes {
    my $cursor = shift;
    for my $id (sort keys %$cursor) {
	printdebug(1,"length",$id,$cursor->{$id});
    }
}

sub report_alignments {
    my $alignments = shift;
    for my $id (keys %$alignments) {
	for my $a ( @{$alignments->{$id}} ) {
	    printinfo("alignment",$id,
		      coords($a->{from_set}),
		      $CONF{idref},
		      coords($a->{to_set}));
	}
    }
}

sub coords {
    my $set = shift;
    return sprintf("%5d %5d",$set->min,$set->max);
}

sub init_alignment {
    my ($from,$to,$from_res,$to_res) = @_;
    return { from_set => Set::IntSpan->new($from),
	     to_set   => Set::IntSpan->new($to),
	     from_seq => $from_res,
	     to_seq   => $to_res };
}
sub add_to_alignment {
    my ($a,$from,$to,$from_res,$to_res) = @_;
    $a->{from_set}->insert($from);
    $a->{to_set}->insert($to);
    $a->{from_seq} .= $from_res;
    $a->{to_seq} .= $to_res;
}

sub make_seq_table {
    my $aln = shift;
    my %seq;
    printdebug(1,"creating sequence table");
    for my $seq ($aln->each_seq) {
	$seq{ $seq->display_id } = $seq;
    }
    return %seq;
}

sub validateconfiguration {
    $CONF{gap} ||= "-";
    $CONF{dir} ||= "circos";
}

# HOUSEKEEPING ###############################################################

sub _dump_config {
    printdumper(\%OPT,\%CONF);
		exit;
}

sub _parse_config {
  my $dump_debug_level = 3;
  GetOptions(\%OPT,@COMMAND_LINE);
  pod2usage() if $OPT{help};
  pod2usage(-verbose=>2) if $OPT{man};
  loadconfiguration($OPT{configfile});
  populateconfiguration(); # copy command line options to config hash
  validateconfiguration(); 
	_dump_config() if $CONF{cdump};
  pod2usage() if ! $CONF{aln};
}

sub populateconfiguration {
  for my $var (keys %OPT) {
    $CONF{$var} = $OPT{$var};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my $root     = shift;
  return unless ref($root) eq "HASH";
  for my $key (keys %$root) {
      my $value = $root->{$key};
      if(ref($value) eq "HASH") {
	  repopulateconfiguration($value);
      } elsif (ref($value) eq "ARRAY") {
	  for my $item (@$value) {
	      repopulateconfiguration($item);
	  }
      } elsif(defined $value) {
	  while($value =~ /__([^_].+?)__/g) {
	      my $source = "__" . $1 . "__";
	      my $target = eval $1;
	      $value =~ s/\Q$source\E/$target/g;
	  }
	  $root->{$key} = $value;
      }
  }
}

################################################################
#
#

sub loadconfiguration {
  my $file = shift;
  if(defined $file) {
    if(-e $file && -r _) {
      # provided configuration file exists and can be read
      $file = abs_path($file);
    } else {
      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
    }
  } else {
    # otherwise, try to automatically find a configuration file
    my ($scriptname,$path,$suffix) = fileparse($0);
    my $cwd     = getcwd();
    my $bindir  = $FindBin::RealBin;
    my $userdir = $ENV{HOME};
    my @candidate_files = (
	"$cwd/$scriptname.conf",
	"$cwd/etc/$scriptname.conf",
	"$cwd/../etc/$scriptname.conf",
	"$bindir/$scriptname.conf",
	"$bindir/etc/$scriptname.conf",
	"$bindir/../etc/$scriptname.conf",
	"$userdir/.$scriptname.conf",
	);
    my @additional_files = (
	
	);
    for my $candidate_file (@additional_files,@candidate_files) {
	#printinfo("configsearch",$candidate_file);
	if(-e $candidate_file && -r _) {
	    $file = $candidate_file;
	    #printinfo("configfound",$candidate_file);
	    last;
	}
    }
  }
  if(defined $file) {
    $OPT{configfile} = $file;
    $conf = new Config::General(
	-ConfigFile=>$file,
	-IncludeRelative=>1,
	-IncludeAgain=>1,
	-ExtendedAccess=>1,
	-AllowMultiOptions=>"yes",
	-LowerCaseNames=>1,
	-AutoTrue=>1
	);
    %CONF = $conf->getall;
  }
}

sub printdebug {
    my ($level,@msg) = @_;
    my $prefix = "debug";
    if(defined $CONF{debug} && $CONF{debug} >= $level) {
	printinfo(sprintf("%s[%d]",$prefix,$level),@msg);
    }
}

sub printinfo {
    print join(" ",@_),"\n";
}

sub printdumper {
    print Dumper(@_);
}

