#!/usr/bin/env perl
use strict;
use FindBin;
use File::Basename;
use Cwd qw(abs_path);
use File::Spec;
use Path::Tiny;
use Data::Dumper;
use List::MoreUtils qw(zip);
use Cwd qw(abs_path);

#..............................................................................
# Globals needed before --help etc

my $VERSION = "1.0.1";
my $EXE = basename($0);
my $AUTHOR = 'Torsten Seemann';
my $TWITTER = '@torstenseemann';
my $URL = 'https://github.com/tseemann/abricate';

#..............................................................................
# Command line options

my(@Options, $debug, $quiet, $setupdb, $list, $summary, $check,
             $datadir, $db, $minid, $mincov, $threads,
             $noheader, $csv, $nopath, $fofn);
setOptions();

#..............................................................................
# Globals

my $OUTSEP = "\t";
my $ABSENT = '.';
my $FIELD = '%COVERAGE';
my $FIELDSEP = ';';
my $IDSEP = '~~~';

my $CULL = 1;   # BLAST -culling_limit 

my @BLAST_FIELDS = qw(
  qseqid qstart qend qlen
  sseqid sstart send slen sstrand
  evalue length pident gaps gapopen
  stitle
);

my @COLUMNS = qw(
 FILE SEQUENCE START END STRAND GENE COVERAGE COVERAGE_MAP GAPS
 %COVERAGE %IDENTITY DATABASE ACCESSION PRODUCT RESISTANCE
);
$COLUMNS[0] = '#'.$COLUMNS[0];

my @REQUIRE = qw(blastn blastx makeblastdb blastdbcmd any2fasta gzip unzip);

#..............................................................................
# Option parsing

$threads >= 1 or err("Invalid --threads option, must be 1 or higher");
($minid > 0 and $minid <= 100) or err("--minid must be between 1 and 100");
($mincov >= 0 and $mincov <= 100) or err("--mincov must be between 0 and 100");
$OUTSEP = ',' if $csv;  # default is tab

if ($fofn) {
  -r $fofn or err("Could not open --fofn '$fofn'");
  my @filenames = path($fofn)->lines( { chomp => 1 } );
  @filenames or err("No files found in --fofn '$fofn'");
  msg("Using files from --fofn and ignoring any provided on the command line.");
  @ARGV = @filenames;
}

if ($summary) {
  @ARGV >= 1 or err("--summary needs >=1 abricate output files");
  summary_table( @ARGV );
  exit;
}

if ($check) {
  $debug=1;
  msg("Checking dependencies are installed:");
  require_exe( @REQUIRE );
  msg("OK.");
  exit;
}

# check if dependencies are installed
require_exe( @REQUIRE );

# do these now that we know the deps are ok
if ($list or $setupdb) {
  list_databases($datadir, $setupdb);
  exit;
}

# check if blast > 2.2.30 to support 'gaps' custom field
for my $blast ('blastn', 'blastx') {
  my($version) = qx"$blast -version 2>&1";
  $version =~ m/2.(\d+.\d+)\+/ or err("Could not parse $blast version from '$version'");
  $1 >= 2.30 or err("You need to install BLAST+ 2.2.30 or higher");
}

# check database exists
$datadir = abs_path($datadir);
my $db_path = "$datadir/$db/sequences";
my $dbinfo = blast_database_info($db_path);
$dbinfo or err("BLAST database not found: $db_path");
msg("Using", $dbinfo->{DBTYPE}, "database $db: ", $dbinfo->{SEQUENCES}, "sequences - ", $dbinfo->{DATE});

# output format
my $format = "6 @BLAST_FIELDS";
print line(@COLUMNS) unless $noheader;

FILE:
for my $file (@ARGV) {
  my %seen;
  my @hit;
  msg("Processing: $file");
  -r $file or err("'$file' does not exist, or is unreadable");

  my $blastcmd = $dbinfo->{DBTYPE} eq 'nucl'
               ? "blastn -task blastn -dust no -perc_identity $minid"
               : "blastx -task blastx-fast -seg no"
               ;

  my $cmd = "(any2fasta -q -u \Q$file\E |"
          . " $blastcmd -db \Q$db_path\E -outfmt '$format' -num_threads $threads"
          . " -evalue 1E-20 -culling_limit $CULL"
          . " -max_target_seqs 10000"  # https://github.com/tseemann/abricate/issues/135
#          . " -max_target_seqs ".$dbinfo->{SEQUENCES}   # Issue #76
          . ")"
          ;

  msg("Running: $cmd") if $debug;
  
  open BLAST, "$cmd |";
  while (<BLAST>) { 
    chomp;
    my @x = split m/\t/;

    @x == @BLAST_FIELDS or err("can not find sequence data in '$file'");

    my %hit = (map { $BLAST_FIELDS[$_] => $x[$_] } 0 .. @BLAST_FIELDS-1); 
    ($hit{sstart}, $hit{send}) = ($hit{send}, $hit{sstart}) if $hit{sstrand} eq 'minus';

    next if $seen{ join('~', @hit{qw(qseqid qstart qend)}) }++;
    
    my $pccov = 100 * ($hit{length}-$hit{gaps}) / $hit{slen};
    next unless $pccov >= $mincov;

#    $hit{sseqid} =~ s/_.*$//;
    my($database, $gene, $acc, $abx) = split m/$IDSEP/, $hit{sseqid};
    # if it wasn't in the format we expected, try and be sensible
    if (!defined $gene and !defined $acc) {
      $acc = '';
      $gene = $hit{sseqid};
      $database = $db;  # name specified with --db
    }
    msg( Dumper(\%hit) ) if $debug;

    my $product = $hit{'stitle'} || "n/a";
    $product =~ s/[,\t]//g;  # remove output separators

    # https://github.com/tseemann/abricate/issues/95
    $product =~ s/^\S+\s+// if $product =~ m/$IDSEP/;

    my $minimap = minimap( @hit{qw(sstart send slen gapopen)} );
#    $minimap .= '!-'.$hit{gaps} if $hit{gaps} > 0;
    push @hit, [ 
      $nopath ? basename($file) : $file ,
      $hit{qseqid}, 
      $hit{qstart}, 
      $hit{qend},
      ($hit{sstrand} eq 'minus' ? '-' : '+'),
      $gene,  #$hit{sseqid},
      $hit{sstart}.'-'.$hit{send}.'/'.$hit{slen}, 
      $minimap, 
      $hit{gapopen}.'/'.$hit{gaps},
      sprintf("%.2f", $pccov),
      sprintf("%.2f", $hit{pident}),
      $database,
      $acc,
      $product,
      $abx,
    ];
  }
  close BLAST;

  msg("Found", scalar(@hit), "genes in $file");

  # Sort hits
  #0    1   2     3   4    5
  #FILE CHR START END GENE COVERAGE COVERAGE_MAP GAPS %COVERAGE %IDENTITY DATABASE ACCESSION
#  @hit = sort { $a->[4] cmp $b->[4] || $b->[8] <=> $a->[8] } @hit;
  @hit = sort { $a->[1] cmp $b->[1] || $a->[2] <=> $b->[2] } @hit;

  # results to stdout
  print line(@$_) for @hit;

  # Inspiration
  msg( "Tip:", motd() ); # 'message of the day'
  msg("Done.");
}


#----------------------------------------------------------------------

sub motd {
  my @motd = (
    "it's important to realise $EXE uses DNA matching, not AA.",
    "$EXE can also find virulence factors; use --list to see all supported databases.",
    "remember that abricate is unable to find AMR-mediated SNPs.",
    "the --fofn option allows you to feed in a big list of files to run on.",
    "you can use the --summary option to combine reports in a presence/absence matrix.",
    "found a bug in $EXE? Post it at $URL/issues.",
    "have a suggestion for $EXE? Tell me at $URL/issues",
    "the $EXE manual is at $URL/blob/master/README.md",
    "did you know? $EXE was named after 'A'nti 'B'acterial 'R'esistiance",
  );
  srand( (localtime)[0] ); # seed
  return $motd[ int(rand(scalar(@motd))) ] ;
}


#----------------------------------------------------------------------

sub minimap {
  my($x, $y, $L, $broken, $scale, $on, $off) = @_;
  my $WIDTH = 15 - ($broken ? 1 : 0);
  $broken ||= 0;
  $scale ||= ($L/$WIDTH);
  $on  ||= '=';
  $off ||= '.';
  $x = int( $x / $scale );
  $y = int( $y / $scale );
  $L = int( $L / $scale );
  my $map='';
  for my $i (0 .. $WIDTH-1) {
#    msg("$i $x $y $L $scale");
    $map .= ($i >= $x and $i <= $y) ? $on : $off;
    $map .= '/' if $broken and $i==int($WIDTH/2);
  }
  return $map;
}

#----------------------------------------------------------------------

sub summary_table {
  my(@fname) = @_;
  
  # https://github.com/tseemann/abricate/issues/32
  # if we have 1 file, we use the '#FILE' column as the key: "dutch mode"
  # if we have >1 files, we use the $fname[] name
  my $dutch_mode = (@fname == 1);

  my %gene;  # genes observed
  my %data;
  my @hdr;

  for my $fname (@fname) {
    if (exists $data{$fname}) {
      wrn("Skipping duplicate file: $fname");
      next;
    }
    # initialise to empty so it appears in report even if 0 genes found
    $data{$fname} = {} unless $dutch_mode;;
    msg("Parsing: $fname") if $debug;
    open my $fh, '<', $fname or err("Can't open '$fname' to summarize.");
    while (<$fh>) {
      chomp;
      my @col = split m/$OUTSEP/;
      msg("### [$.] @col") if $debug;
      @hdr = @col if ! @hdr;  # first row we see will be header
      next if $col[0] =~ m/^#/;
      $col[0] = basename($col[0]) if $nopath;
      my $file = $dutch_mode ? $col[0] : $fname;
      my $row = { zip @hdr, @col };
      #print STDERR Dumper($row)) if $debug;
      $gene{ $row->{'GENE'} }++;
      push @{ $data{ $file }{ $row->{'GENE'} } } , $row;
    }
    #wrn(Dumper(\%data));
  }

  my @gene = sort { $a cmp $b } keys %gene;
  print line('#FILE', 'NUM_FOUND', @gene);

  for my $fname (sort { $a cmp $b } keys %data) {
    my @present = map { exists $data{$fname}{$_} 
                        ? join( $FIELDSEP, map { $_->{$FIELD} } @{$data{$fname}{$_}} ) 
                        : $ABSENT 
                      } @gene;
    my $label = $nopath ?  basename($fname) : $fname;;
    print line( $label, scalar( keys %{$data{$fname}} ), @present );
  }
}

#----------------------------------------------------------------------

sub blast_database_info {
  my($prefix) = @_;
  my(@res) = qx(blastdbcmd -info -db \Q$prefix\E 2> /dev/null);
  chomp @res;
  my $res = join(' ', @res);
  my $info = { PREFIX => $prefix };

  $res =~ m/\b([\d,]+)\s+sequences;/ or return;
  $info->{SEQUENCES} = $1;
  $info->{SEQUENCES} =~ s/,//g;

  $res =~ m/\btotal\s+(\S+)/ or return;
  $info->{DBTYPE} = $1 eq 'residues' ? 'prot' : 'nucl';

  $res =~ m/\bDatabase: (\S+)/ or return;
  $info->{TITLE} = $1;
  
  $res =~ m/\bDate: (\w+)\s+(\d+),\s+(\d{4})\s/ or return;
  $info->{DATE} = "$3-$1-$2";  # YYYY-MM-DD
  
  return $info;
}

#----------------------------------------------------------------------
# blastdbcmd -list ../db/bacmet2/ -list_outfmt "%f~~~%p~~~%t~~~%d~~~%l~~~%n~~~%U"
# ../db/bacmet2/sequences~~~Nucleotide~~~bacmet2~~~Apr 6, 2018  1:02 PM~~~280935~~~744~~~629421

sub list_databases {
  my($from, $setup_too) = @_;
  my @dir = grep { $_->is_dir } path($from)->realpath->children; # all subfolders
  my $count=0;
  my @list;
  
  for my $d (@dir) {
    my $name = $d->basename; # gives the 'file' part of the pathname
    my $dbname = "$from/$name/sequences";
    if (-r $dbname) {
      $setup_too and make_blast_database($dbname, $name);
      -r "$dbname.pin" || -r "$dbname.nin" or err("Database $name is not indexed, please try: abricate --setupdb");
      my $info = blast_database_info( $dbname );
      # save for end
      push @list, [ $name, $info->{SEQUENCES}, $info->{DBTYPE}, $info->{DATE} ];
    }
  }
  if (@list) {
    # print to STDOUT
    print line("DATABASE", "SEQUENCES", "DBTYPE", "DATE");
    print map { line(@$_) } @list;
  }
  else {
    msg("No databases found in $from");
  }
}

#----------------------------------------------------------------------

sub make_blast_database {
  my($path, $name) = @_;
  -r $path or err("Can't read FASTA file: $path");
  # guess if nucl or prot
  my @lines = grep { ! m/^>/ } path($path)->lines( { chomp=>1 } );
  chomp @lines;
  my $letters = join '', @lines;
  my $dbtype = mol_type($letters);
  msg("Removing old BLAST indices for $path");
  unlink <$path.[np]??>;
  msg("Making $dbtype BLAST index for $path");
  system("makeblastdb -in '$path' -title '$name' -dbtype $dbtype -logfile /dev/null");
}

#----------------------------------------------------------------------

sub mol_type {
  my($s) = @_;
  my $L = length($s);
  $s =~ s/[AGTC]//gi;
  my $M = length($s);
  #msg("mol_type: with_ATGC=$L without_AGTC=$M");
  return $M > 0.5 * $L ? 'prot' : 'nucl';
}

#----------------------------------------------------------------------

sub line {
  return join($OUTSEP, @_)."\n";
}

#----------------------------------------------------------------------

sub version {
  print "$EXE $VERSION\n";
  exit;
}

#----------------------------------------------------------------------

sub msg {
  print STDERR "@_\n" unless $quiet;
}

#----------------------------------------------------------------------

sub wrn {
  msg("WARNING:", @_);
}

#----------------------------------------------------------------------

sub err {
  print STDERR "ERROR: @_\n";
  exit(1);
}

#----------------------------------------------------------------------

sub require_exe {
  my(@arg) = @_;
  for my $exe (@arg) {
    my $where = '';
    for my $dir ( File::Spec->path ) {
      if (-x "$dir/$exe") {
        $where = "$dir/$exe";
        last;
      }
    }
    if ($where) {
      msg("Found '$exe' => $where") if $debug;
    }
    else {
      err("Could not find '$exe'. Please install it and ensure it is in the PATH.");
    }
  }
  return;
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    "GENERAL",
    {OPT=>"help",    VAR=>\&usage,             DESC=>"This help"},
    {OPT=>"debug!",  VAR=>\$debug, DESC=>"Verbose debug output"},
    {OPT=>"quiet!",  VAR=>\$quiet, DESC=>"Quiet mode, no stderr output"},
    {OPT=>"version!",  VAR=>\&version, DESC=>"Print version and exit"},
    {OPT=>"check!",  VAR=>\$check, DESC=>"Check dependencies are installed"},
    {OPT=>"threads=i",  VAR=>\$threads, DEFAULT=>1, DESC=>"Use this many BLAST+ threads"},
    {OPT=>"fofn=s",  VAR=>\$fofn, DEFAULT=>'', DESC=>"Run on files listed in this file"},
    "DATABASES",
    {OPT=>"setupdb!",  VAR=>\$setupdb, DESC=>"Format all the BLAST databases"},
    {OPT=>"list!",  VAR=>\$list, DESC=>"List included databases"},
    {OPT=>"datadir=s",  VAR=>\$datadir, DEFAULT=>abs_path("$FindBin::RealBin/../db"), DESC=>"Databases folder"},
    {OPT=>"db=s",  VAR=>\$db, DEFAULT=>"ncbi", DESC=>"Database to use"},
    "OUTPUT",
    {OPT=>"noheader!",  VAR=>\$noheader, DESC=>"Suppress column header row"},
    {OPT=>"csv!",  VAR=>\$csv, DESC=>"Output CSV instead of TSV"},
    {OPT=>"nopath!",  VAR=>\$nopath, DESC=>"Strip filename paths from FILE column"},
    "FILTERING",
    {OPT=>"minid=f",  VAR=>\$minid, DEFAULT=>80, DESC=>"Minimum DNA %identity"},
    {OPT=>"mincov=f",  VAR=>\$mincov, DEFAULT=>80, DESC=>"Minimum DNA %coverage"},
    "MODE",
    {OPT=>"summary!",  VAR=>\$summary, DESC=>"Summarize multiple reports into a table"},
  );

  @ARGV or usage(1);

  &GetOptions(map {$_->{OPT}, $_->{VAR}} (grep { ref } @Options)) || usage(1);

  # Now setup default values.
  foreach (grep { ref } @Options) {
    if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  my($exitcode) = @_;
  $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
  $exitcode ||= 0;
  select STDERR if $exitcode; # write to STDERR if exitcode is error

  print "SYNOPSIS\n  Find and collate amplicons in assembled contigs\n";
  print "AUTHOR\n  $AUTHOR ($TWITTER)\n";
  print "USAGE\n";
  print "  % $EXE --list\n";
  print "  % $EXE [options] <contigs.{fasta,gbk,embl}[.gz] ...> > out.tab\n";
  print "  % $EXE [options] --fofn fileOfFilenames.txt > out.tab\n";
  print "  % $EXE --summary <out1.tab> <out2.tab> <out3.tab> ... > summary.tab\n";

  foreach (@Options) {
    if (!ref $_) { print $_,"\n"; next }  # print text lines
    my $opt = $_->{OPT};
    $opt =~ s/!$//;
    $opt =~ s/=s$/ [X]/;
    $opt =~ s/=i$/ [N]/;
    $opt =~ s/=f$/ [n.n]/;
    printf "  --%-13s %s%s.\n",$opt,$_->{DESC},
           defined($_->{DEFAULT}) ? " [$_->{DEFAULT}]" : "";
  }

  print "DOCUMENTATION\n  $URL\n";
  exit($exitcode);
}
 
#----------------------------------------------------------------------
