#!/usr/bin/env python
# 
# Copyright 2009 Mark Fiers, Plant & Food Research
# 
# This file is part of Moa - http://github.com/mfiers/Moa
# 
# Moa is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your
# option) any later version.
# 
# Moa is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Moa.  If not, see <http://www.gnu.org/licenses/>.
# 
"""
Convert blastn output to gff
"""
import os
import sys
import optparse
import logging

from Bio.Blast import NCBIStandalone
from Bio.Blast import NCBIXML

logging.basicConfig(level=logging.ERROR, 
                    format = "%(levelname)s - %(message)s")
l = logging #shortcut

################################################################################
parser = optparse.OptionParser()
parser.add_option('-d', '--direction', dest='direction',
    help = 'output "query" based, "subject" based or "both"')
parser.set_defaults(output_blasthit = False)
parser.add_option('-b', '--output_blasthit', dest='output_blasthit',
    help = 'output a parent blasthit line?', action="store_true")
parser.add_option('-s', '--source', dest='source',
    help = 'gff source to use, is also incorporated in the feature names/ids')
parser.add_option('--debug', dest='debug',
    help = 'debugging string - no real fixed use here :)')

(options, args) = parser.parse_args()



################################################################################
#blast_parser = NCBIStandalone.BlastParser()

def cleanup_name(name):
    """
    Clean a name from forbidden characters
    """
    return name.replace(';', ','
            ).replace('=', '_'
            ).replace('%', '_'
            ).replace(',', '_'
            ).replace('__', '_')
            
def handle_blast_record(rec):        
    query_id = cleanup_name(
            rec.query.split()[0].replace(">", "").replace("lcl|", ""))
    full_query_name = cleanup_name(
        rec.query.replace(">", "").replace("lcl|", "")
            
        ) 
    
    l.debug("query: %s" % query_id)
    
    count = 0    
    for a in rec.alignments:
        count +=1
        subject_id = cleanup_name(
            a.title.replace(">", "").split()[0].replace("lcl|", ""))             
        full_subject_name =cleanup_name(a.title.replace(">", "").replace("lcl|", "")) 
    
        l.debug("hit against: %s" % subject_id)
        
        m_ss = a.hsps[0].sbjct_start
        m_se = a.hsps[0].sbjct_end
        m_qs = a.hsps[0].query_start
        m_qe = a.hsps[0].query_end
        
        m_eval = a.hsps[0].expect
        m_score = a.hsps[0].score
        
        if a.hsps[0].strand[0] == a.hsps[0].strand[1]:
            m_strand = "+"
        else:
            m_strand = "-"
            
        gff_queries = []
        gff_subjects = []
        

        bh_basenameQ = "%s__%s__%s" % (options.source, query_id, subject_id)
        bh_basenameS = "%s__%s__%s" % (options.source, subject_id, query_id)
        basenameQ = "%s__%s__%s__%s" % (options.source, query_id, subject_id, count)
        basenameS = "%s__%s__%s__%s" % (options.source, subject_id, query_id, count)
        
        l.debug("basename Q is %s" % basenameQ)
        l.debug("basename S is %s" % basenameS)
        

        hcount = 0
        bhss, bhse = 1e100,0
        bhqs, bhqe = 1e100,0

        for h in a.hsps:
            hcount += 1
            ss,se = h.sbjct_start, h.sbjct_end 
            if se < ss: ss,se = se,ss
            
            qs,qe = h.query_start, h.query_end 
            if qe < qs: qs, qe = qe, qs

            if ss < bhss: bhss = ss
            if qs < bhqs: bhqs = qs
            if se > bhse: bhse = se
            if qe > bhqe: bhqe = qe

            if h.strand[0] == h.strand[1]: strand = '+'
            else: strand = '-'
                
            attribQ = ["ID=%s" % basenameQ,
                       "Name=%s" % basenameQ,
                       "Target=Sequence:%s %s %s" % (subject_id, ss, se),
                       "Hsp_expect=%s" % h.expect,
                       "Hsp_identities=%s" % str(h.identities),
                       "Hsp_bits=%s" % h.bits,
                       'Note="%s"' % full_subject_name
                        ]
            
            attribS = ["ID=%s" % basenameS,
                       "Name=%s" % basenameS,
                       "Target=Sequence:%s %s %s" % (query_id, qs, qe),
                        "Hsp_expect=%s" % h.expect,
                        "Hsp_identities=%s" % h.identities,
                        "Hsp_bits=%s" % h.bits,
                       ]

            if options.output_blasthit:
                attribQ.append("Parent=%s" %  bh_basenameQ)
                attribS.append("Parent=%s" %  bh_basenameS)


            gff_queries.append(                
                [query_id, options.source, 'match', qs, qe, 
                h.score, strand, '.', ";".join(attribQ)])
            gff_subjects.append(                
                [subject_id, options.source, 'match', ss, se, 
                h.score, strand, '.', ";".join(attribS)])
        
                
        bh_attribQ = ["ID=%s" % bh_basenameQ,
                     "Name=%s" % bh_basenameQ,
                     "Target=Sequence:%s %s %s" % (subject_id, bhss, bhse),
                     "Hsp_expect=%s" % m_eval,
                     'Note="%s"' % full_subject_name
                        ]
    
            
        bh_attribS = ["ID=%s" % bh_basenameS,
                      "Name=%s" % bh_basenameS,
                      "Target=Sequence:%s %s %s" % (query_id, bhqs, bhqe),
                      "Hsp_expect=%s" % m_eval
                       ]

        if options.output_blasthit:
            gff_queries.append(                
                [query_id, options.source, 'blasthit', bhqs, bhqe, 
                '.', m_strand, '.', ";".join(bh_attribQ)])
            gff_subjects.append(                
                [subject_id, options.source, 'blasthit', bhss, bhse, 
                '.', m_strand, '.', ";".join(bh_attribS)])


        if options.direction == 'both' or options.direction == 'query':
            for gg in gff_queries:
                print "\t".join([str(x) for x in gg])
                
        if options.direction == 'both' or options.direction == 'subject':
            for ss in gff_subjects:
                print "\t".join([str(x) for x in ss])

print "##gff-version 3"
try:
    for blast_record in NCBIXML.parse(sys.stdin):
        handle_blast_record(blast_record)
except Exception, e:
    print "*" * 80
    print "*" * 80
    print "* "
    print "* Failed : %s" % (e)
    print "* "
    print "*" * 80
    print "*" * 80
