#!/usr/bin/env python
# 
# Copyright 2009 Mark Fiers, Plant & Food Research
# 
# This file is part of Moa - http://github.com/mfiers/Moa
# 
# Moa is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your
# option) any later version.
# 
# Moa is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Moa.  If not, see <http://www.gnu.org/licenses/>.
# 
import os
import sys
import logging
import optparse

from Bio.Blast import NCBIStandalone
from Bio.Blast import NCBIXML

logging.basicConfig(level=logging.ERROR, 
                    format = "## %(levelname)s - %(message)s",
                    stream=sys.stdout)
l = logging

parser = optparse.OptionParser()
parser.set_defaults(noalignments=5)
parser.add_option('-n', dest='noalignments', type="int",
        help = "no of alignments per record to parse")
parser.set_defaults(outfile='report.out')
parser.add_option('-o', dest='outfile',
        help = "outputfile")
(options, args) = parser.parse_args()

    
def processAlignment(data, alignment):
    #gather some statistics
    data['expect'] = alignment.hsps[0].expect
    data['identities'] = 0
    data['query_start'] = alignment.hsps[0].query_start
    data['query_end'] = alignment.hsps[0].query_end
    
    for h in alignment.hsps:
        #for k in dir(h):
        #    try: print "%s\t%s" % (k, h.__dict__[k])
        #    except: pass
        if h.expect < data['expect']: 
            data['expect'] = h.expect
        qs, qe = h.query_start, h.query_end
        if qs > qe: qs, qe = qe, qs
        
        if qs < data['query_start']:  
            data['query_start'] = qs
        if qe > data['query_end']:  
            data['query_end'] = qe
        data['identities'] += h.identities
            
    data['accession'] = alignment.accession
    data['hit_def'] = alignment.hit_def
    
    G.write(
        "%(query)s\t%(accession)s\t%(expect).2e\t%(identities)s\t%(hit_def)s\n" % data)

def processRec(rec):
    i = 0     
    data = {
        'query' : rec.query.split()[0]
        }
    
    
    for alignment in rec.alignments:
        i += 1
        l.debug("processing alignment %d" % i)
        processAlignment(data, alignment)
        if i >= options.noalignments: break
        
    return i    
    
#parse all files in dir x
i = 0; r = 0; empty = 0

G = open(options.outfile, 'w')

for infile in os.listdir(args[0]):
    i +=1 ; j = 0
    l.debug("Processing %s" % infile)
    F = open(os.path.join(args[0], infile))
    #try:
    records = NCBIXML.parse(F)
    #except ValueError:
    #    l.error("Error parsing %s" % os.path.join(sys.argv[1], infile))
        
    while True:
        j+=1    
        try:
            rec = records.next()
        except ValueError:
            l.error("#### Error parsing %s" % infile)
        except StopIteration:
            l.debug("#### Done iterating %s" % infile)
            break
            
        l.debug("#### file %d rec %d" % (i,j))
        r += 1
        noals = processRec(rec)
        if noals == 0:
            empty += 1
        
                    
    F.close()
G.close()

l.info("processed %d files" % i)
l.info("processed %d records" % r)
l.info("processed %d empty records" % empty)

