#!/usr/bin/env python
# 
# Copyright 2009 Mark Fiers, Plant & Food Research
# 
# This file is part of Moa - http://github.com/mfiers/Moa
# 
# Moa is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your
# option) any later version.
# 
# Moa is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Moa.  If not, see <http://www.gnu.org/licenses/>.
# 
"""
Create a gff3 file to go with a HTG multifasta file
"""

import time
import sys
import copy
import optparse

def fastareader(f):
    F = open(f)
    name, seq = "", []
    while True:
        l = F.readline()
        if not l: break
        
        l = l.strip()
        if not l: continue

        if l[0] == '>':
            if name and seq:
                yield name, "".join(seq)
            seq = []
            name = l[1:]
                        
        else:
            seq.append("".join(l.split()).lower())

    if name and seq:
        yield name, "".join(seq)

    F.close()

print "##gff-version 3"
print "#date %s" % time.ctime()
print "#Autogenerated gff3 file"

parser = optparse.OptionParser()

parser.add_option('-p', '--phase', dest='phase',
    help = 'sequence phase')

parser.add_option('-c', '--chromosome', dest='chromosome',
    help = 'from chromosome')

parser.add_option('-s', '--source', dest='source',
    help = 'sequence source')

parser.add_option('-g', '--gi', dest='gi',
    help = 'gi identifier')

parser.add_option('-a', '--accession', dest='accid',
    help = 'accession identifier')

parser.add_option('-x', '--extraSequenceAttributes', dest='xattrs',
    help = 'Read extra attributes to add to *each* Sequence ' + 
           'field from this file')

parser.add_option('--interpret_htg', dest='interpret_htg',
    help = 'interpret the sequence as an HTG sequence', action="store_true")

parser.add_option('--gff3mode', dest='gff3', help='gff 3 mode - do not ' + 
                 'generate sequence records, but use the ##sequence-region' +
                 'tag', action='store_true')

(options, args) = parser.parse_args()
if not options.source:
    raise Exception("Must define a source!")

inputFile = args[0]

attributes = []

if options.phase:
    attributes.append("Htg_phase=%d" % int(options.phase))

if options.chromosome:
    attributes.append('Chromosome=%s' % options.chromosome)

if options.gi:
    attributes.append('Dbxref="NCBI_gi:%s' % options.gi)

if options.accid:
    attributes.append('Dbxref="NCBI_acc:%s' % options.accid) 

count = 0

xattrs = {}
if options.xattrs:
    FX = open(options.xattrs, 'r')
    while True:
        line = FX.readline()
        if not line: break
        line = line.strip()
        if not line: continue
        if line[0] == '#': continue
        ls = line.split("\t")
        if len(ls) != 2: 
            raise Exception("Invalid line %s in %s" % (line, options.xattrs))
        sid, val = ls
        if not xattrs.has_key(sid): 
            xattrs[sid] = val
            #sys.stderr.write("1" + sid + " : " + xattrs[sid]+ "\n")
        else: 
            xattrs[sid] += ";" + sid + " : " + val
            #sys.stderr.write("+"+ xattrs[sid]+ "\n")            
        
for name,seq in fastareader(inputFile):
    
    thisAttributes = copy.deepcopy(attributes)
    count += 1
    if options.interpret_htg and count > 1:
        raise Exception("Did not expect more than one sequence in the " + 
                        "input when interpreting this as an HGT file")
    if ' ' in name:
        seqid, note = name.split(' ', 1)
        thisAttributes.append("Note=%s" % note)
    else:
        seqid = name
        
        
    ls = [seqid]
    ls.append(options.source)
    ls.append('Sequence')
    ls.append(1)
    ls.append(len(seq))
    ls.append('.')
    ls.append('.')
    ls.append('.')
    atts = "Name=%s;ID=%s" % (seqid, seqid)

    if thisAttributes:
        atts += ";%s" % ";".join(thisAttributes)
        
    if xattrs.has_key(seqid):
        atts += ";" + xattrs[seqid]
        
    ls.append(atts)
    
    if options.gff3:
        print "##sequence-region %s %d %d" % (seqid, 1, len(seq))
    else:
        print "\t".join([str(x) for x in ls])

    if options.interpret_htg:
        contig_template = [seqid, "FastaToGff", "Contig", 0, 0, '.', '.', '.'] 
        gap_template = [seqid, options.source, "Gap", 0, 0, '.', '.', '.', ]
        
        fraglist = seq.split("n" * 100)
    
        #rebuild the structure
        cpos = 1
        fragcount = 0
        for frag in fraglist:
            fragcount += 1
            c = copy.copy(contig_template)
            g = copy.copy(contig_template)
            c[3] = cpos
            c[4] = cpos + len(frag) - 1
            g[3] = cpos + len(frag) 
            g[4] = cpos + len(frag) + 99
            cpos += len(frag) + 100
            c.append("ID=%s_contig_%s;Name=%s_contig_%s;Parent=%s" % (
                seqid, fragcount, seqid, fragcount, seqid))
            g.append("ID=%s_gap_%s;Name=%s_gap_%s;Parent=%s" % (
                seqid, fragcount, seqid, fragcount, seqid))
            
            print "\t".join([str(x) for x in c])






