#!/usr/bin/env python
# 
# Copyright 2009 Mark Fiers, Plant & Food Research
# 
# This file is part of Moa - http://github.com/mfiers/Moa
# 
# Moa is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your
# option) any later version.
# 
# Moa is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Moa.  If not, see <http://www.gnu.org/licenses/>.
# 
"""
Create a gff3 file to go with a HTG multifasta file
"""

import re
import sys
import time
import copy
import optparse

def fastareader(f):
    F = open(f)
    name, seq = "", []
    while True:
        l = F.readline()
        if not l: break
        
        l = l.strip()
        if not l: continue

        if l[0] == '>':
            if name and seq:
                yield name, "".join(seq)
            seq = []
            name = l[1:]
                        
        else:
            seq.append("".join(l.split()).lower())

    if name and seq:
        yield name, "".join(seq)

    F.close()

print "##gff-version 3"
print "#date %s" % time.ctime()
print "#Autogenerated gff3 file"

inputFile = sys.argv[1]
minLen = int(sys.argv[2])

NRE = re.compile("[NnXx]{%d,}"% minLen)

for name,seq in fastareader(inputFile):
    seqid = name.split(" ")[0]
    i = 0
    for hit in NRE.finditer(seq):
        i += 1
        print "\t".join(map(str, [
            seqid,
            'NRegion',
            'region',
            hit.start(),
            hit.end(),
            hit.end() - hit.start(),
            '.',
            '.',
            "ID=%s_NRegion_%04d;Name=%s_NRegion_%04d" % (seqid, i,seqid, i)]))



        
        
