#!/usr/bin/env python
# 
# Copyright 2009 Mark Fiers, Plant & Food Research
# 
# This file is part of Moa - http://github.com/mfiers/Moa
# 
# Moa is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your
# option) any later version.
# 
# Moa is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Moa.  If not, see <http://www.gnu.org/licenses/>.
# 
"""
Efficient extraction of sequences from a multifasta file
"""

import re
import os
import sys
import time
import copy
import string
import optparse

def fastareader(f):
    """
    Take a filename (`f`), open and read it. 

    This Function yields tuples:

        (seq-id, rest-of-header, sequence)

    """
    F = open(f)
    id, header, seq = "", "", []

    while True:
        l = F.readline()
        if not l: break
        
        l = l.strip()
        if not l: continue

        if l[0] == '>':
            if id  and seq:
                yield id, header, "".join(seq)
            seq = []
            id = l[1:].split()[0]
            header = " ".join(l[1:].split()[1:])
                        
        else:
            seq.append("".join(l.split()).lower())

    if id and seq:
        yield id, header, "".join(seq)

    F.close()

## get & parse command line options
parser = optparse.OptionParser()

parser.add_option('-l', '--list', dest='list',
    help = 'list of ids to extract')

parser.add_option('-f', '--fasta', dest='fasta',
    help = 'fasta file to extract from')

parser.add_option('-t', '--to', dest='to',
    help = 'write to this file (stdout if undefined)')


parser.add_option('-o', '--order', dest='order', action='store_true',
    help = 'keep the order of the list')

parser.add_option('-r', '--rename', dest='rename', action='store_true',
    help = 'Rename the sequences - requires a id file with two columns')

parser.add_option('-R', '--regex', dest='regex',
                   help='regular expression to extract the identifier ' + \
                      'from the fasta headers')

(options, args) = parser.parse_args()

if not options.list:
    raise "Must define a list!"
if not options.fasta:
    raise "Must define a fasta file!"

def g(*a):
    sys.stderr.write("%s\n" % "\t".join(map(str, a)))

# determine what IDs to read
if options.list == '-':
    rawIds = sys.stdin.read().split()
else:
    rawIds = open(options.list, 'U').readlines()

rawIds2 =  [x.split()[0].replace('>', '') for x in rawIds if x.strip()]
ids = [x.replace('*','') for x in rawIds2]
revcomp = set([x[1:] for x in rawIds2 if x[0] == '*'])

if options.rename:
    newIds = [x.split()[1] for x in rawIds if x.strip()]

g(rawIds[:5])
g(ids[:5])
g(list(revcomp)[:5])

g("read %d ids" % len(ids))
g("first three ids: %s" % (" ".join(ids[:3])))
        
def rc(sequence):
    return sequence.upper().translate(string.maketrans("ATCG", "TAGC"))[::-1]

output = {}
idsSeen = []

i = 0
cre = None

OUTHANDLE = sys.stdout
if options.to:
    OUTHANDLE = open(options.to, 'w')

def out(s):
    OUTHANDLE.write("%s\n" % s)

if options.regex:
    cre = re.compile(options.regex)
    
i = 0
for seqId, header, seq in fastareader(options.fasta):

    if cre: testId = cre.search("%s %s" % (seqId, header)).groups()[0]
    else: testId = seqId

    if (not testId in ids):
        continue
    
    if testId in idsSeen:
        continue

    #remember the index id - for later use
    idListIndex = ids.index(testId)

    #remember that we've seen this id - prevent reprocessing
    idsSeen.append(testId)

    if testId in revcomp:
        #g(testId, 'revcomp', list(revcomp)[:5])
        seq = rc(seq)
        header += " reverse complement"
    else:
        pass
        #g(testId, 'forward', list(revcomp)[:5])

    i += 1

    if i < 5:
        g('output', testId, header)
    elif i == 5:
        g('suppressing further output')

    #see if the sequence needs to be renamed 
    if options.rename:
        newId = newIds[idListIndex]
        header += " original_id=%s" % testId
    else:
        newId = testId
            
    if options.order:
        output[seqId] = (newId, header, seq)
    else:
        out(">%s %s" % (newId, header))
        while seq:
            out("%s" % seq[:80])
            seq = seq[80:]

if options.order:
    for seqId in ids:
        if seqId in output:
            newId, header, seq = output[seqId]
        elif '*' + seqId in output:
            newId, header, seq = output[seqId]
        else:
            sys.stderr.write("unknown id %s\n"% seqId)
            sys.exit(-1)
        i += 1
        out(">%s %s" % (newId, header))
        while seq:       
            out("%s" % seq[:80])
            seq = seq[80:]

g("written %d sequences" % i)


