#!/software/Python/2.7.11/bin/python

import argparse

parser = argparse.ArgumentParser(description="obo to tsv parser", formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-i", "--input",help="go-basic.obo file. Files can be downloaded from http://geneontology.org/page/download-ontology.")
parser.add_argument("-u", "--url", help="If no go-basic.obo input file is specified, a url to a target obo file can be specified instead.", default='http://geneontology.org/ontology/go-basic.obo')
parser.add_argument("-o", "--output", help="Name of output tab separated file.", default="go-basic.tsv")
parser.add_argument("-c", "--cpus", help="Number of cpus.", default=36)
parser.add_argument("--organism", help="Optional, merge GO obo.tsv with a GO annotation for an organism: either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective  downloded .gz file.",default=None)
args = parser.parse_args()

import pandas as pd
import numpy as np
import multiprocessing as mp
import sys
import contextlib
import cStringIO
@contextlib.contextmanager
def nostdout():
    save_stdout = sys.stdout
    sys.stdout = cStringIO.StringIO()
    yield
    sys.stdout = save_stdout
with nostdout():
    import AGEpy.AGEpy as age
age.checkImport()


n_processors=int(args.cpus)

if args.input:
    f=open(args.input)
    lis=f.readlines()
    r=args.input
elif args.url:
    from urllib import urlopen
    f=urlopen(args.url).read().split("\n")
    lis=[str(x)+"\n" for x in f]
    r=args.url

print "Finished importing %s" %r
sys.stdout.flush()

def getTerm(i,lines):
    term={}
    cats=[]
    GOid=lines[i].split("\n")[0].split(": ")[1]
    i+=1
    while lines[i] != "\n":
        line=lines[i].split("\n")[0].split(": ")
        cats.append(line[0])
        if not line[0] in term.keys():
            term[line[0]]=line[1]
        else:
            nval=term[line[0]]+"; "+line[1]
            term[line[0]]=nval
        i+=1
    return i+1, GOid, term, cats

def collectUpper(x,GOdic):
    allUpper=[]
    is_a=GOdic[x]["is_a"]
    is_a=is_a.split("; ")
    is_a=[ s.split(" ! ")[0] for s in is_a]
    return is_a

def checkTop(x,df):
    name_spaces=set(df["namespace"].tolist())
    if len(x) == 1:
        if x[0] in name_spaces:
            return True
        else:
            return False
    else:
        return False

i=0
GOdic={}
cats=[]
while i < len(lis):
    l=lis[i]
    if '[Term]' in l:
        i, GOid, term, c=getTerm(i+1,lis)
        GOdic[GOid]=term
        cats.append(c)
        
    else: 
        i+=1
cats=[item for sublist in cats for item in sublist]

df=pd.DataFrame.from_dict(GOdic,orient="index")

print "Collecting information on parent terms"
sys.stdout.flush()

for GO in GOdic.keys():
    allUpper=[]
    if "is_a" in GOdic[GO].keys():
        upper=collectUpper(GO,GOdic)
        allUpper.append(upper)
        while not checkTop(upper,df):
            sub=[]
            for u in upper:
                if "is_a" in GOdic[u].keys():
                    upper_=collectUpper(u,GOdic)
                    sub.append(upper_)
            if len(sub)>0:
                sub=[item for sublist in sub for item in sublist]
                allUpper.append(sub)
                upper=sub
            else:
                break
        allUpper=list(set([item for sublist in allUpper for item in sublist]))
        allUpper="; ".join(allUpper)
        GOdic[GO]["parent_terms"]=allUpper

#import json
#with open("/beegfs/group_bit/home/JBoucas/GO_test/out.dic", 'w') as configfile:
#    json.dump(GOdic, configfile)
#import json
#with open("/beegfs/group_bit/home/JBoucas/GO_test/out.dic", 'r') as configfile:
#    GOdic=json.load(configfile)
 
df=pd.DataFrame.from_dict(GOdic,orient="index")
df["term"]=df.index.tolist()
df=df.reset_index(drop=True)

def getChildren(x,dfn=df):
    children=[]
    for i in range(len(dfn)):
        if str(dfn.ix[i,"parent_terms"])!="nan":
            if x in dfn.ix[i,"parent_terms"]:
                children.append(dfn.ix[i,"term"])
    if len(children)>1:
        children="; ".join(children)
    elif len(children)==1:
        children=str(children[0])
    else:
        children="None"
    return children

def worker(df):
    df=pd.DataFrame(df)
    df["children"]=df['term'].apply(getChildren)
    df["result"]=df["term"].astype(str)+"-"+df["children"].astype(str)
    res=df["result"].tolist()
    return res

def correctNones(x):
    if x == "None":
        return np.nan
    else:
        return x


if __name__ == '__main__':
    print "Collecting information on child terms"
    sys.stdout.flush()

    reader = np.array_split(df,n_processors)
    pool = mp.Pool(n_processors)
    funclist = []
    for d in reader:
        out_put = pool.apply_async(worker,[d])
        funclist.append(out_put)

    dfCov=pd.DataFrame()
    for f in funclist:
        covs_=f.get()
        covs_=pd.DataFrame(covs_,index=range(len(covs_)))
        dfCov=pd.concat([dfCov,covs_],axis=0)
    dfCov.columns=['chil']
    TSS=pd.DataFrame(dfCov['chil'].str.split("-").tolist())
    TSS.columns=["term","children"]
    TSS["children"]=TSS["children"].apply(lambda x: correctNones(x) )
    df=pd.merge(df,TSS,on="term",how="outer")
    col=['term','synonym','name', 'relationship', 'namespace', 'is_a', 'def', 'subset', 'comment', 'xref', 'is_obsolete', 'consider', 'alt_id','replaced_by', 'parent_terms', 'children']
    df=df[col]
    if args.organism:
        org=age.getGeneAssociation(args.organism)
        if "GO ID" in org.columns.tolist():
            df=pd.merge(org,df,left_on=["GO ID"], right_on=["term"],how="left")
        else:
            check=org.ix[0]
            for i in range(len(check)):
                if "GO:" in check[i]:
                    break
            df=pd.merge(org,df,left_on=[i], right_on=["term"],how="left")
    df.to_csv(args.output,sep="\t",index=None)
    print "Done"
