#!/software/Python/2.7.11/bin/python

# david - perform DAVID enrichment analysis of gene sets
#
# This program is part of the python package AGEpy
#
# Copyright (c) 2016 - Bioinformatics Core Facility at the
#                      Max Planck Institute for Biology of Ageing,
#                      Cologne, Germany

import argparse

# set
parser = argparse.ArgumentParser(
    formatter_class = argparse.ArgumentDefaultsHelpFormatter, # argparse.RawDescriptionHelpFormatter
    description = "david - perform DAVID enrichment analysis of gene sets",
    epilog = """
    This program is part of the python package AGEpy.
    Author: Sven E. Templer <sven.templer@gmail.com>.
    Copyright (c) 2016 - Bioinformatics Core Facility at the
    Max Planck Institute for Biology of Ageing, Cologne, Germany.
    Please file issues at https://github.com/mpg-age-bioinformatics/AGEpy/issues
    and find documentation at http://agepy.readthedocs.io/en/latest/david/
    """)
# input file
parser.add_argument("table", metavar = "TABLE", nargs = '+',
    help= "Input file name[s], format is text or xls[x], same for each file and auto-detected.")
# input options
parser.add_argument("-d", "--delimiter", metavar = 'CHAR', default = '\\t',
    help = """For text input files, a character used as column delimiter.
    E.g. '\\t' for tsv, ',' for csv, ';' for csv2, etc.""")
parser.add_argument("-c", "--column", metavar = 'N', default = None, type = int,
    help = "Select a column index (0-based) with ids to query. None means all.")
parser.add_argument("-e", "--expression-column", metavar = 'N', default = None, type = int,
    help = "Select a column index (0-based) with gene expression values")
parser.add_argument("-s", "--sheet", metavar = "NAME", default = None, nargs = '+',
    help = "If input is an xls[x] file, select sheets by name. None means all.")
parser.add_argument("-g", "--gtf", metavar = "GTF", default = None,
    help = "A gtf file to convert the ids in the input tables from gene_name to gene_id (as in ENSEMBL gtf files). If no file is provided, no conversion is performed.")
#parser.add_argument("-b", "--background_table", default='NONE',
#   help="Optional background table.Excel file with one sheet. Each column has the name of the set on the column header and the gene names for the genes as in the respective GTF file ")
# query
parser.add_argument("-t", "--DAVIDtypes", metavar = "CATEGORY[,CATEGORY,...]",
    default = 'GOTERM_BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,BIOCARTA,PFAM,PROSITE',
    help = 'DAVID category types to enrich for. Comma separated string.')
parser.add_argument("-i", "--DAVIDid", metavar = "ID", default = 'ENSEMBL_GENE_ID',
    help = 'DAVID data format id.')
parser.add_argument("-u", "--DAVIDuser", metavar = "EMAIL", default = None,
    help = "DAVID user id (email address).")
parser.add_argument("-p", "--max-pvalue", metavar = "P", default = 0.05, type = float,
    help = "Maximum p-value for enrichment")
parser.add_argument("-n", "--min-genes", metavar = "N", default = 2, type = int,
    help = "Minimum number of genes in term")
#parser.add_argument("-e", "--gene_expression", help="Optional gene expression table.", default='NONE')
#parser.add_argument("-c", "--column", metavar = 'N', help = "Select a column index (0-based) with ids to query. None means all.", default = None, type = int, nargs='+')
# output
parser.add_argument("-o", "--output-folder", metavar = "FOLDER", default = 'david_output',
    help = "Output folder.")
parser.add_argument("-x", "--output-xlsx", action = "store_true",
    help = "Also return xlsx output files, with all categories merged into a single workbook.")
parser.add_argument("-v", "--verbose", action = "store_true",
    help = "Be verbose.")
# parse
args=parser.parse_args()

import time
import codecs
from datetime import datetime
import pandas
import os
import sys
import AGEpy.AGEpy as AGEpy

outsep = '-'
sheets = args.sheet
if sheets is None:
  sheets = [""]

if not os.path.exists(args.output_folder):
  os.makedirs(args.output_folder)

if args.DAVIDuser is None:
  print "error: missing david user id"
  sys.exit(1)

if args.column is None and args.expression_column is not None:
  print "error: expression column (-e) can only be specified when a single column (-c) is used"
  sys.exit(65)

gtf = args.gtf
if gtf is not None:
  if args.verbose:
    print "Reading gtf '" + gtf + "'"
  gtf = AGEpy.readGTF(gtf)
  if args.verbose:
    print "* fetching attributes"
  gname = AGEpy.retrieve_GTF_field('gene_name', gtf)
  gid = AGEpy.retrieve_GTF_field('gene_id', gtf)
  gname.columns = ['2']
  gtf = pandas.concat([gname, gid], axis = 1).drop_duplicates().dropna()
  gtf.columns = ['gene_name','gene_id']
  if args.verbose:
    print gtf.head()

sep = args.delimiter
if sep is not None and sep == '\\t':
    sep = '\t'

def DAVIDannotate(d, q = None):
  """
  Map gene symbols from gtf, and stats to DAVIDenrich tables

  :param d: A pandas data frame, returned from DAVIDenrich
  :param q: A pandas data frame, with the query ids in column 'gene_id' used in DAVIDenrich,
            optionally a column 'gene_expression' for gene expression values, and
            optionally a column 'gene_name' for gene id to name conversion.
            If None or both columns 'gene_expression' and 'gene_name' are missing,
            no changes are made to d.
  """
  add = pandas.DataFrame()
  q.loc[:, 'gene_id'] = q['gene_id'].map(str) # from unicode
  q.loc[:, 'gene_id'] = q['gene_id'].map(str.lower)
  terms = d['termName'].tolist()
  if not 'gene_name' in q.columns and not 'gene_expression' in q.columns:
    return d
  for t in terms:
    #print t # DEVPRINT
    ti = d[d['termName'] == t].copy()
    ti.reset_index(drop = True, inplace = True)
    #print ti # DEVPRINT
    tm = ti.xs(0)['geneIds']
    tm = pandas.DataFrame(data = tm.split(', '))
    tm.columns = ['geneIds']
    tm.loc[:, 'geneIds'] = tm['geneIds'].map(str)
    tm.loc[:, 'geneIds'] = tm['geneIds'].map(str.lower)
    tm = pandas.merge(tm, q, how = 'left', left_on = 'geneIds', right_on = 'gene_id')
    if 'gene_name' in q.columns:
      tg = tm['gene_name'].tolist()
      tg = map(str, tg)
      ti.loc[0, 'geneNames'] = ', '.join(tg)
    if 'gene_expression' in q.columns:
      te = tm['gene_expression'].tolist()
      te = map(str, te)
      ti.loc[0, 'geneExpression'] = ', '.join(te)
    add = pandas.concat([add, ti])
    #print add # DEVPRINT
  add.reset_index(drop = True)
  return add

for f in args.table:
  if args.verbose:
    print "Reading table '" + f + "'"
  f_name, f_suffix = os.path.splitext(os.path.basename(f))
  f_fmt = AGEpy.getFileFormat(f)
  if f_fmt in ["xls", "xlsx"] and args.sheet is None:
    sheets = pandas.ExcelFile(f).sheet_names
  for s in sheets:
    s_name = s
    if s is "":
      s = None
    if args.verbose and s is not None:
      print "* getting sheet '" + s_name + "'"
    d = AGEpy.readDataFrame(f, sheet = s, sep = sep)
    #if args.verbose:
    #  print d.head()
    d_names = list(d)
    ecol = [None]
    if args.column is not None:
      #cols = list(cols[i] for i in args.column) # for nargs='+'
      cols = [d_names[args.column]]
      if args.expression_column is not None:
        ecol = [d_names[args.expression_column]]
    else:
      cols = d_names
    if args.verbose:
      print "* using columns: " + ', '.join(cols)
    for c, e in zip(cols, ecol):
      dc = d[[c]]
      dc.rename(columns = { c : 'gene_id' }, inplace = True)
      if e is not None:
        dc.loc[:,e] = d[[e]]
        dc.rename(columns = { e : 'gene_expression' }, inplace = True)
      if gtf is not None:
        if args.verbose:
          print "* translating ids"
        dc.rename(columns = { 'gene_id' : 'gene_name' }, inplace = True)
        dc = pandas.merge(dc, gtf, left_on = ['gene_name'], right_on = ['gene_name'], how = 'left').dropna()
      i = dc[['gene_id']].drop_duplicates()
      if args.verbose:
        i_n = str(len(i))
        i_nmissing = str(i['gene_id'].isnull().sum())
        i_head = i['gene_id'].head().tolist()
        i_head = map(str, i_head)
        i_head = ', '.join(i_head)
        print "* querying for colum '" + c + "' (missing/total = " + i_nmissing + "/" + i_n + ")"
        print i_head + ', ...'
        sys.stdout.flush()
      i = i['gene_id'].dropna().tolist()
      dc_david = AGEpy.DAVIDenrich(args.DAVIDid, args.DAVIDtypes, args.DAVIDuser,
          i, p = args.max_pvalue, n = args.min_genes, verbose = args.verbose)
      if dc_david is None:
        print "* no results, no output"
        continue
      #print dc_david.head()
      dc_david = DAVIDannotate(dc_david, dc)
      #if gtf is not None:
      #  dc_david = AGEpy.id_nameDAVID(dc_david, name_id = gtf)
      out_name = args.output_folder + '/' + f_name + outsep + s_name + outsep + str(c) #f_suffix
      if args.output_xlsx:
        print "* writing " + out_name + '.xlsx'
        writer = pandas.ExcelWriter(out_name + '.xlsx', engine = 'xlsxwriter')
      cats = list(set(dc_david['categoryName'].tolist()))
      for cat in cats:
        dc_david_t = dc_david[dc_david['categoryName']==cat]
        print "* writing " + out_name + outsep + cat + '.tsv'
        dc_david_t.to_csv(out_name + outsep + cat + '.tsv', sep = '\t', index = False)
        if args.output_xlsx:
          dc_david_t.to_excel(writer, cat, index = False)
      if args.output_xlsx:
        writer.save()

print "Done"
sys.stdout.flush()
