#!python
"""Extract subordinate clauses from a text file or directory of text files"""
import argparse
import logging
import os
from qextract import read_in_chunks, nlp

# Constants
OUTPUT_TEXT_FILE_BASE = 'subordinateClausesFrom{}'


def _document_has_subordinating_conj(doc):
    """Takes a spacy nlp document and returns a True if the document contains a
    subordinate conjunction; else false"""
    for word in doc:
        if word.tag_ == 'IN':
            return True
    return False


def _extract_subordinate_clause_type1(sentence):
    """Return subordinate clause tuple from sentence with subordinate clause at
    the beginning of the sentence."""
    subordinate_clause = ''
    subordinating_conj = None
    flagged = False
    for clause in sentence.split(','):
        clause = clause.strip()
        doc = nlp(clause)
        if len(doc) < 1:
            break
        if not subordinating_conj and doc[0].tag_ == 'IN':
            subordinating_conj = str(doc[0])
            subordinate_clause += clause
        elif subordinating_conj and doc[0].tag_ not in ['NN', 'NNS', 'NNP', 'NNPS', 'DT',
                'JJ', 'JJR', 'JJS', 'PRP', 'PRP$']:
            subordinate_clause += ', ' + clause
            flagged = True
        elif subordinating_conj:
            break # clause followed by main clause
    return subordinate_clause, subordinating_conj, flagged


def _extract_subordinate_clause_type2(sentence):
    """Return subordinate clause tuple from sentence with subordiante clause at
    the end or middle of sentence. If the subordinating conj is not the first
    word after a comma, the subordinating conjunction returned will be None.  If
    this occurs, _extract_subordinate_clause_type3 should be attempted."""
    subordinate_clause = ''
    subordinating_conj = None
    flagged = False
    for clause in sentence.split(','):
        clause = clause.strip()
        doc = nlp(clause)
        if len(doc) < 1:
            break
        if not subordinating_conj and doc[0].tag_ == 'IN':
            subordinating_conj = str(doc[0])
            subordinate_clause += clause
        elif subordinating_conj and doc[0].tag_ not in ['RB', 'RBR', 'RBS',
                'VB', 'VBG', 'VBS', 'VBN', 'VBP', 'VBZ']:
            subordinate_clause += ', ' + clause
            flagged = True
        elif subordinating_conj:
            break # clause followed by main clause
    return subordinate_clause, subordinating_conj, flagged

def _extract_subordinate_clause_type3(sentence):
    """Return subordinate clause tuple from sentence with subordinate clause in
    the middle of a sentence and not directly after a comma.
    
    Note: using this method on setences that are not sure to have a subordinate
    conj will give you trash results.
    """
    # Diane decided to plant tomatoes in the back of the yard where the sun
    # blazed the longest during the day.
    doc = nlp(sentence)
    for word in doc:
        if word.tag_ == 'IN':
            break # print(word) => where
    result = str(word) + sentence.split(str(word))[1] # print(result) => where
    # the sun blazed the longest during the day.
    result = result.split(',')[0]
    result = result.split('.')[0]
    subordinate_clause, subordinate_conj, flagged = result, word, False
    
    return subordinate_clause, subordinate_conj, flagged  


def _has_subordinate_clause(sentence):
    '''If this has a subordiate clause return an object with the clause,
    subordinate conjuction, and whether or not it is 'flagged' for review'''
    result = {'clause': '', 'subordinating_conj': '', 'flagged': False}
    sentence = sentence.strip()
    # Short circuit questions
    if len(sentence) == 0 or sentence[-1] == '?':
        return None

    # TODO: does not cover clauses that begin with a relative (WP, WP$) pronoun
    
    # If sentence begins with subordinating conjuction:
    # Start at subordinating conj and end with period or
    # a comma followed by a noun/pronoun/adj/determiner
    # ie. Until the end of time, he would love her.
    #     becomes,
    #     Until the end of time
    # NOTE: With short introductory phrases, a comma is not always used. 
    # For example,
    # > After the holidays the girls went to the Blackheath High School.
    # is a sentence that might be found in a book, still, we believe it would be
    # better with a comma after holidays and would suggest the student add one.
    #
    # If subordinating conj is in the middle of the sentence and after a comma:
    # Start at subordinating conj and end with period or comma followed by
    # adverb/verb/
    # ie. Peter, with heroic unselfishness, did not say anything.
    #     becomes,
    #     with heroic unselfishness
    #
    # If the subordinating conj in the middle of a sentence and not after a
    # comma:
    # Start at the subordinating conjunction and end with the first comma or
    # period.
    # ie. Jonathon spent his class time reading comic books since his average
    #     was a 45 one week before final exams.
    #     becomes,
    #     since his average was a 45 one week before final exams.
    

    doc = nlp(sentence)
    if doc[0].tag_ == 'IN':
        subordinate_clause, subordinating_conj, flagged =  _extract_subordinate_clause_type1(sentence)
    elif _document_has_subordinating_conj(doc):
        subordinate_clause, subordinating_conj, flagged =  _extract_subordinate_clause_type2(sentence)
        if not subordinating_conj:
            _extract_subordinate_clause_type3(sentence)
    else:
        subordinating_conj = None
            
    if subordinating_conj:
        result['clause'] = subordinate_clause
        result['subordinating_conj'] = subordinating_conj
        result['flagged'] = flagged
    else:
        result = None

    return result


def _extract_from_file(input_file, output_file='qextract.out'):
    """Write subordinate clause file"""
    # open a working copy of the file to show its currently being written to
    try:
        with open(input_file, 'r') as f:
            # final sentence may not be a complete sentence, save and prepend to next chunk
            leftovers = ''
            sentence_no = 0
            output = open(output_file + '.working', 'w+')
            for chunk in read_in_chunks(f): # lazy way of reading our file in case it's large
                # prepend leftovers to chunk
                chunk = leftovers + chunk
                chunk = chunk.replace(';', '.') # replace semi colons with periods 
                doc = nlp(chunk)

                # last sentence may not be sentence, move to next chunk
                sents = [sent.string.strip() for sent in doc.sents]
                if len(sents) > 1:
                    leftovers = sents[-1] + chunk.rpartition(sents[-1])[-1]
                    sents = sents[:-1]
                for sent in sents:
                    sent = sent.replace('\n', ' ')
                    clause = _has_subordinate_clause(sent)
                    if clause:
                        output.write("{}\n{}\n{}\n" \
                        "{}\n\n\n\n\n".format(sent, clause['clause'],
                            clause['subordinating_conj'], clause['flagged']))
            output.close()
            # remove the .working extention to show the file is finished
            os.rename(output_file + '.working', output_file)
    except Exception as e:
        raise e
        print('error on {}'.format(input_file))
        print(e)
        print('closing file and continuing')
        os.rename(output_file + '.working', output_file)
        print('...')


def _extract_from_directory(inputdir, outputdir='qextract_output'):
    os.makedirs(outputdir, exist_ok=True)
    existing_books = [os.path.join(outputdir, os.fsdecode(f1)) for f1 in
            os.listdir(outputdir)]
    for f in os.listdir(inputdir):
        input_filename = os.path.join(inputdir, os.fsdecode(f))
        output_filename = os.path.join(outputdir,
                OUTPUT_TEXT_FILE_BASE.format(os.fsdecode(f)))
        if output_filename not in existing_books:
            _extract_from_file(input_filename, output_filename)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Extract subordinate clauses '
            'from arbitrary text.')
    parser.add_argument('-i', '--inputfile', help='Extract subordinate clauses '
            'from here.')
    parser.add_argument('-I', '--inputdir', help='Extract subordinate clauses '
            'from files in this input directory.')
    parser.add_argument('-o', '--outputfile', help='write output to this file')
    parser.add_argument('-O', '--outputdir', help='write output to this '
            'directory.')
    args = parser.parse_args()

    if args.inputdir and args.outputdir:
        _extract_from_directory(args.inputdir, args.outputdir)
    elif args.inputdir:
        _extract_from_directory(args.inputdir) 
    elif args.inputfile and args.outputfile:
        _extract_from_file(args.inputfile, args.outputfile)
    elif args.inputfile:
        _extract_from_file(args.inputfile)

    # Generate ignore messages
    if args.inputdir and (args.inputfile or args.outputfile):
        logging.warning('inputfile and outputfile unused when directory is '
                'specified.')
    if args.outputdir and not args.inputdir:
        logging.warning('inputdir required with outputdir.')
    
    if args.outputfile and not args.inputfile:
        logging.warning('inputfile required with outputfile.')
