#!/usr/bin/env python

#  Copyright 2015 by Leipzig University Library, http://ub.uni-leipzig.de
#                    The Finc Authors, http://finc.info
#                    Martin Czygan, <martin.czygan@uni-leipzig.de>
#
# This file is part of some open source application.
#
# Some open source application is free software: you can redistribute
# it and/or modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation, either
# version 3 of the License, or (at your option) any later version.
#
# Some open source application is distributed in the hope that it will
# be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
#
# @license GPL-3.0+ <http://spdx.org/licenses/GPL-3.0+>

# ----
#
# During a software update, processes and tasks change. Instead of keeping track
# of all the things that need to be rebuilt, we purge everything, except a few
# tasks, which take a long time to run (e.g. downloading 70G over HTTP or 300G
# over FTP, etc.).

# The update process then looks like this:

# * update software (e.g. via pip install -U siskin)
# * run `taskpurge` to cleanup all "derived" artifacts
# * run you things as you usually do and expect higher latency during first run

from __future__ import print_function
from siskin.configuration import Config
import ConfigParser
import os
import re
import shutil
import sys

WHITELIST = [
    '/003/NEPCopy',
    '/028/DOAJDump',
    '/048/GBIDropbox',
    '/common/FTPMirror',
    '/crossref/CrossrefHarvestChunk',
    '/dblp/DBLPDownload',
    '/hackernews/HNCommentsDownload',
    '/mag/MAGDump',
    '/mag/MAGFile',
    '/nl/NLSync',
    '/wikipedia/WikipediaArticleDump',
    '/yago/YagoDump',
    '/zdb/ZDBDump',
]

if __name__ == '__main__':
    if len(sys.argv) >= 2 and sys.argv[1] in ('-h', '--help'):
        print('Usage: TASKPURGE=YES taskpurge', file=sys.stderr)
        sys.exit(1)

    dryrun = True
    if os.environ.get('TASKPURGE', 'NO').upper() in ('1', 'TRUE', 'YES'):
        dryrun = False

    # a set of path matching the WHITELIST
    keep = set()

    try:
        config = Config.instance()
        home = config.get('core', 'home')

        # add all path that match the strings
        for dir, _, _ in os.walk(home):
            for s in WHITELIST:
                if s in dir:
                    print('adding %s to whitelist' % dir, file=sys.stderr)
                    keep.add(dir)
                    break

        # create a whitelisted set of paths, that contain all the parent dirs as well
        whitelisted = set()

        print('%s paths whitelisted' % len(keep), file=sys.stderr)
        print('including whitelist parents...', file=sys.stderr)

        for path in keep:
            whitelisted.add(path)

            # add all parents as well
            p = os.path.dirname(path)
            while p not in ('/', ''):
                whitelisted.add(p)
                p = os.path.dirname(p)

        print('%s paths whitelisted' % len(whitelisted), file=sys.stderr)
        print('purging...', file=sys.stderr)

        # actually run the purge
        for dir, _, _ in os.walk(home):
            if dir in whitelisted:
                if dryrun:
                    print('KEEP\t%s' % dir)
            else:
                if dryrun:
                    print('PURGE\t%s' % dir)
                else:
                    shutil.rmtree(dir)

    except ConfigParser.Error as err:
        print('invalid configuration @%s: %s' % (config._config_paths, err), file=sys.stderr)
        sys.exit(1)
