#!/usr/bin/env python

import logging
import argparse
import json
import csv
import sys
import os.path

import cdx_toolkit
import cdx_toolkit.warc

print('This command is deprecated in favor of cdxt', file=sys.stderr)

ARGS = argparse.ArgumentParser(description='cdx_toolkit iterator command line tool')
ARGS.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs')
ARGS.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback')
ARGS.add_argument('--source', action='store', help='direct the query to this CDX server')
ARGS.add_argument('--wb', action='store', help='direct replays for content to this wayback')
ARGS.add_argument('--limit', type=int, action='store')
ARGS.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror')
ARGS.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending')
ARGS.add_argument('--from', action='store')
ARGS.add_argument('--to', action='store')
ARGS.add_argument('--closest', action='store')  # this is a get() thing, not an iter thing
ARGS.add_argument('--filter', action='store', help='see CDX API documentation for usage')
ARGS.add_argument('--all-fields', action='store_true')
ARGS.add_argument('--fields', action='store', default='url,status,timestamp', help='try --all-fields if you need the list')
ARGS.add_argument('--jsonl', action='store_true')
ARGS.add_argument('--csv', action='store_true')
ARGS.add_argument('--warc', action='store_true', help='create a warc of the output contents')
ARGS.add_argument('--warc-prefix', default='TEST', help='prefix for the warc filename')
ARGS.add_argument('--warc-subprefix', type=str, default=None, help='subprefix for the warc filename, default None')
ARGS.add_argument('--warc-size', type=int, default=1000000000, help='target for the warc filesize in bytes')
ARGS.add_argument('--warc-creator', action='store', help='creator of the warc: person, organization, service')
ARGS.add_argument('--warc-operator', action='store', help='a person, if the creator is an organization')
ARGS.add_argument('--warc-url-fgrep', action='store', help='this pattern must be present to warc an url')
ARGS.add_argument('--warc-url-fgrepv', action='store', help='this pattern must not be present to warc an url, e.g. /robots.txt')
ARGS.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration')
ARGS.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)')
ARGS.add_argument('url')

cmdline = ' '.join(sys.argv[1:])
args = ARGS.parse_args()

loglevel = os.getenv('LOGLEVEL') or 'WARNING'
if args.verbose:
    if args.verbose > 0:
        loglevel = 'INFO'
    if args.verbose > 1:
        loglevel = 'DEBUG'
logging.basicConfig(level=loglevel)
LOGGER = logging.getLogger(__name__)

if not args.url:
    raise ValueError('must specify an url to iterate, example: commoncrawl.org/*')

kwargs = {}
kwargs['source'] = args.cc or args.ia or args.source or None
if kwargs['source'] is None:
    raise ValueError('must specify --cc, --ia, or a --source')
if args.wb:
    kwargs['wb'] = args.wb
if args.cc_mirror:
    kwargs['cc_mirror'] = args.cc_mirror

cdx = cdx_toolkit.CDXFetcher(**kwargs)

fields = set(args.fields.split(','))

kwargs = {}
if args.limit:
    kwargs['limit'] = args.limit
if 'from' in vars(args) and vars(args)['from']:  # python, uh, from is a reserved word
    kwargs['from_ts'] = vars(args)['from']
if args.to:
    kwargs['to'] = args.to
if args.closest:
    if not args.get:
        LOGGER.info('note: --closest works best with --get')
    kwargs['closest'] = args.closest
if args.filter:
    kwargs['filter'] = args.filter

if args.csv:
    writer = csv.DictWriter(sys.stdout, fieldnames=sorted(list(fields)))
    writer.writeheader()


def winnow_fields(obj):
    if args.all_fields:
        printme = obj
    else:
        printme = dict([(k, obj[k]) for k in fields if k in obj])
    return printme


def print_line(printme):
    if args.jsonl:
        print(json.dumps(printme, sort_keys=True))
    elif args.csv:
        writer.writerow(printme)
    else:
        print(', '.join([' '.join((k, printme[k])) for k in sorted(printme.keys())]))


if args.get:
    objs = cdx.get(args.url, **kwargs)
    for obj in objs:
        printme = winnow_fields(obj)
        print_line(printme)
elif args.warc:
    info = {
        'software': 'pypi_cdx_toolkit/'+cdx_toolkit.__version__,
        'isPartOf': args.warc_prefix,
        'description': 'warc extraction generated with: cdx_toolkit '+cmdline,
        'format': 'WARC file version 1.0',  # todo: if we directly read a warc, have this match the warc
    }
    if args.warc_creator:
        info['creator'] = args.warc_creator
    if args.warc_operator:
        info['operator'] = args.warc_operator

    writer_kwargs = {}
    writer_kwargs['size'] = args.warc_size

    writer = cdx_toolkit.warc.get_writer(args.warc_prefix, args.warc_subprefix, info, **writer_kwargs)

    for obj in cdx.iter(args.url, **kwargs):
        url = obj['url']
        if args.warc_url_fgrep and args.warc_url_fgrep not in url:
            LOGGER.debug('not warcing due to fgrep: %s', url)
            continue
        if args.warc_url_fgrepv and args.warc_url_fgrepv in url:
            LOGGER.debug('not warcing due to fgrepv: %s', url)
            continue
        timestamp = obj['timestamp']
        try:
            record = obj.fetch_warc_record()
        except RuntimeError:  # pragma: no cover
            LOGGER.warning('skipping capture for RuntimeError 404: %s %s', url, timestamp)
            continue
        if obj.is_revisit():
            LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
        writer.write_record(record)
else:
    for obj in cdx.iter(args.url, **kwargs):
        printme = winnow_fields(obj)
        print_line(printme)
