#!/usr/bin/python

import os
import sys
import time
import fcntl
import atexit
import logging
import optparse
import subprocess

os.environ['CONDOR_CONFIG'] = '/etc/condor-ce/condor_config'

import libxml2
import urllib2
import htcondor


# Add htcondor.param.get() to old versions of condor-python that don't have it
try:
    htcondor.param.get
except AttributeError:
    def _htcondor_param_get(key, default=None):
        try:
            return htcondor.param[key]
        except KeyError:
            return default
    htcondor.param.get = _htcondor_param_get

#############################################################
# NOTE: The following locking code was originally authored by
# Brian Bockelman as part of the Gratia project.
##############################################################
fd = None
# Record the PID that initially took the lock, to prevent unlinking
# when a fork'ed child exits.
pid_with_lock = None
def close_and_unlink_lock():
    if fd:
        if pid_with_lock == os.getpid():
            os.unlink(fd.name)
        fd.close()
atexit.register(close_and_unlink_lock)

def ExclusiveLock(given_lock_location = None, timeout=3600):
    """
    Grabs an exclusive lock on $(LOCK)/GeneratorLock.

    If the lock is owned by another process, and that process is older than the
    timeout, then the other process will be signaled.  If the timeout is
    negative, then the other process is never signaled.

    If we are unable to hold the lock, this call will not block on the lock;
    rather, it will throw an exception.
    """

    if not given_lock_location:
        lock_path = htcondor.param['LOCK']
        lock_location = htcondor.param.get(
            'GENERATOR_LOCK', os.path.join(lock_path, "GeneratorLock"))
    else:
        lock_location = given_lock_location

    lock_location = os.path.abspath(lock_location)
    lockdir = os.path.dirname(lock_location)
    if not os.path.isdir(lockdir):
        raise Exception("Lock is to be created in a directory %s which does "
            "not exist." % lockdir)

    global fd
    global pid_with_lock
    fd = open(lock_location, "w")

    # POSIX file locking is cruelly crude.  There's nothing to do besides
    # try / sleep to grab the lock, no equivalent of polling.
    # Why hello, thundering herd.

    # An alternate would be to block on the lock, and use signals to interupt.
    # This would mess up Gratia's flawed use of signals already, and not be
    # able to report on who has the lock.  I don't like indefinite waits!
    max_tries = 5
    for tries in range(1, max_tries+1):
        try:
            fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
            fd.write("%d" % os.getpid())
            pid_with_lock = os.getpid()
            fd.flush()
            return
        except IOError, ie:
            if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)):
                raise
            if check_lock(fd, timeout):
                time.sleep(.2) # Fast case; however, we have *no clue* how
                               # long it takes to clean/release the old lock.
                               # Nor do we know if we'd get it if we did
                               # fcntl.lockf w/ blocking immediately.  Blech.
                # Check again immediately, especially if this was the last
                # iteration in the for loop.
                try:
                    fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    fd.write("%d" % os.getpid())
                    pid_with_lock = os.getpid()
                    fd.flush()
                    return
                except IOError, ie:
                    if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)):
                        raise
        fd.close()
        fd = open(lock_location, "w")
        logging.error("Unable to acquire lock, try %i; will sleep for %i "
            "seconds and try %i more times." % (tries, tries, max_tries-tries))
        time.sleep(tries)

    raise Exception("Unable to acquire lock")

def check_lock(my_fd, timeout):
    """
    For internal use only.

    Given a fd that is locked, determine which process has the lock.
    Kill said process if it is older than "timeout" seconds.
    This will log the PID of the "other process".
    """

    pid = get_lock_pid(my_fd)
    if pid == os.getpid():
        return True

    if timeout < 0:
        logging.warning("Another process, %d, holds the probe lockfile." % pid)
        return False

    try:
        age = get_pid_age(pid)
    except:
        logging.warning("Another process, %d, holds the probe lockfile." % pid)
        logging.warning("Unable to get the other process's age; will not time "
            "it out.")
        return False

    logging.info("Another process, %d (age %d seconds), holds the probe "
        "lockfile." % (pid, age))

    if age > timeout:
        os.kill(pid, signal.SIGKILL)
    else:
        return False

    return True

linux_struct_flock = "hhxxxxqqixxxx"
try:
    os.O_LARGEFILE
except AttributeError:
    start_len = "hhlli"

def get_lock_pid(my_fd):
    # For reference, here's the definition of struct flock on Linux
    # (/usr/include/bits/fcntl.h).
    #
    # struct flock
    # {
    #   short int l_type;   /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK.  */
    #   short int l_whence; /* Where `l_start' is relative to (like `lseek').  */
    #   __off_t l_start;    /* Offset where the lock begins.  */
    #   __off_t l_len;      /* Size of the locked area; zero means until EOF.  */
    #   __pid_t l_pid;      /* Process holding the lock.  */
    # };
    #
    # Note that things are different on Darwin
    # Assuming off_t is unsigned long long, pid_t is int
    try:
        if sys.platform == "darwin":
            arg = struct.pack("QQihh", 0, 0, 0, fcntl.F_WRLCK, 0)
        else:
            arg = struct.pack(linux_struct_flock, fcntl.F_WRLCK, 0, 0, 0, 0)
        result = fcntl.fcntl(my_fd, fcntl.F_GETLK, arg)
    except IOError, ie:
        if ie.errno != errno.EINVAL:
            raise
        logging.error("Unable to determine which PID has the lock due to a "
            "python portability failure.  Contact the developers with your"
            " platform information for support.")
        return False
    if sys.platform == "darwin":
        _, _, pid, _, _ = struct.unpack("QQihh", result)
    else:
        _, _, _, _, pid = struct.unpack(linux_struct_flock, result)
    return pid

def get_pid_age(pid):
    now = time.time()
    st = os.stat("/proc/%d" % pid)
    return now - st.st_ctime
#############################################################
## End locking code from Gratia.
#############################################################

CONFIG_TEMPLATE = """\
###############################################################################
#
# HTCondor-CE server *generated* authorization configuration
#
###############################################################################

# DO NOT EDIT:
# This file is AUTO-GENERATED and will be periodically overwritten if the
# condor-ce-collector service is enabled. To force an update, run:
#
#  condor_ce_config_generator
#
# There is one entry per active, registered CE in OSG Topology.

%s

LastCEConfigGenerateTime = %d

COLLECTOR_ATTRS = $(COLLECTOR_ATTRS), LastCEConfigGenerateTime
# GENERATED AT %s
"""

def write_config(config_line):

    config_dir = htcondor.param['LOCAL_CONFIG_DIR'].split(",")[1]
    config_file = os.path.join(config_dir, "02-ce-collector-auth-generated.conf")
    tmp_config_file = os.path.join(config_dir, "#02-ce-collector-auth-generated.conf")
    logging.info("Will write configuration to %s" % config_file)

    try:
        fd = open(tmp_config_file, "w")
    except IOError, ie:
        logging.error("Failed to open temporary config file: %s" % str(ie))
        sys.exit(2)
    generate_time = int(time.time())
    fd.write(CONFIG_TEMPLATE % (config_line, generate_time, time.ctime(generate_time)))
    os.fsync(fd.fileno())
    fd.close()
    try:
        os.rename(tmp_config_file, config_file)
    except IOError, ie:
        logging.error("Failed to rename temporary config file to %s: %s" % (config_file, str(ie)))
        sys.exit(2)
    # Note that we don't call fsync on the directory itself; on a crash,
    # we aren't guaranteed which version (old or new) is present.  Since
    # this is still periodically updated, this should be OK.

def generate_config(hosts):
    authz = ["%s@users.htcondor.org/%s" % (i, i) for i in hosts]
    return "COLLECTOR.ALLOW_ADVERTISE_SCHEDD = $(COLLECTOR.ALLOW_ADVERTISE_SCHEDD), %s" % ", \\\n\t\t".join(authz)

def parse_config():
    parser = optparse.OptionParser()
    parser.add_option("-l", "--logfile", dest="logfile", help="Location of the tool's logfile.")
    parser.add_option("--lock", dest="lockfile", help="Location of the tool's lock file.")
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print logging statements to stderr")
    parser.add_option("-d", "--debug", dest="debug", action="store_true", default=False, help="Provide additional logging statements.")
    parser.add_option("--dry-run", dest="dryrun", action="store_true", default=False, help="Dry run only; do not update configuration files.")
    parser.add_option("-r", "--reconfig", dest="reconfig", action="store_true", default=None, help="Force a reconfig to be issued.")
    parser.add_option("--no-reconfig", dest="reconfig", action="store_false", help="Do not reconfig even if GENERATOR_RECONFIG is true.")
    opts, args = parser.parse_args()

    if opts.reconfig is None and htcondor.param.get('GENERATOR_RECONFIG', 'true').lower() == 'true':
        opts.reconfig = True

    if opts.lockfile:
        htcondor.param['GENERATOR_LOCK'] = opts.lockfile

    if not opts.logfile:
        log_dir = htcondor.param['LOG']

        opts.logfile = htcondor.param.get(
            'GENERATOR_LOG', os.path.join(log_dir, "GeneratorLog"))

    level = logging.INFO
    if opts.debug:
        level = logging.DEBUG

    FORMAT = "%(asctime)s (%(levelname)s) %(message)s"
    try:
        logging.basicConfig(filename=opts.logfile, level=level, format=FORMAT)
    except IOError, ie:
        print "Unable to configure logfile:\n%s" % str(ie)
        sys.exit(1)

    if hasattr(htcondor, 'enable_debug'):
        htcondor.enable_debug()

    if opts.verbose:
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter(fmt=FORMAT))
        rootLogger = logging.getLogger()
        rootLogger.addHandler(handler)

    if args:
        print "This tool does not take any positional arguments"
        parser.print_args()

    return opts

def main():
    opts = parse_config()
    logger = logging.getLogger()

    logger.debug("Taking lock on condor_ce_config_generator instance")
    ExclusiveLock()

    url = htcondor.param.get('MYOSG_URL', None)
    if url is None:
        logger.error("MYOSG_URL is missing in HTCondor-CE config.  Unable to proceed.")
        sys.exit(1)

    logger.info("Downloading MyOSG service list from %s" % url)
    fp = urllib2.urlopen(url)
    doc = fp.read()
    logger.info("Download complete")
    doc = libxml2.parseMemory(doc, len(doc))
    ctxt = doc.xpathNewContext()

    host_list = [str(text_node) for text_node in ctxt.xpathEval("/ResourceSummary/ResourceGroup/Resources/Resource/FQDN/text()")]
    config = generate_config(host_list)
    if host_list:
        logger.info("%d CE hosts found" % len(host_list))
        logger.debug("CE Hosts:")
        for host in host_list:
            logger.debug("- %s" % host)
    else:
        logger.warning("No hosts returned")

    if not opts.dryrun:
        logger.debug("Final config:")
        logger.debug(config)
        write_config(config)

        if opts.reconfig:
            logger.info("Configuration file written; now reconfiguring daemons.")
            cmd = ['condor_ce_reconfig']

            proc_outerr_fd = None
            if not opts.verbose:
                proc_outerr_fd = logger.handlers[0].stream.fileno()

            retcode = subprocess.call(cmd, stdout=proc_outerr_fd, stderr=proc_outerr_fd)
            if retcode != 0:
                logger.warning("condor_ce_reconfig failed.  This may be"
                               " because the condor-ce daemons aren't"
                               " running.  Rerun with --no-reconfig"
                               " to supress this message.")
                sys.exit(1)
    else:
        logger.info("Final config:")
        logger.info(config)
        logger.info("Dry-run enabled; not actually writing configuration file.")

    logger.info("Success!")

if __name__ == '__main__':
    try:
        main()
    except SystemExit:
        raise
    except:
        logging.exception("Uncaught exception:")
        sys.exit(2)

