#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2009 Fermi Research Alliance, LLC
# SPDX-License-Identifier: Apache-2.0

"""
This script download HTCondor tarballs from the official condor website.

# Exit codes:
# 0: All good
# 1: Configuration file does not exist

A configuration file is specified through the GET_TARBALLS_CONFIG environment variable.
Alternatively, the script looks for a file named get_tarballs.yaml in the same directory
of get_tarball.py

This is a sample configuration file:

DESTINATION_DIR: "/var/lib/gwms-factory/condor/"
TARBALL_BASE_URL: "https://research.cs.wisc.edu/htcondor/tarball/"
DEFAULT_TARBALL_VERSION: [ "9.0.16" ] # Can be set to "latest"
CONDOR_TARBALL_LIST:
   - MAJOR_VERSION: "9.0"
     WHITELIST: [ "9.0.7", "9.0.16", "latest" ]
   - MAJOR_VERSION: "10.0"
     WHITELIST: [ "latest" ]
   - MAJOR_VERSION: "10.x"
     DOWNLOAD_LATEST: True # Same as adding "latest" to a WHITELIST. Default False
   - MAJOR_VERSION: "23.0"
     WHITELIST: [ "23.0.0" ]
   - MAJOR_VERSION: "23.x"
     WHITELIST: [ "23.0.0" ]
FILENAME_LIST: [ "condor-{version}-x86_64_CentOS7-stripped.tar.gz", "condor-{version}-x86_64_CentOS8-stripped.tar.gz", "condor-{version}-x86_64_AlmaLinux8-stripped.tar.gz", "condor-{version}-x86_64_Ubuntu18-stripped.tar.gz", "condor-{version}-x86_64_Ubuntu20-stripped.tar.gz", "condor-{version}-aarch64_Stream8-stripped.tar.gz", "condor-{version}-ppc64le_CentOS8-stripped.tar.gz", "condor-{version}-ppc64le_AlmaLinux8-stripped.tar.gz", "condor-{version}-aarch64_AlmaLinux8-stripped.tar.gz" ]
OS_MAP: { "CentOS7":"default,rhel7,linux-rhel7", "CentOS8":"rhel8,linux-rhel8", "AlmaLinux8":"rhel8,linux-rhel8", "Ubuntu18":"ubuntu18,linux-ubuntu18", "Ubuntu20":"ubuntu20,linux-ubuntu20"}
ARCH_MAP: { "x86_64":"default", "ppc64le":"ppc64le", "aarch64":"aarch64" }
XML_OUT: "/etc/gwms-factory/config.d/01-condor-tarballs.xml"

# Not specifying BLACKLIST or WHITELIST download everything
# Blacklist is ignored if whitelist is specified as well
# WHITELIST: download only those releases
# BLACKLIST: do not download the releases but download all the rest
# CHECK_LATEST: print a warning if latest version of the major series is not on the factory xml
"""

import argparse
import hashlib
import os
import re
import sys
import tempfile

from collections import UserDict
from distutils.version import StrictVersion
from html.parser import HTMLParser
from urllib import request
from urllib.error import HTTPError
from urllib.parse import urljoin

import yaml


class TarballManager(HTMLParser):
    """This class manages the HTCondor tarballs for a major release (e.g.: 23.0).

    In the constructor, it builds a list of releases by parsing the major
    release web page looking for minor releases (e.g.: 23.0.0) that have
    been released (they have a release directory). The list of releases are
    solely used for validation purposes, and to know the latest release.

    It then offers a method to download a tarball and save it locally,
    and a method to generate the xml snippet to add to the glideinWMS.xml
    tarball configuration section.
    """

    def __init__(self, release_url, filenames, destination, verbose=False):
        """Create a TarballManager object. Parses the tarball htcondor web page to get
        the list of releases. The available releases gets saved into self.releases.

        It calls the .feed() function from HTMLParser that in turn calls the overridden
        handle_data() function.

        Args:
          release_url: The main url where to begin looking for releases. It has to be the url
                       of a major release, for example: https://research.cs.wisc.edu/htcondor/tarball/23.0/
                       The list of available releases are here: https://research.cs.wisc.edu/htcondor/tarball/
          filenames: A list of strings indicating the tarballs that need to be downloaded for each release found.
                     As an example, [ "condor-{version}-x86_64_CentOS7-stripped.tar.gz", "condor-{version}-x86_64_CentOS8-stripped.tar.gz" ]
                     will download the x86_64 CentOS7 and CeontOS8 tarballs by looking for urls that looks like:
                     https://research.cs.wisc.edu/htcondor/tarball/23.0/23.0.0/release/condor-23.0.0-x86_64_CentOS7-stripped.tar.gz
                     The substring {version} gets expanded to the current version.
          destination: the directory where files will be downloaded when the download_tarballs method is called
          verbose: More printouts when files are being downloaded if True
        """
        super().__init__()
        self.releases = []
        self.filenames = filenames
        self.release_url = release_url
        self.destination = destination
        self.downloaded_files = []
        self.latest_version = None  # absolute latest, does not consider whitelists and blacklists
        self.verbose = verbose

        fp = request.urlopen(self.release_url)
        mybytes = fp.read()
        self.feed(mybytes.decode("utf-8"))
        if len(self.releases) == 0:
            print(f"Cannot find any release in {self.release_url}")
        else:
            self.releases.sort(key=StrictVersion)
            self.latest_version = self.releases[-1]

    def handle_data(self, data):
        """Internal method. Override the base class handle_data"""
        if re.match(r"\d+\.\d+\.\d+/", data):
            try:
                request.urlopen(self.release_url + "/" + data + "release")
            except HTTPError as err:
                if err.getcode() != 404:
                    raise
            else:
                self.releases.append(data[:-1])

    def download_tarballs(self, version):
        """Download a specific set of condor tarballs from the release_url link
         All the OS and architecture tarballs for the specified condor version are downloaded.
         The set of OS and architecture files are specified in the constructure using filenames

         The method also checks the tarball checksum (by downloading the sha256sum.txt file)
         If a tarball already exist and its checksum is correct then it is skipped.
         If a specific os/architecture tarball is not avalable it is skipped, and a message is
         printed on stdout if verbose has been set to True in the constructor.

        Args:
          version: The condor version to download among this major release. E.g.: "23.0.1"
        """
        desturl = os.path.join(
            self.release_url, version, "release/"
        )  # urljoin needs nested call and manual adding of "/".. It sucks.
        checksums = {}
        with tempfile.TemporaryDirectory() as tmp_dir:
            hash_file = os.path.join(tmp_dir, "sha256sum.txt")
            request.urlretrieve(urljoin(desturl, "sha256sum.txt"), hash_file)

            with open(hash_file) as hf:
                for line in hf:
                    fhash, filename = line.split("  ")
                    checksums[filename.strip()] = fhash.strip()

        for fname in self.filenames:
            tname = fname.format(version=version)  # tarball name
            dest_file = os.path.join(self.destination, tname)
            if os.path.isfile(dest_file):
                if self.verify_checksum(tname, checksums):
                    self.verbose and print(f"\tFile {dest_file} already exists. Continuing with next file")
                    self.downloaded_files.append(dest_file)
                    continue
                else:
                    print(
                        f"\tRe-downloading {dest_file} since it exists but it has a wrong checksum (or checkusm does not exist)"
                    )

            try:
                request.urlretrieve(urljoin(desturl, tname), dest_file)
            except HTTPError as err:
                if err.getcode() == 404:
                    self.verbose and print(f"\tFile {tname} is not available at {desturl}. Continuing with next file")
                    continue
                else:
                    raise
            isok = self.verify_checksum(tname, checksums)
            if isok:
                print(f"\tFile {tname} successfully downloaded")
                self.downloaded_files.append(dest_file)
            elif isok is False:
                print(f"\tChecksum verification failed for file {tname} at {desturl}. Continuing with next file")
            elif isok is None:
                print(
                    f"\tFile {tname} successfully downloaded but checksum not available at {desturl} (check 'sha256sum.txt')"
                )

    def verify_checksum(self, tname, checksums):
        """Internal function to verify the checksum of a file"""
        dest_file = os.path.join(self.destination, tname)
        with open(dest_file, "rb") as f:
            tar_content = f.read()
            actual_checksum = hashlib.sha256(tar_content).hexdigest()

        try:
            return actual_checksum == checksums[tname]
        except KeyError:
            return None

    def generate_xml(self, os_map, arch_map, whitelist, blacklist, default_tarball_version):
        """Generate the XML snipped to be used in the <tarball> section of the glideinWMS.xml configuration.

        Args:
          os_map: A map that indicates how to translate the OS in the tarball filename to the os attribute in the xml.
                  See OS_MAP in the configuration template.
          arch_map: A map that indicates how to translate the ARCH in the tarball filename to the os attribute in the xml.
                    See ARCH_MAP in the configuration template.
          whitelist: The whitelist that tells the method which versions of condor have been downloaded. Can be "latest".
          blacklist: The blacklist in case it was used.
          default_tarball_version: The default condor tarball version, "default" will be addded to the version attribute in the xml
        """
        xml_snippet = '      <condor_tarball arch="{arch}" os="{os}" tar_file="{dest_file}" version="{version}"/>\n'

        if whitelist != []:
            latest_version = sorted(whitelist, key=StrictVersion)[-1]
        else:
            versions = list(set(self.releases) - set(blacklist))
            latest_version = sorted(versions, key=StrictVersion)[-1]

        out = ""
        for dest_file in self.downloaded_files:
            _, sversion, os_arch, _ = os.path.basename(dest_file).split("-")
            arch, opsystem = os_arch.rsplit("_", 1)
            version = sversion  # sversion = "split" version
            if sversion == latest_version:
                major, minor, _ = sversion.split(".")
                version += "," + major + ".0.x" if minor == "0" else "," + major + ".x"
            if sversion in default_tarball_version:
                version += ",default"
            out += xml_snippet.format(arch=arch_map[arch], os=os_map[opsystem], dest_file=dest_file, version=version)
        return out


class Config(UserDict):
    """Used to store information about the configuration file. yaml file gets converted
    to a dictionary by using yaml.load.
    """

    def __init__(self):
        """Build the dictionary using yaml.load. The configuration file is located by
        looking at the environment variable GET_TARBALLS_CONFIG, or by looking for
        a file named get_tarballs.yaml in the script directory (os.path.abspath(__file__))
        """
        script_dir = os.path.dirname(os.path.abspath(__file__))
        config_file = os.environ.get("GET_TARBALLS_CONFIG", False) or os.path.join(script_dir, "get_tarballs.yaml")
        if not os.path.isfile(config_file):
            print(f"Configuration file {config_file} does not exist")
            sys.exit(1)
        with open(config_file) as cf:
            config = yaml.load(cf, Loader=yaml.FullLoader)
        super().__init__(config)
        self.validate()

    def validate(self):
        """Validate the configuration file, sort the whitelists and blacklists, etc"""
        for major_dict in self["CONDOR_TARBALL_LIST"]:
            if "WHITELIST" not in major_dict:
                major_dict["WHITELIST"] = []
            if "BLACKLIST" not in major_dict:
                major_dict["BLACKLIST"] = []
            if "latest" in major_dict["WHITELIST"]:
                major_dict["WHITELIST"].remove("latest")
                major_dict["DOWNLOAD_LATEST"] = True
            major_dict["WHITELIST"].sort(key=StrictVersion)
            major_dict["BLACKLIST"].sort(key=StrictVersion)


def check_xml(release):
    """Deprecate?"""
    found = False
    with open("/etc/gwms-factory/glideinWMS.xml") as myfile:
        for line in myfile:
            if re.search(f"condor_tarball.*{release}", line):
                found = True
    return found


def save_xml(dest_xml, xml):
    """Wrapper function to save the tarball xml snippet to disk. Also adds
    the necessary xml tags.
    """
    with open(dest_xml, "w") as fd:
        fd.write("<glidein>\n")
        fd.write("   <condor_tarballs>\n")
        fd.write(xml)
        fd.write("   </condor_tarballs>\n")
        fd.write("</glidein>\n")


def parse_opts():
    """Parse the command line using ArgumentParser. Only option is --verbose"""
    parser = argparse.ArgumentParser(
        prog="get_tarballs", description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )

    parser.add_argument("--verbose", action="store_true", help="Be more loud when downloading tarballs file")

    args = parser.parse_args()

    return args


def main():
    """The main. You happy pylint?"""
    args = parse_opts()
    config = Config()
    release_url = config["TARBALL_BASE_URL"]
    default_tarball_version = config["DEFAULT_TARBALL_VERSION"]
    xml = ""

    for major_dict in config["CONDOR_TARBALL_LIST"]:
        print(f'Handling major version {major_dict["MAJOR_VERSION"]}')
        major_version = major_dict["MAJOR_VERSION"]
        manager = TarballManager(
            urljoin(release_url, major_version), config["FILENAME_LIST"], config["DESTINATION_DIR"], args.verbose
        )
        # If necessary, add the latest version to the whitelist now that we know the latest version for this major set of releases
        if major_dict.get("DOWNLOAD_LATEST", False):
            major_dict["WHITELIST"].append(manager.latest_version)
        # I think CHACK_LATEST can be deprecated now that we have DOWNLOAD_LATEST
        if major_dict.get("CHECK_LATEST", False) and not check_xml(manager.latest_version):
            print(f"Latest version {manager.latest_version} not present in the glideinWMS.xml file.")
        if major_dict["WHITELIST"] != []:
            # Just get whitelisted versions
            for version in set(major_dict["WHITELIST"]):
                manager.download_tarballs(version)
        else:
            # Get everything but the blacklisted
            to_download = sorted(set(manager.releases) - set(major_dict["BLACKLIST"]), key=StrictVersion)
            for version in to_download:
                manager.download_tarballs(version)
        if config.get("XML_OUT") is not None:
            xml += manager.generate_xml(
                config["OS_MAP"],
                config["ARCH_MAP"],
                major_dict["WHITELIST"],
                major_dict["BLACKLIST"],
                manager.latest_version if default_tarball_version == "latest" else default_tarball_version,
            )

    if config.get("XML_OUT") is not None:
        try:
            save_xml(config["XML_OUT"], xml)
        except OSError as ioex:
            print(f'Cannot write file {config["XML_OUT"]} when trying to save xml tarball output: {str(ioex)}')


if __name__ == "__main__":
    main()
