#!/usr/bin/env python
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import errno
import fcntl
import json
import logging
import os
import time
from swift.common.storage_policy import POLICIES
from swift.common.utils import replace_partition_in_path, config_true_value, \
    audit_location_generator, get_logger, readconf, drop_privileges, \
    RateLimitedIterator, lock_path, PrefixLoggerAdapter, distribute_evenly, \
    non_negative_float, non_negative_int, config_auto_int_value
from swift.obj import diskfile


LOCK_FILE = '.relink.{datadir}.lock'
STATE_FILE = 'relink.{datadir}.json'
STATE_TMP_FILE = '.relink.{datadir}.json.tmp'
STEP_RELINK = 'relink'
STEP_CLEANUP = 'cleanup'
EXIT_SUCCESS = 0
EXIT_NO_APPLICABLE_POLICY = 2
EXIT_ERROR = 1


def policy(policy_name_or_index):
    value = POLICIES.get_by_name_or_index(policy_name_or_index)
    if value is None:
        raise ValueError
    return value


class Relinker(object):
    def __init__(self, conf, logger, device_list=None, do_cleanup=False):
        self.conf = conf
        self.logger = logger
        self.device_list = device_list or []
        self.do_cleanup = do_cleanup
        self.root = self.conf['devices']
        if len(self.device_list) == 1:
            self.root = os.path.join(self.root, list(self.device_list)[0])
        self.part_power = self.next_part_power = None
        self.diskfile_mgr = None
        self.dev_lock = None
        self.diskfile_router = diskfile.DiskFileRouter(self.conf, self.logger)
        self._zero_stats()

    def _zero_stats(self):
        self.stats = {
            'hash_dirs': 0,
            'files': 0,
            'linked': 0,
            'removed': 0,
            'errors': 0,
            'policies': 0,
        }

    def devices_filter(self, _, devices):
        if self.device_list:
            devices = [d for d in devices if d in self.device_list]

        return set(devices)

    def hook_pre_device(self, device_path):
        lock_file = os.path.join(device_path,
                                 LOCK_FILE.format(datadir=self.datadir))

        fd = os.open(lock_file, os.O_CREAT | os.O_WRONLY)
        fcntl.flock(fd, fcntl.LOCK_EX)
        self.dev_lock = fd

        state_file = os.path.join(device_path,
                                  STATE_FILE.format(datadir=self.datadir))
        self.states["state"].clear()
        try:
            with open(state_file, 'rt') as f:
                state_from_disk = json.load(f)
                if state_from_disk["next_part_power"] != \
                        self.states["next_part_power"]:
                    raise ValueError
                on_disk_part_power = state_from_disk["part_power"]
                if on_disk_part_power != self.states["part_power"]:
                    self.states["prev_part_power"] = on_disk_part_power
                    raise ValueError
                self.states["state"].update(state_from_disk["state"])
        except (ValueError, TypeError, KeyError):
            # Bad state file: remove the file to restart from scratch
            os.unlink(state_file)
        except IOError as err:
            # Ignore file not found error
            if err.errno != errno.ENOENT:
                raise

    def hook_post_device(self, _):
        os.close(self.dev_lock)
        self.dev_lock = None

    def partitions_filter(self, datadir_path, partitions):
        # Remove all non partitions first (eg: auditor_status_ALL.json)
        partitions = [p for p in partitions if p.isdigit()]

        relinking = (self.part_power != self.next_part_power)
        if relinking:
            # All partitions in the upper half are new partitions and there is
            # nothing to relink there
            partitions = [part for part in partitions
                          if int(part) < 2 ** self.part_power]
        elif "prev_part_power" in self.states:
            # All partitions in the upper half are new partitions and there is
            # nothing to clean up there
            partitions = [part for part in partitions
                          if int(part) < 2 ** self.states["prev_part_power"]]

        # Format: { 'part': processed }
        if self.states["state"]:
            missing = list(set(partitions) - set(self.states["state"].keys()))
            if missing:
                # All missing partitions were created after the first run of
                # the relinker with this part_power/next_part_power pair. This
                # is expected when relinking, where new partitions appear that
                # are appropriate for the target part power. In such cases,
                # there's nothing to be done. Err on the side of caution
                # during cleanup, however.
                for part in missing:
                    self.states["state"][part] = relinking
            partitions = [
                str(part) for part, processed in self.states["state"].items()
                if not processed]
        else:
            self.states["state"].update({
                str(part): False for part in partitions})

        # Always scan the partitions in reverse order to minimize the amount
        # of IO (it actually only matters for relink, not for cleanup).
        #
        # Initial situation:
        #  objects/0/000/00000000...00000000/12345.data
        #  -> relinked to objects/1/000/10000000...00000000/12345.data
        #
        # If the relinker then scan partition 1, it will listdir that object
        # while it's unnecessary. By working in reverse order of partitions,
        # this is avoided.
        partitions = sorted(partitions, key=int, reverse=True)

        # do this last so that self.states, and thus the state file, has been
        # initiated with *all* partitions before partitions are restricted for
        # this particular run...
        conf_partitions = self.conf.get('partitions')
        if conf_partitions:
            partitions = [p for p in partitions if int(p) in conf_partitions]

        return partitions

    # Save states when a partition is done
    def hook_post_partition(self, partition_path):
        datadir_path, part = os.path.split(os.path.abspath(partition_path))
        device_path, datadir_name = os.path.split(datadir_path)
        device = os.path.basename(device_path)
        state_tmp_file = os.path.join(
            device_path, STATE_TMP_FILE.format(datadir=datadir_name))
        state_file = os.path.join(
            device_path, STATE_FILE.format(datadir=datadir_name))

        # We started with a partition space like
        #   |0              N|
        #   |ABCDEFGHIJKLMNOP|
        #
        # After relinking, it will be more like
        #   |0                             2N|
        #   |AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP|
        #
        # We want to hold off on rehashing until after cleanup, since that is
        # the point at which we've finished with filesystem manipulations. But
        # there's a slight complication: we know the upper half has nothing to
        # clean up, so the cleanup phase only looks at
        #   |0                             2N|
        #   |AABBCCDDEEFFGGHH                |
        #
        # To ensure that the upper half gets rehashed, too, do it as part of
        # relinking; as we finish
        #   |0              N|
        #   |        IJKLMNOP|
        # shift to the new partition space and rehash
        #   |0                             2N|
        #   |                IIJJKKLLMMNNOOPP|
        partition = int(part)
        if not self.do_cleanup and partition >= 2 ** (
                self.states['part_power'] - 1):
            for new_part in (2 * partition, 2 * partition + 1):
                self.diskfile_mgr.get_hashes(
                    device, new_part, [], self.policy)
        elif self.do_cleanup:
            hashes = self.diskfile_mgr.get_hashes(
                device, partition, [], self.policy)
            # In any reasonably-large cluster, we'd expect all old
            # partitions P to be empty after cleanup (i.e., it's unlikely
            # that there's another partition Q := P//2 that also has data
            # on this device).
            #
            # Try to clean up empty partitions now, so operators can use
            # existing rebalance-complete metrics to monitor relinking
            # progress (provided there are few/no handoffs when relinking
            # starts and little data is written to handoffs during the
            # increase).
            if not hashes:
                with lock_path(partition_path):
                    # Same lock used by invalidate_hashes, consolidate_hashes,
                    # get_hashes
                    try:
                        for f in ('hashes.pkl', 'hashes.invalid', '.lock'):
                            try:
                                os.unlink(os.path.join(partition_path, f))
                            except OSError as e:
                                if e.errno != errno.ENOENT:
                                    raise
                    except OSError:
                        pass
                try:
                    os.rmdir(partition_path)
                except OSError:
                    # Most likely, some data landed in here or we hit an error
                    # above. Let the replicator deal with things; it was worth
                    # a shot.
                    pass

        # Then mark this part as done, in case the process is interrupted and
        # needs to resume.
        self.states["state"][part] = True
        with open(state_tmp_file, 'wt') as f:
            json.dump(self.states, f)
            os.fsync(f.fileno())
        os.rename(state_tmp_file, state_file)
        num_parts_done = sum(
            1 for part in self.states["state"].values()
            if part)
        step = STEP_CLEANUP if self.do_cleanup else STEP_RELINK
        num_total_parts = len(self.states["state"])
        self.logger.info("Step: %s Device: %s Policy: %s Partitions: %d/%d" % (
            step, device, self.policy.name, num_parts_done, num_total_parts))

    def hashes_filter(self, suff_path, hashes):
        hashes = list(hashes)
        for hsh in hashes:
            fname = os.path.join(suff_path, hsh)
            if fname == replace_partition_in_path(
                    self.conf['devices'], fname, self.next_part_power):
                hashes.remove(hsh)
        return hashes

    def check_existing(self, new_file):
        existing_link = None
        link_created = False
        start = self.next_part_power - 1
        stop = max(start - self.conf['link_check_limit'], -1)
        for check_part_power in range(start, stop, -1):
            # Try to create the link from each of several previous part power
            # locations. If an attempt succeeds then either a link was made or
            # an existing link with the same inode as the next part power
            # location was found: either is acceptable. The part power location
            # that previously failed with EEXIST is included in the further
            # attempts here for simplicity.
            target_file = replace_partition_in_path(
                self.conf['devices'], new_file, check_part_power)
            try:
                link_created = diskfile.relink_paths(target_file, new_file,
                                                     ignore_missing=False)
                existing_link = target_file
                break
            except OSError:
                pass
        return existing_link, link_created

    def process_location(self, hash_path, new_hash_path):
        # Compare the contents of each hash dir with contents of same hash
        # dir in its new partition to verify that the new location has the
        # most up to date set of files. The new location may have newer
        # files if it has been updated since relinked.
        self.stats['hash_dirs'] += 1

        # Get on disk data for new and old locations, cleaning up any
        # reclaimable or obsolete files in each. The new location is
        # cleaned up *before* the old location to prevent false negatives
        # where the old still has a file that has been cleaned up in the
        # new; cleaning up the new location first ensures that the old will
        # always be 'cleaner' than the new.
        new_df_data = self.diskfile_mgr.cleanup_ondisk_files(new_hash_path)
        old_df_data = self.diskfile_mgr.cleanup_ondisk_files(hash_path)
        # Now determine the most up to date set of on disk files would be
        # given the content of old and new locations...
        new_files = set(new_df_data['files'])
        old_files = set(old_df_data['files'])
        union_files = new_files.union(old_files)
        union_data = self.diskfile_mgr.get_ondisk_files(
            union_files, '', verify=False)
        obsolete_files = set(info['filename']
                             for info in union_data.get('obsolete', []))
        # drop 'obsolete' files but retain 'unexpected' files which might
        # be misplaced diskfiles from another policy
        required_files = union_files.difference(obsolete_files)
        required_links = required_files.intersection(old_files)

        missing_links = 0
        created_links = 0
        unwanted_files = []
        for filename in required_links:
            # Before removing old files, be sure that the corresponding
            # required new files exist by calling relink_paths again. There
            # are several possible outcomes:
            #  - The common case is that the new file exists, in which case
            #    relink_paths checks that the new file has the same inode
            #    as the old file. An exception is raised if the inode of
            #    the new file is not the same as the old file.
            #  - The new file may not exist because the relinker failed to
            #    create the link to the old file and has erroneously moved
            #    on to cleanup. In this case the relink_paths will create
            #    the link now or raise an exception if that fails.
            #  - The new file may not exist because some other process,
            #    such as an object server handling a request, has cleaned
            #    it up since we called cleanup_ondisk_files(new_hash_path).
            #    In this case a new link will be created to the old file.
            #    This is unnecessary but simpler than repeating the
            #    evaluation of what links are now required and safer than
            #    assuming that a non-existent file that *was* required is
            #    no longer required. The new file will eventually be
            #    cleaned up again.
            self.stats['files'] += 1
            old_file = os.path.join(hash_path, filename)
            new_file = os.path.join(new_hash_path, filename)
            try:
                if diskfile.relink_paths(old_file, new_file):
                    self.logger.debug(
                        "Relinking%s created link: %s to %s",
                        ' (cleanup)' if self.do_cleanup else '',
                        old_file, new_file)
                    created_links += 1
                    self.stats['linked'] += 1
            except OSError as exc:
                existing_link = None
                link_created = False
                if exc.errno == errno.EEXIST and filename.endswith('.ts'):
                    # special case for duplicate tombstones in older partition
                    # power locations
                    # (https://bugs.launchpad.net/swift/+bug/1921718)
                    existing_link, link_created = self.check_existing(new_file)
                if existing_link:
                    self.logger.debug(
                        "Relinking%s: link not needed: %s to %s due to "
                        "existing %s", ' (cleanup)' if self.do_cleanup else '',
                        old_file, new_file, existing_link)
                    if link_created:
                        # uncommon case: the retry succeeded in creating a link
                        created_links += 1
                        self.stats['linked'] += 1
                    wanted_file = replace_partition_in_path(
                        self.conf['devices'], old_file, self.part_power)
                    if old_file not in (existing_link, wanted_file):
                        # A link exists to a different file and this file
                        # is not the current target for client requests. If
                        # this file is visited again it is possible that
                        # the existing_link will have been cleaned up and
                        # the check will fail, so clean it up now.
                        self.logger.info(
                            "Relinking%s: cleaning up unwanted file: %s",
                            ' (cleanup)' if self.do_cleanup else '', old_file)
                        unwanted_files.append(filename)
                else:
                    self.logger.warning(
                        "Error relinking%s: failed to relink %s to %s: %s",
                        ' (cleanup)' if self.do_cleanup else '',
                        old_file, new_file, exc)
                    self.stats['errors'] += 1
                    missing_links += 1
        if created_links:
            diskfile.invalidate_hash(os.path.dirname(new_hash_path))

        if self.do_cleanup and not missing_links:
            # use the sorted list to help unit testing
            unwanted_files = old_df_data['files']

        # the new partition hash dir has the most up to date set of on
        # disk files so it is safe to delete the old location...
        rehash = False
        for filename in unwanted_files:
            old_file = os.path.join(hash_path, filename)
            try:
                os.remove(old_file)
            except OSError as exc:
                self.logger.warning('Error cleaning up %s: %r', old_file, exc)
                self.stats['errors'] += 1
            else:
                rehash = True
                self.stats['removed'] += 1
                self.logger.debug("Removed %s", old_file)

        if rehash:
            try:
                diskfile.invalidate_hash(os.path.dirname(hash_path))
            except Exception as exc:
                # note: not counted as an error
                self.logger.warning(
                    'Error invalidating suffix for %s: %r',
                    hash_path, exc)

    def process_policy(self, policy):
        self.logger.info(
            'Processing files for policy %s under %s (cleanup=%s)',
            policy.name, self.root, self.do_cleanup)
        self.part_power = policy.object_ring.part_power
        self.next_part_power = policy.object_ring.next_part_power
        self.diskfile_mgr = self.diskfile_router[policy]
        self.datadir = diskfile.get_data_dir(policy)
        self.states = {
            "part_power": self.part_power,
            "next_part_power": self.next_part_power,
            "state": {},
        }

        locations = audit_location_generator(
            self.conf['devices'],
            self.datadir,
            mount_check=self.conf['mount_check'],
            devices_filter=self.devices_filter,
            hook_pre_device=self.hook_pre_device,
            hook_post_device=self.hook_post_device,
            partitions_filter=self.partitions_filter,
            hook_post_partition=self.hook_post_partition,
            hashes_filter=self.hashes_filter,
            logger=self.logger,
            error_counter=self.stats,
            yield_hash_dirs=True
        )
        if self.conf['files_per_second'] > 0:
            locations = RateLimitedIterator(
                locations, self.conf['files_per_second'])
        for hash_path, device, partition in locations:
            # note, in cleanup step next_part_power == part_power
            new_hash_path = replace_partition_in_path(
                self.conf['devices'], hash_path, self.next_part_power)
            if new_hash_path == hash_path:
                continue
            self.process_location(hash_path, new_hash_path)

    def run(self):
        self._zero_stats()
        for policy in self.conf['policies']:
            self.policy = policy
            policy.object_ring = None  # Ensure it will be reloaded
            policy.load_ring(self.conf['swift_dir'])
            ring = policy.object_ring
            if not ring.next_part_power:
                continue
            part_power_increased = ring.next_part_power == ring.part_power
            if self.do_cleanup != part_power_increased:
                continue

            self.stats['policies'] += 1
            self.process_policy(policy)

        policies = self.stats.pop('policies')
        if not policies:
            self.logger.warning(
                "No policy found to increase the partition power.")
            return EXIT_NO_APPLICABLE_POLICY

        hash_dirs = self.stats.pop('hash_dirs')
        files = self.stats.pop('files')
        linked = self.stats.pop('linked')
        removed = self.stats.pop('removed')
        action_errors = self.stats.pop('errors')
        unmounted = self.stats.pop('unmounted', 0)
        if unmounted:
            self.logger.warning('%d disks were unmounted', unmounted)
        listdir_errors = self.stats.pop('unlistable_partitions', 0)
        if listdir_errors:
            self.logger.warning(
                'There were %d errors listing partition directories',
                listdir_errors)
        if self.stats:
            self.logger.warning(
                'There were unexpected errors while enumerating disk '
                'files: %r', self.stats)

        if action_errors + listdir_errors + unmounted > 0:
            log_method = self.logger.warning
            # NB: audit_location_generator logs unmounted disks as warnings,
            # but we want to treat them as errors
            status = EXIT_ERROR
        else:
            log_method = self.logger.info
            status = EXIT_SUCCESS

        log_method(
            '%d hash dirs processed (cleanup=%s) (%d files, %d linked, '
            '%d removed, %d errors)', hash_dirs, self.do_cleanup, files,
            linked, removed, action_errors + listdir_errors)
        return status


def parallel_process(do_cleanup, conf, logger=None, device_list=None):
    logger = logger or logging.getLogger()
    device_list = sorted(set(device_list or os.listdir(conf['devices'])))
    workers = conf['workers']
    if workers == 'auto':
        workers = len(device_list)
    else:
        workers = min(workers, len(device_list))
    if workers == 0 or len(device_list) in (0, 1):
        return Relinker(
            conf, logger, device_list, do_cleanup=do_cleanup).run()

    start = time.time()
    children = {}
    for worker_devs in distribute_evenly(device_list, workers):
        pid = os.fork()
        if pid == 0:
            dev_logger = PrefixLoggerAdapter(logger, {})
            dev_logger.set_prefix('[pid=%s, devs=%s] ' % (
                os.getpid(), ','.join(worker_devs)))
            os._exit(Relinker(
                conf, dev_logger, worker_devs, do_cleanup=do_cleanup).run())
        else:
            children[pid] = worker_devs

    final_status = EXIT_SUCCESS
    final_messages = []
    while children:
        pid, status = os.wait()
        sig = status & 0xff
        status = status >> 8
        time_delta = time.time() - start
        devs = children.pop(pid, ['unknown device'])
        worker_desc = '(pid=%s, devs=%s)' % (pid, ','.join(devs))
        if sig != 0:
            final_status = EXIT_ERROR
            final_messages.append(
                'Worker %s exited in %.1fs after receiving signal: %s'
                % (worker_desc, time_delta, sig))
            continue

        if status == EXIT_SUCCESS:
            continue

        if status == EXIT_NO_APPLICABLE_POLICY:
            if final_status == EXIT_SUCCESS:
                final_status = status
            continue

        final_status = EXIT_ERROR
        if status == EXIT_ERROR:
            final_messages.append(
                'Worker %s completed in %.1fs with errors'
                % (worker_desc, time_delta))
        else:
            final_messages.append(
                'Worker %s exited in %.1fs with unexpected status %s'
                % (worker_desc, time_delta, status))

    for msg in final_messages:
        logger.warning(msg)
    return final_status


def main(args):
    parser = argparse.ArgumentParser(
        description='Relink and cleanup objects to increase partition power')
    parser.add_argument('action', choices=['relink', 'cleanup'])
    parser.add_argument('conf_file', nargs='?', help=(
        'Path to config file with [object-relinker] section'))
    parser.add_argument('--swift-dir', default=None,
                        dest='swift_dir', help='Path to swift directory')
    parser.add_argument(
        '--policy', default=[], dest='policies',
        action='append', type=policy,
        help='Policy to relink; may specify multiple (default: all)')
    parser.add_argument('--devices', default=None,
                        dest='devices', help='Path to swift device directory')
    parser.add_argument('--user', default=None, dest='user',
                        help='Drop privileges to this user before relinking')
    parser.add_argument('--device',
                        default=[], dest='device_list', action='append',
                        help='Device name to relink (default: all)')
    parser.add_argument('--partition', '-p', default=[], dest='partitions',
                        type=non_negative_int, action='append',
                        help='Partition to relink (default: all)')
    parser.add_argument('--skip-mount-check', default=False,
                        help='Don\'t test if disk is mounted',
                        action="store_true", dest='skip_mount_check')
    parser.add_argument('--files-per-second', default=None,
                        type=non_negative_float, dest='files_per_second',
                        help='Used to limit I/O. Zero implies no limit '
                             '(default: no limit).')
    parser.add_argument(
        '--workers', default=None, type=non_negative_int, help=(
            'Process devices across N workers '
            '(default: one worker per device)'))
    parser.add_argument('--logfile', default=None, dest='logfile',
                        help='Set log file name. Ignored if using conf_file.')
    parser.add_argument('--debug', default=False, action='store_true',
                        help='Enable debug mode')
    parser.add_argument('--link-check-limit', type=non_negative_int,
                        default=None, dest='link_check_limit',
                        help='Maximum number of partition power locations to '
                             'check for a valid link target if relink '
                             'encounters an existing tombstone with different '
                             'inode in the next partition power location '
                             '(default: 2).')

    args = parser.parse_args(args)
    if args.conf_file:
        conf = readconf(args.conf_file, 'object-relinker')
        if args.debug:
            conf['log_level'] = 'DEBUG'
        user = args.user or conf.get('user')
        if user:
            drop_privileges(user)
        logger = get_logger(conf)
    else:
        conf = {'log_level': 'DEBUG' if args.debug else 'INFO'}
        if args.user:
            # Drop privs before creating log file
            drop_privileges(args.user)
            conf['user'] = args.user
        logging.basicConfig(
            format='%(message)s',
            level=logging.DEBUG if args.debug else logging.INFO,
            filename=args.logfile)
        logger = logging.getLogger()

    conf.update({
        'swift_dir': args.swift_dir or conf.get('swift_dir', '/etc/swift'),
        'devices': args.devices or conf.get('devices', '/srv/node'),
        'mount_check': (config_true_value(conf.get('mount_check', 'true'))
                        and not args.skip_mount_check),
        'files_per_second': (
            args.files_per_second if args.files_per_second is not None
            else non_negative_float(conf.get('files_per_second', '0'))),
        'policies': set(args.policies) or POLICIES,
        'partitions': set(args.partitions),
        'workers': config_auto_int_value(
            conf.get('workers') if args.workers is None else args.workers,
            'auto'),
        'link_check_limit': (
            args.link_check_limit if args.link_check_limit is not None
            else non_negative_int(conf.get('link_check_limit', 2))),
    })
    return parallel_process(
        args.action == 'cleanup', conf, logger, args.device_list)