deb-python-cassandra-driver/cassandra/metadata.py

from bisect import bisect_left
from collections import defaultdict
try:
    from collections import OrderedDict
except ImportError:  # Python <2.7
    from cassandra.util import OrderedDict
from hashlib import md5
import json
import logging
import re
from threading import RLock
import weakref

murmur3 = None
try:
    from murmur3 import murmur3
except ImportError, exc:
    pass

import cassandra.cqltypes as types
from cassandra.marshal import varint_unpack
from cassandra.pool import Host

log = logging.getLogger(__name__)

_keywords = set((
    'select', 'from', 'where', 'and', 'key', 'insert', 'update', 'with',
    'limit', 'using', 'use', 'count', 'set',
    'begin', 'apply', 'batch', 'truncate', 'delete', 'in', 'create',
    'keyspace', 'schema', 'columnfamily', 'table', 'index', 'on', 'drop',
    'primary', 'into', 'values', 'timestamp', 'ttl', 'alter', 'add', 'type',
    'compact', 'storage', 'order', 'by', 'asc', 'desc', 'clustering',
    'token', 'writetime', 'map', 'list', 'to'
))

_unreserved_keywords = set((
    'key', 'clustering', 'ttl', 'compact', 'storage', 'type', 'values'
))


class Metadata(object):
    """
    Holds a representation of the cluster schema and topology.
    """

    cluster_name = None
    """ The string name of the cluster. """

    keyspaces = None
    """
    A map from keyspace names to matching :class:`~.KeyspaceMetadata` instances.
    """

    token_map = None
    """ A :class:`~.TokenMap` instance describing the ring topology. """

    def __init__(self, cluster):
        # use a weak reference so that the Cluster object can be GC'ed.
        # Normally the cycle detector would handle this, but implementing
        # __del__ disables that.
        self.cluster_ref = weakref.ref(cluster)
        self.keyspaces = {}
        self._hosts = {}
        self._hosts_lock = RLock()

    def export_schema_as_string(self):
        """
        Returns a string that can be executed as a query in order to recreate
        the entire schema.  The string is formatted to be human readable.
        """
        return "\n".join(ks.export_as_string() for ks in self.keyspaces.values())

    def rebuild_schema(self, keyspace, table, ks_results, cf_results, col_results):
        """
        Rebuild the view of the current schema from a fresh set of rows from
        the system schema tables.

        For internal use only.
        """
        cf_def_rows = defaultdict(list)
        col_def_rows = defaultdict(lambda: defaultdict(list))

        for row in cf_results:
            cf_def_rows[row["keyspace_name"]].append(row)

        for row in col_results:
            ksname = row["keyspace_name"]
            cfname = row["columnfamily_name"]
            col_def_rows[ksname][cfname].append(row)

        # either table or ks_results must be None
        if not table:
            # ks_results is not None
            added_keyspaces = set()
            for row in ks_results:
                keyspace_meta = self._build_keyspace_metadata(row)
                for table_row in cf_def_rows.get(keyspace_meta.name, []):
                    table_meta = self._build_table_metadata(
                        keyspace_meta, table_row, col_def_rows[keyspace_meta.name])
                    keyspace_meta.tables[table_meta.name] = table_meta

                added_keyspaces.add(keyspace_meta.name)
                self.keyspaces[keyspace_meta.name] = keyspace_meta

            if not keyspace:
                # remove not-just-added keyspaces
                self.keyspaces = dict((name, meta) for name, meta in self.keyspaces.items()
                                      if name in added_keyspaces)
        else:
            # keyspace is not None, table is not None
            try:
                keyspace_meta = self.keyspaces[keyspace]
            except KeyError:
                # we're trying to update a table in a keyspace we don't know
                # about, something went wrong.
                # TODO log error, submit schema refresh
                pass
            if keyspace in cf_def_rows:
                for table_row in cf_def_rows[keyspace]:
                    table_meta = self._build_table_metadata(
                        keyspace_meta, table_row, col_def_rows[keyspace])
                    keyspace_meta.tables[table_meta.name] = table_meta

    def _build_keyspace_metadata(self, row):
        name = row["keyspace_name"]
        durable_writes = row["durable_writes"]
        strategy_class = row["strategy_class"]
        strategy_options = json.loads(row["strategy_options"])
        return KeyspaceMetadata(name, durable_writes, strategy_class, strategy_options)

    def _build_table_metadata(self, keyspace_metadata, row, col_rows):
        cfname = row["columnfamily_name"]

        comparator = types.lookup_casstype(row["comparator"])
        if issubclass(comparator, types.CompositeType):
            column_name_types = comparator.subtypes
            is_composite = True
        else:
            column_name_types = (comparator,)
            is_composite = False

        num_column_name_components = len(column_name_types)
        last_col = column_name_types[-1]

        column_aliases = json.loads(row["column_aliases"])
        if is_composite:
            if issubclass(last_col, types.ColumnToCollectionType):
                # collections
                is_compact = False
                has_value = False
                clustering_size = num_column_name_components - 2
            elif (len(column_aliases) == num_column_name_components - 1
                    and issubclass(last_col, types.UTF8Type)):
                # aliases?
                is_compact = False
                has_value = False
                clustering_size = num_column_name_components - 1
            else:
                # compact table
                is_compact = True
                has_value = True
                clustering_size = num_column_name_components
        else:
            is_compact = True
            if column_aliases or not col_rows.get(cfname):
                has_value = True
                clustering_size = num_column_name_components
            else:
                has_value = False
                clustering_size = 0

        table_meta = TableMetadata(keyspace_metadata, cfname)
        table_meta.comparator = comparator

        # partition key
        key_aliases = row.get("key_aliases")
        key_aliases = json.loads(key_aliases) if key_aliases else []

        key_type = types.lookup_casstype(row["key_validator"])
        key_types = key_type.subtypes if issubclass(key_type, types.CompositeType) else [key_type]
        for i, col_type in enumerate(key_types):
            if len(key_aliases) > i:
                column_name = key_aliases[i]
            elif i == 0:
                column_name = "key"
            else:
                column_name = "key%d" % i

            col = ColumnMetadata(table_meta, column_name, col_type)
            table_meta.columns[column_name] = col
            table_meta.partition_key.append(col)

        # clustering key
        for i in range(clustering_size):
            if len(column_aliases) > i:
                column_name = column_aliases[i]
            else:
                column_name = "column%d" % i

            col = ColumnMetadata(table_meta, column_name, column_name_types[i])
            table_meta.columns[column_name] = col
            table_meta.clustering_key.append(col)

        # value alias (if present)
        if has_value:
            validator = types.lookup_casstype(row["default_validator"])
            if not key_aliases:  # TODO are we checking the right thing here?
                value_alias = "value"
            else:
                value_alias = row["value_alias"]

            col = ColumnMetadata(table_meta, value_alias, validator)
            table_meta.columns[value_alias] = col

        # other normal columns
        if col_rows:
            for col_row in col_rows[cfname]:
                column_meta = self._build_column_metadata(table_meta, col_row)
                table_meta.columns[column_meta.name] = column_meta

        table_meta.options = self._build_table_options(row, is_compact)
        return table_meta

    def _build_table_options(self, row, is_compact_storage):
        """ Setup the mostly-non-schema table options, like caching settings """
        options = dict((o, row.get(o)) for o in TableMetadata.recognized_options)
        options["is_compact_storage"] = is_compact_storage
        return options

    def _build_column_metadata(self, table_metadata, row):
        name = row["column_name"]
        data_type = types.lookup_casstype(row["validator"])
        column_meta = ColumnMetadata(table_metadata, name, data_type)
        index_meta = self._build_index_metadata(column_meta, row)
        column_meta.index = index_meta
        return column_meta

    def _build_index_metadata(self, column_metadata, row):
        index_name = row.get("index_name")
        index_type = row.get("index_type")
        if index_name or index_type:
            return IndexMetadata(column_metadata, index_name, index_type)
        else:
            return None

    def rebuild_token_map(self, partitioner, token_map):
        """
        Rebuild our view of the topology from fresh rows from the
        system topology tables.
        For internal use only.
        """
        if partitioner.endswith('RandomPartitioner'):
            token_class = MD5Token
        elif partitioner.endswith('Murmur3Partitioner'):
            token_class = Murmur3Token
            if murmur3 is None:
                log.warning(
                    "The murmur3 C extension is not available, token awareness "
                    "cannot be supported for the Murmur3Partitioner")
        elif partitioner.endswith('ByteOrderedPartitioner'):
            token_class = BytesToken
        else:
            self.token_map = None
            return

        tokens_to_hosts = defaultdict(set)
        ring = []
        for host, token_strings in token_map.iteritems():
            for token_string in token_strings:
                token = token_class(token_string)
                ring.append(token)
                tokens_to_hosts[token].add(host)

        ring = sorted(ring)
        self.token_map = TokenMap(token_class, tokens_to_hosts, ring)

    def get_replicas(self, key):
        """
        Returns a list of :class:`.Host` instances that are replicas for a given
        partition key.
        """
        t = self.token_map
        if not t:
            return []
        try:
            return t.get_replicas(t.token_class.from_key(key))
        except NoMurmur3:
            return []

    def add_host(self, address):
        cluster = self.cluster_ref()
        with self._hosts_lock:
            if address not in self._hosts:
                new_host = Host(address, cluster.conviction_policy_factory)
                self._hosts[address] = new_host
            else:
                return None

        new_host.monitor.register(cluster)
        return new_host

    def remove_host(self, host):
        with self._hosts_lock:
            return bool(self._hosts.pop(host.address, False))

    def get_host(self, address):
        return self._hosts.get(address)

    def all_hosts(self):
        """
        Returns a list of all known :class:`.Host` instances in the cluster.
        """
        with self._hosts_lock:
            return self._hosts.values()


class KeyspaceMetadata(object):
    """
    A representation of the schema for a single keyspace.
    """

    name = None
    """ The string name of the keyspace """

    durable_writes = True
    """
    A boolean indicating whether durable writes are enabled for this keyspace
    or not
    """

    replication = None
    """
    A dict holding the replication settings for this keyspace. Typically,
    there will be a "class" entry with the name of the replication strategy
    class.
    """

    tables = None
    """
    A map from table names to instances of :class:`~.TableMetadata`.
    """

    def __init__(self, name, durable_writes, strategy_class, strategy_options):
        self.name = name
        self.durable_writes = durable_writes
        self.replication = strategy_options
        self.replication["class"] = strategy_class
        self.tables = {}

    def export_as_string(self):
        return "\n".join([self.as_cql_query()] + [t.as_cql_query() for t in self.tables.values()])

    def as_cql_query(self):
        ret = "CREATE KEYSPACE %s WITH REPLICATION = { 'class' : '%s'" % \
                (self.name, self.replication["class"])
        for k, v in self.replication.items():
            if k != "class":
                ret += ", '%s': '%s'" % (k, v)
        ret += ' } AND DURABLE_WRITES = %s;' % ("true" if self.durable_writes else "false")


class TableMetadata(object):
    """
    A representation of the schema for a single table.
    """

    keyspace = None
    """ An instance of :class:`~.KeyspaceMetadata` """

    name = None
    """ The string name of the table """

    partition_key = None
    """
    A list of :class:`.ColumnMetadata` instances representing the columns in
    the partition key for this table.  This will always hold at least one
    column.
    """

    clustering_key = None
    """
    A list of :class:`.ColumnMetadata` instances representing the columns
    in the clustering key for this table.  These are all of the
    :attr:`.primary_key` columns that are not in the :attr:`.partition_key`.

    Note that a table may have no clustering keys, in which case this will
    be an empty list.
    """

    @property
    def primary_key(self):
        """
        A list of :class:`.ColumnMetadata` representing the components of
        the primary key for this table.
        """
        return self.partition_key + self.clustering_key

    columns = None
    """
    A dict mapping column names to :class:`.ColumnMetadata` instances.
    """

    options = None
    """
    A dict mapping table option names to their specific settings for this
    table.
    """

    recognized_options = (
            "comment", "read_repair_chance",  # "local_read_repair_chance",
            "replicate_on_write", "gc_grace_seconds", "bloom_filter_fp_chance",
            "caching", "compaction_strategy_class", "compaction_strategy_options",
            "min_compaction_threshold", "max_compression_threshold",
            "compression_parameters")

    def __init__(self, keyspace_metadata, name, partition_key=None, clustering_key=None, columns=None, options=None):
        self.keyspace = keyspace_metadata
        self.name = name
        self.partition_key = [] if partition_key is None else partition_key
        self.clustering_key = [] if clustering_key is None else clustering_key
        self.columns = OrderedDict() if columns is None else columns
        self.options = options
        self.comparator = None

    def export_as_string(self):
        """
        Returns a string of CQL queries that can be used to recreate this table
        along with all indexes on it.  The returned string is formatted to
        be human readable.
        """
        ret = self.as_cql_query(formatted=True)
        ret += ";"

        for col_meta in self.columns.values():
            if col_meta.index:
                ret += "\n%s;" % (col_meta.index.as_cql_query(),)

        return ret

    def as_cql_query(self, formatted=False):
        """
        Returns a CQL query that can be used to recreate this table (index
        creations are not included).  If `formatted` is set to :const:`True`,
        extra whitespace will be added to make the query human readable.
        """
        ret = "CREATE TABLE %s.%s (%s" % (self.keyspace.name, self.name, "\n" if formatted else "")

        if formatted:
            column_join = ",\n"
            padding = "    "
        else:
            column_join = ", "
            padding = ""

        columns = []
        for col in self.columns.values():
            columns.append("%s %s" % (col.name, col.typestring))

        if len(self.partition_key) == 1 and not self.clustering_key:
            columns[0] += " PRIMARY KEY"

        ret += column_join.join("%s%s" % (padding, col) for col in columns)

        # primary key
        if len(self.partition_key) > 1 or self.clustering_key:
            ret += "%s%sPRIMARY KEY (" % (column_join, padding)

            if len(self.partition_key) > 1:
                ret += "(%s)" % ", ".join(col.name for col in self.partition_key)
            else:
                ret += self.partition_key[0].name

            if self.clustering_key:
                ret += ", %s" % ", ".join(col.name for col in self.clustering_key)

            ret += ")"

        # options
        ret += "%s) WITH " % ("\n" if formatted else "")

        option_strings = []
        if self.options.get("is_compact_storage"):
            option_strings.append("COMPACT STORAGE")

        if self.clustering_key:
            cluster_str = "CLUSTERING ORDER BY "

            clustering_names = self.protect_names([c.name for c in self.clustering_key])

            if self.options.get("is_compact_storage") and \
                    not issubclass(self.comparator, types.CompositeType):
                subtypes = [self.comparator]
            else:
                subtypes = self.comparator.subtypes

            inner = []
            for colname, coltype in zip(clustering_names, subtypes):
                ordering = "DESC" if issubclass(coltype, types.ReversedType) else "ASC"
                inner.append("%s %s" % (colname, ordering))

            cluster_str += "(%s)" % ", ".join(inner)
            option_strings.append(cluster_str)

        option_strings.extend(map(self._make_option_str, self.recognized_options))
        option_strings = filter(lambda x: x is not None, option_strings)

        join_str = "\n    AND " if formatted else " AND "
        ret += join_str.join(option_strings)

        return ret

    def _make_option_str(self, name):
        value = self.options.get(name)
        if value is not None:
            if name == "comment":
                value = value or ""
            return "%s = %s" % (name, self.protect_value(value))

    def protect_name(self, name):
        if isinstance(name, unicode):
            name = name.encode('utf8')
        return self.maybe_escape_name(name)

    def protect_names(self, names):
        return map(self.protect_name, names)

    def protect_value(self, value):
        if value is None:
            return 'NULL' # this totally won't work
        if isinstance(value, bool):
            value = str(value).lower()
        elif isinstance(value, float):
            return '%f' % value
        elif isinstance(value, int):
            return str(value)
        return "'%s'" % value.replace("'", "''")

    valid_cql3_word_re = re.compile(r'^[a-z][0-9a-z_]*$')

    def is_valid_name(self, name):
        if name is None:
            return False
        if name.lower() in _keywords - _unreserved_keywords:
            return False
        return self.valid_cql3_word_re.match(name) is not None

    def maybe_escape_name(self, name):
        if self.is_valid_name(name):
            return name
        return self.escape_name(name)

    def escape_name(self, name):
        return '"%s"' % (name.replace('"', '""'),)


class ColumnMetadata(object):
    """
    A representation of a single column in a table.
    """

    table = None
    """ The :class:`.TableMetadata` this column belongs to. """

    name = None
    """ The string name of this column. """

    data_type = None

    index = None
    """
    If an index exists on this column, this is an instance of
    :class:`.IndexMetadata`, otherwise :const:`None`.
    """

    def __init__(self, table_metadata, column_name, data_type, index_metadata=None):
        self.table = table_metadata
        self.name = column_name
        self.data_type = data_type
        self.index = index_metadata

    @property
    def typestring(self):
        """
        A string representation of the type for this column, such as "varchar"
        or "map<string, int>".
        """
        if issubclass(self.data_type, types.ReversedType):
            return self.data_type.subtypes[0].cql_parameterized_type()
        else:
            return self.data_type.cql_parameterized_type()

    def __str__(self):
        return "%s %s" % (self.name, self.data_type)


class IndexMetadata(object):
    """
    A representation of a secondary index on a column.
    """

    column = None
    """
    The column (:class:`.ColumnMetadata`) this index is on.
    """

    name = None
    """ A string name for the index. """

    index_type = None
    """ A string representing the type of index. """

    def __init__(self, column_metadata, index_name=None, index_type=None):
        self.column = column_metadata
        self.name = index_name
        self.index_type = index_type

    def as_cql_query(self):
        """
        Returns a CQL query that can be used to recreate this index.
        """
        table = self.column.table
        return "CREATE INDEX %s ON %s.%s (%s)" % (self.name, table.keyspace.name, table.name, self.column.name)


class TokenMap(object):
    """
    Information about the layout of the ring.
    """

    token_class = None
    """
    A subclass of :class:`.Token`, depending on what partitioner the cluster uses.
    """

    tokens_to_hosts = None
    """
    A map of :class:`.Token` objects to :class:`.Host` objects.
    """

    ring = None
    """
    An ordered list of :class:`.Token` instances in the ring.
    """

    def __init__(self, token_class, tokens_to_hosts, ring):
        self.token_class = token_class
        self.tokens_to_hosts = tokens_to_hosts
        self.ring = ring

    def get_replicas(self, token):
        """
        Get :class:`.Host` instances representing all of the replica nodes
        for a given :class:`.Token`.
        """
        # TODO depending on keyspace and replication strategy options,
        # return full set of replicas
        point = bisect_left(self.ring, token)
        if point == 0 and token != self.ring[0]:
            return self.tokens_to_hosts[self.ring[-1]]
        elif point == len(self.ring):
            return self.tokens_to_hosts[self.ring[0]]
        else:
            return self.tokens_to_hosts[self.ring[point]]


class Token(object):
    """
    Abstract class representing a token.
    """

    @classmethod
    def hash_fn(cls, key):
        return key

    @classmethod
    def from_key(cls, key):
        return cls(cls.hash_fn(key))

    def __cmp__(self, other):
        if self.value < other.value:
            return -1
        elif self.value == other.value:
            return 0
        else:
            return 1

MIN_LONG = -(2 ** 63)
MAX_LONG = (2 ** 63) - 1


class NoMurmur3(Exception):
    pass


class Murmur3Token(Token):
    """
    A token for ``Murmur3Partitioner``.
    """

    @classmethod
    def hash_fn(cls, key):
        if murmur3 is not None:
            h = murmur3(key)
            return h if h != MIN_LONG else MAX_LONG
        else:
            raise NoMurmur3()

    def __init__(self, token):
        """ `token` should be an int or string representing the token """
        self.value = int(token)


class MD5Token(Token):
    """
    A token for ``RandomPartitioner``.
    """

    @classmethod
    def hash_fn(cls, key):
        return abs(varint_unpack(md5('foo').digest()))

    def __init__(self, token):
        """ `token` should be an int or string representing the token """
        self.value = int(token)


class BytesToken(Token):
    """
    A token for ``ByteOrderedPartitioner``.
    """

    def __init__(self, token_string):
        """ `token_string` should be string representing the token """
        self.value = token_string