Source code for seccs

"""sec-cs --- the SECure Content Store.

This module provides an implementation of the secure content store data
structure introduced in [LS16]_.

`sec-cs` allows secure and efficient storage of contents in an existing
key-value database, providing the following features:

    * Confidentiality:
        Stored contents are securely encrypted using a symmetric key.
    * Authenticity:
        `sec-cs` guarantees authenticity of all stored contents,
        irrespective of gurantees of the underlying database.
    * Storage Efficiency:
        Data deduplication strategies are applied to all
        stored contents. When storing new contents, overlapping parts of
        existing contents are automatically reused as to avoid redundancy.
        Storage costs of an n-bytes content that differs only slightly from an
        existing content are in O(log n).
        
Note:
    The only `sec-cs` implementation currently included in this module is called
    :class:`SecCSLite`. While it is likely suitable for many projects and can be
    used as is, it is actually intended as a base class for a much more powerful
    variant, :class:`SecCS`, which makes some slight changes to the internal
    storage structure and will be published in the near future.

References:
    .. [LS16] Dominik Leibenger and Christoph Sorge (2016). sec-cs: Getting the
       Most out of Untrusted Cloud Storage.
       `arXiv:1606.03368 <http://arxiv.org/abs/1606.03368>`_
"""
import collections
from itertools import chain
import itertools
import logging
import math
import struct

import seccs.crypto_wrapper
import seccs.rc

__version__ = '0.0.5'


[docs]class UnsupportedChunkSizeError(Exception):
    """Raised when trying to instantiate SecCS with an unsupported chunk size."""
    pass


[docs]class SecCSLite(object):

    """Secure Content Store `lite`.

    Basic implementation of the Secure Content Store data structure, supports
    only insertion (put), retrieval (get) and deletion (delete) of contents.

    Args:
        chunk_size (int): Target chunk size, i.e., expected size of all chunks
            stored in the database.
        database: Persistent database used as backend. Can be any object with
            a dict-like interface, i.e., any object implementing the operations
            __getitem__, __setitem__, __delitem__ and __contains__.
        crypto_wrapper (:class:`crypto_wrapper.BaseCryptoWrapper`): Crypto
            wrapper object that specifies cryptographic operations to be applied
            to data stored in the `database`.
        chunking_strategy (Optional[:class:`fastchunking.BaseChunkingStrategy`]):
            Chunking strategy that shall be applied to contents. Defaults to
            Rabin-Karp-based content defined chunking with 48-bytes window size.
        reference_counter (Optional[:class:`seccs.rc.BaseReferenceCounter`]):
            Reference counting strategy. By default, reference counters are
            stored in `database` under keys `key` || `"r"`, where `key` is the key
            whose references are counted.
        **kwargs: Extra keyword arguments that you should NOT use unless you
            really, really know what you are doing, e.g.:

            * length_to_height_fn: Function that resolves content lengths to
              appropriate chunk tree heights. May be used to modify the
              multi-level chunking approach performed by default, e.g., to
              degrade it to single-level chunking or similar.
            * height_to_chunk_size_fn: Function that computes the target chunk
              size for a specific level (height) of a chunk tree. May be used
              to create `imbalanced` chunk trees.

    Raises:
        seccs.UnsupportedChunkSizeError: If the chosen chunk size would create
            superchunk nodes with less than two expected children as efficiency
            guarantees would fail in this case (see [LS16]_).
    """

    def __init__(
            self, chunk_size, database, crypto_wrapper, chunking_strategy=None,
            reference_counter=None, **kwargs):
        self._logger = logging.getLogger(__name__)

        """ initialize reference representations depending on digest size """
        self._digest_size = crypto_wrapper.DIGEST_SIZE
        self._initialize_reference_representations()

        """ require S >= 2R unless an alternate height_to_chunk_size_fn is given """
        if chunk_size < 2 * self._R and 'height_to_chunk_size_fn' not in kwargs:
            raise UnsupportedChunkSizeError((
                'target average chunk size of {S} bytes is too small, must be '
                'at least {req} bytes').format(S=chunk_size, req=2 * self._R))

        """ function that calculates the appropriate number of chunking levels
            for a specific content length """
        self._content_length_to_level = kwargs.get('length_to_height_fn', None)
        if not self._content_length_to_level:
            def length_to_height_fn(size):
                return int(max(math.ceil(math.log(size * 1.0 / chunk_size)
                                         / math.log(chunk_size * 1.0 / self._R)),
                               0)) if size != 0 else 0
            self._content_length_to_level = length_to_height_fn

        """ dict that resolves a chunking level to its corresponding chunk size """
        self._level_to_chunksize = self._LevelToChunkSizeDict(
            chunk_size, self._R, kwargs.get('height_to_chunk_size_fn', None))

        """
        Crypto wrapper.

          Expected interface:
            value, digest <- wrap_value(value, height, is_root)
            value <- unwrap_value(value, digest, height, is_root, length)
        """
        self._crypto_wrapper = crypto_wrapper

        """
        Key-value database.

          Expected interface:
            __setitem__(key, value)
            __delitem__(key)
            value <- __getitem__(key)
            True/False <- __contains__(key)
        """
        self._database = database

        """
        Reference counter.

          Expected interface:
            new_count <- inc(key)
            new_count <- dec(key)
        """
        if reference_counter is None:
            reference_counter = seccs.rc.KeySuffixDatabaseReferenceCounter(
                database, b'r')
        self._reference_counter = reference_counter

        """
        Chunking strategy.

        Needs to be a context-sensitive chunking strategy meeting the Chunking
        interface of the fastchunking package.
        """
        if chunking_strategy is None:
            import fastchunking
            chunking_strategy = fastchunking.RabinKarpCDC(
                48, seed=0)  # FIXME: seed should not be 0
        self._chunking_strategy = chunking_strategy
        self._W = chunking_strategy.window_size

    def _initialize_reference_representations(self):
        """Define how chunk references are represented."""

        # size of subchunk references
        digest_size = self._digest_size
        self._R = digest_size

        # format of content references
        self._content_reference_format = '!{digest_size}sQ'.format(
            digest_size=digest_size)

        # format of subchunk references
        self._subchunk_reference_format = '!{digest_size}s'.format(
            digest_size=digest_size)

    """
    Public interface.
    """

[docs]    def put_content(self, m, ignore_rc=False):
        """Insert a content into the data structure.

        Args:
            m (str): The message or content that shall be processed and inserted
                into the data structure.
            ignore_rc (Optional[bool]): If True, increase of reference counter
                for the root node of the generated chunk tree is skipped.
                Defaults to False.

        Returns:
            str: Digest of the content that allows its retrieval using
            :meth:`get_content`.
        """
        return self.put_content_and_check_if_new(m, ignore_rc)[0]

[docs]    def put_content_and_check_if_new(self, m, ignore_rc=False):
        """Insert a content into the data structure.

        Like :meth:`put_content`, but return value includes information whether
        the content had been in the data structure before.

        Args:
            m (str): The message or content that shall be processed and inserted
                into the data structure.
            ignore_rc (Optional[bool]): If True, increase of reference counter
                for the root node of the generated chunk tree is skipped.
                Defaults to False.

        Returns:
            tuple: (digest, is_new), where `digest` is the content's digest that
            allows its retrieval using :meth:`get_content`, and `is_new`
            is True if the content has been inserted for the first time and
            False if it had existed before.
        """
        l = len(m)
        h = self._content_length_to_level(l)
        k, is_new = self._put_chunk(m, h, h)
        k = k[0]
        if not ignore_rc:
            self._reference_counter.inc(k)
        return (struct.pack(self._content_reference_format, k, l), is_new)

[docs]    def get_content(self, k):
        """Retrieve a content from the data structure.

        Args:
            k (str): The digest under which the content is stored.

        Returns:
            str: The content bytestring.
        """
        k, l = struct.unpack(self._content_reference_format, k)
        h = self._content_length_to_level(l)
        return self._get_chunk(k, h, h, l)

[docs]    def delete_content(self, k, ignore_rc=False):
        """Delete a content from the data structure.

        Decreases the content's reference counter and deletes its root chunk
        (possibly including children) if no references are left.

        Args:
            k (str): The digest under which the content is stored.
            ignore_rc (Optional[bool]): If True, decrease of reference counter
                of the root node is skipped and root node (possibly including
                children) is deleted straight away.
        """
        k, l = struct.unpack(self._content_reference_format, k)
        h = self._content_length_to_level(l)
        return self._delete_content(k, h, h, l, ignore_rc)

    """
    Helper functions for storing chunk tree nodes in the backend and retrieving
    them without having to deal with (de)serialization issues.
    """

    def _store_node(self, v, h, root_h):
        """Store chunk tree node in database.

        Args:
            v: Node content (str in case of a leaf chunk, list of child
                references otherwise).
            h (int): Height of chunk tree node.
            root_h (int): Height of chunk tree.

        Returns:
            tuple: ((k, ), is_new), where `k` is the digest or key under which
                the node has been stored, and `is_new` is a flag indicating
                whether the node has just been inserted (True) or existed before
                (False).
        """

        if isinstance(v, list):
            """ serialize superchunk:
                use each key as is (and replace None keys by a sequence of zero
                bytes) and concatenate everything """
            serialized_chunk = b''.join([struct.pack(
                self._subchunk_reference_format,
                subchunk_key) for (subchunk_key,) in v])

        else:
            """ leaf chunks do not need any serialization """
            serialized_chunk = v

        """ compute chunk key using digest function """
        serialized_chunk, k = self._crypto_wrapper.wrap_value(
            serialized_chunk, h, h == root_h)

        """ if the chunk already exists, verify its integrity and return its identifier """
        if k in self._database and self._get_node(k, h, root_h) is not None and self._reference_counter.get(k) > 0:
            return ((k,), False)

        """ if we have a new superchunk, increase reference counters for its children """
        if isinstance(v, list):
            for (chunk_id,) in v:
                self._reference_counter.inc(chunk_id)

        """ create the chunk and return its identifier """
        self._database[k] = serialized_chunk

        return ((k,), True)

    def _get_node(self, k, h, root_h, l=-1):
        """Retrieve chunk tree node from database.

        Args:
            k (str): Digest or key of the node.
            h (int): Height of the chunk tree node in its chunk tree.
            root_h (int): Height of the full chunk tree.
            l [Optional(int)]: Size of the node representation, if known.
                Defaults to -1.

        Returns:
            Chunk tree node, i.e., a bytestring in case of a leaf chunk (h = 0),
            and a list of subchunk references in case of a superchunk (h > 0).
        """

        v = self._database[k]

        serialized_chunk = self._crypto_wrapper.unwrap_value(
            v, k, h, h == root_h, l)

        if h > 0:
            """ deserialize superchunk """
            return None if serialized_chunk is None else[
                struct.unpack(
                    self._subchunk_reference_format,
                    serialized_chunk[i: i + self._R])
                for i in range(0, len(serialized_chunk),
                               self._R)]
        else:
            return serialized_chunk

    """
    Functions for realization of operations.
    """

    def _put_chunk(self, m, h, root_h):
        """ Compute subtree of a chunk tree for a specific content part and
        store it in the database.

        Args:
            m (str): Content part (message).
            h (int): Height of the considered content part in its content's
                chunk tree.
            root_h (int): Height of chunk tree of the whole content.

        Returns:
            tuple: ((k, ), is_new), where `k` is the digest or key under which
                the chunk has been stored, and `is_new` is a flag indicating
                whether the chunk has just been inserted (True) or existed
                before (False).
        """
        assert h >= 0
        if __debug__:
            self._logger.debug(
                'Store chunk with length %d bytes (levels: %d)', len(m), h)

        if h == 0:
            """ No chunking required """
            return self._store_node(m, h, root_h)  # return (k), is_new

        else:
            """ Chunking required """
            if __debug__:
                self._logger.debug(
                    'Do chunking with target chunk size: %d', self._level_to_chunksize[h - 1])

            """ Determine chunk boundaries for all levels """

            # first boundary is before the content
            boundaries = [(0, h - 1)]

            # determine subsequent boundaries...
            chunker = self._chunking_strategy.create_multilevel_chunker(
                [self._level_to_chunksize[height] for height in range(0, h)])
            # ...and append them to the boundaries list
            # W-1 zeros are prepended
            boundaries.extend(
                chunker.next_chunk_boundaries_levels(m, self._W - 1))

            # make sure that the last boundary is after the content and is a
            # height (h-1)-boundary
            if boundaries[-1][0] == len(m):
                boundaries.pop()
            boundaries.append((len(m), h - 1))

            """ Build chunk tree and store its nodes """

            # initialize temporary nodes (one for each level)
            nodes_levels = dict([(height, []) for height in range(0, h + 1)])

            # process chunks specified by boundaries
            for ((start_position, _), (end_position, boundary_height)) \
                    in zip(boundaries, boundaries[1:]):
                # place leaf chunk at height 0
                # is guaranteed to be cleared during this loop due to the
                # subsequent inner loop
                nodes_levels[0] = m[start_position:end_position]

                # update superchunks and insert nodes that are already complete
                # (including the leaf chunk)
                for height in range(0, boundary_height + 1):
                    # in order to extend the superchunk, we have to insert its
                    # next subchunk
                    nodes_levels[height + 1].append(self._store_node(nodes_levels[height],
                                                                     height, root_h)[0])
                    # subchunk is complete, so prepare new subchunk
                    nodes_levels[height] = []

            # insert root chunk (return (k), is_new)
            return self._store_node(nodes_levels[h], h, root_h)

    def _get_chunk(self, k, h, root_h, l):
        """ Get content part which is stored under some key in the database.
        If content consists of multiple chunks, put them together.

        Args:
            k (str): Digest or key of the node representing the part of the
                chunk tree that is to be retrieved.
            h (int): Height of the node representing the part of the chunk tree
                that is to be retrieved.
            root_h (int): Height of the chunk tree to which the chunk requested
                here belongs.
            l (int): Length of the content represented by the chunk that is to
                be retrieved.

        Returns:
            Content part.
        """
        chunks = [(k,)]
        for height in range(h, 0, -1):
            chunks = chain.from_iterable([self._get_node(k, height, root_h)
                                          for (k,) in chunks])
        return b''.join([self._get_node(k, 0, root_h) for (k,) in chunks])

    def _delete_content(self, k, h, root_h, l, ignore_rc=True):
        """ Remove reference from content's root node and delete its root node
        (possibly including children) if necessary.

        Args:
            k (str): The digest under which the content is stored.
            h (int): Height of node `k` in its chunk tree.
            root_h (int): Height of the chunk tree `k` is part of.
            l (int): Length of the content represented by `k`.
            ignore_rc (Optional[bool]): If True, decrease of reference counter
                of the root node is skipped and root node (possibly including
                children) is deleted straight away.
        """
        if ignore_rc or self._reference_counter.dec(k) == 0:
            return self._delete_chunk(k, h, root_h, l)

    def _delete_chunk(self, k, h, root_h, l=-1):
        """ Delete a chunk and all children whose reference counters drop to
        zero.

        Args:
            k (str): The digest of the chunk tree node that is to be deleted.
            h (int): The height of the chunk tree node `k`.
            root_h (int): The height of the chunk tree `k` is part of.
            l [Optional(int)]: The length of the content part represented by
                `k`. Defaults to -1.
        """
        if h > 0:
            v = self._get_node(k, h, root_h)
            for (k_child,) in v:
                if self._reference_counter.dec(k_child) == 0:
                    self._delete_chunk(k_child, h - 1, root_h)
        del self._database[k]

    """
    Generic helpers.
    """

    class _LevelToChunkSizeDict(collections.defaultdict):

        """
        Dict that caches the target chunk sizes for different chunk tree heights
        to save some computations.
        """

        def __init__(self, target_chunksize, R, height_to_chunk_size_fn=None):
            collections.defaultdict.__init__(self)
            self._target_chunksize = target_chunksize
            self._R = R

            if not height_to_chunk_size_fn:
                height_to_chunk_size_fn = lambda L: int(
                    self._target_chunksize ** (L + 1) / self._R ** L)
            self._height_to_chunk_size_fn = height_to_chunk_size_fn

        def __missing__(self, L):
            self[L] = chunksize = self._height_to_chunk_size_fn(L)
            return chunksize