Source code for seccs

"""sec-cs --- the SECure Content Store.

This module provides an implementation of the secure content store data
structure introduced in [LS16]_.

`sec-cs` allows secure and efficient storage of contents in an existing
key-value database, providing the following features:

    * Confidentiality:
        Stored contents are securely encrypted using a symmetric key.
    * Authenticity:
        `sec-cs` guarantees authenticity of all stored contents,
        irrespective of gurantees of the underlying database.
    * Storage Efficiency:
        Data deduplication strategies are applied to all
        stored contents. When storing new contents, overlapping parts of
        existing contents are automatically reused as to avoid redundancy.
        Storage costs of an n-bytes content that differs only slightly from an
        existing content are in O(log n).
        
Note:
    The only `sec-cs` implementation currently included in this module is called
    :class:`SecCSLite`. While it is likely suitable for many projects and can be
    used as is, it is actually intended as a base class for a much more powerful
    variant, :class:`SecCS`, which makes some slight changes to the internal
    storage structure and will be published in the near future.

References:
    .. [LS16] Dominik Leibenger and Christoph Sorge (2016). sec-cs: Getting the
       Most out of Untrusted Cloud Storage.
       `arXiv:1606.03368 <http://arxiv.org/abs/1606.03368>`_
"""
import collections
from itertools import chain
import itertools
import logging
import math
import struct

import seccs.crypto_wrapper
import seccs.rc

__version__ = '0.0.5'


[docs]class UnsupportedChunkSizeError(Exception): """Raised when trying to instantiate SecCS with an unsupported chunk size.""" pass
[docs]class SecCSLite(object): """Secure Content Store `lite`. Basic implementation of the Secure Content Store data structure, supports only insertion (put), retrieval (get) and deletion (delete) of contents. Args: chunk_size (int): Target chunk size, i.e., expected size of all chunks stored in the database. database: Persistent database used as backend. Can be any object with a dict-like interface, i.e., any object implementing the operations __getitem__, __setitem__, __delitem__ and __contains__. crypto_wrapper (:class:`crypto_wrapper.BaseCryptoWrapper`): Crypto wrapper object that specifies cryptographic operations to be applied to data stored in the `database`. chunking_strategy (Optional[:class:`fastchunking.BaseChunkingStrategy`]): Chunking strategy that shall be applied to contents. Defaults to Rabin-Karp-based content defined chunking with 48-bytes window size. reference_counter (Optional[:class:`seccs.rc.BaseReferenceCounter`]): Reference counting strategy. By default, reference counters are stored in `database` under keys `key` || `"r"`, where `key` is the key whose references are counted. **kwargs: Extra keyword arguments that you should NOT use unless you really, really know what you are doing, e.g.: * length_to_height_fn: Function that resolves content lengths to appropriate chunk tree heights. May be used to modify the multi-level chunking approach performed by default, e.g., to degrade it to single-level chunking or similar. * height_to_chunk_size_fn: Function that computes the target chunk size for a specific level (height) of a chunk tree. May be used to create `imbalanced` chunk trees. Raises: seccs.UnsupportedChunkSizeError: If the chosen chunk size would create superchunk nodes with less than two expected children as efficiency guarantees would fail in this case (see [LS16]_). """ def __init__( self, chunk_size, database, crypto_wrapper, chunking_strategy=None, reference_counter=None, **kwargs): self._logger = logging.getLogger(__name__) """ initialize reference representations depending on digest size """ self._digest_size = crypto_wrapper.DIGEST_SIZE self._initialize_reference_representations() """ require S >= 2R unless an alternate height_to_chunk_size_fn is given """ if chunk_size < 2 * self._R and 'height_to_chunk_size_fn' not in kwargs: raise UnsupportedChunkSizeError(( 'target average chunk size of {S} bytes is too small, must be ' 'at least {req} bytes').format(S=chunk_size, req=2 * self._R)) """ function that calculates the appropriate number of chunking levels for a specific content length """ self._content_length_to_level = kwargs.get('length_to_height_fn', None) if not self._content_length_to_level: def length_to_height_fn(size): return int(max(math.ceil(math.log(size * 1.0 / chunk_size) / math.log(chunk_size * 1.0 / self._R)), 0)) if size != 0 else 0 self._content_length_to_level = length_to_height_fn """ dict that resolves a chunking level to its corresponding chunk size """ self._level_to_chunksize = self._LevelToChunkSizeDict( chunk_size, self._R, kwargs.get('height_to_chunk_size_fn', None)) """ Crypto wrapper. Expected interface: value, digest <- wrap_value(value, height, is_root) value <- unwrap_value(value, digest, height, is_root, length) """ self._crypto_wrapper = crypto_wrapper """ Key-value database. Expected interface: __setitem__(key, value) __delitem__(key) value <- __getitem__(key) True/False <- __contains__(key) """ self._database = database """ Reference counter. Expected interface: new_count <- inc(key) new_count <- dec(key) """ if reference_counter is None: reference_counter = seccs.rc.KeySuffixDatabaseReferenceCounter( database, b'r') self._reference_counter = reference_counter """ Chunking strategy. Needs to be a context-sensitive chunking strategy meeting the Chunking interface of the fastchunking package. """ if chunking_strategy is None: import fastchunking chunking_strategy = fastchunking.RabinKarpCDC( 48, seed=0) # FIXME: seed should not be 0 self._chunking_strategy = chunking_strategy self._W = chunking_strategy.window_size def _initialize_reference_representations(self): """Define how chunk references are represented.""" # size of subchunk references digest_size = self._digest_size self._R = digest_size # format of content references self._content_reference_format = '!{digest_size}sQ'.format( digest_size=digest_size) # format of subchunk references self._subchunk_reference_format = '!{digest_size}s'.format( digest_size=digest_size) """ Public interface. """
[docs] def put_content(self, m, ignore_rc=False): """Insert a content into the data structure. Args: m (str): The message or content that shall be processed and inserted into the data structure. ignore_rc (Optional[bool]): If True, increase of reference counter for the root node of the generated chunk tree is skipped. Defaults to False. Returns: str: Digest of the content that allows its retrieval using :meth:`get_content`. """ return self.put_content_and_check_if_new(m, ignore_rc)[0]
[docs] def put_content_and_check_if_new(self, m, ignore_rc=False): """Insert a content into the data structure. Like :meth:`put_content`, but return value includes information whether the content had been in the data structure before. Args: m (str): The message or content that shall be processed and inserted into the data structure. ignore_rc (Optional[bool]): If True, increase of reference counter for the root node of the generated chunk tree is skipped. Defaults to False. Returns: tuple: (digest, is_new), where `digest` is the content's digest that allows its retrieval using :meth:`get_content`, and `is_new` is True if the content has been inserted for the first time and False if it had existed before. """ l = len(m) h = self._content_length_to_level(l) k, is_new = self._put_chunk(m, h, h) k = k[0] if not ignore_rc: self._reference_counter.inc(k) return (struct.pack(self._content_reference_format, k, l), is_new)
[docs] def get_content(self, k): """Retrieve a content from the data structure. Args: k (str): The digest under which the content is stored. Returns: str: The content bytestring. """ k, l = struct.unpack(self._content_reference_format, k) h = self._content_length_to_level(l) return self._get_chunk(k, h, h, l)
[docs] def delete_content(self, k, ignore_rc=False): """Delete a content from the data structure. Decreases the content's reference counter and deletes its root chunk (possibly including children) if no references are left. Args: k (str): The digest under which the content is stored. ignore_rc (Optional[bool]): If True, decrease of reference counter of the root node is skipped and root node (possibly including children) is deleted straight away. """ k, l = struct.unpack(self._content_reference_format, k) h = self._content_length_to_level(l) return self._delete_content(k, h, h, l, ignore_rc)
""" Helper functions for storing chunk tree nodes in the backend and retrieving them without having to deal with (de)serialization issues. """ def _store_node(self, v, h, root_h): """Store chunk tree node in database. Args: v: Node content (str in case of a leaf chunk, list of child references otherwise). h (int): Height of chunk tree node. root_h (int): Height of chunk tree. Returns: tuple: ((k, ), is_new), where `k` is the digest or key under which the node has been stored, and `is_new` is a flag indicating whether the node has just been inserted (True) or existed before (False). """ if isinstance(v, list): """ serialize superchunk: use each key as is (and replace None keys by a sequence of zero bytes) and concatenate everything """ serialized_chunk = b''.join([struct.pack( self._subchunk_reference_format, subchunk_key) for (subchunk_key,) in v]) else: """ leaf chunks do not need any serialization """ serialized_chunk = v """ compute chunk key using digest function """ serialized_chunk, k = self._crypto_wrapper.wrap_value( serialized_chunk, h, h == root_h) """ if the chunk already exists, verify its integrity and return its identifier """ if k in self._database and self._get_node(k, h, root_h) is not None and self._reference_counter.get(k) > 0: return ((k,), False) """ if we have a new superchunk, increase reference counters for its children """ if isinstance(v, list): for (chunk_id,) in v: self._reference_counter.inc(chunk_id) """ create the chunk and return its identifier """ self._database[k] = serialized_chunk return ((k,), True) def _get_node(self, k, h, root_h, l=-1): """Retrieve chunk tree node from database. Args: k (str): Digest or key of the node. h (int): Height of the chunk tree node in its chunk tree. root_h (int): Height of the full chunk tree. l [Optional(int)]: Size of the node representation, if known. Defaults to -1. Returns: Chunk tree node, i.e., a bytestring in case of a leaf chunk (h = 0), and a list of subchunk references in case of a superchunk (h > 0). """ v = self._database[k] serialized_chunk = self._crypto_wrapper.unwrap_value( v, k, h, h == root_h, l) if h > 0: """ deserialize superchunk """ return None if serialized_chunk is None else[ struct.unpack( self._subchunk_reference_format, serialized_chunk[i: i + self._R]) for i in range(0, len(serialized_chunk), self._R)] else: return serialized_chunk """ Functions for realization of operations. """ def _put_chunk(self, m, h, root_h): """ Compute subtree of a chunk tree for a specific content part and store it in the database. Args: m (str): Content part (message). h (int): Height of the considered content part in its content's chunk tree. root_h (int): Height of chunk tree of the whole content. Returns: tuple: ((k, ), is_new), where `k` is the digest or key under which the chunk has been stored, and `is_new` is a flag indicating whether the chunk has just been inserted (True) or existed before (False). """ assert h >= 0 if __debug__: self._logger.debug( 'Store chunk with length %d bytes (levels: %d)', len(m), h) if h == 0: """ No chunking required """ return self._store_node(m, h, root_h) # return (k), is_new else: """ Chunking required """ if __debug__: self._logger.debug( 'Do chunking with target chunk size: %d', self._level_to_chunksize[h - 1]) """ Determine chunk boundaries for all levels """ # first boundary is before the content boundaries = [(0, h - 1)] # determine subsequent boundaries... chunker = self._chunking_strategy.create_multilevel_chunker( [self._level_to_chunksize[height] for height in range(0, h)]) # ...and append them to the boundaries list # W-1 zeros are prepended boundaries.extend( chunker.next_chunk_boundaries_levels(m, self._W - 1)) # make sure that the last boundary is after the content and is a # height (h-1)-boundary if boundaries[-1][0] == len(m): boundaries.pop() boundaries.append((len(m), h - 1)) """ Build chunk tree and store its nodes """ # initialize temporary nodes (one for each level) nodes_levels = dict([(height, []) for height in range(0, h + 1)]) # process chunks specified by boundaries for ((start_position, _), (end_position, boundary_height)) \ in zip(boundaries, boundaries[1:]): # place leaf chunk at height 0 # is guaranteed to be cleared during this loop due to the # subsequent inner loop nodes_levels[0] = m[start_position:end_position] # update superchunks and insert nodes that are already complete # (including the leaf chunk) for height in range(0, boundary_height + 1): # in order to extend the superchunk, we have to insert its # next subchunk nodes_levels[height + 1].append(self._store_node(nodes_levels[height], height, root_h)[0]) # subchunk is complete, so prepare new subchunk nodes_levels[height] = [] # insert root chunk (return (k), is_new) return self._store_node(nodes_levels[h], h, root_h) def _get_chunk(self, k, h, root_h, l): """ Get content part which is stored under some key in the database. If content consists of multiple chunks, put them together. Args: k (str): Digest or key of the node representing the part of the chunk tree that is to be retrieved. h (int): Height of the node representing the part of the chunk tree that is to be retrieved. root_h (int): Height of the chunk tree to which the chunk requested here belongs. l (int): Length of the content represented by the chunk that is to be retrieved. Returns: Content part. """ chunks = [(k,)] for height in range(h, 0, -1): chunks = chain.from_iterable([self._get_node(k, height, root_h) for (k,) in chunks]) return b''.join([self._get_node(k, 0, root_h) for (k,) in chunks]) def _delete_content(self, k, h, root_h, l, ignore_rc=True): """ Remove reference from content's root node and delete its root node (possibly including children) if necessary. Args: k (str): The digest under which the content is stored. h (int): Height of node `k` in its chunk tree. root_h (int): Height of the chunk tree `k` is part of. l (int): Length of the content represented by `k`. ignore_rc (Optional[bool]): If True, decrease of reference counter of the root node is skipped and root node (possibly including children) is deleted straight away. """ if ignore_rc or self._reference_counter.dec(k) == 0: return self._delete_chunk(k, h, root_h, l) def _delete_chunk(self, k, h, root_h, l=-1): """ Delete a chunk and all children whose reference counters drop to zero. Args: k (str): The digest of the chunk tree node that is to be deleted. h (int): The height of the chunk tree node `k`. root_h (int): The height of the chunk tree `k` is part of. l [Optional(int)]: The length of the content part represented by `k`. Defaults to -1. """ if h > 0: v = self._get_node(k, h, root_h) for (k_child,) in v: if self._reference_counter.dec(k_child) == 0: self._delete_chunk(k_child, h - 1, root_h) del self._database[k] """ Generic helpers. """ class _LevelToChunkSizeDict(collections.defaultdict): """ Dict that caches the target chunk sizes for different chunk tree heights to save some computations. """ def __init__(self, target_chunksize, R, height_to_chunk_size_fn=None): collections.defaultdict.__init__(self) self._target_chunksize = target_chunksize self._R = R if not height_to_chunk_size_fn: height_to_chunk_size_fn = lambda L: int( self._target_chunksize ** (L + 1) / self._R ** L) self._height_to_chunk_size_fn = height_to_chunk_size_fn def __missing__(self, L): self[L] = chunksize = self._height_to_chunk_size_fn(L) return chunksize