Skip to content

Commit

Permalink
Moving loading and save sigs to rust
Browse files Browse the repository at this point in the history
move json parsing and init to rust
working on loading sigs

55 failing. Now it's failing because SBT index is saving all signatures
(instead of only the one it was used to build the tree).
This was actually a feature (see #198) but it broke the SBT code
(it wasn't ready for that!)
  • Loading branch information
luizirber committed Dec 7, 2018
1 parent d0b26de commit 99ba8ef
Show file tree
Hide file tree
Showing 10 changed files with 182 additions and 468 deletions.
4 changes: 0 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ branches:
only:
- master
- "/^v.*$/"
addons:
apt:
packages:
- libyajl2
matrix:
fast_finish: true
include:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def build_native(spec):
'sourmash = sourmash.__main__:main'
]
},
"install_requires": ["screed>=0.9", "ijson", "khmer>=2.1", 'milksnake'],
"install_requires": ["screed>=0.9", "khmer>=2.1", 'milksnake'],
"setup_requires": ["setuptools>=38.6.0", "milksnake"],
"extras_require": {
'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy','recommonmark'],
Expand Down
9 changes: 9 additions & 0 deletions sourmash/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,12 @@ def implements_to_string(cls):
itervalues = lambda x: x.values()
NUL = 0
implements_to_string = lambda x: x


def to_bytes(s):
if not isinstance(s, string_types + (bytes,)):
raise TypeError("Requires a string-like sequence")

if isinstance(s, string_types):
s = s.encode('utf-8')
return s
16 changes: 5 additions & 11 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import math
import copy

from ._compat import string_types, range_type
from ._lowlevel import ffi, lib
from ._compat import to_bytes
from .utils import RustObject, rustcall

# default MurmurHash seed
Expand Down Expand Up @@ -40,15 +40,6 @@ def get_scaled_for_max_hash(max_hash):
return int(round(get_minhash_max_hash() / max_hash, 0))


def to_bytes(s):
if not isinstance(s, string_types + (bytes,)):
raise TypeError("Requires a string-like sequence")

if isinstance(s, string_types):
s = s.encode('utf-8')
return s


def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED):
"hash_murmur(string, [,seed])\n\n"
"Compute a hash for a string, optionally using a seed (an integer). "
Expand Down Expand Up @@ -84,7 +75,6 @@ class MinHash(RustObject):

def __init__(self, n, ksize, is_protein=False, track_abundance=False,
seed=MINHASH_DEFAULT_SEED, max_hash=0, mins=None, scaled=0):
self.track_abundance = track_abundance

if max_hash and scaled:
raise ValueError('cannot set both max_hash and scaled')
Expand Down Expand Up @@ -196,6 +186,10 @@ def subtract_mins(self, other):
b = set(other.get_mins())
return a - b

@property
def track_abundance(self):
return self._methodcall(lib.kmerminhash_track_abundance)

@property
def seed(self):
return self._methodcall(lib.kmerminhash_seed)
Expand Down
202 changes: 161 additions & 41 deletions sourmash/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,56 @@
"""
from __future__ import print_function
import hashlib
import weakref

import gzip
import bz2file
import io
import sys

from . import signature_json
from .logging import error


from .logging import error
from .minhash import MinHash

from ._compat import to_bytes
from ._lowlevel import ffi, lib
from .utils import RustObject, rustcall, decode_str


SIGNATURE_VERSION=0.4


class SourmashSignature(object):
sig_refs = weakref.WeakKeyDictionary()
mhs_refs = weakref.WeakKeyDictionary()


class SourmashSignature(RustObject):
"Main class for signature information."
_name = ''
filename = ''

def __init__(self, minhash, name='', filename=''):
self.d = {}
self.d['class'] = 'sourmash_signature'
self._objptr = lib.signature_new()

if name:
self.d['name'] = name
self._name = name
if filename:
self.d['filename'] = filename
self.filename = filename

self.minhash = minhash
self.d['license'] = 'CC0'

self.__dealloc_func__ = lib.signature_free

@property
def minhash(self):
return MinHash._from_objptr(self._methodcall(lib.signature_first_mh), shared=True)

@minhash.setter
def minhash(self, value):
# TODO: validate value is a MinHash
self._methodcall(lib.signature_set_mh, value._objptr)

def __hash__(self):
return hash(self.md5sum())
Expand All @@ -42,6 +66,19 @@ def __str__(self):
return "SourmashSignature({})".format(md5pref)
__repr__ = __str__

def minhashes(self):
size = ffi.new("uintptr_t *")
mhs_ptr = self._methodcall(lib.signature_get_mhs, size)
size = ffi.unpack(size, 1)[0]

mhs = []
for i in range(size):
mh = MinHash._from_objptr(mhs_ptr[i], shared=True)
mhs.append(mh)
# mhs_refs[mh] = mh

return mhs

def md5sum(self):
"Calculate md5 hash of the bottom sketch, specifically."
m = hashlib.md5()
Expand All @@ -51,29 +88,49 @@ def md5sum(self):
return m.hexdigest()

def __eq__(self, other):
allkeys = set(self.d.keys()).union(set(other.d.keys()))
for k in allkeys:
if self.d.get(k) != other.d.get(k):
return False
return self._methodcall(lib.signature_eq, other._objptr)

return self.minhash == other.minhash
@property
def _name(self):
return decode_str(self._methodcall(lib.signature_get_name), free=True)

@_name.setter
def _name(self, value):
self._methodcall(lib.signature_set_name, to_bytes(value))

def name(self):
"Return as nice a name as possible, defaulting to md5 prefix."
if 'name' in self.d:
return self.d.get('name')
elif 'filename' in self.d:
return self.d.get('filename')
name = self._name
filename = self.filename

if name:
return name
elif filename:
return filename
else:
return self.md5sum()[:8]

@property
def filename(self):
return decode_str(self._methodcall(lib.signature_get_filename), free=True)

@filename.setter
def filename(self, value):
self._methodcall(lib.signature_set_filename, to_bytes(value))

@property
def license(self):
return decode_str(self._methodcall(lib.signature_get_license), free=True)

def _display_name(self, max_length):
if 'name' in self.d:
name = self.d['name']
name = self._name
filename = self.filename

if name:
if len(name) > max_length:
name = name[:max_length - 3] + '...'
elif 'filename' in self.d:
name = self.d['filename']
elif filename:
name = filename
if len(name) > max_length:
name = '...' + name[-max_length + 3:]
else:
Expand Down Expand Up @@ -189,40 +246,90 @@ def load_signatures(data, ksize=None, select_moltype=None,
return

is_fp = False
is_filename = False
if hasattr(data, 'find') and data.find('sourmash_signature') == -1: # filename
done = False
try: # is it a file handle?
data.read
is_fp = True
done = True
except AttributeError:
pass
is_filename = True

# not a file handle - treat it like a filename.
if not done:
try:
data = _guess_open(data)
is_fp = True
done = True
except OSError as excinfo:
error(str(excinfo))
if do_raise:
raise
return
else: # file-like
if hasattr(data, 'mode'): # file handler
if 't' in data.mode: # need to reopen handler as binary
if sys.version_info >= (3, ):
data = data.buffer

size = ffi.new("uintptr_t *")

if ksize is None:
ksize = 0

if select_moltype is None:
select_moltype = ffi.NULL
else:
try:
select_moltype = select_moltype.encode('utf-8')
except AttributeError:
pass
try:
# TODO: we still can't pass a file-like object to rust...
buf = data.read()
is_fp = False
data.close()
data = buf
except AttributeError:
pass

try:
# JSON format
for sig in signature_json.load_signatures_json(data,
ignore_md5sum=ignore_md5sum):
if not ksize or ksize == sig.minhash.ksize:
if not select_moltype or \
sig.minhash.is_molecule_type(select_moltype):
yield sig
if is_fp:
sigs_ptr = rustcall(lib.signatures_load_buffer,
data,
len(data),
ignore_md5sum,
ksize,
select_moltype,
size)
#fp_c = ffi.cast("FILE *", data)
#sigs_ptr = rustcall(lib.signatures_load_file, fp_c, ignore_md5sum, size)
elif is_filename:
sigs_ptr = rustcall(lib.signatures_load_path,
data.encode('utf-8'),
ignore_md5sum,
ksize,
select_moltype,
size)

else:
if hasattr(data, 'encode'):
sigs_ptr = rustcall(lib.signatures_load_buffer,
data.encode('utf-8'),
len(data),
ignore_md5sum,
ksize,
select_moltype,
size)
else:
sigs_ptr = rustcall(lib.signatures_load_buffer,
data,
len(data),
ignore_md5sum,
ksize,
select_moltype,
size)

size = ffi.unpack(size, 1)[0]

sigs = []
for i in range(size):
sig = SourmashSignature._from_objptr(sigs_ptr[i], shared=True)
sigs.append(sig)
sig_refs[sig] = sigs

for sig in sigs:
yield sig

except Exception as e:
error("Error in parsing signature; quitting.")
error("Exception: {}", str(e))
Expand Down Expand Up @@ -254,4 +361,17 @@ def load_one_signature(data, ksize=None, select_moltype=None,

def save_signatures(siglist, fp=None):
"Save multiple signatures into a JSON string (or into file handle 'fp')"
return signature_json.save_signatures_json(siglist, fp)
collected = [obj._get_objptr() for obj in siglist]
siglist_c = ffi.new("Signature*[]", collected)

if fp is None:
buf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected))
else:
#fp_c = ffi.cast("FILE *", fp)
#buf = rustcall(lib.signatures_save_file, siglist_c, len(collected), fp_c)
buf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected))
result = decode_str(buf, free=True)
fp.write(result)
return None

return decode_str(buf, free=True)
Loading

0 comments on commit 99ba8ef

Please sign in to comment.