Skip to content

Commit

Permalink
add a '--rna' option (#585)
Browse files Browse the repository at this point in the history
* add --rna and refer to 'nucleotides' instead of DNA
* add tests for --dna, refactoring some existing ones too
  • Loading branch information
ctb authored and luizirber committed Jan 3, 2019
1 parent 4ccdd3d commit bddd23f
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 68 deletions.
12 changes: 6 additions & 6 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def compute(args):
sys.exit(-1)

if args.input_is_protein and args.dna:
notify('WARNING: input is protein, turning off DNA hashing')
notify('WARNING: input is protein, turning off nucleotide hashing')
args.dna = False
args.protein = True

Expand Down Expand Up @@ -148,13 +148,13 @@ def compute(args):

num_sigs = 0
if args.dna and args.protein:
notify('Computing both DNA and protein signatures.')
notify('Computing both nucleotide and protein signatures.')
num_sigs = 2*len(ksizes)
elif args.dna:
notify('Computing only DNA (and not protein) signatures.')
notify('Computing only nucleotide (and not protein) signatures.')
num_sigs = len(ksizes)
elif args.protein:
notify('Computing only protein (and not DNA) signatures.')
notify('Computing only protein (and not nucleotide) signatures.')
num_sigs = len(ksizes)

if args.protein and not args.input_is_protein:
Expand Down Expand Up @@ -1304,12 +1304,12 @@ def watch(args):
set_quiet(args.quiet)

if args.input_is_protein and args.dna:
notify('WARNING: input is protein, turning off DNA hashing.')
notify('WARNING: input is protein, turning off nucleotide hashing.')
args.dna = False
args.protein = True

if args.dna and args.protein:
notify('ERROR: cannot use "watch" with both DNA and protein.')
notify('ERROR: cannot use "watch" with both nucleotide and protein.')

if args.dna:
moltype = 'DNA'
Expand Down
22 changes: 12 additions & 10 deletions sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ def add_moltype_args(parser):
help='do not choose a protein signature')
parser.set_defaults(protein=False)

parser.add_argument('--dna', dest='dna', default=None,
parser.add_argument('--dna', '--rna', dest='dna', default=None,
action='store_true',
help='choose a DNA signature (default: True)')
parser.add_argument('--no-dna', dest='dna', action='store_false',
help='do not choose a DNA signature')
help='choose a nucleotide signature (default: True)')
parser.add_argument('--no-dna', '--no-rna', dest='dna',
action='store_false',
help='do not choose a nucleotide signature')
parser.set_defaults(dna=None)


Expand All @@ -38,11 +39,12 @@ def add_construct_moltype_args(parser):
help='do not build protein signatures')
parser.set_defaults(protein=False)

parser.add_argument('--dna', dest='dna', default=None,
parser.add_argument('--dna', '--rna', dest='dna', default=None,
action='store_true',
help='build DNA signatures (default: True)')
parser.add_argument('--no-dna', dest='dna', action='store_false',
help='do not build DNA signatures')
help='build nucleotide signatures (default: True)')
parser.add_argument('--no-dna', '--no-rna', dest='dna',
action='store_false',
help='do not build nucleotide signatures')
parser.set_defaults(dna=True)


Expand All @@ -65,7 +67,7 @@ def get_moltype(sig, require=False):
def calculate_moltype(args, default=None):
if args.protein:
if args.dna is True:
error('cannot specify both --dna and --protein!')
error('cannot specify both --dna/--rna and --protein!')
sys.exit(-1)
args.dna = False

Expand Down Expand Up @@ -104,7 +106,7 @@ def load_query_signature(filename, ksize, select_moltype):
if len(sl) != 1:
error('When loading query from "{}"', filename)
error('{} signatures matching ksize and molecule type;', len(sl))
error('need exactly one. Specify --ksize or --dna/--protein.')
error('need exactly one. Specify --ksize or --dna, --rna, or --protein.')
sys.exit(-1)

return sl[0]
Expand Down
163 changes: 111 additions & 52 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,24 @@ def test_do_sourmash_compute_multik_protein_bad_ksize():
assert 'protein ksizes must be divisible by 3' in err


@utils.in_tempdir
def test_do_sourmash_compute_multik_only_protein(c):
# check sourmash compute with only protein, no nucl
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('compute', '-k', '21,30',
'--protein', '--no-dna', testdata1)
outfile = os.path.join(c.location, 'short.fa.sig')
assert os.path.exists(outfile)

with open(outfile, 'rt') as fp:
sigdata = fp.read()
siglist = list(signature.load_signatures(sigdata))
assert len(siglist) == 2
ksizes = set([ x.minhash.ksize for x in siglist ])
assert 21 in ksizes
assert 30 in ksizes


def test_do_sourmash_compute_multik_protein_input_non_div3_ksize():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short-protein.fa')
Expand All @@ -341,24 +359,23 @@ def test_do_sourmash_compute_multik_protein_input_non_div3_ksize():
assert os.path.exists(outfile)


def test_do_sourmash_compute_multik_only_protein():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '21,30',
'--protein', '--no-dna',
testdata1],
in_directory=location)
outfile = os.path.join(location, 'short.fa.sig')
assert os.path.exists(outfile)
@utils.in_tempdir
def test_do_sourmash_compute_multik_only_protein_no_rna(c):
# test --no-rna as well (otherwise identical to previous test)
testdata1 = utils.get_test_data('short.fa')

with open(outfile, 'rt') as fp:
sigdata = fp.read()
siglist = list(signature.load_signatures(sigdata))
assert len(siglist) == 2
ksizes = set([ x.minhash.ksize for x in siglist ])
assert 21 in ksizes
assert 30 in ksizes
c.run_sourmash('compute', '-k', '21,30',
'--protein', '--no-rna', testdata1)
outfile = os.path.join(c.location, 'short.fa.sig')
assert os.path.exists(outfile)

with open(outfile, 'rt') as fp:
sigdata = fp.read()
siglist = list(signature.load_signatures(sigdata))
assert len(siglist) == 2
ksizes = set([ x.minhash.ksize for x in siglist ])
assert 21 in ksizes
assert 30 in ksizes


def test_do_sourmash_compute_protein_bad_sequences():
Expand Down Expand Up @@ -584,26 +601,42 @@ def test_do_sourmash_check_protein_comparisons():
assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0


def test_do_sourmash_check_knowngood_dna_comparisons():
@utils.in_tempdir
def test_do_sourmash_check_knowngood_dna_comparisons(c):
# this test checks against a known good signature calculated
# by utils/compute-dna-mh-another-way.py
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('ecoli.genes.fna')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '21',
'--singleton', '--dna',
testdata1],
in_directory=location)
sig1 = os.path.join(location, 'ecoli.genes.fna.sig')
assert os.path.exists(sig1)
testdata1 = utils.get_test_data('ecoli.genes.fna')
c.run_sourmash('compute', '-k', '21',
'--singleton', '--dna',
testdata1)
sig1 = c.output('ecoli.genes.fna.sig')
assert os.path.exists(sig1)

x = list(signature.load_signatures(sig1))
sig1, sig2 = sorted(x, key=lambda x: x.name())
x = list(signature.load_signatures(sig1))
sig1, sig2 = sorted(x, key=lambda x: x.name())

knowngood = utils.get_test_data('benchmark.dna.sig')
good = list(signature.load_signatures(knowngood))[0]

assert sig2.similarity(good) == 1.0


@utils.in_tempdir
def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c):
# check the --rna flag; otherwise identical to previous test.
testdata1 = utils.get_test_data('ecoli.genes.fna')
c.run_sourmash('compute', '-k', '21', '--singleton', '--rna',
testdata1)
sig1 = c.output('ecoli.genes.fna.sig')
assert os.path.exists(sig1)

knowngood = utils.get_test_data('benchmark.dna.sig')
good = list(signature.load_signatures(knowngood))[0]
x = list(signature.load_signatures(sig1))
sig1, sig2 = sorted(x, key=lambda x: x.name())

assert sig2.similarity(good) == 1.0
knowngood = utils.get_test_data('benchmark.dna.sig')
good = list(signature.load_signatures(knowngood))[0]

assert sig2.similarity(good) == 1.0


def test_do_sourmash_check_knowngood_input_protein_comparisons():
Expand Down Expand Up @@ -652,31 +685,56 @@ def test_do_sourmash_check_knowngood_protein_comparisons():
assert sig2_trans.similarity(good_trans) == 1.0


def test_do_basic_compare():
@utils.in_tempdir
def test_do_basic_compare(c):
# try doing a basic compare
import numpy
with utils.TempDirectory() as location:
testsigs = utils.get_test_data('genome-s1*.sig')
testsigs = glob.glob(testsigs)
testsigs = utils.get_test_data('genome-s1*.sig')
testsigs = glob.glob(testsigs)

args = ['compare', '-o', 'cmp', '-k', '21', '--dna'] + testsigs
status, out, err = utils.runscript('sourmash', args,
in_directory=location)
c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs)

cmp_outfile = c.output('cmp')
assert os.path.exists(cmp_outfile)
cmp_out = numpy.load(cmp_outfile)

sigs = []
for fn in testsigs:
sigs.append(sourmash_lib.load_one_signature(fn, ksize=21,
select_moltype='dna'))

cmp_calc = numpy.zeros([len(sigs), len(sigs)])
for i, si in enumerate(sigs):
for j, sj in enumerate(sigs):
cmp_calc[i][j] = si.similarity(sj)

assert (cmp_out == cmp_calc).all()


@utils.in_tempdir
def test_do_basic_compare_using_rna_arg(c):
# try doing a basic compare using --rna instead of --dna
import numpy
testsigs = utils.get_test_data('genome-s1*.sig')
testsigs = glob.glob(testsigs)

c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--rna', *testsigs)

cmp_outfile = os.path.join(location, 'cmp')
assert os.path.exists(cmp_outfile)
cmp_out = numpy.load(cmp_outfile)
cmp_outfile = c.output('cmp')
assert os.path.exists(cmp_outfile)
cmp_out = numpy.load(cmp_outfile)

sigs = []
for fn in testsigs:
sigs.append(sourmash_lib.load_one_signature(fn, ksize=21,
select_moltype='dna'))
sigs = []
for fn in testsigs:
sigs.append(sourmash_lib.load_one_signature(fn, ksize=21,
select_moltype='dna'))

cmp_calc = numpy.zeros([len(sigs), len(sigs)])
for i, si in enumerate(sigs):
for j, sj in enumerate(sigs):
cmp_calc[i][j] = si.similarity(sj)
cmp_calc = numpy.zeros([len(sigs), len(sigs)])
for i, si in enumerate(sigs):
for j, sj in enumerate(sigs):
cmp_calc[i][j] = si.similarity(sj)

assert (cmp_out == cmp_calc).all()
assert (cmp_out == cmp_calc).all()


def test_do_compare_quiet():
Expand Down Expand Up @@ -1866,7 +1924,8 @@ def test_do_sourmash_index_bad_args():
'--dna', '--protein'],
in_directory=location, fail_ok=True)

assert "cannot specify both --dna and --protein!" in err
print(out, err)
assert "cannot specify both --dna/--rna and --protein!" in err
assert status != 0


Expand Down

0 comments on commit bddd23f

Please sign in to comment.