Skip to content

Commit

Permalink
support for billions of references
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Ondov committed Nov 17, 2015
1 parent 9dc52dd commit bcc0d61
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 30 deletions.
28 changes: 14 additions & 14 deletions src/mash/CommandDistance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ int CommandDistance::run() const

sketch.initFromSequence(refArgVector, parameters);

for ( int i = 0; i < sketch.getReferenceCount(); i++ )
for ( uint64_t i = 0; i < sketch.getReferenceCount(); i++ )
{
uint64_t length = sketch.getReference(i).length;

Expand Down Expand Up @@ -336,16 +336,16 @@ int CommandDistance::run() const

void CommandDistance::writeOutput(CompareOutput * output, bool table) const
{
int refCount = output->sketchRef.getReferenceCount();
uint64_t refCount = output->sketchRef.getReferenceCount();

for ( int i = 0; i < output->sketchQuery->getReferenceCount(); i++ )
for ( uint64_t i = 0; i < output->sketchQuery->getReferenceCount(); i++ )
{
if ( table )
{
cout << output->sketchQuery->getReference(i).name;
}

for ( int j = 0; j < refCount; j++ )
for ( uint64_t j = 0; j < refCount; j++ )
{
const CompareOutput::PairOutput & pair = output->pairs.at(i * refCount + j);

Expand Down Expand Up @@ -390,17 +390,17 @@ CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * data)
sketchQuery->initFromSequence(fileVector, data->parameters);
}

int sketchSize = sketchQuery->getMinHashesPerWindow() < sketchRef.getMinHashesPerWindow() ?
uint64_t sketchSize = sketchQuery->getMinHashesPerWindow() < sketchRef.getMinHashesPerWindow() ?
sketchQuery->getMinHashesPerWindow() :
sketchRef.getMinHashesPerWindow();

output->pairs.resize(sketchRef.getReferenceCount() * sketchQuery->getReferenceCount());

for ( int i = 0; i < sketchQuery->getReferenceCount(); i++ )
for ( uint64_t i = 0; i < sketchQuery->getReferenceCount(); i++ )
{
for ( int j = 0; j < sketchRef.getReferenceCount(); j++ )
for ( uint64_t j = 0; j < sketchRef.getReferenceCount(); j++ )
{
int pairIndex = i * sketchRef.getReferenceCount() + j;
uint64_t pairIndex = i * sketchRef.getReferenceCount() + j;

compareSketches(output->pairs[pairIndex], sketchRef.getReference(j), sketchQuery->getReference(i), sketchSize, sketchRef.getKmerSize(), sketchRef.getKmerSpace(), data->maxDistance, data->maxPValue);
}
Expand All @@ -409,12 +409,12 @@ CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * data)
return output;
}

void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, int sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue)
void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, uint64_t sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue)
{
int i = 0;
int j = 0;
int common = 0;
int denom = 0;
uint64_t i = 0;
uint64_t j = 0;
uint64_t common = 0;
uint64_t denom = 0;
const HashList & hashesSortedRef = refRef.hashesSorted;
const HashList & hashesSortedQry = refQry.hashesSorted;

Expand Down Expand Up @@ -495,7 +495,7 @@ void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const
output.pass = true;
}

double pValue(uint32_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint32_t sketchSize)
double pValue(uint64_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint64_t sketchSize)
{
if ( x == 0 )
{
Expand Down
8 changes: 4 additions & 4 deletions src/mash/CommandDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class CommandDistance : public Command

struct PairOutput
{
int numer;
int denom;
uint64_t numer;
uint64_t denom;
double distance;
double pValue;
bool pass;
Expand All @@ -73,7 +73,7 @@ class CommandDistance : public Command
};

CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * data);
void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, int sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue);
double pValue(uint32_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint32_t sketchSize);
void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, uint64_t sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue);
double pValue(uint64_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint64_t sketchSize);

#endif
2 changes: 1 addition & 1 deletion src/mash/CommandInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int CommandInfo::run() const
columns[2].push_back("ID");
columns[3].push_back("Comment");

for ( int i = 0; i < sketch.getReferenceCount(); i++ )
for ( uint64_t i = 0; i < sketch.getReferenceCount(); i++ )
{
const Sketch::Reference & ref = sketch.getReference(i);

Expand Down
20 changes: 11 additions & 9 deletions src/mash/Sketch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,11 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append)

capnp::List<capnp::MinHash::ReferenceList::Reference>::Reader referencesReader = referenceListReader.getReferences();

int referencesOffset = append ? references.size() : 0;
uint64_t referencesOffset = append ? references.size() : 0;

references.resize(referencesOffset + referencesReader.size());

for ( int i = 0; i < referencesReader.size(); i++ )
for ( uint64_t i = 0; i < referencesReader.size(); i++ )
{
capnp::MinHash::ReferenceList::Reference::Reader referenceReader = referencesReader[i];

Expand All @@ -135,7 +135,7 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append)

reference.hashesSorted.resize(hashesReader.size());

for ( int j = 0; j < hashesReader.size(); j++ )
for ( uint64_t j = 0; j < hashesReader.size(); j++ )
{
reference.hashesSorted.set64(j, hashesReader[j]);
}
Expand All @@ -146,7 +146,7 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append)

reference.hashesSorted.resize(hashesReader.size());

for ( int j = 0; j < hashesReader.size(); j++ )
for ( uint64_t j = 0; j < hashesReader.size(); j++ )
{
reference.hashesSorted.set32(j, hashesReader[j]);
}
Expand All @@ -158,30 +158,32 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append)

positionHashesByReference.resize(references.size());

for ( int i = 0; i < lociReader.size(); i++ )
for ( uint64_t i = 0; i < lociReader.size(); i++ )
{
capnp::MinHash::LocusList::Locus::Reader locusReader = lociReader[i];
//cout << locusReader.getHash() << '\t' << locusReader.getSequence() << '\t' << locusReader.getPosition() << endl;
positionHashesByReference[locusReader.getSequence() + referencesOffset].push_back(PositionHash(locusReader.getPosition(), locusReader.getHash64()));
}

//cout << endl << "References:" << endl << endl;
/*
cout << endl << "References:" << endl << endl;
vector< vector<string> > columns(3);
columns[0].push_back("ID");
columns[1].push_back("Length");
columns[2].push_back("Name/Comment");
for ( int i = 0; i < references.size(); i++ )
for ( uint64_t i = 0; i < references.size(); i++ )
{
columns[0].push_back(to_string(i));
columns[1].push_back(to_string(references[i].length));
columns[2].push_back(references[i].name + " " + references[i].comment);
}
//printColumns(columns);
//cout << endl;
printColumns(columns);
cout << endl;
*/

/*
printf("\nCombined hash table:\n");
Expand Down
4 changes: 2 additions & 2 deletions src/mash/Sketch.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ class Sketch
float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
int getMinKmerSize(int reference) const;
double getRandomKmerChance(int reference) const;
const Reference & getReference(int index) const {return references.at(index);}
int getReferenceCount() const {return references.size();}
const Reference & getReference(uint64_t index) const {return references.at(index);}
uint64_t getReferenceCount() const {return references.size();}
int getReferenceIndex(std::string id) const;
int getKmerSize() const {return parameters.kmerSize;}
double getKmerSpace() const {return kmerSpace;}
Expand Down

0 comments on commit bcc0d61

Please sign in to comment.