From 4766f92dd85b502ff1106aa64d903626bf0d2db7 Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Sun, 12 Jan 2025 22:36:51 +0900 Subject: [PATCH] Fix masking issue in Index when excluding k-mer --- src/prefiltering/IndexBuilder.cpp | 88 +++++++++----------- src/prefiltering/IndexBuilder.h | 3 +- src/prefiltering/Prefiltering.cpp | 4 +- src/prefiltering/PrefilteringIndexReader.cpp | 81 +++++++++--------- 4 files changed, 83 insertions(+), 93 deletions(-) diff --git a/src/prefiltering/IndexBuilder.cpp b/src/prefiltering/IndexBuilder.cpp index 81965a0b..685a0635 100644 --- a/src/prefiltering/IndexBuilder.cpp +++ b/src/prefiltering/IndexBuilder.cpp @@ -52,8 +52,7 @@ class DbInfo { }; -void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup, - SequenceLookup **unmaskedLookup,BaseMatrix &subMat, +void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup ** externalLookup, BaseMatrix &subMat, ScoreMatrix & three, ScoreMatrix & two, Sequence *seq, DBReader *dbr, size_t dbFrom, size_t dbTo, int kmerThr, bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode) { @@ -65,27 +64,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL size_t dbSize = dbTo - dbFrom; DbInfo* info = new DbInfo(dbFrom, dbTo, seq->getEffectiveKmerSize(), *dbr); - SequenceLookup *sequenceLookup; - if (unmaskedLookup != NULL && maskedLookup == NULL) { - *unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize); - sequenceLookup = *unmaskedLookup; - } else if (unmaskedLookup == NULL && maskedLookup != NULL) { - *maskedLookup = new SequenceLookup(dbSize, info->aaDbSize); - sequenceLookup = *maskedLookup; - } else if (unmaskedLookup != NULL && maskedLookup != NULL) { - *unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize); - *maskedLookup = new SequenceLookup(dbSize, info->aaDbSize); - sequenceLookup = *maskedLookup; - } else{ - Debug(Debug::ERROR) << "This should not happen\n"; - EXIT(EXIT_FAILURE); - } + *externalLookup = new SequenceLookup(dbSize, info->aaDbSize); + SequenceLookup *sequenceLookup = *externalLookup; // identical scores for memory reduction code char *idScoreLookup = getScoreLookup(subMat); Debug::Progress progress(dbTo-dbFrom); - + bool needMasking = (mask == 1 || maskNrepeats > 0 || maskLowerCaseMode == 1); size_t maskedResidues = 0; size_t totalKmerCount = 0; #pragma omp parallel @@ -96,16 +82,17 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL #endif // need to prune low scoring k-mers through masking Masker *masker = NULL; - if (maskedLookup != NULL) { + if (needMasking) { masker = new Masker(subMat); } - - Indexer idxer(static_cast(indexTable->getAlphabetSize()), seq->getKmerSize()); + unsigned int alphabetSize = (indexTable != NULL) ? static_cast(indexTable->getAlphabetSize()) + : static_cast(subMat.alphabetSize); + Indexer idxer(alphabetSize, seq->getKmerSize()); Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern()); KmerGenerator *generator = NULL; - if (isTargetSimiliarKmerSearch) { + if (isTargetSimiliarKmerSearch && indexTable != NULL) { generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr); if(isProfile){ generator->setDivideStrategy(s.profile_matrix); @@ -132,26 +119,21 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL // count similar or exact k-mers based on sequence type if (isTargetSimiliarKmerSearch) { // Find out if we should also mask profiles - totalKmerCount += indexTable->addSimilarKmerCount(&s, generator); - unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence; - if (unmaskedLookup != NULL) { - (*unmaskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]); - } else if (maskedLookup != NULL) { - (*maskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]); + if(indexTable != NULL){ + totalKmerCount += indexTable->addSimilarKmerCount(&s, generator); } + unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence; + + sequenceLookup->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]); + } else { // Do not mask if column state sequences are used - if (unmaskedLookup != NULL) { - (*unmaskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]); - } - maskedResidues += masker->maskSequence(s, mask, maskProb, maskLowerCaseMode, maskNrepeats); + sequenceLookup->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]); - if(maskedLookup != NULL){ - (*maskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]); + if(indexTable != NULL){ + totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup); } - - totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup); } } @@ -168,14 +150,13 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL Debug(Debug::INFO) << "Index table: Masked residues: " << maskedResidues << "\n"; - if(totalKmerCount == 0) { - Debug(Debug::ERROR) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n" + if(indexTable != NULL && totalKmerCount == 0) { + Debug(Debug::WARNING) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n" << "Maybe the sequences length is less than 14 residues.\n"; if (maskedResidues == true){ - Debug(Debug::ERROR) << " or contains only low complexity regions."; - Debug(Debug::ERROR) << "Use --mask 0 to deactivate the low complexity filter.\n"; + Debug(Debug::WARNING) << " or contains only low complexity regions."; + Debug(Debug::WARNING) << "Use --mask 0 to deactivate the low complexity filter.\n"; } - EXIT(EXIT_FAILURE); } dbr->remapData(); @@ -193,9 +174,10 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL // } // Debug(Debug::INFO) << "Index table: Remove "<< lowSelectiveResidues <<" none selective residues\n"; // Debug(Debug::INFO) << "Index table: init... from "<< dbFrom << " to "<< dbTo << "\n"; - - indexTable->initMemory(info->tableSize); - indexTable->init(); + if(indexTable != NULL){ + indexTable->initMemory(info->tableSize); + indexTable->init(); + } delete info; Debug::Progress progress2(dbTo-dbFrom); @@ -208,7 +190,9 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL thread_idx = static_cast(omp_get_thread_num()); #endif Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern()); - Indexer idxer(static_cast(indexTable->getAlphabetSize()), seq->getKmerSize()); + unsigned int alphabetSize = (indexTable != NULL) ? static_cast(indexTable->getAlphabetSize()) + : static_cast(subMat.alphabetSize); + Indexer idxer(alphabetSize, seq->getKmerSize()); IndexEntryLocalTmp *buffer = static_cast(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp))); size_t bufferSize = seq->getMaxLen(); KmerGenerator *generator = NULL; @@ -229,10 +213,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL unsigned int qKey = dbr->getDbKey(id); if (isTargetSimiliarKmerSearch) { s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id)); - indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer); + if(indexTable != NULL) { + indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer); + } } else { s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom)); - indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup); + if(indexTable != NULL) { + indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup); + } } } @@ -245,6 +233,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL if(idScoreLookup!=NULL){ delete[] idScoreLookup; } - indexTable->revertPointer(); - indexTable->sortDBSeqLists(); + if(indexTable != NULL){ + indexTable->revertPointer(); + indexTable->sortDBSeqLists(); + } } diff --git a/src/prefiltering/IndexBuilder.h b/src/prefiltering/IndexBuilder.h index 46599e60..b384c1e6 100644 --- a/src/prefiltering/IndexBuilder.h +++ b/src/prefiltering/IndexBuilder.h @@ -6,8 +6,7 @@ class IndexBuilder { public: - static void fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup, SequenceLookup **unmaskedLookup, - BaseMatrix &subMat, + static void fillDatabase(IndexTable *indexTable, SequenceLookup **externalLookup, BaseMatrix &subMat, ScoreMatrix & three, ScoreMatrix & two, Sequence *seq, DBReader *dbr, size_t dbFrom, size_t dbTo, int kmerThr, bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode); diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 90612550..166d39c1 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -531,11 +531,9 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) { Parameters::isEqualDbtype(targetSeqType,Parameters::DBTYPE_AMINO_ACIDS)) ? alphabetSize -1 : alphabetSize; indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false); - SequenceLookup **unmaskedLookup = maskMode == 0 && maskLowerCaseMode == 0 ? &sequenceLookup : NULL; - SequenceLookup **maskedLookup = maskMode == 1 || maskLowerCaseMode == 1 ? &sequenceLookup : NULL; Debug(Debug::INFO) << "Index table k-mer threshold: " << localKmerThr << " at k-mer size " << kmerSize << " \n"; - IndexBuilder::fillDatabase(indexTable, maskedLookup, unmaskedLookup, *kmerSubMat, + IndexBuilder::fillDatabase(indexTable, &sequenceLookup, *kmerSubMat, _3merSubMatrix, _2merSubMatrix, &tseq, tdbr, dbFrom, dbFrom + dbSize, localKmerThr, maskMode, maskLowerCaseMode, diff --git a/src/prefiltering/PrefilteringIndexReader.cpp b/src/prefiltering/PrefilteringIndexReader.cpp index 65215f4e..f001552b 100644 --- a/src/prefiltering/PrefilteringIndexReader.cpp +++ b/src/prefiltering/PrefilteringIndexReader.cpp @@ -190,14 +190,14 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB, (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS)) ? alphabetSize -1: alphabetSize; - const bool noPrefilter = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0; - if (noPrefilter) { - splits = 0; + const bool noKmerIndex = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0; + if (noKmerIndex) { + splits = 1; } ScoreMatrix s3; ScoreMatrix s2; - if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noPrefilter == false) { + if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noKmerIndex == false) { int alphabetSize = subMat->alphabetSize; subMat->alphabetSize = subMat->alphabetSize-1; s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3); @@ -225,36 +225,53 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB, continue; } - IndexTable indexTable(adjustAlphabetSize, kmerSize, false); + IndexTable * indexTable; + if(noKmerIndex){ + indexTable = NULL; + } else { + indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false); + } SequenceLookup *sequenceLookup = NULL; - IndexBuilder::fillDatabase(&indexTable, - (maskMode == 1 || maskNrepeats > 0 || maskLowerCase == 1) ? &sequenceLookup : NULL, - (maskMode == 0 && maskNrepeats == 0 && maskLowerCase == 0) ? &sequenceLookup : NULL, + IndexBuilder::fillDatabase(indexTable, &sequenceLookup, *subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr, maskMode, maskLowerCase, maskProb, maskNrepeats, targetSearchMode); - indexTable.printStatistics(subMat->num2aa); if (sequenceLookup == NULL) { Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n"; EXIT(EXIT_FAILURE); } - - // save the entries unsigned int keyOffset = 1000 * s; - Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n"; - char *entries = (char *) indexTable.getEntries(); - size_t entriesSize = indexTable.getTableEntriesNum() * indexTable.getSizeOfEntry(); - writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s); - writer.alignToPageSize(SPLIT_INDX + s); - - // save the size - Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n"; - char *offsets = (char*)indexTable.getOffsets(); - size_t offsetsSize = (indexTable.getTableSize() + 1) * sizeof(size_t); - writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s); - writer.alignToPageSize(SPLIT_INDX + s); - indexTable.deleteEntries(); - + if(noKmerIndex == false){ + indexTable->printStatistics(subMat->num2aa); + // save the entries + Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n"; + char *entries = (char *) indexTable->getEntries(); + size_t entriesSize = indexTable->getTableEntriesNum() * indexTable->getSizeOfEntry(); + writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s); + writer.alignToPageSize(SPLIT_INDX + s); + + // save the size + Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n"; + char *offsets = (char *) indexTable->getOffsets(); + size_t offsetsSize = (indexTable->getTableSize() + 1) * sizeof(size_t); + writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s); + writer.alignToPageSize(SPLIT_INDX + s); + indexTable->deleteEntries(); + + // ENTRIESNUM + Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n"; + uint64_t entriesNum = indexTable->getTableEntriesNum(); + char *entriesNumPtr = (char *) &entriesNum; + writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s); + writer.alignToPageSize(SPLIT_INDX + s); + + // SEQCOUNT + Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n"; + size_t tablesize = indexTable->getSize(); + char *tablesizePtr = (char *) &tablesize; + writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s); + writer.alignToPageSize(SPLIT_INDX + s); + } Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n"; int64_t seqindexDataSize = sequenceLookup->getDataSize(); char *seqindexDataSizePtr = (char *) &seqindexDataSize; @@ -271,20 +288,6 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB, writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s); writer.alignToPageSize(SPLIT_INDX + s); delete sequenceLookup; - - // ENTRIESNUM - Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n"; - uint64_t entriesNum = indexTable.getTableEntriesNum(); - char *entriesNumPtr = (char *) &entriesNum; - writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s); - writer.alignToPageSize(SPLIT_INDX + s); - - // SEQCOUNT - Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n"; - size_t tablesize = indexTable.getSize(); - char *tablesizePtr = (char *) &tablesize; - writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s); - writer.alignToPageSize(SPLIT_INDX + s); } if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {