diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e7e2252 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,8 @@ +[submodule "zlib"] + path = zlib + url = https://github.com/madler/zlib + ignore = dirty +[submodule "seqan2"] + path = seqan2 + url = https://github.com/seqan/seqan + ignore = dirty diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..035a884 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +CC = g++ +SRCDIR = src +SRCEXT = cpp +OBJEXT = o +BUILDDIR = build +LDFLAGS = -L./zlib -lz -lpthread +CFLAGS = -c -O3 -Wall -std=c++2a -pthread -I ./seqan2/include -I./seqan2/include -I./zlib -DSEQAN_HAS_ZLIB=1 +SOURCES = $(shell find $(SRCDIR) -type f -name *.$(SRCEXT)) + + +OBJECTS = $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.o)) +OBJECTS := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.$(OBJEXT))) +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE = bin/FastRemap + +$(EXECUTABLE): $(OBJECTS) + $(CC) -std=c++11 $(OBJECTS) -o $@ $(LDFLAGS) + cp bin/FastRemap ./ + +.cpp.o: + $(CC) $(CFLAGS) $< -o $@ + +clean: + rm -f $(BUILDDIR)/*.o $(SRCDIR)/*.o bin/FastRemap + diff --git a/README.md b/README.md index 62d06fb..4212146 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,43 @@ -# FastRemap -A tool for quickly remapping reads between genome assemblies +# FastRemap: A Tool for Quickly Remapping Reads between Genome Assemblies + +## FastRemap: + - Currently only supports BAM files as input + - if not using gcc 10 or higher, can use the following library: https://github.com/tcbrindle/span + +## To clone: +``` +git clone --recurse-submodules git@github.com:CMU-SAFARI/FastRemap.git FastRemap +``` + +## To compile: +### zlib: +``` +FastRemap/zlib$ ./configure +FastRemap/zlib$ make +``` + +### FastRemap: +may need to use '-lstdc++fs' in LDFLAGS depending on compiler / system. +``` +FastRemap$ make +``` + +## To run: +``` +./FastRemap bam [chain file] [input bam file] [unmapped file] [out file] +``` +test using the small sample files in test_data folder +- input / output files should be paths relative to the current directory. +- e.g., + ./FastRemap bam test_data/ce6ToCe10.over.chain test_data/little.bam test.unmapped test.out + +optional arguments +- --append-tags (-a) to append tags in output bam file +- --mean (-m) to set insert size +- --stdev (-s) to set insert_size_stdev +- --times (-t) to set insert_size_fold + +## To validate and compare two bam outputs: +``` +python ./validation/compare_outputs.py [input bam file 1] [input bam file 2] +``` diff --git a/bin/.gitignore b/bin/.gitignore new file mode 100644 index 0000000..f935021 --- /dev/null +++ b/bin/.gitignore @@ -0,0 +1 @@ +!.gitignore diff --git a/build/.gitignore b/build/.gitignore new file mode 100644 index 0000000..f935021 --- /dev/null +++ b/build/.gitignore @@ -0,0 +1 @@ +!.gitignore diff --git a/seqan2 b/seqan2 new file mode 160000 index 0000000..f5f6583 --- /dev/null +++ b/seqan2 @@ -0,0 +1 @@ +Subproject commit f5f658343c366c9c3d44ba358ffc9317e78a09ed diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..9c17811 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,33 @@ +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS_RELEASE "-O3") + +cmake_minimum_required (VERSION 3.4) +project (seqan3_tutorial CXX) +set(CMAKE_CXX_STANDARD 17) + +#set(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "/usr/local/opt/boost\@1.76") +#SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "/usr/local/opt/boost\@1.76/lib") + +#set(BOOST_ROOT "/usr/local/opt/boost\@1.76") +#set(_boost_INCLUDE_SEARCH_DIRS ${_boost_INCLUDE_SEARCH_DIRS} "/usr/local/opt/boost\@1.76") +#set(Boost_DEBUG 1) +#set(Boost_USE_STATIC_LIBS OFF) +#set(Boost_USE_MULTITHREADED ON) +#set(Boost_USE_STATIC_RUNTIME OFF) +#find_package(Boost COMPONENTS) + + +# add seqan3 to search path +list(APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../seqan2") + +# require seqan3 with a version between >=3.0.0 and <4.0.0 +find_package (seqan2 2.0 REQUIRED) + +# build app with seqan3 +#include_directories(${Boost_INCLUDE_DIR}) +add_executable (FastRemap main.cpp utils.cpp mapbam.cpp) +target_link_libraries (FastRemap seqan2::seqan2) # ${Boost_LIBRARIES}) + diff --git a/src/IntervalTree.h b/src/IntervalTree.h new file mode 100644 index 0000000..82b8341 --- /dev/null +++ b/src/IntervalTree.h @@ -0,0 +1,346 @@ +// Taken from https://github.com/ekg/intervaltree + +#ifndef __INTERVAL_TREE_H +#define __INTERVAL_TREE_H + +#include +#include +#include +#include +#include +#include + +#ifdef USE_INTERVAL_TREE_NAMESPACE +namespace interval_tree { +#endif +template +class Interval { +public: + Scalar start; + Scalar stop; + Value value; + Interval(const Scalar& s, const Scalar& e, const Value& v) + : start(std::min(s, e)) + , stop(std::max(s, e)) + , value(v) + {} +}; + +template +Value intervalStart(const Interval& i) { + return i.start; +} + +template +Value intervalStop(const Interval& i) { + return i.stop; +} + +template +std::ostream& operator<<(std::ostream& out, const Interval& i) { + out << "Interval(" << i.start << ", " << i.stop << "): " << i.value; + return out; +} + +template +class IntervalTree { +public: + typedef Interval interval; + typedef std::vector interval_vector; + + + struct IntervalStartCmp { + bool operator()(const interval& a, const interval& b) { + return a.start < b.start; + } + }; + + struct IntervalStopCmp { + bool operator()(const interval& a, const interval& b) { + return a.stop < b.stop; + } + }; + + IntervalTree() + : left(nullptr) + , right(nullptr) + , center(0) + {} + + ~IntervalTree() = default; + + std::unique_ptr clone() const { + return std::unique_ptr(new IntervalTree(*this)); + } + + IntervalTree(const IntervalTree& other) + : intervals(other.intervals), + left(other.left ? other.left->clone() : nullptr), + right(other.right ? other.right->clone() : nullptr), + center(other.center) + {} + + IntervalTree& operator=(IntervalTree&&) = default; + IntervalTree(IntervalTree&&) = default; + + IntervalTree& operator=(const IntervalTree& other) { + center = other.center; + intervals = other.intervals; + left = other.left ? other.left->clone() : nullptr; + right = other.right ? other.right->clone() : nullptr; + return *this; + } + + IntervalTree( + interval_vector&& ivals, + std::size_t depth = 16, + std::size_t minbucket = 64, + std::size_t maxbucket = 512, + Scalar leftextent = 0, + Scalar rightextent = 0) + : left(nullptr) + , right(nullptr) + { + --depth; + const auto minmaxStop = std::minmax_element(ivals.begin(), ivals.end(), + IntervalStopCmp()); + const auto minmaxStart = std::minmax_element(ivals.begin(), ivals.end(), + IntervalStartCmp()); + if (!ivals.empty()) { + center = (minmaxStart.first->start + minmaxStop.second->stop) / 2; + } + if (leftextent == 0 && rightextent == 0) { + // sort intervals by start + std::sort(ivals.begin(), ivals.end(), IntervalStartCmp()); + } else { + assert(std::is_sorted(ivals.begin(), ivals.end(), IntervalStartCmp())); + } + if (depth == 0 || (ivals.size() < minbucket && ivals.size() < maxbucket)) { + std::sort(ivals.begin(), ivals.end(), IntervalStartCmp()); + intervals = std::move(ivals); + assert(is_valid().first); + return; + } else { + Scalar leftp = 0; + Scalar rightp = 0; + + if (leftextent || rightextent) { + leftp = leftextent; + rightp = rightextent; + } else { + leftp = ivals.front().start; + rightp = std::max_element(ivals.begin(), ivals.end(), + IntervalStopCmp())->stop; + } + + interval_vector lefts; + interval_vector rights; + + for (typename interval_vector::const_iterator i = ivals.begin(); + i != ivals.end(); ++i) { + const interval& interval = *i; + if (interval.stop < center) { + lefts.push_back(interval); + } else if (interval.start > center) { + rights.push_back(interval); + } else { + assert(interval.start <= center); + assert(center <= interval.stop); + intervals.push_back(interval); + } + } + + if (!lefts.empty()) { + left.reset(new IntervalTree(std::move(lefts), + depth, minbucket, maxbucket, + leftp, center)); + } + if (!rights.empty()) { + right.reset(new IntervalTree(std::move(rights), + depth, minbucket, maxbucket, + center, rightp)); + } + } + assert(is_valid().first); + } + + // Call f on all intervals near the range [start, stop]: + template + void visit_near(const Scalar& start, const Scalar& stop, UnaryFunction f) const { + if (!intervals.empty() && ! (stop < intervals.front().start)) { + for (auto & i : intervals) { + f(i); + } + } + if (left && start <= center) { + left->visit_near(start, stop, f); + } + if (right && stop >= center) { + right->visit_near(start, stop, f); + } + } + + // Call f on all intervals crossing pos + template + void visit_overlapping(const Scalar& pos, UnaryFunction f) const { + visit_overlapping(pos, pos, f); + } + + // Call f on all intervals overlapping [start, stop] + template + void visit_overlapping(const Scalar& start, const Scalar& stop, UnaryFunction f) const { + auto filterF = [&](const interval& interval) { + if (interval.stop > start && interval.start < stop) { + // Only apply f if overlapping + f(interval); + } + }; + visit_near(start, stop, filterF); + } + + // Call f on all intervals contained within [start, stop] + template + void visit_contained(const Scalar& start, const Scalar& stop, UnaryFunction f) const { + auto filterF = [&](const interval& interval) { + if (start <= interval.start && interval.stop <= stop) { + f(interval); + } + }; + visit_near(start, stop, filterF); + } + + interval_vector findOverlapping(const Scalar& start, const Scalar& stop) const { + interval_vector result; + visit_overlapping(start, stop, + [&](const interval& interval) { + result.emplace_back(interval); + }); + return result; + } + + interval_vector findContained(const Scalar& start, const Scalar& stop) const { + interval_vector result; + visit_contained(start, stop, + [&](const interval& interval) { + result.push_back(interval); + }); + return result; + } + bool empty() const { + if (left && !left->empty()) { + return false; + } + if (!intervals.empty()) { + return false; + } + if (right && !right->empty()) { + return false; + } + return true; + } + + template + void visit_all(UnaryFunction f) const { + if (left) { + left->visit_all(f); + } + std::for_each(intervals.begin(), intervals.end(), f); + if (right) { + right->visit_all(f); + } + } + + std::pair extentBruitForce() const { + struct Extent { + std::pair x = {std::numeric_limits::max(), + std::numeric_limits::min() }; + void operator()(const interval & interval) { + x.first = std::min(x.first, interval.start); + x.second = std::max(x.second, interval.stop); + } + }; + Extent extent; + + visit_all([&](const interval & interval) { extent(interval); }); + return extent.x; + } + + // Check all constraints. + // If first is false, second is invalid. + std::pair> is_valid() const { + const auto minmaxStop = std::minmax_element(intervals.begin(), intervals.end(), + IntervalStopCmp()); + const auto minmaxStart = std::minmax_element(intervals.begin(), intervals.end(), + IntervalStartCmp()); + + std::pair> result = {true, { std::numeric_limits::max(), + std::numeric_limits::min() }}; + if (!intervals.empty()) { + result.second.first = std::min(result.second.first, minmaxStart.first->start); + result.second.second = std::min(result.second.second, minmaxStop.second->stop); + } + if (left) { + auto valid = left->is_valid(); + result.first &= valid.first; + result.second.first = std::min(result.second.first, valid.second.first); + result.second.second = std::min(result.second.second, valid.second.second); + if (!result.first) { return result; } + if (valid.second.second >= center) { + result.first = false; + return result; + } + } + if (right) { + auto valid = right->is_valid(); + result.first &= valid.first; + result.second.first = std::min(result.second.first, valid.second.first); + result.second.second = std::min(result.second.second, valid.second.second); + if (!result.first) { return result; } + if (valid.second.first <= center) { + result.first = false; + return result; + } + } + if (!std::is_sorted(intervals.begin(), intervals.end(), IntervalStartCmp())) { + result.first = false; + } + return result; + } + + friend std::ostream& operator<<(std::ostream& os, const IntervalTree& itree) { + return writeOut(os, itree); + } + + friend std::ostream& writeOut(std::ostream& os, const IntervalTree& itree, + std::size_t depth = 0) { + auto pad = [&]() { for (std::size_t i = 0; i != depth; ++i) { os << ' '; } }; + pad(); os << "center: " << itree.center << '\n'; + for (const interval & inter : itree.intervals) { + pad(); os << inter << '\n'; + } + if (itree.left) { + pad(); os << "left:\n"; + writeOut(os, *itree.left, depth + 1); + } else { + pad(); os << "left: nullptr\n"; + } + if (itree.right) { + pad(); os << "right:\n"; + writeOut(os, *itree.right, depth + 1); + } else { + pad(); os << "right: nullptr\n"; + } + return os; + } + +private: + interval_vector intervals; + std::unique_ptr left; + std::unique_ptr right; + Scalar center; +}; +#ifdef USE_INTERVAL_TREE_NAMESPACE +} +#endif + +#endif diff --git a/src/README b/src/README new file mode 100644 index 0000000..01a5f9b --- /dev/null +++ b/src/README @@ -0,0 +1,15 @@ +// Interface for using IntervalTree class. + typedef vector countsVector; + + // a simple sanity check + typedef IntervalTree ITree; + ITree::interval_vector sanityIntervals; + sanityIntervals.push_back(ITree::interval(60, 80, true)); + sanityIntervals.push_back(ITree::interval(20, 40, true)); + ITree sanityTree(std::move(sanityIntervals), 16, 1); + + ITree::interval_vector sanityResults; + sanityResults = sanityTree.findOverlapping(30, 50); + assert(sanityResults.size() == 1); + + diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..d4eb76e --- /dev/null +++ b/src/common.h @@ -0,0 +1,26 @@ +#ifndef __COMMON_H +#define __COMMON_H + +#include "IntervalTree.h" + +#include + +typedef IntervalTree> ITree; + +// same format as seqan3::sam_record; +typedef struct my_sam_record { + std::string id; // string QNAME COL 1 qName id() + int flag; // int FLAG COL 2 flag flag() + std::string reference_id; // string RNAME COL 3 rID reference_id() + int reference_position; // int POS COL 4 beginPos reference_position()/sequence_position() + int mapping_quality; // int MAPQ COL 5 mapQ mapping_quality() + std::string cigar_sequence; // string CIGAR COL 6 cigar cigar_sequence() + std::string mate_reference_id; // string RNEXT COL 7 rNextId mate_reference_id() + std::string mate_position; // string PNEXT COL 8 pNext mate_position() + std::string template_length; // string TLEN COL 9 tLen template_length() + std::string sequence; // string SEQ COL 10 seq sequence() + std::string base_qualities; // string QUAL COL 11 qual base_qualities() + std::string tags; // string tags COL 12 tags tags() +} mysam_record; + +#endif diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..6a75e91 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,100 @@ +// C++ Implementation of CrossMap +// For speed and improved accuracy +// by: Jeremie Kim + +#include "utils.h" +#include "mapbam.h" +#include "IntervalTree.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + // if enough arguments, check if the second == "bam" + // chain file, in_file, unmapped_file, outfile + + std::string chain_file, in_file, unmapped_file, out_file; + + int insert_size = 200; + int insert_size_stdev = 30; + int insert_size_fold = 3; + bool addtags = false; + + const char* const short_opts = "s:t:m:ha"; + const option long_opts[] = { + {"mean", required_argument, nullptr, 'm'}, + {"stdev", required_argument, nullptr, 's'}, + {"times", required_argument, nullptr, 't'}, + {"append-tags", no_argument, nullptr, 'a'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, no_argument, nullptr, 0} + }; + + while (true) { + const auto opt = getopt_long(argc, argv, short_opts, long_opts, nullptr); + + if (opt == -1) { + break; + } + + switch (opt) { + case 'm': + insert_size = std::stoi(optarg); + break; + case 's': + insert_size_stdev = std::stoi(optarg); + break; + case 't': + insert_size_fold = std::stoi(optarg); + break; + case 'a': + addtags = true; + break; + + case 'h': // -h or --help + case '?': // Unrecognized option + default: + break; + } + } + + int num_args = argc - optind; + if (num_args >= 1) { + if (strcmp(argv[optind], "bam") == 0) { + + if (argc >= 3) { + // TODO: add optional arguments + chain_file = std::string(argv[optind+1]); + in_file = std::string(argv[optind+2]); + unmapped_file = std::string(argv[optind+3]); + if (argc >= 4) { + out_file = std::string(argv[optind+4]); + } + + std::map target_chrom_size; + std::map source_chrom_size; + std::map mapTree; + read_chain_file(chain_file, target_chrom_size, source_chrom_size, mapTree); + + std::cout << "Input File: " << in_file << "\n"; + std::cout << "Unmapped File: " << unmapped_file << "\n"; + std::cout << "Output File: " << out_file << "\n"; + std::cout << "mean : " << insert_size << "\n"; + std::cout << "addtags: " << addtags << "\n"; + + + + crossmap_bam_file(mapTree, chain_file, in_file, unmapped_file, out_file, target_chrom_size, insert_size, insert_size_stdev, insert_size_fold, addtags); + } + } + } + + return 0; +} diff --git a/src/mapbam.cpp b/src/mapbam.cpp new file mode 100644 index 0000000..8134f5f --- /dev/null +++ b/src/mapbam.cpp @@ -0,0 +1,511 @@ +#include "mapbam.h" +#include "common.h" +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef typename seqan::BamHeaderRecord::TTag TTag; + +int crossmap_bam_file(std::map& mapping, std::string chainfile, std::string infile, std::string unmapped_file, std::string outfile_prefix, std::map chrom_size, int IS_size, int IS_std, int fold, bool addtag) { + // determine input file format (BAM, CRAM or SAM) + std::string file_type; + std::string file_extension = infile.substr(infile.find_last_of(".") + 1); + std::vector comments; + if (file_extension == "bam") { + file_type = "BAM"; + comments.push_back("ORIGINAL_BAM_FILE=" + std::string(std::filesystem::current_path()) + "/" + infile); + // TODO: check header. + } + else { + std::cout << "File Extension: " << file_extension << " not supported or unknown\n"; + exit(1); + } + comments.push_back("CHAIN_FILE=" + std::string(std::filesystem::current_path()) + "/" + chainfile); + + // get output file names. + std::string outfile_name = outfile_prefix; + if (outfile_prefix != "") { + if (file_type == "BAM") { + outfile_name = outfile_name + ".bam"; + } + } + else { + std::cout << "Output to Screen not yet implemented\n"; + exit(1); + } + + seqan::CharString bamFileName = infile; + + auto test_outfile_name2 = std::filesystem::current_path() / outfile_name; + seqan::BamFileIn bamFileIn(seqan::toCString(bamFileName)); + + std::ofstream sam_out; + sam_out.open(test_outfile_name2); + seqan::BamFileOut bamFileOut(seqan::context(bamFileIn), sam_out, seqan::Bam()); + + + ///////////////////////////////////////////////////////// + // Updating Headers + ///////////////////////////////////////////////////////// + + seqan::BamHeader header; + seqan::BamHeader new_header; + seqan::readHeader(header, bamFileIn); + + seqan::BamAlignmentRecord record; + seqan::BamHeaderRecord header_record; + seqan::CharString tmp; + + std::map name_to_id; + std::map id_to_name; + + seqan::StringSet contigNameStore; + seqan::NameStoreCache > contigNameStoreCache(contigNameStore); + seqan::BamIOContext> bamIOContext(contigNameStore, contigNameStoreCache); + + for (unsigned recIdx = 0; searchRecord(recIdx, header, seqan::BAM_HEADER_REFERENCE, recIdx); recIdx++) { + unsigned this_idx = 0; + + std::string chr_name; + int chr_length; + while(getTagValue(tmp, this_idx, header[recIdx])) { + switch (this_idx) { + case 0: + chr_name = seqan::toCString(tmp); + break; + case 1: + chr_length = atoi(seqan::toCString(tmp)); + break; + } + this_idx++; + } + + seqan::assignValueById(contigLengths(context(bamFileOut)), nameToId(contigNamesCache(context(bamFileOut)), header[recIdx].tags[0].i2), chrom_size[seqan::toCString(header[recIdx].tags[0].i2)]); + + // TODO: This needs to be implemented in seqan to get rid of useless chromosomes in header + // seqan::removeValueById(contigLengths(context(bamFileOut)), nameToId(contigNamesCache(context(bamFileOut)), header[recIdx].tags[0].i2)); + } + + // TODO: or implement this in seqan + // seqan::clear(contigLengths(context(bamFileOut))); + + for (auto it : chrom_size) { + seqan::assignValueById(contigLengths(context(bamFileOut)), nameToId(contigNamesCache(context(bamFileOut)), it.first), it.second); + name_to_id[it.first] = nameToId(contigNamesCache(context(bamFileOut)), it.first); + id_to_name[nameToId(contigNamesCache(context(bamFileOut)), it.first)] = it.first; + } + seqan::BamHeaderRecord seqRecord; + seqRecord.type = seqan::BAM_HEADER_PROGRAM; + seqan::appendValue(seqRecord.tags, TTag("ID", "FastRemap")); + seqan::appendValue(seqRecord.tags, TTag("VN", "1.0")); + seqan::appendValue(header, seqRecord); + + seqan::BamHeaderRecord seqRecord1; + seqRecord1.type = seqan::BAM_HEADER_COMMENT; + seqan::appendValue(seqRecord1.tags, TTag("ID", comments[0]), seqan::Exact()); + seqan::appendValue(header, seqRecord1); + + seqan::BamHeaderRecord seqRecord2; + seqRecord2.type = seqan::BAM_HEADER_COMMENT; + seqan::appendValue(seqRecord2.tags, TTag("ID", comments[1]), seqan::Exact()); + seqan::appendValue(header, seqRecord2); + + seqan::writeHeader(bamFileOut, header); + + ///////////////////////////////////////////////////////////////////////// + + std::ofstream UNMAP; + if (unmapped_file != "") { + UNMAP.open(unmapped_file); + } + else { + std::cout << "unmapped file does not exist\n"; + exit(1); + } + + int QF = 0, NN = 0, NU = 0, NM = 0, UN = 0, UU = 0, UM = 0, MN = 0, MU = 0, MM = 0, SN = 0, SM = 0, SU = 0, total_item = 0; + std::string unmap_queryname; + + int read1_start, read1_end; + int read2_start, read2_end; + std::string read1_chr; + std::string read2_chr; + std::string read1_strand, read2_strand; + std::vector> read1_maps; + std::vector> read2_maps; + std::string new_reference_id; + + + seqan::BamAlignmentRecord old_alignment; + while (!seqan::atEnd(bamFileIn)) { + + total_item++; + seqan::readRecord(old_alignment, bamFileIn); + + seqan::BamAlignmentRecord new_alignment = old_alignment; + seqan::BamTagsDict tagsDict(new_alignment.tags); + seqan::eraseTag(tagsDict, "RG"); + + auto old_flag = old_alignment.flag; + unmap_queryname = seqan::toCString(old_alignment.qName); + int old_reference_length = get_reference_length(old_alignment.cigar); + //std::cout << "old_reference_length: " << old_reference_length << "\n"; + + + int old_reference_start = old_alignment.beginPos; + int old_reference_end = old_reference_start + old_reference_length; + + // TODO: something about rg tags.. maybe unecessary for seqan3 + + ////////////////////////////// + // Pair-end sequencing + ////////////////////////////// + if (old_flag & seqan::BAM_FLAG_MULTIPLE) { + //std::cout << "Read paired\n"; + new_alignment.flag = seqan::BAM_FLAG_MULTIPLE; + if (old_flag & seqan::BAM_FLAG_FIRST) { + new_alignment.flag |= seqan::BAM_FLAG_FIRST; + unmap_queryname += ".1"; + } + else if (old_flag & seqan::BAM_FLAG_LAST) { + new_alignment.flag |= seqan::BAM_FLAG_LAST; + unmap_queryname += ".2"; + } + + if (old_alignment.flag & 0x800) { + new_alignment.flag |= 0x800; + } + + if (old_flag & seqan::BAM_FLAG_QC_NO_PASS) { + //std::cout << "Failed filter\n"; + new_alignment.flag |= seqan::BAM_FLAG_QC_NO_PASS; + QF++; + if (addtag) { + seqan::setTagValue(tagsDict, "QF", 0); + } + UNMAP << "chr\t" << old_reference_start << "\t" << old_reference_end << "\t" << unmap_queryname << "\n"; + continue; + } + ////////////////////////////////////////// + // R1 originally unmapped + ////////////////////////////////////////// + else if (old_flag & seqan::BAM_FLAG_UNMAPPED) { + //std::cout << "R1 originally unmapped\n"; + NU++; // not accurate. could be NU, NN, NM + UNMAP << "chr\t" << old_reference_start << "\t" << old_reference_end << "\t" << unmap_queryname << "\n"; + continue; + } + ///////////////////////////////////////// + // R1 is originally mapped + ///////////////////////////////////////// + else { + //std::cout << "R1 originally mapped\n"; + read1_chr = id_to_name[old_alignment.rID]; + + if (old_flag & seqan::BAM_FLAG_RC) { + read1_strand = "-"; + } + else { + read1_strand = "+"; + } + read1_start = old_reference_start; + read1_end = old_reference_end; + if (map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_maps, read1_strand, false) == -1) { + //std::cout << "failed map_coordinates\n"; + } + + if (!(old_flag & seqan::BAM_FLAG_NEXT_UNMAPPED)) { + read2_chr = id_to_name[old_alignment.rNextId]; + if (old_flag & seqan::BAM_FLAG_NEXT_RC) { + read2_strand = "-"; + } + else { + read2_strand = "+"; + } + read2_start = old_alignment.pNext; + read2_end = read2_start + 1; // TODO: double check this?? + map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_maps, read2_strand, false); + } + + ///////////////////////////////////// + // R1 failed to liftover + ///////////////////////////////////// + if (read1_maps.size() == 0) { + //std::cout << "R1 failed to liftover\n"; + UNMAP << read1_chr << "\t" << read1_start << "\t" << read1_end << "\t" << unmap_queryname << "\n"; + continue; + } + ///////////////////////////////////// + // R1 uniquely mapped + ///////////////////////////////////// + else if (read1_maps.size() == 2) { + //std::cout << "R1 uniquely mapped\n"; + if (std::get<3>(read1_maps[1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_RC; + } + // REQUIRED TO AVOID MARKDUP ISSUES when remapping across chromosomes + new_alignment.rID = name_to_id[std::get<0>(read1_maps[1])]; + new_alignment.beginPos = std::get<1>(read1_maps[1]); + + // opposite strand + if (std::get<3>(read1_maps[0]) != std::get<3>(read1_maps[1])) { + seqan::reverse(new_alignment.cigar); + seqan::Dna5String tmp = new_alignment.seq; + seqan::Dna5StringReverseComplement rc(tmp); + new_alignment.seq = seqan::CharString(rc); + + tmp = new_alignment.qual; + seqan::Dna5StringReverse r(tmp); + new_alignment.qual = seqan::CharString(r); + } + + // R2 unmapped before or after conversion + if ((old_flag & seqan::BAM_FLAG_NEXT_UNMAPPED) || (read2_maps.size() == 0)) { + new_alignment.flag |= seqan::BAM_FLAG_NEXT_UNMAPPED; + new_alignment.rNextId = name_to_id[std::get<0>(read1_maps[1])]; // TODO double check. + new_alignment.pNext = std::get<1>(read1_maps[1]); + new_alignment.tLen = 0; + UN += 1; + if (addtag) { + seqan::setTagValue(tagsDict, "UN", 0); + } + } + // R2 is unique mapped + else if (read2_maps.size() == 2) { + if (std::get<3>(read2_maps[1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_NEXT_RC; + } + new_alignment.rNextId = name_to_id[std::get<0>(read2_maps[1])]; + new_alignment.pNext = std::get<1>(read2_maps[1]); + + new_alignment.tLen = abs(new_alignment.beginPos - int(new_alignment.pNext)) + old_reference_length; //(old_alignment.template_length() - old_alignment.mate_position().value_or(0)); // TODO: double check old reference_length + if (std::get<3>(read2_maps[1]) != std::get<3>(read1_maps[1]) && (new_alignment.tLen <= IS_size + fold * IS_std) && (new_alignment.tLen >= IS_size - fold * IS_std)) { + new_alignment.flag |= seqan::BAM_FLAG_ALL_PROPER; + } + UU++; + if (addtag) { + seqan::setTagValue(tagsDict, "UU", 0); + } + } + // R2 is multiple mapped + else { + if (std::get<3>(read2_maps[read2_maps.size()-1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_NEXT_UNMAPPED; + } + new_alignment.flag |= seqan::BAM_FLAG_SECONDARY; + new_alignment.rNextId = name_to_id[std::get<0>(read2_maps[read2_maps.size()-1])]; + new_alignment.pNext = std::get<1>(read2_maps[read2_maps.size()-1]); + new_alignment.tLen = 0; + UM++; + if (addtag) { + seqan::setTagValue(tagsDict, "UM", 0); + } + } + + seqan::writeRecord(bamFileOut, new_alignment); + continue; + } + + ///////////////////////////////////// + // R1 multiple mapped + ///////////////////////////////////// + else if (read1_maps.size() > 2 && read1_maps.size() % 2 == 0) { + //std::cout << "R1 Multiple Mapped\n"; + new_alignment.flag |= seqan::BAM_FLAG_SECONDARY; + if (std::get<3>(read1_maps[1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_RC; + } + new_alignment.rID = name_to_id[std::get<0>(read1_maps[1])]; + new_alignment.beginPos = std::get<1>(read1_maps[1]); + //new_alignment.beginPos = std::get<1>(read1_maps[1]); + new_alignment.mapQ = 255; + + // opposite strand + if (std::get<3>(read1_maps[0]) != std::get<3>(read1_maps[1])) { +// //std::cout << "Opposite strand\n"; + seqan::reverse(new_alignment.cigar); + seqan::Dna5String tmp = new_alignment.seq; + seqan::Dna5StringReverseComplement rc(tmp); + new_alignment.seq = seqan::CharString(rc); + + tmp = new_alignment.qual; + seqan::Dna5StringReverse r(tmp); + new_alignment.qual = seqan::CharString(r); + } + + // R2 is unmapped + if ((old_flag & seqan::BAM_FLAG_NEXT_UNMAPPED) || read2_maps.size() == 0) { + new_alignment.flag |= seqan::BAM_FLAG_NEXT_UNMAPPED; + new_alignment.rNextId = name_to_id[std::get<0>(read1_maps[1])]; + new_alignment.pNext = std::get<1>(read1_maps[1]); + new_alignment.tLen = 0; + MN++; + if (addtag) { + seqan::setTagValue(tagsDict, "MN", 0); + } + } + // R2 is unique mapped + else if (read2_maps.size() == 2) { + if (std::get<3>(read2_maps[1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_NEXT_RC; + } + new_alignment.rNextId = name_to_id[std::get<0>(read2_maps[1])]; + new_alignment.pNext = std::get<1>(read2_maps[1]); + new_alignment.tLen = 0; + MU++; + if (addtag) { + seqan::setTagValue(tagsDict, "MU", 0); + } + } + // R2 is multiple mapped. + else { + if (std::get<3>(read2_maps[read2_maps.size()-1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_NEXT_RC; + } + new_alignment.flag |= seqan::BAM_FLAG_SECONDARY; + new_alignment.rNextId = name_to_id[std::get<0>(read2_maps[read2_maps.size()-1])]; + new_alignment.pNext = std::get<1>(read2_maps[read2_maps.size()-1]); + new_alignment.tLen = 0; + MM++; + if (addtag) { + seqan::setTagValue(tagsDict, "MM", 0); + } + } + + seqan::writeRecord(bamFileOut, new_alignment); + + continue; + } + } + } + // single end sequencing + else { + new_alignment.rNextId = -1; + new_alignment.pNext = 0; + new_alignment.tLen = 0; + + // originally unmapped + if (static_cast(old_flag & seqan::BAM_FLAG_UNMAPPED)) { + UNMAP << "chr\t" << old_reference_start << "\t" << old_reference_end << "\t" << unmap_queryname << "\n"; + continue; + } + else { + new_alignment.flag = 0x0; // clear flag + read1_chr = id_to_name[old_alignment.rID]; + + if (static_cast(old_flag & seqan::BAM_FLAG_RC)) { + read1_strand = "-"; + } + else { + read1_strand = "+"; + } + read1_start = old_reference_start; + read1_end = old_reference_end; + if (map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_maps, read1_strand, false) == -1) { + //std::cout << "failed map_coordinates\n"; + } + + // unmapped after liftover + if (read1_maps.size() == 0) { + UNMAP << read1_chr << "\t" << old_reference_start << "\t" << old_reference_end << "\t" << unmap_queryname << "\n"; + continue; + } + + // unique mapped + if (read1_maps.size() == 2) { + if (std::get<3>(read1_maps[1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_RC; + } + if (std::get<3>(read1_maps[0]) != std::get<3>(read1_maps[1])) { + seqan::reverse(new_alignment.cigar); + seqan::Dna5String tmp = new_alignment.seq; + seqan::Dna5StringReverseComplement rc(tmp); + new_alignment.seq = seqan::CharString(rc); + + tmp = new_alignment.qual; + seqan::Dna5StringReverse r(tmp); + new_alignment.qual = seqan::CharString(r); + } + + new_alignment.rID = name_to_id[std::get<0>(read1_maps[1])]; + new_alignment.beginPos = std::get<1>(read1_maps[1]); + SU++; + if (addtag) { + seqan::setTagValue(tagsDict, "SU", 0); + } + seqan::writeRecord(bamFileOut, new_alignment); + continue; + } + + // multiple mapped + if (read1_maps.size() > 2 && read1_maps.size() % 2 == 0) { + new_alignment.flag |= seqan::BAM_FLAG_SECONDARY; + if (std::get<3>(read1_maps[1]) == "-") { + new_alignment.flag |= seqan::BAM_FLAG_RC; + } + if (std::get<3>(read1_maps[0]) != std::get<3>(read1_maps[1])) { + seqan::reverse(new_alignment.cigar); + seqan::Dna5String tmp = new_alignment.seq; + seqan::Dna5StringReverseComplement rc(tmp); + new_alignment.seq = seqan::CharString(rc); + + tmp = new_alignment.qual; + seqan::Dna5StringReverse r(tmp); + new_alignment.qual = seqan::CharString(r); + } + + new_alignment.rID = name_to_id[std::get<0>(read1_maps[1])]; + new_alignment.beginPos = std::get<1>(read1_maps[1]); + SM++; + if (addtag) { + seqan::setTagValue(tagsDict, "SM", 0); + } + seqan::writeRecord(bamFileOut, new_alignment); + continue; + } + } + } + } + + UNMAP.close(); + + // TODO: sort the entries. + + std::cout << "Total alignments: " << total_item << "\n"; + std::cout << " QC failed: " << QF << "\n"; + + if (NN+NU+NM+UN+UU+UM+MN+MU+MM > 0) { + std::cout << " Paired-end reads:\n"; + std::cout << "\tR1 unique, R2 unique (UU): " << UU << "\n"; + std::cout << "\tR1 unique, R2 unmap (UN): " << UN << "\n"; + std::cout << "\tR1 unique, R2 multiple (UM): " << UM << "\n"; + + std::cout << "\tR1 multiple, R2 unique (MU): " << MU << "\n"; + std::cout << "\tR1 multiple, R2 unmap (MN): " << MN << "\n"; + std::cout << "\tR1 multiple, R2 multiple (MM): " << MM << "\n"; + + std::cout << "\tR1 unmap, R2 unique (NU): " << NU << "\n"; + std::cout << "\tR1 unmap, R2 unmap (NN): " << NN << "\n"; + std::cout << "\tR1 unmap, R2 multiple (NM): " << NM << "\n"; + } + if (SN+SU+SM > 0) { + std::cout << " Single-end reads:\n"; + std::cout << "\tUniquely mapped (SU): " << SU << "\n"; + std::cout << "\tUnmapped (SN): " << SN << "\n"; + std::cout << "\tMultiple mapped (SM): " << SM << "\n"; + } + + return 0; +} diff --git a/src/mapbam.h b/src/mapbam.h new file mode 100644 index 0000000..fad4c12 --- /dev/null +++ b/src/mapbam.h @@ -0,0 +1,12 @@ +#ifndef __MAPBAM_H +#define __MAPBAM_H + +#include "common.h" + +#include +#include +#include + +int crossmap_bam_file(std::map&, std::string, std::string, std::string, std::string, std::map, int, int, int, bool); + +#endif diff --git a/src/utils.cpp b/src/utils.cpp new file mode 100644 index 0000000..332f42e --- /dev/null +++ b/src/utils.cpp @@ -0,0 +1,279 @@ +#include "utils.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +//#include +//#include +#include +#include + +using namespace std; +const std::string WHITESPACE = " \n\r\t\f\v"; + + +int read_chain_file(std::string chain_file, std::map& target_chromSize, std::map& source_chromSize, std::map& maps) { + std::string source_name, target_name; + std::string source_strand, target_strand; + int source_size, target_size; + int source_start, target_start; + int sfrom, tfrom; + int sgap, tgap; + int size; + + std::map interval_vector_map; + + std::cout << "Reading the chain file: " << chain_file << "\n"; + + std::ifstream infile(chain_file); + if (infile.is_open()) { + std::string line; + int line_number = 0; + while (std::getline(infile, line)) { + trim(line); + + if (line.empty()) continue; + if (line.at(0) == '#') continue; + + vector fields; + std::stringstream ss(line); + std::string token; + while(ss >> token) { + fields.push_back(token); + } + + // found a chainfile header line + if (fields[0] == "chain" && (fields.size() == 12 || fields.size() == 13)) { + source_name = fields[2]; + source_size = stoi(fields[3]); + source_strand = fields[4]; + source_start = stoi(fields[5]); + target_name = fields[7]; + target_size = stoi(fields[8]); + target_strand = fields[9]; + target_start = stoi(fields[10]); + target_chromSize[target_name] = target_size; + source_chromSize[source_name] = source_size; + + if (source_strand != "+") { + std::cout << "ERROR: source_strand must be + (Line: " << line_number << ")\n"; + exit(0); + } + if (target_strand != "+" and target_strand != "-") { + std::cout << "ERROR: target_strand must be + or - (Line: " << line_number << ")\n"; + exit(0); + } + + if (interval_vector_map.find(source_name) == interval_vector_map.end()) { + interval_vector_map[source_name] = ITree::interval_vector(); + } + + sfrom = source_start; + tfrom = target_start; + } + else if (fields[0] != "chain" and fields.size() == 3) { + size = stoi(fields[0]); + sgap = stoi(fields[1]); + tgap = stoi(fields[2]); + + if (target_strand == "+") { + interval_vector_map[source_name].push_back(ITree::interval(sfrom, sfrom+size, make_tuple(target_name, tfrom, tfrom+size, target_strand))); + } + else if (target_strand == "-") { + interval_vector_map[source_name].push_back(ITree::interval(sfrom, sfrom+size, make_tuple(target_name, target_size - (tfrom+size), target_size - tfrom, target_strand))); + } + sfrom += size + sgap; + tfrom += size + tgap; + } + else if (fields[0] != "chain" and fields.size() == 1) { + size = stoi(fields[0]); + + if (target_strand == "+") { + interval_vector_map[source_name].push_back(ITree::interval(sfrom, sfrom+size, make_tuple(target_name, tfrom, tfrom+size, target_strand))); + } + else if (target_strand == "-") { + interval_vector_map[source_name].push_back(ITree::interval(sfrom, sfrom+size, make_tuple(target_name, target_size - (tfrom+size), target_size - tfrom, target_strand))); + } + } + else { + std::cout << "ERROR: invalid chain file format (Line: " << line_number << ")\n"; + exit(0); + } + + line_number++; + } + infile.close(); + } + + map::iterator it; + for (it = interval_vector_map.begin(); it != interval_vector_map.end(); it++) { + maps[it->first] = ITree(std::move(it->second), 16, 1); + } + + return 0; +} + + +int map_coordinates(std::map& mapping, std::string q_chr, int q_start, int q_end, std::vector>& matches, std::string q_strand = "+", bool print_match = false) { + // initialize matches by clearing everything + matches.clear(); + std::map complement = {{"+" , "-"}, {"-" , "+"}}; + + ITree::interval_vector targets; + std::string tmp_q_chr = std::regex_replace(q_chr, std::regex("chr"), ""); + std::string tmp2_q_chr = "chr" + q_chr; +// std::cout << "q_chr: " << q_chr; +// std::cout << "tmp_q_chr: " << tmp_q_chr; +// std::cout << "tmp2_q_chr: " << tmp2_q_chr; + std::string mapping_query; + if (mapping.find(q_chr) != mapping.end()) { + mapping_query = q_chr; + } + else if (mapping.find(tmp_q_chr) != mapping.end()) { + mapping_query = tmp_q_chr; + } + else if (mapping.find(tmp2_q_chr) != mapping.end()) { + mapping_query = tmp2_q_chr; + } + else { + return -1; + } + + targets = mapping[mapping_query].findOverlapping(q_start, q_end); + + if (targets.size() == 0) { + return -1; + } + else if (targets.size() >= 1) { + + for (ITree::interval_vector::iterator it = targets.begin(); it != targets.end(); it++) { + //std::cout << "checking\n"; + int s_start = it->start; + int s_end = it->stop; + std::string t_chrom = std::get<0>(it->value); + + t_chrom = update_chromID(q_chr, t_chrom); + int t_start = std::get<1>(it->value); + int t_end = std::get<2>(it->value); + std::string t_strand = std::get<3>(it->value); + + std::string chr; + int real_start, real_end; + intersectBed(q_chr, q_start, q_end, q_chr, s_start, s_end, chr, real_start, real_end); + + int l_offset = abs(real_start - s_start); + int size = abs(real_end - real_start); + + //std::cout << "NEW\n"; + //std::cout << s_start << "\n" << s_end << "\n" << t_chrom << "\n" << t_start << "\n" << t_end << "\n" << t_strand << "\n\n"; + + + matches.push_back(std::make_tuple(chr, real_start, real_end, q_strand)); + if (t_strand == "+") { + int i_start = t_start + l_offset; + if (q_strand == "+") { + matches.push_back(std::make_tuple(t_chrom, i_start, i_start + size, t_strand)); + } + else { + matches.push_back(std::make_tuple(t_chrom, i_start, i_start + size, complement[t_strand])); + } + } + else if (t_strand == "-") { // TODO: CHECK VALIDITY HERE? why complement when both - - ? + int i_start = t_end - l_offset - size; + if (q_strand == "+") { + matches.push_back(std::make_tuple(t_chrom, i_start, i_start + size, t_strand)); + } + else { + matches.push_back(std::make_tuple(t_chrom, i_start, i_start + size, complement[t_strand])); + } + } + else { + std::cout << "Unknown strand " << q_strand << ". Can only be + or -.\n"; + exit(1); + } + } + } + //std::cout << "matches \n"; + //for (int it = 0; it < matches.size(); it++) { + // std::cout << std::get<0>(matches[it]) << " " << std::get<1>(matches[it]) << " " << std::get<2>(matches[it]) << " " << std::get<3>(matches[it]) << "\n"; + //} + return 0; +} + +std::string update_chromID(std::string c_temp, std::string c_target) { + if (c_temp.find("chr") == 0) { + if (c_target.find("chr") == 0) { + return c_target; + } + else { + return "chr" + c_target; + } + } + else { + if (c_target.find("chr") == 0) { + return std::regex_replace(c_target, std::regex("chr"), ""); + } + else { + return c_target; + } + } + return ""; +} + +int intersectBed(std::string chr1, int st1, int end1, std::string chr2, int st2, int end2, std::string& ret_chr, int& ret_st, int& ret_end) { + + if (st1 > end1 || st2 > end2) { + std::cout << "Start cannot be larger than end\n"; + exit(1); + } + if (chr1 != chr2) { + return -1; + } + if (st1 > end2 || end1 < st2) { + return -1; + } + + ret_chr = chr1; + ret_st = std::max(st1, st2); + ret_end = std::min(end1, end2); + + return 0; +} + +int get_reference_length(seqan::String> cigar) { + int len = 0; + + for (auto it : cigar) { + auto count = it.count; + auto operation = it.operation; + if (operation == (char)('D') || operation == (char)('M')) { + len += count; + } + } + return len; +} + + +// string trimming functions +// https://www.techiedelight.com/trim-string-cpp-remove-leading-trailing-spaces/ +std::string lefttrim(const std::string &s) { + size_t start = s.find_first_not_of(WHITESPACE); + return (start == std::string::npos) ? "" : s.substr(start); +} + +std::string righttrim(const std::string &s) { + size_t end = s.find_last_not_of(WHITESPACE); + return (end == std::string::npos) ? "" : s.substr(0, end + 1); +} + +std::string trim(const std::string &s) { + return righttrim(lefttrim(s)); +} + + + diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..9c7f72a --- /dev/null +++ b/src/utils.h @@ -0,0 +1,24 @@ +#ifndef __UTILS_H +#define __UTILS_H + +#include "common.h" +#include +#include +#include +//#include +#include + + +int read_chain_file(std::string, std::map&, std::map&, std::map&); +int map_coordinates(std::map&, std::string, int, int, std::vector>&, std::string, bool); +int intersectBed(std::string, int, int, std::string, int, int, std::string&, int&, int&); +std::string update_chromID(std::string c_temp, std::string c_target); +int get_reference_length(seqan::String>); + +// string trimming functions +// https://www.techiedelight.com/trim-string-cpp-remove-leading-trailing-spaces/ +std::string righttrim(const std::string&); +std::string lefttrim(const std::string&); +std::string trim(const std::string&); + +#endif diff --git a/test_data/ce6ToCe10.over.chain b/test_data/ce6ToCe10.over.chain new file mode 100644 index 0000000..a8379d2 --- /dev/null +++ b/test_data/ce6ToCe10.over.chain @@ -0,0 +1,182 @@ +chain 1420079490 chrI 15072421 + 0 15072421 chrI 15072423 + 0 15072423 5 +221369 0 1 +10668 1 0 +510810 0 1 +3441789 1 0 +3353183 0 1 +2898637 1 0 +1086017 0 1 +3102274 0 1 +447671 + +chain 1440182246 chrII 15279323 + 0 15279323 chrII 15279345 + 0 15279345 4 +213638 0 1 +1364148 0 1 +25209 0 1 +323844 0 1 +751899 0 1 +117713 0 1 +82594 0 1 +46901 0 1 +595289 0 1 +1232922 1 0 +9136 1 0 +68621 0 1 +196995 1 0 +50351 0 1 +124052 0 1 +68144 0 1 +390381 1 0 +5841 0 1 +2571 0 1 +37422 0 1 +113165 1 0 +118139 0 1 +35566 0 1 +24103 0 1 +195000 0 1 +6529 0 1 +55899 0 1 +244614 0 1 +38770 0 1 +161265 0 1 +48490 1 0 +729895 0 1 +68366 0 1 +94559 0 1 +21 0 1 +97334 1 0 +4370 1 0 +26 1 0 +74 1 0 +1902 1 0 +121136 0 1 +4306 1 0 +1100324 0 1 +263385 1 0 +202478 0 1 +1426671 1 0 +343549 0 1 +177176 0 1 +332939 0 1 +838864 0 1 +2722723 + +chain 1298536650 chrIII 13783681 + 0 13783681 chrIII 13783700 + 0 13783700 6 +661016 0 1 +174888 0 1 +2530843 0 1 +692973 1 0 +14419 1 0 +1031284 0 1 +227835 0 1 +109920 0 1 +24511 1 0 +63426 0 1 +502240 1 0 +159842 0 1 +129053 1 0 +40674 0 1 +233 0 1 +49328 0 1 +55919 0 1 +289193 0 1 +130699 0 1 +468992 0 1 +230102 1 0 +160 1 0 +285 1 0 +294226 1 0 +160635 0 1 +25697 0 1 +62455 0 1 +341657 0 1 +80400 0 1 +30452 0 1 +304814 0 1 +88259 1 0 +223205 0 1 +123169 0 1 +27972 0 1 +1018531 0 1 +91082 0 1 +23236 1 0 +21759 6 7 +4364 0 1 +41745 1 0 +1122 0 1 +427 0 1 +3230621 + +chain 1646393646 chrIV 17493785 + 0 17493785 chrIV 17493793 + 0 17493793 3 +1515189 0 1 +1866903 1 0 +1201512 0 1 +217597 0 1 +406841 0 1 +197301 0 1 +600822 0 1 +3189501 0 1 +645184 0 1 +573978 0 1 +479411 0 1 +152268 1 0 +3799 1 0 +1040949 0 1 +342327 1 0 +754723 0 1 +602691 1 0 +2174556 0 1 +1528228 + +chain 1284774 chrM 13794 + 0 13794 chrM 13794 + 0 13794 23 +13794 + +chain 1970373837 chrV 20919568 + 0 20919568 chrV 20924149 + 0 20924149 1 +653488 1 0 +2902897 0 1 +4677208 0 1 +524159 0 1 +356177 0 1 +555593 0 1 +635270 0 1 +1190364 1 0 +1315315 0 1 +696139 0 1 +159259 0 1 +6017674 0 4574 +1236023 + +chain 1668542462 chrX 17718854 + 0 17718854 chrX 17718866 + 0 17718866 2 +425163 1 0 +1809291 1 0 +1179400 0 1 +398334 0 1 +753101 0 1 +38162 1 0 +566717 0 1 +292574 0 1 +64197 0 1 +63512 0 1 +9043 0 1 +460077 1 0 +15907 0 1 +207893 0 1 +856068 1 0 +5894 0 1 +291455 0 1 +549377 0 1 +339187 1 0 +270659 0 1 +44 1 0 +5893 0 1 +37902 1 0 +595372 0 1 +2188474 0 1 +3150810 1 0 +47183 0 1 +12638 0 1 +2594642 0 1 +485431 0 1 +4445 + diff --git a/test_data/little.bam b/test_data/little.bam new file mode 100644 index 0000000..3e683db Binary files /dev/null and b/test_data/little.bam differ diff --git a/test_data/little.sam b/test_data/little.sam new file mode 100644 index 0000000..16bc710 --- /dev/null +++ b/test_data/little.sam @@ -0,0 +1,22 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:chrI LN:15072421 +@SQ SN:chrII LN:15279323 +@SQ SN:chrIII LN:13783681 +@SQ SN:chrIV LN:17493785 +@SQ SN:chrM LN:13794 +@SQ SN:chrV LN:20919568 +@SQ SN:chrX LN:17718854 +@RG ID:SRR3536210 SM:SRR3536210 PL:illumina LB:SRR3536210 +@PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem -R @RG\tID:SRR3536210\tSM:SRR3536210\tPL:illumina\tLB:SRR3536210 -t 22 /mnt/batty-shared/species/caenorhabditis_elegans-6239/reference/ce6/full/ref.fa /mnt/batty-shared/species/caenorhabditis_elegans-6239/sample/SAMN05004936/illumina/SRR3536210_1.fastq /mnt/batty-shared/species/caenorhabditis_elegans-6239/sample/SAMN05004936/illumina/SRR3536210_2.fastq +@PG ID:samtools PN:samtools PP:bwa VN:1.11 CL:samtools view -h -F4 +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.11 CL:samtools sort -l5 -m 8G -@ 10 +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.12 CL:samtools view -h ce6_SRR3536210.bam +@PG ID:samtools.3 PN:samtools PP:samtools.2 VN:1.12 CL:samtools view -bS - +@PG ID:samtools.4 PN:samtools PP:samtools.3 VN:1.12 CL:samtools view -h little.bam +SRR3536210.981192 129 chrI 1 0 101M = 15072261 15072261 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA CCCFFFFFHHHHHJJJJJJJJJJJJJJJJFIGIIJJJGIHIFHGGHIJFIBFCHIHG@CHDDHHDDGHJEEEC@CCCDECEC>CACCCDCCCC>@CCCDC@ NM:i:0 MD:Z:101 MC:Z:101M AS:i:101 XS:i:101 RG:Z:SRR3536210 +SRR3536210.1727635 99 chrI 1 0 101M = 375 475 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA CCCFFFFFHHHHHJJJJJJIJJJJJIJJJJIIJJJJGEIIIIGCGGGIFGHIJJFDGHGGGDFHEEGGGAEFEE>;?@>>CA@CAC>@CCCC:>AA>?ACC NM:i:0 MD:Z:101 MC:Z:101M AS:i:101 XS:i:101 RG:Z:SRR3536210 +SRR3536210.1827380 133 chrI 1 0 101M = 15072259 15072259 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA CCCFFFFFHHHHHIJJJJJIHIGJJIJJJJJGIJJIJEHGCGI>3?DDHB?DGGEBFCFAGC=C@@G=@ECEED9BDDC66@CCCCA>ACCC9>ACD#### NM:i:0 MD:Z:101 MC:Z:101M AS:i:101 XS:i:101 RG:Z:SRR3536210 +SRR3536210.2188889 97 chrI 1 0 101M = 71 171 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA CCCFFFFFHHHHHJJJJJJJJJIJJJJJJJJJJJJJJJJJJJJIIIJJHIJJJJJIIIGIGGEHIGGHHFECEBB?DDFCEEDCD?CAD@CDCCDCDCCCC NM:i:0 MD:Z:101 MC:Z:101M AS:i:101 XS:i:101 RG:Z:SRR3536210 +SRR3536210.2918244 97 chrI 1 0 101M = 144 244 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA CCCFFFFFHHHHHJJJJJJJJJJJIJJJJJIJJIJJIJJJJJIJIJJJIJIJJJGGHIIHGGIIGGGGGEHEEFEBCECECEC@CDCDCC:?A@AC::::5 NM:i:0 MD:Z:101 MC:Z:101M AS:i:101 XS:i:101 RG:Z:SRR3536210 +SRR3536210.3356959 99 chrI 1 0 91M10S = 374 474 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGACTAATCATA @CCFFFFFFGGGHIJJJJJGIGIIIEGHIJIJJJIGG@DGGGFDBD90*9?8??)=;8EH3@3==).7)=7?A9);;B>AC#################### NM:i:1 MD:Z:84G6 MC:Z:101M AS:i:86 XS:i:88 RG:Z:SRR3536210 +SRR3536210.3977123 163 chrI 1 1 101M = 363 463 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA CCCFFFFFHHHHHJJJJJJJJJJJIIJIJJGEHDEHGEIEGIEGIIHIEBDFDDFGIGBDF>FDGHJEDCHHHFFCDEFAC;;ACC9;@CCCAACD?CCCD NM:i:0 MD:Z:101 MC:Z:101M AS:i:101 XS:i:101 RG:Z:SRR3536210 diff --git a/test_data/little2.bam b/test_data/little2.bam new file mode 100644 index 0000000..3e683db Binary files /dev/null and b/test_data/little2.bam differ diff --git a/validation/compare_outputs.py b/validation/compare_outputs.py new file mode 100644 index 0000000..08214ac --- /dev/null +++ b/validation/compare_outputs.py @@ -0,0 +1,91 @@ +import sys + +f1 = open(sys.argv[1]) +f2 = open(sys.argv[2]) + +l1 = f1.readline() +l2 = f2.readline() + +# only care about @SQ for now. +f1_head = dict() +f2_head = dict() + +# add unmapped later. + +f1_linenum = 1 +f2_linenum = 1 +while True: + + if l1[0] == "@": + if "@SQ SN:" in l1: + f1_head[l1.split()[1]] = l1.split()[2] + l1 = f1.readline() + f1_linenum += 1 + elif l2[0] == "@": + if "@SQ SN:" in l2: + f2_head[l2.split()[1]] = l2.split()[2] + l2 = f2.readline() + f2_linenum += 1 + else: + # we are in the body now. + # first check the header. + for el in f1_head: + if el not in f2_head: + print("HEADER DIFF: f1 " + str(el) + " -> " + str(f1_head[el]) + " f2 " + str(el) + " not found") + elif f1_head[el] != f2_head[el]: + print("HEADER DIFF: f1 " + str(el) + " -> " + str(f1_head[el]) + " f2 " + str(el) + " -> " + str(f2_head[el])) + for el in f2_head: + if el not in f1_head: + print("HEADER DIFF: f1 " + str(el) + " -> " + str(f1_head[el]) + " f2 " + str(el) + " not found") + elif f2_head[el] != f1_head[el]: + print("HEADER DIFF: f1 " + str(el) + " -> " + str(f1_head[el]) + " f2 " + str(el) + " -> " + str(f2_head[el])) + + l1_cols_1_11 = l1.split()[0:11] + l2_cols_1_11 = l2.split()[0:11] + + l1_tags = l1.split()[11:] + l2_tags = l2.split()[11:] + + exit_code = 0 + + for el in l1_tags: + if el not in l2_tags: + print("TAG DIFF: " + el + " not found in f2") + print("f1 line num: " + str(f1_linenum) + " f2 line num: " + str(f2_linenum)) + exit_code = 1 + + for el in l2_tags: + if el not in l1_tags: + print("TAG DIFF: " + el + " not found in f1") + print("f1 line num: " + str(f1_linenum) + " f2 line num: " + str(f2_linenum)) + exit_code = 1 + + if (exit_code): + exit(1) + + + for idx,el in enumerate(l1_cols_1_11): + if idx == 6: + if (l1_cols_1_11[idx] == "=" and (l2_cols_1_11[idx] != l2_cols_1_11[2])) or (l2_cols_1_11[idx] == "=" and (l1_cols_1_11[idx] != l1_cols_1_11[2])): + # RNEXT == "=" seqan3 problem? + print("RNEXT and RNAME DO NOT MATCH") + exit_code = 1 + break + continue + + if el != l2_cols_1_11[idx]: + print("ENTRY DIFF: " + str(el) + " != " + str(l2_cols_1_11[idx])) + exit_code = 1 + + if exit_code: + print("f1 linenum: " + str(f1_linenum) + " -- " + " ".join(l1_cols_1_11)) + print("f2 linenum: " + str(f2_linenum) + " -- " + " ".join(l2_cols_1_11)) + exit(1) + + l1 = f1.readline() + l2 = f2.readline() + + if not l1 or not l2: + break + +print("Found 0 Errors") diff --git a/zlib b/zlib new file mode 160000 index 0000000..cacf7f1 --- /dev/null +++ b/zlib @@ -0,0 +1 @@ +Subproject commit cacf7f1d4e3d44d871b605da3b647f07d718623f