diff --git a/README.md b/README.md index 213beee..63346bf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,12 @@ pyPaSWAS ======== [![DOI](https://zenodo.org/badge/28648467.svg)](https://zenodo.org/badge/latestdoi/28648467) -Extented python version of PaSWAS. Original paper in PLOS ONE: http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0122524 +Extented python version of PaSWAS. Original papers in PLOS ONE: + +http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0122524 + +http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190279 + For DNA/RNA/protein sequence alignment and trimming. @@ -23,6 +28,15 @@ Platforms supported: More information: https://github.com/swarris/pyPaSWAS/wiki +Docker +------ + +The pyPasWAS source contains several docker install files. Clone the repository: + +git clone https://github.com/swarris/pyPaSWAS.git + +Then use one of the docker images in the _docker_ folder. For more information, see the [README](https://github.com/swarris/pyPaSWAS/tree/master/docker) + Installation ------------ In most cases it is enough to clone the repository. After that, please install: diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..ba2bdff --- /dev/null +++ b/docker/README.md @@ -0,0 +1,40 @@ +# pyPaSWAS Docker Containers + +This folder contains the Docker files for building Containers containing the `pyPaSWAS` software. These containers are based on Ubuntu 16.04 and come supplied with Python3 and the Nvidia CUDA software as they are based on the `nvidia/cuda:8.0-devel-ubuntu16.04` container image [supplied by Nvidia](https://hub.docker.com/r/nvidia/cuda/). + +## Running existing Docker Containers + +The Docker engine is required for running the container, see their [excellent installation instructions](https://docs.docker.com/engine/installation/) for further details. +Next, these containers require low-level access to the hardware (i.e. the GPU) and therefore the use of the `nvidia-docker` utility, installation instructions are available on its [github page](https://github.com/NVIDIA/nvidia-docker/tree/2.0). + +`nvidia-docker run --rm -ti mkempenaar/pypaswas:nvidia-opencl_cuda8.0 bash` will download the container, start and attach to a bash session running inside the container. Here you will find the software at `/root/pyPasWAS`. Running the performance tests on a clean container is as simple as (note: this will take a while): + +``` +cd /root/pyPaSWAS +sh data/runPerformanceTests.sh +``` + +* ## Container(s) available on [Docker Hub](https://hub.docker.com/r/mkempenaar/pypaswas/) + + **`mkempenaar/pypaswas:nvidia-opencl_cuda8.0` [*Docker file*](https://raw.githubusercontent.com/swarris/pyPaSWAS/master/docker/nvidia/Dockerfile)** + + This container can be used for testing all availabilities of the `pyPaSWAS` sequence aligner as it contains the Intel and Nvidia OpenCL runtime libraries and Nvidia CUDA support. + + +## Building custom Docker Containers + +As most hardware manufacturers have their own acceleration libraries (multiple versions of OpenCL, Nvidia CUDA, etc.) the available containers might not work for your hardware. Therefore, a few custom build files are available depending on your hardware and requirements (i.e. only CUDA support or only Intel OpenCL). + +### Downloading and Building + +Cloning this repository gives the currently available Dockerfiles for building custom images which can be found in the `pyPaSWAS/docker` folder. Building a container locally can be done by going to the folder of choice (each contains a single `Dockerfile`; a container description) and running: + +``` +docker build -t pypaswas:custom . +``` + +Currently available: + +* [Intel OpenCL + Nvidia CUDA](https://raw.githubusercontent.com/swarris/pyPaSWAS/master/docker/intel/Dockerfile), `pyPaSWAS/data/docker/intel/Dockerfile`: Suitable for Intel Core and Xeon CPUs and GPUs from the 3rd generation (Ivy Bridge) and newer, combined with Nvidia CUDA from the base container image. +* [Intel OpenCL + Nvidia CUDA](https://raw.githubusercontent.com/swarris/pyPaSWAS/master/docker/intel/sandybridge/Dockerfile), `pyPaSWAS/data/docker/intel/sandybridge/Dockerfile`: Only suitable for 2nd generation (Sandy Bridge) Intel Core CPUs, combined with Nvidia CUDA from the base container image. +* [Intel OpenCL + Nvidia OpenCL + Nvidia CUDA](https://raw.githubusercontent.com/swarris/pyPaSWAS/master/docker/nvidia/Dockerfile), `pyPaSWAS/data/docker/nvidia/Dockerfile`: Full package for 3rd generation and newer Intel Core and Xeon CPUs and GPUs, combined with Nvidia OpenCL and CUDA support. diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile new file mode 100644 index 0000000..e89d679 --- /dev/null +++ b/docker/intel/Dockerfile @@ -0,0 +1,32 @@ +FROM nvidia/cuda:8.0-devel-ubuntu16.04 + +MAINTAINER Marcel Kempenaar (m.kempenaar@pl.hanze.nl) + +## OpenCL dependencies, runtime and development packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + beignet ocl-icd-opencl-dev libffi-dev clinfo && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH /usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH /usr/local/cuda/lib:/usr/local/cuda/lib64 + +## Python3 and dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-dev python3-pip python3-setuptools git opencl-headers \ + autoconf libtool pkg-config && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/local/cuda/lib64/libOpenCL* /usr/lib/ && \ + pip3 install --upgrade pip + +RUN pip3 install wheel + +RUN pip3 install numpy + +RUN pip3 install biopython + +RUN pip3 install pyopencl + +## pyPaSWAS installation +RUN git clone https://github.com/swarris/pyPaSWAS.git /root/pyPaSWAS diff --git a/docker/intel/sandybridge/Dockerfile b/docker/intel/sandybridge/Dockerfile new file mode 100644 index 0000000..7e85fa1 --- /dev/null +++ b/docker/intel/sandybridge/Dockerfile @@ -0,0 +1,58 @@ +FROM nvidia/cuda:8.0-devel-ubuntu16.04 + +MAINTAINER Marcel Kempenaar (m.kempenaar@pl.hanze.nl) + +## OpenCL dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + rpm alien libnuma1 curl fakeroot libffi-dev clinfo && \ + rm -rf /var/lib/apt/lists/* + +## Intel 2nd Generation OpenCL 1.2 support +RUN curl http://registrationcenter-download.intel.com/akdlm/irc_nas/9019/opencl_runtime_16.1.1_x64_ubuntu_6.4.0.25.tgz | tar xz + +RUN cd opencl_runtime_16.1.1_x64_ubuntu_6.4.0.25/rpm && \ + fakeroot alien --to-deb opencl-1.2-base-6.4.0.25-1.x86_64.rpm && \ + fakeroot alien --to-deb opencl-1.2-intel-cpu-6.4.0.25-1.x86_64.rpm + +RUN cd opencl_runtime_16.1.1_x64_ubuntu_6.4.0.25/rpm && \ + dpkg -i opencl-1.2-base_6.4.0.25-2_amd64.deb && \ + dpkg -i opencl-1.2-intel-cpu_6.4.0.25-2_amd64.deb && \ + rm -Rf /opencl_runtime_16.1.1_x64_ubuntu_6.4.0.25 + +RUN echo "/opt/intel/opencl-1.2-6.4.0.25/lib64/clinfo" > /etc/ld.so.conf.d/intelOpenCL.conf + +RUN mkdir -p /etc/OpenCL/vendors && \ + ln /opt/intel/opencl-1.2-6.4.0.25/etc/intel64.icd /etc/OpenCL/vendors/intel64.icd && \ + ldconfig + +ENV PATH /usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH /usr/local/cuda/lib:/usr/local/cuda/lib64 + +## Python3 and dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-dev python3-pip python3-setuptools git opencl-headers \ + autoconf libtool pkg-config && \ + ln -s /usr/bin/python3 /usr/bin/python + +RUN ln -s /usr/local/cuda/lib64/libOpenCL* /usr/lib/ && \ + pip3 install --upgrade pip + +RUN pip3 install wheel + +RUN pip3 install numpy + +RUN pip3 install biopython + +RUN export PATH=/usr/local/cuda/bin:$PATH && pip3 install pycuda + +## Custom pyOpenCL installation forcing the use of version 1.2 +RUN export PATH=/usr/local/cuda/bin:$PATH && \ + export LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64 && \ + export LDFLAGS=-L/usr/local/cuda/lib64 && \ + git clone https://github.com/pyopencl/pyopencl.git && \ + cd pyopencl && python3 configure.py && \ + echo 'CL_PRETEND_VERSION = "1.2"' >> siteconf.py && \ + pip3 install . + +## pyPaSWAS installation +RUN git clone https://github.com/swarris/pyPaSWAS.git /root/pyPaSWAS diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile new file mode 100644 index 0000000..6667022 --- /dev/null +++ b/docker/nvidia/Dockerfile @@ -0,0 +1,41 @@ +FROM nvidia/cuda:8.0-devel-ubuntu16.04 + +MAINTAINER Marcel Kempenaar (m.kempenaar@pl.hanze.nl) + +## OpenCL dependencies, runtime and development packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + beignet ocl-icd-opencl-dev libffi-dev clinfo && \ + rm -rf /var/lib/apt/lists/* + +## NVIDIA OpenCL support, taken from: https://gitlab.com/nvidia/opencl/blob/ubuntu14.04/runtime/Dockerfile +RUN mkdir -p /etc/OpenCL/vendors && \ + echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd + +RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ + echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf + +ENV PATH /usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH /usr/local/cuda/lib:/usr/local/cuda/lib64 + +## Python3 and dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-dev python3-pip python3-setuptools git opencl-headers \ + autoconf libtool pkg-config && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/local/cuda/lib64/libOpenCL* /usr/lib/ && \ + pip3 install --upgrade pip + +RUN pip3 install wheel + +RUN pip3 install numpy + +RUN pip3 install biopython + +RUN export PATH=/usr/local/cuda/bin:$PATH && pip3 install pycuda + +RUN pip3 install pyopencl + +## pyPaSWAS installation +RUN git clone https://github.com/swarris/pyPaSWAS.git /root/pyPaSWAS diff --git a/pyPaSWAS/Core/Formatters.py b/pyPaSWAS/Core/Formatters.py index 75bfba6..3b753f3 100644 --- a/pyPaSWAS/Core/Formatters.py +++ b/pyPaSWAS/Core/Formatters.py @@ -33,11 +33,16 @@ def _set_name(self): '''Name of the formatter. Used for logging''' self.name = 'defaultformatter' + def _get_hits(self): + '''Returns ordered list of hits''' + hits = self.hitlist.real_hits.values() + return sorted(hits, key=lambda hit: (hit.get_seq_id(), hit.get_target_id(), hit.score)) + def print_results(self): '''sets, formats and prints the results to a file.''' self.logger.debug('printing results...') output = open(self.outputfile, 'w') - for hit in self.hitlist.real_hits.values(): + for hit in self._get_hits(): formatted_hit = self._format_hit(hit) output.write(formatted_hit + "\n") self.logger.debug('finished printing results') @@ -81,7 +86,7 @@ def print_results(self): '''sets, formats and prints the results to a file.''' self.logger.info('formatting results...') #format header and hit lines - for hit in self.hitlist.real_hits.values(): + for hit in self._get_hits(): self._format_hit(hit) self.logger.debug('printing results...') @@ -132,7 +137,7 @@ def print_results(self): '''sets, formats and prints the results to a file.''' self.logger.info('formatting results...') #format header and hit lines - for hit in self.hitlist.real_hits.values(): + for hit in self._get_hits(): self._format_hit(hit) self.logger.debug('printing results...') @@ -173,7 +178,7 @@ def print_results(self): '''sets, formats and prints the results to a file.''' self.logger.info('formatting results...') #format header and hit lines - for hit in self.hitlist.real_hits.values(): + for hit in self._get_hits(): self._format_hit(hit) self.logger.debug('printing results...') diff --git a/pyPaSWAS/Core/PaSWAS.py b/pyPaSWAS/Core/PaSWAS.py index f4013f2..b2f871a 100644 --- a/pyPaSWAS/Core/PaSWAS.py +++ b/pyPaSWAS/Core/PaSWAS.py @@ -18,6 +18,12 @@ def __init__(self, logger): self.score_source = '' self.main_source = '' + def read_source(self, filename): + '''Read source code from the specified file and prefix it + with line number and file name info for better compilation error messages. + ''' + return '#line 1 "{}"\n'.format(filename) + read_file(filename) + def set_shared_xy_code(self, sharedx=8, sharedy=8): ''' Sets the horizontal and the vertical sizes of the smallest alignment matrices in shared memory @@ -25,7 +31,7 @@ def set_shared_xy_code(self, sharedx=8, sharedy=8): :param sharedy: ''' #self.logger.debug('Setting sharedx to {0}, sharedy to {1}'.format(sharedx, sharedy)) - code_t = Template(read_file(self.main_source)) + code_t = Template(self.read_source(self.main_source)) self.shared_xy_code = code_t.safe_substitute(SHARED_X=sharedx, SHARED_Y=sharedy) def set_direction_code(self, no_direction=0, up_left=1, up=2, left=3, stop=4): @@ -39,7 +45,7 @@ def set_direction_code(self, no_direction=0, up_left=1, up=2, left=3, stop=4): ''' #self.logger.debug('Setting directions:\n\tno = {0}\n\tup_left = {1}\n\tup = {2}\n\tleft = {3}\n\t' # 'stop = {3}'.format(no_direction, up_left, up, left, stop)) - direction_t = Template(read_file(self.direction_source)) + direction_t = Template(self.read_source(self.direction_source)) self.directions = direction_t.safe_substitute(NO_DIRECTION=no_direction, UP_LEFT_DIRECTION=up_left, UP_DIRECTION=up, @@ -50,7 +56,7 @@ def set_score_code(self, score): '''Formats information contained in a score. ''' #self.logger.debug('Sourcing the scorepart of the cuda code') - score_part_t = Template(read_file(self.score_source)) + score_part_t = Template(self.read_source(self.score_source)) gap_extension = 0.0 if score.gap_extension != None: gap_extension = score.gap_extension @@ -69,7 +75,7 @@ def set_variable_code(self, number_sequences, number_targets, x_val, y_val, char '''Sets the variable part of the code''' #self.logger.debug('Setting the variable part of the cuda code\n\t(using: n_seq: {}, n_targets: {}, ' # 'x_val: {}, y_val: {})'.format(number_sequences, number_targets, x_val, y_val)) - variable_t = Template(read_file(self.variable_source)) + variable_t = Template(self.read_source(self.variable_source)) self.variable_part = variable_t.safe_substitute(N_SEQUENCES=number_sequences, N_TARGETS=number_targets, X=x_val, @@ -104,7 +110,6 @@ class OCLcode(Code): ''' def __init__(self, logger): Code.__init__(self, logger) - self.variable_source = resource_filename(__name__, 'ocl/default_variable.cl') self.direction_source = resource_filename(__name__, 'ocl/default_direction.cl') self.score_source = resource_filename(__name__, 'ocl/default_score.cl') @@ -116,6 +121,7 @@ class GPUcode(OCLcode): def __init__(self, logger): OCLcode.__init__(self, logger) self.main_source = resource_filename(__name__, 'ocl/default_main_gpu.cl') + self.variable_source = resource_filename(__name__, 'ocl/default_variable_gpu.cl') class CPUcode(OCLcode): ''' @@ -125,6 +131,7 @@ class CPUcode(OCLcode): def __init__(self, logger): OCLcode.__init__(self, logger) self.main_source = resource_filename(__name__, 'ocl/default_main_cpu.cl') + self.variable_source = resource_filename(__name__, 'ocl/default_variable_cpu.cl') def set_shared_xy_code(self, sharedx=8, sharedy=8, workloadx=4, workloady=4): ''' @@ -133,5 +140,5 @@ def set_shared_xy_code(self, sharedx=8, sharedy=8, workloadx=4, workloady=4): :param sharedy: ''' #self.logger.debug('Setting sharedx to {0}, sharedy to {1}'.format(sharedx, sharedy)) - code_t = Template(read_file(self.main_source)) - self.shared_xy_code = code_t.safe_substitute(SHARED_X=sharedx, SHARED_Y=sharedy, WORKLOAD_X=workloadx, WORKLOAD_Y=workloady) \ No newline at end of file + code_t = Template(self.read_source(self.main_source)) + self.shared_xy_code = code_t.safe_substitute(SHARED_X=sharedx, SHARED_Y=sharedy, WORKLOAD_X=workloadx, WORKLOAD_Y=workloady) diff --git a/pyPaSWAS/Core/Programs.py b/pyPaSWAS/Core/Programs.py index 666ef90..ce31f91 100644 --- a/pyPaSWAS/Core/Programs.py +++ b/pyPaSWAS/Core/Programs.py @@ -35,14 +35,9 @@ def __init__(self, logger, score, settings): self.settings = settings if (self.settings.framework.upper() == 'OPENCL'): if(self.settings.device_type.upper() == 'GPU'): - if(self.settings.platform_name.upper() == 'NVIDIA'): - self.logger.debug('Using OpenCL NVIDIA implementation') - from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanNVIDIA - self.smith_waterman = SmithWatermanNVIDIA(self.logger, self.score, settings) - else: - self.logger.debug('Using OpenCL GPU implementation') - from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU - self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) + self.logger.debug('Using OpenCL GPU implementation') + from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU + self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) elif(self.settings.device_type.upper() == 'CPU'): self.logger.debug('Using OpenCL CPU implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanCPU @@ -74,11 +69,15 @@ def process(self, records_seqs, targets, pypaswas): self.logger.debug('Aligner processing...') target_index = 0 + all_targets_length = sum(len(s.seq) for s in targets) + all_sequences_length = sum(len(s.seq) for s in records_seqs) + self.smith_waterman.set_total_work_size(all_targets_length * all_sequences_length) + while target_index < len(targets): self.logger.debug('At target: {0} of {1}'.format(target_index, len(targets))) - last_target_index = self.smith_waterman.set_targets(targets, target_index) + last_target_index = self.smith_waterman.set_targets(targets, target_index, records_seqs=records_seqs, use_all_records_seqs=False) # results should be a Hitlist() results = self.smith_waterman.align_sequences(records_seqs, targets, target_index) self.hitlist.extend(results) @@ -104,7 +103,11 @@ def process(self, records_seqs, targets, pypaswas): max_length = len(targets[0]) else: max_length = None - + + all_targets_length = sum(len(s.seq) for s in targets) + all_sequences_length = sum(len(s.seq) for s in records_seqs) + self.smith_waterman.set_total_work_size(all_targets_length * all_sequences_length) + while target_index < len(targets): self.logger.debug('At target: {0} of {1}'.format(target_index, len(targets))) diff --git a/pyPaSWAS/Core/SmithWaterman.py b/pyPaSWAS/Core/SmithWaterman.py index 1d62635..0f0cee7 100644 --- a/pyPaSWAS/Core/SmithWaterman.py +++ b/pyPaSWAS/Core/SmithWaterman.py @@ -12,6 +12,7 @@ import numpy import math import time +import datetime from pyPaSWAS.Core.StartingPoint import StartingPoint from pyPaSWAS.Core.HitList import HitList @@ -177,6 +178,11 @@ def __init__(self, logger, score, settings): if self.mem_fill_factor > 1.0 or self.mem_fill_factor <= 0.0: raise InvalidOptionException('maximux_memory_usage is not a float between 0.0 and 1.0'.format(settings.maximum_memory_usage)) + # Attibutes related to reporting of current progress + self.total_work_size = 0 + self.total_processed = 0 + self.start_time = time.time() + def __del__(self): '''Destructor. Removes the current running context''' pass @@ -192,14 +198,6 @@ def _initialize_device(self, device_number): def _device_global_mem_size(self): ''' defines maximum available mem on device. Should be implemented by subclasses. ''' pass - - def _get_max_length_xy(self): - ''' - _get_max_length_xy gives the maximum length of both X and Y possible based on the total memory. - @return: int value of the maximum length of both X and Y. - ''' - return (math.floor(math.sqrt((self._device_global_mem_size() * self.mem_fill_factor) / - self._get_mem_size_basic_matrix()))) def _get_max_number_sequences(self, length_sequences, length_targets, number_of_targets): ''' @@ -209,13 +207,10 @@ def _get_max_number_sequences(self, length_sequences, length_targets, number_of_ ''' self.logger.debug("Total memory on Device: {}".format(self._device_global_mem_size()/1024.0/1024.0)) value = 1 - gapExtensionFactor = 1 - if self.gap_extension: - gapExtensionFactor = 3 - + try: value = math.floor((self._device_global_mem_size() * self.mem_fill_factor) / #@UndefinedVariable - ((gapExtensionFactor * length_sequences * length_targets * (self._get_mem_size_basic_matrix()) + + ((length_sequences * length_targets * (self._get_mem_size_basic_matrix()) + (length_sequences * length_targets * SmithWaterman.float_size) / (self.shared_x * self.shared_y)) * number_of_targets)) #@UndefinedVariable @IgnorePep8 except: @@ -269,7 +264,7 @@ def _set_max_possible_score(self, target_index, targets, i, index, records_seqs) '''fills the max_possible_score datastructure on the host''' pass - def _get_starting_point_byte_array(self): + def _get_starting_point_byte_array(self, number_of_starting_points): ''' Get the resulting starting points @return gives the resulting starting point array as byte array @@ -372,18 +367,23 @@ def _get_number_of_targets_with_sequences(self, records_seqs): return max_number_of_targets - def set_targets(self, targets, target_index, max_length = None, records_seqs=None): + def set_targets(self, targets, target_index, max_length=None, records_seqs=None, use_all_records_seqs=True): '''Retrieves a block of targets from the target array and returns the index of the last target that will be processed''' if self.max_length == None or target_index == 0: self._set_target_block_length(targets, target_index) - if records_seqs != None and len(records_seqs) > 0: - self.max_number_of_targets = self._get_number_of_targets_with_sequences(records_seqs) - if self.max_number_of_targets < 1: - self.max_number_of_targets = self._get_number_of_targets() - else: - self.max_number_of_targets = self._get_number_of_targets() - + if records_seqs != None and len(records_seqs) > 0: + if use_all_records_seqs: + self.max_number_of_targets = self._get_number_of_targets_with_sequences(records_seqs) + if self.max_number_of_targets < 1: + self.max_number_of_targets = self._get_number_of_targets() + else: + # Find maximum possible number of targets for cases when number of records_seqs + # is small. _get_number_of_targets assumes that many sequences are used, hence + # it may return too small number. + self.max_number_of_targets = max(self._get_number_of_targets(), + self._get_number_of_targets_with_sequences(records_seqs)) + if max_length != None and self.settings.recompile == "F" and target_index > 0: self.max_length = max_length @@ -395,8 +395,8 @@ def set_targets(self, targets, target_index, max_length = None, records_seqs=Non if self.number_of_targets > len(targets): self.number_of_targets = len(targets) - if self.number_of_targets * len(targets[0]) / self.shared_y > self.internal_limit: - self.number_of_targets = int(self.internal_limit * self.shared_y / len(targets[0])) + if self.number_of_targets * len(targets[target_index]) / self.shared_y > self.internal_limit: + self.number_of_targets = int(self.internal_limit * self.shared_y / len(targets[target_index])) # fill the target array with sequences @@ -422,14 +422,21 @@ def _get_mem_size_basic_matrix(self): @return: the amount of memory in bytes for the 1x1 alignment Calculate GPU memory requirements for 1x1 alignment with 1x1 character. """ - # size of each element in a smith waterman matrix (lchar, uchar, luchar, value (is float) and direction) + gapExtensionFactor = 1 + if self.gap_extension: + gapExtensionFactor = 3 + + # size of each element in a smith waterman matrix (direction (is char) and score (is float)) mem_size = 1 - mem_size += 1 - mem_size += self.float_size - mem_size += 1 - mem_size += 1 + mem_size += gapExtensionFactor * self.float_size return mem_size + def set_total_work_size(self, size): + '''Sets total work size (number of cells) and resets current progress. + ''' + self.total_work_size = size + self.total_processed = 0 + self.start_time = time.time() def align_sequences(self, records_seqs, targets, target_index): '''Aligns sequences against the targets. Returns the resulting alignments in a hitlist.''' @@ -437,7 +444,7 @@ def align_sequences(self, records_seqs, targets, target_index): index = 0 prev_seq_length = 0 prev_target_length = 0 - + cont = True # step through all the sequences max_length = 0 @@ -445,6 +452,7 @@ def align_sequences(self, records_seqs, targets, target_index): max_length = len(records_seqs[0]) hitlist=HitList(self.logger) while index < len(records_seqs) and cont: + t0 = time.time() # make sure length of sequences can be divided by shared_x # don't reset when no need to recompile: if self.settings.recompile == "F" : @@ -520,15 +528,34 @@ def align_sequences(self, records_seqs, targets, target_index): self._init_zero_copy() # calculate scores of alignments self._calculate_score() - # perform the traceback - self._traceback_host() - - # TODO: change to returning a value, change _print_alignments to getAlignments in SmithWaterman - # TODO: move _print_alignments to here? This should be a statement to retrieve the results and - # put them into a Hitlist (?) - #hitlist = self._print_alignments(records_seqs, targets, index, target_index) - self._print_alignments(records_seqs, targets, index, target_index, hitlist) - self.logger.info("Time spent on Smith-Waterman > {}".format(time.time()-t)) + + if self._is_traceback_required(): + # perform the traceback + self._traceback_host() + + # TODO: change to returning a value, change _print_alignments to getAlignments in SmithWaterman + # TODO: move _print_alignments to here? This should be a statement to retrieve the results and + # put them into a Hitlist (?) + #hitlist = self._print_alignments(records_seqs, targets, index, target_index) + self._print_alignments(records_seqs, targets, index, target_index, hitlist) + + self.logger.debug("Time spent on Smith-Waterman > {}".format(time.time()-t)) + + if self.total_work_size > 0: + t1 = time.time() + processed = sum(len(s.seq) for s in targets[target_index:(target_index+self.number_targets)]) * \ + sum(len(s.seq) for s in records_seqs[index:(index+self.number_of_sequences)]) + self.total_processed += processed + duration = t1 - t0 + total_duration = t1 - self.start_time + performance = processed / 1e9 / duration + avg_performance = self.total_processed / 1e9 / total_duration + progress = self.total_processed / float(self.total_work_size) + eta = total_duration / progress * (1.0 - progress) + self.logger.info("Duration: {:5.3f} | Total: {} | Performance: {:5.2f} GCUPS | Avg: {:5.2f} GCUPS | Progress: {:7.3%} | ETA: {}" + .format(duration, datetime.timedelta(seconds=round(total_duration)), performance, avg_performance, progress, datetime.timedelta(seconds=round(eta))) + ) + index += self.max_sequences return hitlist @@ -586,6 +613,11 @@ def _calculate_score(self): if (idx < self.x_div_shared_x - 1): idx += 1 + def _is_traceback_required(self): + '''Returns False if it is known after calculating scores that there are no possible + starting points, hence no need to run traceback. + ''' + return True def _traceback_host(self): ''' Performs the traceback on the device ''' @@ -634,15 +666,18 @@ def _print_alignments(self, sequences, targets, start_seq, start_target, hit_lis if hit_list is None: hit_list = HitList(self.logger) self.logger.debug('Printing alignments.') - starting_points = self._get_starting_point_byte_array() - #starting_point = StartingPoint(self.logger) - + number_of_starting_points = self._get_number_of_starting_points() self.logger.debug('Number of starting points is: {0}.'.format(number_of_starting_points)) + if number_of_starting_points == 0: + # No need to read other data from device + return hit_list if number_of_starting_points >= (self.maximum_number_starting_points * self.number_of_sequences * self.number_targets): self.logger.warning("Too many hits returned. Skipping the rest. Please set lower_limit_score higher in config.") number_of_starting_points = self.maximum_number_starting_points * self.number_of_sequences * self.number_targets - + + starting_points = self._get_starting_point_byte_array(number_of_starting_points) + max_score = 0 direction_array = self._get_direction_byte_array() diff --git a/pyPaSWAS/Core/SmithWatermanCuda.py b/pyPaSWAS/Core/SmithWatermanCuda.py index ebffc9a..b77c2b2 100644 --- a/pyPaSWAS/Core/SmithWatermanCuda.py +++ b/pyPaSWAS/Core/SmithWatermanCuda.py @@ -281,7 +281,7 @@ def _set_max_possible_score(self, target_index, targets, i, index, records_seqs) if len(records_seqs[i+index]) < len(targets[tI+target_index]) else len(targets[tI+target_index])) * float(self.filter_factor)) - def _get_starting_point_byte_array(self): + def _get_starting_point_byte_array(self, number_of_starting_points): ''' Get the resulting starting points @return gives the resulting starting point array as byte array diff --git a/pyPaSWAS/Core/SmithWatermanOcl.py b/pyPaSWAS/Core/SmithWatermanOcl.py index 4744fc9..b3917d6 100644 --- a/pyPaSWAS/Core/SmithWatermanOcl.py +++ b/pyPaSWAS/Core/SmithWatermanOcl.py @@ -37,7 +37,9 @@ def __init__(self, logger, score, settings): self._set_device_type(self.settings.device_type) self._set_platform(self.settings.platform_name) self._initialize_device(int(self.settings.device_number)) - + + self.always_reallocate_memory = False + def _init_oclcode(self): # Compiling part of the OpenCL code in advance self.oclcode.set_shared_xy_code(self.shared_x, self.shared_y) @@ -134,6 +136,8 @@ def _device_global_mem_size(self): def _clear_memory(self): '''Clears the claimed memory on the device.''' + if not self.always_reallocate_memory: + return self.logger.debug('Clearing device memory.') self._clear_normal_memory() self._clear_zero_copy_memory() @@ -180,7 +184,13 @@ def _clear_normal_memory(self): except: pass self.d_global_maxima.release() - + if (self.d_index_increment is not None): + try: + self.d_index_increment.finish() + except: + pass + self.d_index_increment.release() + def _clear_zero_copy_memory(self): self.logger.debug('Clearing zero-copy device memory.') if (self.d_starting_points_zero_copy is not None): @@ -189,12 +199,6 @@ def _clear_zero_copy_memory(self): except: pass self.d_starting_points_zero_copy.release() - if (self.d_global_direction_zero_copy is not None): - try: - self.d_global_direction_zero_copy.finish() - except: - pass - self.d_global_direction_zero_copy.release() if (self.d_max_possible_score_zero_copy is not None): try: self.d_max_possible_score_zero_copy.finish() @@ -202,6 +206,20 @@ def _clear_zero_copy_memory(self): pass self.d_max_possible_score_zero_copy.release() + def _need_reallocation(self, buffer, size): + if self.always_reallocate_memory: + return True + if buffer is None: + return True + if buffer.get_info(cl.mem_info.SIZE) < size: + try: + buffer.finish() + except: + pass + buffer.release() + return True + return False + def _init_normal_memory(self): ''' #_init_memory will initialize all required memory on the device based on the current settings. @@ -210,14 +228,19 @@ def _init_normal_memory(self): # Sequence device memory self.logger.debug('Initializing normal device memory.') memory = self.length_of_x_sequences * self.number_of_sequences - self.d_sequences = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory) + if self._need_reallocation(self.d_sequences, memory): + self.d_sequences = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory) mem_size = memory # Target device memory memory = self.length_of_y_sequences * self.number_targets - self.d_targets = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory) + if self._need_reallocation(self.d_targets, memory): + self.d_targets = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory) mem_size += memory - + + if self._need_reallocation(self.d_index_increment, SmithWaterman.int_size): + self.d_index_increment = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=SmithWaterman.int_size) + return mem_size def _init_zero_copy_memory(self): @@ -226,20 +249,15 @@ def _init_zero_copy_memory(self): # Starting points host memory allocation and device copy memory = (self.size_of_startingpoint * self.maximum_number_starting_points * self.number_of_sequences * self.number_targets) - self.d_starting_points_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.ALLOC_HOST_PTR, size=memory) + if self._need_reallocation(self.d_starting_points_zero_copy, memory): + self.d_starting_points_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.ALLOC_HOST_PTR, size=memory) mem_size = memory - - # Global directions host memory allocation and device copy - memory = (self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) - self.d_global_direction_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.ALLOC_HOST_PTR, size=memory) - mem_size += memory - # Maximum zero copy memory allocation and device copy memory = (self.number_of_sequences * self.number_of_targets * SmithWaterman.float_size) #self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.ALLOC_HOST_PTR, size=memory) mem_size += memory - + return mem_size def _init_memory(self): @@ -251,7 +269,6 @@ def _init_memory(self): def _init_zero_copy(self): ''' Initializes the index used for the 'zero copy' of the found starting points ''' - self.d_index_increment = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=SmithWaterman.int_size) index = numpy.zeros((1), dtype=numpy.int32) cl.enqueue_write_buffer(self.queue, self.d_index_increment, index) @@ -261,6 +278,10 @@ def _compile_code(self): code = self.oclcode.get_code(self.score, self.number_of_sequences, self.number_targets, self.length_of_x_sequences, self.length_of_y_sequences) #self.logger.debug('Code: \n{}'.format(code)) self.program = cl.Program(self.ctx, code).build() + self.calculateScoreAffineGap_kernel = self.program.calculateScoreAffineGap + self.calculateScore_kernel = self.program.calculateScore + self.tracebackAffineGap_kernel = self.program.tracebackAffineGap + self.traceback_kernel = self.program.traceback def copy_sequences(self, h_sequences, h_targets): ''' @@ -268,8 +289,8 @@ def copy_sequences(self, h_sequences, h_targets): @param h_sequences: the sequences to be copied. Should be a single string containing all sequences @param h_targets: the targets to be copied. Should be a single string containing all sequences ''' - cl.enqueue_copy(self.queue, self.d_sequences, h_sequences) - cl.enqueue_copy(self.queue, self.d_targets, h_targets) + cl.enqueue_copy(self.queue, self.d_sequences, h_sequences, is_blocking=False) + cl.enqueue_copy(self.queue, self.d_targets, h_targets, is_blocking=False) def _get_number_of_starting_points(self): ''' Returns the number of startingpoints. ''' @@ -286,9 +307,10 @@ def _fill_max_possible_score(self, target_index, targets, i, index, records_seqs else len(targets[tI+target_index])) * float(self.filter_factor)) def _copy_min_score(self): - self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.ALLOC_HOST_PTR| cl.mem_flags.COPY_HOST_PTR, hostbuf=self.min_score_np\ -) - #cl.enqueue_copy(self.queue, self.d_max_possible_score_zero_copy, self.min_score_np) + if self._need_reallocation(self.d_max_possible_score_zero_copy, self.min_score_np.nbytes): + self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.ALLOC_HOST_PTR, size=self.min_score_np.nbytes) + cl.enqueue_copy(self.queue, self.d_max_possible_score_zero_copy, self.min_score_np, is_blocking=False) + def _set_max_possible_score(self, target_index, targets, i, index, records_seqs): '''fills the max_possible_score datastructure on the host''' @@ -300,25 +322,17 @@ def _set_max_possible_score(self, target_index, targets, i, index, records_seqs) #Unmap memory object # del self.h_max_possible_score_zero_copy - def _get_starting_point_byte_array(self): + def _get_starting_point_byte_array(self, number_of_starting_points): ''' Get the resulting starting points @return gives the resulting starting point array as byte array ''' self.h_starting_points_zero_copy = cl.enqueue_map_buffer(self.queue, self.d_starting_points_zero_copy, cl.map_flags.READ, 0, (self.size_of_startingpoint * - self.maximum_number_starting_points * - self.number_of_sequences * - self.number_targets, 1), dtype=numpy.byte)[0] + number_of_starting_points, 1), dtype=numpy.byte)[0] return self.h_starting_points_zero_copy - - def _print_alignments(self, sequences, targets, start_seq, start_target, hit_list=None): - return SmithWaterman._print_alignments(self, sequences, targets, start_seq, start_target, hit_list) - #unmap memory objects - #del self.h_global_direction_zero_copy - #del self.h_starting_points_zero_copy - - + + class SmithWatermanCPU(SmithWatermanOcl): ''' classdocs @@ -348,16 +362,19 @@ def _init_normal_memory(self): # Input matrix device memory memory = (SmithWaterman.float_size * (self.length_of_x_sequences + 1) * self.number_of_sequences * (self.length_of_y_sequences + 1) * self.number_targets) - self.d_matrix = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_matrix, memory): + self.d_matrix = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory pattern = numpy.zeros((1),dtype=numpy.float32) cl.enqueue_fill_buffer(self.queue, self.d_matrix, pattern, 0, size = memory) if self.gap_extension: - self.d_matrix_i = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_matrix_i, memory): + self.d_matrix_i = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory - self.d_matrix_j = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_matrix_j, memory): + self.d_matrix_j = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory pattern = numpy.array([-1E10],dtype=numpy.float32) cl.enqueue_fill_buffer(self.queue, self.d_matrix_i, pattern, 0, size = memory) @@ -367,7 +384,8 @@ def _init_normal_memory(self): # Maximum global device memory memory = (SmithWaterman.float_size * self.x_div_shared_x * self.number_of_sequences * self.y_div_shared_y * self.number_targets * self.workload_x * self.workload_y) - self.d_global_maxima = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_global_maxima, memory): + self.d_global_maxima = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory @@ -377,27 +395,56 @@ def _init_normal_memory(self): self.length_of_y_sequences * self.number_targets) - self.d_semaphores = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_semaphores, memory): + self.d_semaphores = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) pattern = numpy.zeros((1),dtype=numpy.int32) cl.enqueue_fill_buffer(self.queue, self.d_semaphores, pattern, 0, size=memory) mem_size += memory return mem_size - + + def _init_zero_copy_memory(self): + mem_size = SmithWatermanOcl._init_zero_copy_memory(self) + + # Global directions host memory allocation and device copy + memory = (self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) + if self._need_reallocation(self.d_global_direction_zero_copy, memory): + self.d_global_direction_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.ALLOC_HOST_PTR, size=memory) + mem_size += memory + + return mem_size + + def _clear_normal_memory(self): + SmithWatermanOcl._clear_normal_memory(self) + if (self.d_semaphores is not None): + try: + self.d_semaphores.finish() + except: + pass + self.d_semaphores.release() + + def _clear_zero_copy_memory(self): + SmithWatermanOcl._clear_zero_copy_memory(self) + if (self.d_global_direction_zero_copy is not None): + try: + self.d_global_direction_zero_copy.finish() + except: + pass + self.d_global_direction_zero_copy.release() + def _get_direction_byte_array(self): ''' Get the resulting directions @return gives the resulting direction array as byte array ''' - self.h_global_direction_zero_copy = cl.enqueue_map_buffer(self.queue, self.d_global_direction_zero_copy, cl.map_flags.READ, 0, - (self.number_of_sequences, - self.number_targets, - self.length_of_x_sequences, - self.length_of_y_sequences), dtype=numpy.byte)[0] - return self.h_global_direction_zero_copy + h_global_direction_zero_copy = cl.enqueue_map_buffer(self.queue, self.d_global_direction_zero_copy, cl.map_flags.READ, 0, + (self.number_of_sequences, + self.number_targets, + self.length_of_x_sequences, + self.length_of_y_sequences), dtype=numpy.byte)[0] + return h_global_direction_zero_copy - def _get_direction(self, direction_array, sequence, target, block_x, block_y, value_x, value_y): return direction_array[sequence][target][block_x*self.shared_x + value_x][block_y*self.shared_y + value_y] @@ -411,75 +458,62 @@ def _execute_calculate_score_kernel(self, number_of_blocks, idx, idy): dim_grid_sw = (self.number_of_sequences * self.workgroup_x, self.number_targets * number_of_blocks * self.workgroup_y) if self.gap_extension: - self.program.calculateScoreAffineGap(self.queue, - dim_grid_sw, - dim_block, - self.d_matrix, - self.d_matrix_i, - self.d_matrix_j, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_sequences, - self.d_targets, - self.d_global_maxima, - self.d_global_direction_zero_copy) -# direction_array = self._get_direction_byte_array() -# from pprint import pprint -# pprint(direction_array[0][0], width=1000) - + self.calculateScoreAffineGap_kernel(self.queue, + dim_grid_sw, + dim_block, + self.d_matrix, + self.d_matrix_i, + self.d_matrix_j, + numpy.int32(idx), + numpy.int32(idy), + numpy.int32(number_of_blocks), + self.d_sequences, + self.d_targets, + self.d_global_maxima, + self.d_global_direction_zero_copy) else: - self.program.calculateScore(self.queue, - dim_grid_sw, - dim_block, - self.d_matrix, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_sequences, - self.d_targets, - self.d_global_maxima, - self.d_global_direction_zero_copy) + self.calculateScore_kernel(self.queue, + dim_grid_sw, + dim_block, + self.d_matrix, + numpy.int32(idx), + numpy.int32(idy), + numpy.int32(number_of_blocks), + self.d_sequences, + self.d_targets, + self.d_global_maxima, + self.d_global_direction_zero_copy) def _execute_traceback_kernel(self, number_of_blocks, idx, idy): ''' Executes a single run of the traceback kernel''' dim_block = (self.workgroup_x, self.workgroup_y) dim_grid_sw = (self.number_of_sequences * self.workgroup_x, self.number_targets * number_of_blocks * self.workgroup_y) if self.gap_extension: - self.program.tracebackAffineGap(self.queue, dim_grid_sw, dim_block, - self.d_matrix, - self.d_matrix_i, - self.d_matrix_j, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_global_maxima, - self.d_global_direction_zero_copy, - self.d_index_increment, - self.d_starting_points_zero_copy, - self.d_max_possible_score_zero_copy, - self.d_semaphores) + self.tracebackAffineGap_kernel(self.queue, dim_grid_sw, dim_block, + self.d_matrix, + self.d_matrix_i, + self.d_matrix_j, + numpy.int32(idx), + numpy.int32(idy), + numpy.int32(number_of_blocks), + self.d_global_maxima, + self.d_global_direction_zero_copy, + self.d_index_increment, + self.d_starting_points_zero_copy, + self.d_max_possible_score_zero_copy, + self.d_semaphores) else: - self.program.traceback(self.queue, dim_grid_sw, dim_block, - self.d_matrix, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_global_maxima, - self.d_global_direction_zero_copy, - self.d_index_increment, - self.d_starting_points_zero_copy, - self.d_max_possible_score_zero_copy, - self.d_semaphores) - - def _clear_memory(self): - SmithWatermanOcl._clear_memory(self) - if (self.d_semaphores is not None): - try: - self.d_semaphores.finish() - except: - pass - self.d_semaphores.release() + self.traceback_kernel(self.queue, dim_grid_sw, dim_block, + self.d_matrix, + numpy.int32(idx), + numpy.int32(idy), + numpy.int32(number_of_blocks), + self.d_global_maxima, + self.d_global_direction_zero_copy, + self.d_index_increment, + self.d_starting_points_zero_copy, + self.d_max_possible_score_zero_copy, + self.d_semaphores) class SmithWatermanGPU(SmithWatermanOcl): @@ -494,6 +528,10 @@ def __init__(self, logger, score, settings): ''' SmithWatermanOcl.__init__(self, logger, score, settings) self.oclcode = GPUcode(self.logger) + + self.d_global_direction = None + self.d_is_traceback_required = None + self._init_oclcode() def _init_normal_memory(self): @@ -503,197 +541,161 @@ def _init_normal_memory(self): # Input matrix device memory memory = (SmithWaterman.float_size * self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) - self.d_matrix = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_matrix, memory): + self.d_matrix = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory if self.gap_extension: - self.d_matrix_i = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_matrix_i, memory): + self.d_matrix_i = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory - self.d_matrix_j = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_matrix_j, memory): + self.d_matrix_j = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory - - + # Maximum global device memory memory = (SmithWaterman.float_size * self.x_div_shared_x * self.number_of_sequences * self.y_div_shared_y * self.number_targets) - self.d_global_maxima = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + if self._need_reallocation(self.d_global_maxima, memory): + self.d_global_maxima = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory + memory = (self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) + if self._need_reallocation(self.d_global_direction, memory): + self.d_global_direction = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) + mem_size += memory + + memory = SmithWaterman.int_size + if self._need_reallocation(self.d_is_traceback_required, memory): + self.d_is_traceback_required = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=memory) + flag = numpy.zeros((1), dtype=numpy.uint32) + cl.enqueue_fill_buffer(self.queue, self.d_is_traceback_required, flag, 0, size=memory) + return mem_size - + + def _clear_normal_memory(self): + SmithWatermanOcl._clear_normal_memory(self) + if (self.d_global_direction is not None): + try: + self.d_global_direction.finish() + except: + pass + self.d_global_direction.release() + if (self.d_is_traceback_required is not None): + try: + self.d_is_traceback_required.finish() + except: + pass + self.d_is_traceback_required.release() + + def _compile_code(self): + """Compile the device code with current settings""" + if self.program is None: + self.logger.debug('Compiling OpenCL code.') + code = self.oclcode.get_code(self.score, self.number_of_sequences, self.number_targets, self.length_of_x_sequences, self.length_of_y_sequences) + self.program = cl.Program(self.ctx, code).build(options=['-cl-fast-relaxed-math']) + self.calculateScoreAffineGap_kernel = self.program.calculateScoreAffineGap + self.calculateScore_kernel = self.program.calculateScore + self.tracebackAffineGap_kernel = self.program.tracebackAffineGap + self.traceback_kernel = self.program.traceback + def _get_direction_byte_array(self): ''' Get the resulting directions @return gives the resulting direction array as byte array ''' - self.h_global_direction_zero_copy = cl.enqueue_map_buffer(self.queue, self.d_global_direction_zero_copy, cl.map_flags.READ, 0, - (self.number_of_sequences, - self.number_targets, - self.x_div_shared_x, - self.y_div_shared_y, - self.shared_x, - self.shared_y), dtype=numpy.byte)[0] - return self.h_global_direction_zero_copy - + h_global_direction = cl.enqueue_map_buffer(self.queue, self.d_global_direction, cl.map_flags.READ, 0, + (self.number_of_sequences, + self.number_targets, + self.x_div_shared_x, + self.y_div_shared_y, + self.shared_x, + self.shared_y), dtype=numpy.byte)[0] + return h_global_direction def _execute_calculate_score_kernel(self, number_of_blocks, idx, idy): ''' Executes a single run of the calculate score kernel''' - dim_block = (self.shared_x, self.shared_y) - dim_grid_sw = (self.number_of_sequences * self.shared_x, self.number_targets * number_of_blocks * self.shared_y) + dim_block = (self.shared_x, self.shared_y, 1) + dim_grid_sw = (number_of_blocks * self.shared_x, self.number_of_sequences * self.shared_y, self.number_targets) if self.gap_extension: - self.program.calculateScoreAffineGap(self.queue, - dim_grid_sw, - dim_block, - self.d_matrix, - self.d_matrix_i, - self.d_matrix_j, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_sequences, - self.d_targets, - self.d_global_maxima, - self.d_global_direction_zero_copy) + self.calculateScoreAffineGap_kernel(self.queue, dim_grid_sw, dim_block, + numpy.uint32(self.number_of_sequences), + numpy.uint32(self.number_targets), + numpy.uint32(self.x_div_shared_x), + numpy.uint32(self.y_div_shared_y), + self.d_matrix, + self.d_matrix_i, + self.d_matrix_j, + numpy.uint32(idx), + numpy.uint32(idy), + self.d_sequences, + self.d_targets, + self.d_global_maxima, + self.d_global_direction, + self.d_max_possible_score_zero_copy, + self.d_is_traceback_required) else: - self.program.calculateScore(self.queue, - dim_grid_sw, - dim_block, - self.d_matrix, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_sequences, - self.d_targets, - self.d_global_maxima, - self.d_global_direction_zero_copy) - - + self.calculateScore_kernel(self.queue, dim_grid_sw, dim_block, + numpy.uint32(self.number_of_sequences), + numpy.uint32(self.number_targets), + numpy.uint32(self.x_div_shared_x), + numpy.uint32(self.y_div_shared_y), + self.d_matrix, + numpy.uint32(idx), + numpy.uint32(idy), + self.d_sequences, + self.d_targets, + self.d_global_maxima, + self.d_global_direction, + self.d_max_possible_score_zero_copy, + self.d_is_traceback_required) + + def _is_traceback_required(self): + '''Returns False if it is known after calculating scores that there are no possible + starting points, hence no need to run traceback. + ''' + flag = numpy.zeros((1), dtype=numpy.uint32) + cl.enqueue_copy(self.queue, flag, self.d_is_traceback_required) + if flag[0]: + # Clear the flag + flag[0] = 0 + cl.enqueue_fill_buffer(self.queue, self.d_is_traceback_required, flag, 0, size=SmithWaterman.int_size) + return True + else: + return False def _execute_traceback_kernel(self, number_of_blocks, idx, idy): ''' Executes a single run of the traceback kernel''' - dim_block = (self.shared_x, self.shared_y) - dim_grid_sw = (self.number_of_sequences * self.shared_x, self.number_targets * number_of_blocks * self.shared_y) + dim_block = (self.shared_x, self.shared_y, 1) + dim_grid_sw = (number_of_blocks * self.shared_x, self.number_of_sequences * self.shared_y, self.number_targets) + if self.gap_extension: - self.program.tracebackAffineGap(self.queue, dim_grid_sw, dim_block, - self.d_matrix, - self.d_matrix_i, - self.d_matrix_j, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_global_maxima, - self.d_global_direction_zero_copy, - self.d_index_increment, - self.d_starting_points_zero_copy, - self.d_max_possible_score_zero_copy) + self.tracebackAffineGap_kernel(self.queue, dim_grid_sw, dim_block, + numpy.uint32(self.number_of_sequences), + numpy.uint32(self.number_targets), + numpy.uint32(self.x_div_shared_x), + numpy.uint32(self.y_div_shared_y), + self.d_matrix, + self.d_matrix_i, + self.d_matrix_j, + numpy.uint32(idx), + numpy.uint32(idy), + self.d_global_maxima, + self.d_global_direction, + self.d_index_increment, + self.d_starting_points_zero_copy, + self.d_max_possible_score_zero_copy) else: - self.program.traceback(self.queue, dim_grid_sw, dim_block, - self.d_matrix, - numpy.int32(idx), - numpy.int32(idy), - numpy.int32(number_of_blocks), - self.d_global_maxima, - self.d_global_direction_zero_copy, - self.d_index_increment, - self.d_starting_points_zero_copy, - self.d_max_possible_score_zero_copy) - - -class SmithWatermanNVIDIA(SmithWatermanGPU): - ''' - classdocs - ''' - - - def __init__(self, logger, score, settings): - ''' - Constructor - ''' - SmithWatermanGPU.__init__(self, logger, score, settings) - self.pinned_starting_points_zero_copy = None - self.pinned_max_possible_score_zero_copy = None - self.pinned_global_direction_zero_copy = None - self._init_oclcode() - - def _init_zero_copy_memory(self): - self.logger.debug('Initializing NVIDIA zero-copy memory.') - # Starting points host memory allocation and device copy - memory = (self.size_of_startingpoint * self.maximum_number_starting_points * self.number_of_sequences * - self.number_targets) - self.pinned_starting_points_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory) - self.d_starting_points_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=memory) - self.h_starting_points_zero_copy = cl.enqueue_map_buffer(self.queue, self.pinned_starting_points_zero_copy, cl.map_flags.READ, 0, - (memory, 1), dtype=numpy.byte)[0] - mem_size = memory - - # Global directions host memory allocation and device copy - memory = (self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * - self.number_targets) - self.pinned_global_direction_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory) - self.d_global_direction_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=memory) - self.h_global_direction_zero_copy = cl.enqueue_map_buffer(self.queue, self.pinned_global_direction_zero_copy, cl.map_flags.READ, 0, - (memory, 1), dtype=numpy.byte)[0] - mem_size += memory - - - # Maximum zero copy memory allocation and device copy - memory = (self.number_of_sequences * self.number_of_targets * SmithWaterman.float_size) -# self.pinned_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory) -# self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory) -# self.h_max_possible_score_zero_copy = cl.enqueue_map_buffer(self.queue, self.pinned_max_possible_score_zero_copy, cl.map_flags.WRITE, 0, -# (self.number_of_sequences * self.number_of_targets, 1), dtype=numpy.float32)[0] - mem_size += memory - - # Zero copy buffers are allocated twice in NVIDIA - return 2*mem_size - -# def _copy_min_score(self): -# self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.ALLOC_HOST_PTR| cl.mem_flags.COPY_HOST_PTR, hostbuf=self.min_score_np\ - - def _set_max_possible_score(self, target_index, targets, i, index, records_seqs): - #cl.enqueue_copy(self.queue, self.d_max_possible_score_zero_copy, self.h_max_possible_score_zero_copy) - self._fill_max_possible_score(target_index, targets, i, index, records_seqs) - - def _get_direction_byte_array(self): - self.h_global_direction_zero_copy = cl.enqueue_map_buffer(self.queue, self.d_global_direction_zero_copy, cl.map_flags.READ, 0, - (self.number_of_sequences, - self.number_targets, - self.x_div_shared_x, - self.y_div_shared_y, - self.shared_x, - self.shared_y), dtype=numpy.byte)[0] - - return self.h_global_direction_zero_copy - - - def _clear_zero_copy_memory(self): - SmithWatermanGPU._clear_zero_copy_memory(self) - - if (self.pinned_starting_points_zero_copy is not None): - try: - self.pinned_starting_points_zero_copy.finish() - except: - pass - self.pinned_starting_points_zero_copy.release() - if (self.pinned_global_direction_zero_copy is not None): - try: - self.pinned_global_direction_zero_copy.finish() - except: - pass - self.pinned_global_direction_zero_copy.release() - if (self.pinned_max_possible_score_zero_copy is not None): - try: - self.pinned_max_possible_score_zero_copy.finish() - except: - pass - self.pinned_max_possible_score_zero_copy.release() - - def _compile_code(self): - """Compile the OpenCL code with current settings""" - self.logger.debug('Compiling NVIDIA OpenCL code.') - code = self.oclcode.get_code(self.score, self.number_of_sequences, self.number_targets, self.length_of_x_sequences, self.length_of_y_sequences) - self.program = cl.Program(self.ctx, code).build(options=['-D', 'NVIDIA']) - - - - + self.traceback_kernel(self.queue, dim_grid_sw, dim_block, + numpy.uint32(self.number_of_sequences), + numpy.uint32(self.number_targets), + numpy.uint32(self.x_div_shared_x), + numpy.uint32(self.y_div_shared_y), + self.d_matrix, + numpy.uint32(idx), + numpy.uint32(idy), + self.d_global_maxima, + self.d_global_direction, + self.d_index_increment, + self.d_starting_points_zero_copy, + self.d_max_possible_score_zero_copy) diff --git a/pyPaSWAS/Core/ocl/default_main_gpu.cl b/pyPaSWAS/Core/ocl/default_main_gpu.cl index 8438c22..446faf2 100644 --- a/pyPaSWAS/Core/ocl/default_main_gpu.cl +++ b/pyPaSWAS/Core/ocl/default_main_gpu.cl @@ -6,21 +6,13 @@ /** kernel contains a for-loop in which the score is calculated. */ #define DIAGONAL SHARED_X + SHARED_Y -/** amount of blocks across the X axis */ -#define XdivSHARED_X (X/SHARED_X) -/** amount of blocks across the Y axis */ -#define YdivSHARED_Y (Y/SHARED_Y) - /** character used to fill the sequence if length < X */ #define FILL_CHARACTER '\0' -#define FILL_SCORE -1E10 +#define FILL_SCORE -1E10f /** Set init for affine gap matrices */ #define AFFINE_GAP_INIT -1E10f -/** this value is used to allocate enough memory to store the starting points */ -#define MAXIMUM_NUMBER_STARTING_POINTS (NUMBER_SEQUENCES*NUMBER_TARGETS*1000) - /**** Other definitions ****/ /** bit mask to get the negative value of a float, or to keep it negative */ @@ -28,110 +20,69 @@ /* Scorings matrix for each thread block */ typedef struct { - float value[SHARED_X][SHARED_Y]; -} LocalMatrix; - -/* Scorings matrix for each sequence alignment */ -typedef struct { - LocalMatrix matrix[XdivSHARED_X][YdivSHARED_Y]; -} ScoringsMatrix; - -/* Scorings matrix for entire application */ -typedef struct { - ScoringsMatrix metaMatrix[NUMBER_SEQUENCES][NUMBER_TARGETS]; -} GlobalMatrix; - -typedef struct { - float value[XdivSHARED_X][YdivSHARED_Y]; -} BlockMaxima; - -typedef struct { - BlockMaxima blockMaxima[NUMBER_SEQUENCES][NUMBER_TARGETS]; -} GlobalMaxima; - -typedef struct { - unsigned char value[SHARED_X][SHARED_Y]; -} LocalDirection; + float value[SHARED_X][SHARED_Y]; +} Matrix; +/* Direction matrix for each thread block */ typedef struct { - LocalDirection localDirection[XdivSHARED_X][YdivSHARED_Y]; + unsigned char value[SHARED_X][SHARED_Y]; } Direction; typedef struct { - Direction direction[NUMBER_SEQUENCES][NUMBER_TARGETS]; -} GlobalDirection; - -typedef struct { - unsigned int sequence; - unsigned int target; - unsigned int blockX; - unsigned int blockY; - unsigned int valueX; - unsigned int valueY; - float score; - float maxScore; - float posScore; + unsigned int sequence; + unsigned int target; + unsigned int blockX; + unsigned int blockY; + unsigned int valueX; + unsigned int valueY; + float score; + float maxScore; + float posScore; } StartingPoint; -typedef struct { - StartingPoint startingPoint[MAXIMUM_NUMBER_STARTING_POINTS]; -} StartingPoints; - -__kernel void calculateScore( - __global GlobalMatrix *matrix, - unsigned int x, - unsigned int y, - unsigned int numberOfBlocks, - __global char *sequences, - __global char *targets, - __global GlobalMaxima *globalMaxima, - __global GlobalDirection *globalDirection) { - - /** +__kernel +__attribute__((reqd_work_group_size(SHARED_X, SHARED_Y, 1))) +void calculateScore( + const unsigned int numberOfSequences, + const unsigned int numberOfTargets, + const unsigned int xDivSHARED_X, + const unsigned int yDivSHARED_Y, + __global Matrix *matrix, + const unsigned int x, + const unsigned int y, + const __global char *sequences, + const __global char *targets, + __global float *globalMaxima, + __global Direction *globalDirection, + const __global float *maxPossibleScore, + __global unsigned int *isTracebackRequired) { + + /** * shared memory block for calculations. It requires * extra (+1 in both directions) space to hold * Neighboring cells */ - __local float s_matrix[SHARED_X+1][SHARED_Y+1]; - /** - * shared memory block for storing the maximum value of each neighboring cell. - * Careful: the s_maxima[SHARED_X][SHARED_Y] does not contain the maximum value - * after the calculation loop! This value is determined at the end of this - * function. - */ - __local float s_maxima[SHARED_X][SHARED_Y]; + __local float s_matrix[SHARED_X+1][SHARED_Y+1]; // calculate indices: - //unsigned int yDIVnumSeq = (blockIdx.y/NUMBER_SEQUENCES); - // 1 is in y-direction and 0 is in x-direction - unsigned int blockx = x - get_group_id(1)/NUMBER_TARGETS;//yDIVnumSeq; - unsigned int blocky = y + get_group_id(1)/NUMBER_TARGETS;//yDIVnumSeq; + unsigned int blockx = x - get_group_id(0); + unsigned int blocky = y + get_group_id(0); unsigned int tIDx = get_local_id(0); unsigned int tIDy = get_local_id(1); - unsigned int bIDx = get_group_id(0); - unsigned int bIDy = get_group_id(1)%NUMBER_TARGETS;///numberOfBlocks; + unsigned int bIDx = get_group_id(1); + unsigned int bIDy = get_group_id(2); unsigned char direction = NO_DIRECTION; - // indices of the current characters in both sequences. - int seqIndex1 = tIDx + bIDx * X + blockx * SHARED_X; - int seqIndex2 = tIDy + bIDy * Y + blocky * SHARED_Y; + // Move pointers to current target and sequence + const unsigned int offset = (bIDx * numberOfTargets + bIDy) * (xDivSHARED_X * yDivSHARED_Y); + matrix += offset; + globalMaxima += offset; + globalDirection += offset; + // indices of the current characters in both sequences. + int seqIndex1 = tIDx + (bIDx * xDivSHARED_X + blockx) * SHARED_X; + int seqIndex2 = tIDy + (bIDy * yDivSHARED_Y + blocky) * SHARED_Y; - /* the next block is to get the maximum value from surrounding blocks. This maximum values is compared to the - * first element in the shared score matrix s_matrix. - */ - float maxPrev = 0.0f; - if (!tIDx && !tIDy) { - if (blockx && blocky) { - maxPrev = fmax(fmax(globalMaxima->blockMaxima[bIDx][bIDy].value[blockx-1][blocky-1], globalMaxima->blockMaxima[bIDx][bIDy].value[blockx-1][blocky]), globalMaxima->blockMaxima[bIDx][bIDy].value[blockx][blocky-1]); - } - else if (blockx) { - maxPrev = globalMaxima->blockMaxima[bIDx][bIDy].value[blockx-1][blocky]; - } - else if (blocky) { - maxPrev = globalMaxima->blockMaxima[bIDx][bIDy].value[blockx][blocky-1]; - } - } // local scorings variables: float currentScore, ulS, lS, uS; float innerScore = 0.0f; @@ -152,173 +103,141 @@ __kernel void calculateScore( if (!tIDx) s_seq2[tIDy] = targets[seqIndex2]; - // set both matrices to zero - s_matrix[tIDx][tIDy] = 0.0f; - s_maxima[tIDx][tIDy] = 0.0f; - - if (tIDx == SHARED_X-1 && ! tIDy) - s_matrix[SHARED_X][0] = 0.0f; - if (tIDy == SHARED_Y-1 && ! tIDx) - s_matrix[0][SHARED_Y] = 0.0f; - - /**** sync barrier ****/ - s_matrix[tIDx][tIDy] = 0.0f; barrier(CLK_LOCAL_MEM_FENCE); // initialize outer parts of the matrix: - if (!tIDx || !tIDy) { - if (tIDx == SHARED_X-1) - s_matrix[tIDx+1][tIDy] = 0.0f; - if (tIDy == SHARED_Y-1) - s_matrix[tIDx][tIDy+1] = 0.0f; - if (blockx && !tIDx) { - s_matrix[0][tIDy+1] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy]; - } - if (blocky && !tIDy) { - s_matrix[tIDx+1][0] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1]; - } - if (blockx && blocky && !tIDx && !tIDy){ - s_matrix[0][0] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1]; - } + if (!tIDx) { + s_matrix[0][tIDy+1] = blockx ? matrix[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy] : 0.0f; + } + if (!tIDy) { + s_matrix[tIDx+1][0] = blocky ? matrix[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx][SHARED_Y-1] : 0.0f; } + if (!tIDx && !tIDy){ + s_matrix[0][0] = blockx && blocky ? matrix[(blockx-1) * yDivSHARED_Y + (blocky-1)].value[SHARED_X-1][SHARED_Y-1] : 0.0f; + } + // set inner score (aka sequence match/mismatch score): char charS1 = s_seq1[tIDx]; char charS2 = s_seq2[tIDy]; - - innerScore = charS1 == FILL_CHARACTER || charS2 == FILL_CHARACTER ? FILL_SCORE : scoringsMatrix[charS1-characterOffset][charS2-characterOffset]; + + innerScore = charS1 == FILL_CHARACTER || charS2 == FILL_CHARACTER ? FILL_SCORE : scoringsMatrix[charS1-characterOffset][charS2-characterOffset]; // transpose the index ++tIDx; ++tIDy; - // set shared matrix to zero (starting point!) - s_matrix[tIDx][tIDy] = 0.0f; - - // wait until all elements have been copied to the shared memory block - /**** sync barrier ****/ - barrier(CLK_LOCAL_MEM_FENCE); - - currentScore = 0.0f; - - for (int i=0; i < DIAGONAL; ++i) { - if (i == tXM1+ tYM1) { + for (int i = 0; i < DIAGONAL - 1; ++i) { + barrier(CLK_LOCAL_MEM_FENCE); + if (i == tXM1 + tYM1) { // calculate only when there are two valid characters // this is necessary when the two sequences are not of equal length // this is the SW-scoring of the cell: - ulS = s_matrix[tXM1][tYM1] + innerScore; - lS = s_matrix[tXM1][tIDy] + gapScore; - uS = s_matrix[tIDx][tYM1] + gapScore; + ulS = s_matrix[tXM1][tYM1] + innerScore; + lS = s_matrix[tXM1][tIDy] + gapScore; + uS = s_matrix[tIDx][tYM1] + gapScore; - if (currentScore < lS) { // score comes from left - currentScore = lS; - direction = LEFT_DIRECTION; - } - if (currentScore < uS) { // score comes from above - currentScore = uS; - direction = UPPER_DIRECTION; - } - if (currentScore < ulS) { // score comes from upper left - currentScore = ulS; - direction = UPPER_LEFT_DIRECTION; - } - s_matrix[tIDx][tIDy] = innerScore == FILL_SCORE ? 0.0 : currentScore; // copy score to matrix + currentScore = fmax(fmax(0.0f, ulS), fmax(lS, uS)); + s_matrix[tIDx][tIDy] = currentScore; // copy score to matrix } + } - else if (i-1 == tXM1 + tYM1 ){ - // use this to find fmax - if (i==1) { - s_maxima[0][0] = fmax(maxPrev, currentScore); - } - else if (!tXM1 && tYM1) { - s_maxima[0][tYM1] = fmax(s_maxima[0][tYM1-1], currentScore); - } - else if (!tYM1 && tXM1) { - s_maxima[tXM1][0] = fmax(s_maxima[tXM1-1][0], currentScore); - } - else if (tXM1 && tYM1 ){ - s_maxima[tXM1][tYM1] = fmax(s_maxima[tXM1-1][tYM1], fmax(s_maxima[tXM1][tYM1-1], currentScore)); - } - } - // wait until all threads have calculated their new score - /**** sync barrier ****/ + if (currentScore == ulS) // score comes from upper left + direction = UPPER_LEFT_DIRECTION; + if (currentScore == uS) // score comes from above + direction = UPPER_DIRECTION; + if (currentScore == lS) // score comes from left + direction = LEFT_DIRECTION; + currentScore = innerScore == FILL_SCORE ? 0.0f : currentScore; + + // copy end score to the scorings matrix: + matrix[blockx * yDivSHARED_Y + blocky].value[tXM1][tYM1] = currentScore; + globalDirection[blockx * yDivSHARED_Y + blocky].value[tXM1][tYM1] = direction; + + // Find maximum score + __local float s_maxima[SHARED_X * SHARED_Y]; + + const unsigned int lid = get_local_id(1) * SHARED_X + get_local_id(0); + float m = currentScore; + s_maxima[lid] = m; + + for (int stride = SHARED_X * SHARED_Y / 2; stride > 0; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); + if (lid < stride) { + m = fmax(m, s_maxima[lid + stride]); + s_maxima[lid] = m; + } } - // copy end score to the scorings matrix: - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tXM1][tYM1] = s_matrix[tIDx][tIDy]; - (*globalDirection).direction[bIDx][bIDy].localDirection[blockx][blocky].value[tXM1][tYM1] = direction; - if (tIDx==SHARED_X && tIDy==SHARED_Y) - globalMaxima->blockMaxima[bIDx][bIDy].value[blockx][blocky] = fmax(currentScore, fmax(s_maxima[SHARED_X-2][SHARED_Y-1], s_maxima[SHARED_X-1][SHARED_Y-2])); + if (lid == 0) { + /* the next block is to get the maximum value from surrounding blocks. */ + if (blockx) { + m = fmax(m, globalMaxima[(blockx-1) * yDivSHARED_Y + blocky]); + } + if (blocky) { + m = fmax(m, globalMaxima[blockx * yDivSHARED_Y + (blocky-1)]); + } + globalMaxima[blockx * yDivSHARED_Y + blocky] = m; - // wait until all threads have copied their score: - /**** sync barrier ****/ - barrier(CLK_LOCAL_MEM_FENCE); + if (blockx == xDivSHARED_X - 1 && blocky == yDivSHARED_Y - 1) { + if (m >= MINIMUM_SCORE && m >= maxPossibleScore[bIDy * numberOfSequences + bIDx]) { + *isTracebackRequired = 1; + } + } + } } -__kernel void calculateScoreAffineGap( - __global GlobalMatrix *matrix, - __global GlobalMatrix *matrix_i, - __global GlobalMatrix *matrix_j, - unsigned int x, - unsigned int y, - unsigned int numberOfBlocks, - __global char *sequences, - __global char *targets, - __global GlobalMaxima *globalMaxima, - __global GlobalDirection *globalDirection) { - - /** +__kernel +__attribute__((reqd_work_group_size(SHARED_X, SHARED_Y, 1))) +void calculateScoreAffineGap( + const unsigned int numberOfSequences, + const unsigned int numberOfTargets, + const unsigned int xDivSHARED_X, + const unsigned int yDivSHARED_Y, + __global Matrix *matrix, + __global Matrix *matrix_i, + __global Matrix *matrix_j, + const unsigned int x, + const unsigned int y, + const __global char *sequences, + const __global char *targets, + __global float *globalMaxima, + __global Direction *globalDirection, + const __global float *maxPossibleScore, + __global unsigned int *isTracebackRequired) { + + /** * shared memory block for calculations. It requires * extra (+1 in both directions) space to hold * Neighboring cells */ - __local float s_matrix[SHARED_X+1][SHARED_Y+1]; - __local float s_matrix_i[SHARED_X+1][SHARED_Y+1]; - __local float s_matrix_j[SHARED_X+1][SHARED_Y+1]; - /** - * shared memory block for storing the maximum value of each neighboring cell. - * Careful: the s_maxima[SHARED_X][SHARED_Y] does not contain the maximum value - * after the calculation loop! This value is determined at the end of this - * function. - */ - __local float s_maxima[SHARED_X][SHARED_Y]; + __local float s_matrix[SHARED_X+1][SHARED_Y+1]; + __local float s_matrix_i[SHARED_X+1][SHARED_Y+1]; + __local float s_matrix_j[SHARED_X+1][SHARED_Y+1]; // calculate indices: - //unsigned int yDIVnumSeq = (blockIdx.y/NUMBER_SEQUENCES); - // 1 is in y-direction and 0 is in x-direction - unsigned int blockx = x - get_group_id(1)/NUMBER_TARGETS;//yDIVnumSeq; - unsigned int blocky = y + get_group_id(1)/NUMBER_TARGETS;//yDIVnumSeq; + unsigned int blockx = x - get_group_id(0); + unsigned int blocky = y + get_group_id(0); unsigned int tIDx = get_local_id(0); unsigned int tIDy = get_local_id(1); - unsigned int bIDx = get_group_id(0); - unsigned int bIDy = get_group_id(1)%NUMBER_TARGETS;///numberOfBlocks; + unsigned int bIDx = get_group_id(1); + unsigned int bIDy = get_group_id(2); unsigned char direction = NO_DIRECTION; - unsigned char direction_i = NO_DIRECTION; - unsigned char direction_j = NO_DIRECTION; - // indices of the current characters in both sequences. - int seqIndex1 = tIDx + bIDx * X + blockx * SHARED_X; - int seqIndex2 = tIDy + bIDy * Y + blocky * SHARED_Y; + // Move pointers to current target and sequence + const unsigned int offset = (bIDx * numberOfTargets + bIDy) * (xDivSHARED_X * yDivSHARED_Y); + matrix += offset; + matrix_i += offset; + matrix_j += offset; + globalMaxima += offset; + globalDirection += offset; + // indices of the current characters in both sequences. + int seqIndex1 = tIDx + (bIDx * xDivSHARED_X + blockx) * SHARED_X; + int seqIndex2 = tIDy + (bIDy * yDivSHARED_Y + blocky) * SHARED_Y; - /* the next block is to get the maximum value from surrounding blocks. This maximum values is compared to the - * first element in the shared score matrix s_matrix. - */ - float maxPrev = 0.0f; - if (!tIDx && !tIDy) { - if (blockx && blocky) { - maxPrev = fmax(fmax(globalMaxima->blockMaxima[bIDx][bIDy].value[blockx-1][blocky-1], globalMaxima->blockMaxima[bIDx][bIDy].value[blockx-1][blocky]), globalMaxima->blockMaxima[bIDx][bIDy].value[blockx][blocky-1]); - } - else if (blockx) { - maxPrev = globalMaxima->blockMaxima[bIDx][bIDy].value[blockx-1][blocky]; - } - else if (blocky) { - maxPrev = globalMaxima->blockMaxima[bIDx][bIDy].value[blockx][blocky-1]; - } - } // local scorings variables: - float currentScore,currentScore_i, currentScore_j, m_M, m_I, m_J; + float currentScore, currentScore_m, currentScore_i, currentScore_j, m_M, m_I, m_J; float innerScore = 0.0f; /** * tXM1 and tYM1 are to store the current value of the thread Index. tIDx and tIDy are @@ -337,412 +256,316 @@ __kernel void calculateScoreAffineGap( if (!tIDx) s_seq2[tIDy] = targets[seqIndex2]; - // init matrices - s_matrix[tIDx][tIDy] = 0.0f; - s_matrix_i[tIDx][tIDy] = AFFINE_GAP_INIT; - s_matrix_j[tIDx][tIDy] = AFFINE_GAP_INIT; - s_maxima[tIDx][tIDy] = 0.0f; - - if (tIDx == SHARED_X-1 && ! tIDy){ - s_matrix[SHARED_X][0] = 0.0f; - s_matrix_i[SHARED_X][0] = AFFINE_GAP_INIT; - s_matrix_j[SHARED_X][0] = AFFINE_GAP_INIT; - } - if (tIDy == SHARED_Y-1 && ! tIDx) { - s_matrix[0][SHARED_Y] = 0.0f; - s_matrix_i[0][SHARED_Y] = AFFINE_GAP_INIT; - s_matrix_j[0][SHARED_Y] = AFFINE_GAP_INIT; - } - - /**** sync barrier ****/ - s_matrix[tIDx][tIDy] = 0.0f; barrier(CLK_LOCAL_MEM_FENCE); // initialize outer parts of the matrix: - if (!tIDx || !tIDy) { - if (tIDx == SHARED_X-1) { - s_matrix[tIDx+1][tIDy] = 0.0f; - s_matrix_i[tIDx+1][tIDy] = AFFINE_GAP_INIT; - s_matrix_j[tIDx+1][tIDy] = AFFINE_GAP_INIT; - } - if (tIDy == SHARED_Y-1) { - s_matrix[tIDx][tIDy+1] = 0.0f; - s_matrix_i[tIDx][tIDy+1] = AFFINE_GAP_INIT; - s_matrix_j[tIDx][tIDy+1] = AFFINE_GAP_INIT; - } - if (blockx && !tIDx) { - s_matrix[0][tIDy+1] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy]; - s_matrix_i[0][tIDy+1] = (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy]; - s_matrix_j[0][tIDy+1] = (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy]; - } - if (blocky && !tIDy) { - s_matrix[tIDx+1][0] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1]; - s_matrix_i[tIDx+1][0] = (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1]; - s_matrix_j[tIDx+1][0] = (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1]; - } - if (blockx && blocky && !tIDx && !tIDy){ - s_matrix[0][0] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1]; - s_matrix_i[0][0] = (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1]; - s_matrix_j[0][0] = (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1]; - } + if (!tIDx) { + s_matrix[0][tIDy+1] = blockx ? matrix[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy] : 0.0f; + s_matrix_i[0][tIDy+1] = blockx ? matrix_i[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy] : AFFINE_GAP_INIT; + s_matrix_j[0][tIDy+1] = blockx ? matrix_j[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy] : AFFINE_GAP_INIT; } + if (!tIDy) { + s_matrix[tIDx+1][0] = blocky ? matrix[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx][SHARED_Y-1] : 0.0f; + s_matrix_i[tIDx+1][0] = blocky ? matrix_i[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx][SHARED_Y-1] : AFFINE_GAP_INIT; + s_matrix_j[tIDx+1][0] = blocky ? matrix_j[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx][SHARED_Y-1] : AFFINE_GAP_INIT; + } + if (!tIDx && !tIDy){ + s_matrix[0][0] = blockx && blocky ? matrix[(blockx-1) * yDivSHARED_Y + (blocky-1)].value[SHARED_X-1][SHARED_Y-1] : 0.0f; + s_matrix_i[0][0] = blockx && blocky ? matrix_i[(blockx-1) * yDivSHARED_Y + (blocky-1)].value[SHARED_X-1][SHARED_Y-1] : AFFINE_GAP_INIT; + s_matrix_j[0][0] = blockx && blocky ? matrix_j[(blockx-1) * yDivSHARED_Y + (blocky-1)].value[SHARED_X-1][SHARED_Y-1] : AFFINE_GAP_INIT; + } + // set inner score (aka sequence match/mismatch score): char charS1 = s_seq1[tIDx]; char charS2 = s_seq2[tIDy]; - - innerScore = charS1 == FILL_CHARACTER || charS2 == FILL_CHARACTER ? FILL_SCORE : scoringsMatrix[charS1-characterOffset][charS2-characterOffset]; + + innerScore = charS1 == FILL_CHARACTER || charS2 == FILL_CHARACTER ? FILL_SCORE : scoringsMatrix[charS1-characterOffset][charS2-characterOffset]; // transpose the index ++tIDx; ++tIDy; - // set shared matrix to zero (starting point!) - s_matrix[tIDx][tIDy] = 0.0f; - s_matrix_i[tIDx][tIDy] = AFFINE_GAP_INIT; - s_matrix_j[tIDx][tIDy] = AFFINE_GAP_INIT; + for (int i = 0; i < DIAGONAL - 1; ++i) { + barrier(CLK_LOCAL_MEM_FENCE); + if (i == tXM1 + tYM1) { + m_M = s_matrix[tXM1][tYM1] + innerScore; + m_I = s_matrix_i[tXM1][tYM1] + innerScore; + m_J = s_matrix_j[tXM1][tYM1] + innerScore; + currentScore_m = fmax(fmax(0.0f, m_M), fmax(m_I, m_J)); + s_matrix[tIDx][tIDy] = currentScore_m; // copy score to matrix + + // now do I matrix: + m_M = gapScore + gapExtension + s_matrix[tIDx][tYM1]; // score comes from m matrix (match) + m_I = gapExtension + s_matrix_i[tIDx][tYM1]; // score comes from I matrix (gap in x) + currentScore_i = fmax(m_I, m_M); + currentScore_i = currentScore_i < 0 ? AFFINE_GAP_INIT : currentScore_i; + s_matrix_i[tIDx][tIDy] = currentScore_i; // copy score to matrix + + // now do J matrix: + m_M = gapScore + gapExtension + s_matrix[tXM1][tIDy]; // score comes from m matrix (match) + m_J = gapExtension + s_matrix_j[tXM1][tIDy]; // score comes from J matrix (gap in y) + currentScore_j = fmax(m_J, m_M); + currentScore_j = currentScore_j < 0 ? AFFINE_GAP_INIT : currentScore_j; + s_matrix_j[tIDx][tIDy] = currentScore_j; // copy score to matrix + } + } + currentScore = fmax(currentScore_m, fmax(currentScore_i, currentScore_j)); + if (currentScore > 0) { + if (currentScore == currentScore_m) {// direction from main + direction = A_DIRECTION | MAIN_MATRIX; + } + else if(currentScore == currentScore_i) {// direction from I + direction = B_DIRECTION | I_MATRIX; + } + else if(currentScore == currentScore_j){ // direction from J + direction = C_DIRECTION | J_MATRIX; + } + } - // wait until all elements have been copied to the shared memory block - /**** sync barrier ****/ - barrier(CLK_LOCAL_MEM_FENCE); + // copy end score to the scorings matrix: + matrix[blockx * yDivSHARED_Y + blocky].value[tXM1][tYM1] = currentScore_m; + matrix_i[blockx * yDivSHARED_Y + blocky].value[tXM1][tYM1] = currentScore_i; + matrix_j[blockx * yDivSHARED_Y + blocky].value[tXM1][tYM1] = currentScore_j; + globalDirection[blockx * yDivSHARED_Y + blocky].value[tXM1][tYM1] = direction; + // Find maximum score + __local float s_maxima[SHARED_X * SHARED_Y]; - for (int i=0; i < DIAGONAL; ++i) { - if (i == tXM1+ tYM1) { - currentScore = 0.0f; - m_M = s_matrix[tXM1][tYM1]+innerScore; - m_I = s_matrix_i[tXM1][tYM1]+innerScore; - m_J = s_matrix_j[tXM1][tYM1]+innerScore; - - if (currentScore < m_I) { // score comes from I matrix (gap in x) - currentScore = m_I; - direction = A_DIRECTION | MAIN_MATRIX; - } - if (currentScore < m_J) { // score comes from J matrix (gap in y) - currentScore = m_J; - direction = A_DIRECTION | MAIN_MATRIX; - } - if (currentScore < m_M) { // score comes from m matrix (match) - currentScore = m_M; - direction = A_DIRECTION | MAIN_MATRIX; - } - s_matrix[tIDx][tIDy] = innerScore == FILL_SCORE ? 0.0 : currentScore; // copy score to matrix - - // now do I matrix: - currentScore_i = AFFINE_GAP_INIT; - m_M = gapScore + gapExtension + s_matrix[tIDx][tYM1]; - m_I = gapExtension + s_matrix_i[tIDx][tYM1]; - - if (currentScore_i < m_I) { // score comes from I matrix (gap in x) - currentScore_i = m_I; - direction_i = B_DIRECTION | I_MATRIX; - } - if (currentScore_i < m_M) { // score comes from m matrix (match) - currentScore_i = m_M; - direction_i= B_DIRECTION | I_MATRIX; - } - s_matrix_i[tIDx][tIDy] = currentScore_i < 0 ? AFFINE_GAP_INIT : currentScore_i; // copy score to matrix - - // now do J matrix: - currentScore_j = AFFINE_GAP_INIT; - m_M = gapScore + gapExtension + s_matrix[tXM1][tIDy]; - m_J = gapExtension + s_matrix_j[tXM1][tIDy]; - - if (currentScore_j < m_J) { // score comes from J matrix (gap in y) - currentScore_j = m_J; - direction_j = C_DIRECTION | J_MATRIX; - } - if (currentScore_j < m_M) { // score comes from m matrix (match) - currentScore_j = m_M; - direction_j = C_DIRECTION | J_MATRIX; - } - s_matrix_j[tIDx][tIDy] = currentScore_j < 0 ? AFFINE_GAP_INIT : currentScore_j; // copy score to matrix - - currentScore = fmax(currentScore,fmax(currentScore_i,currentScore_j)); - if (currentScore > 0) { - if (currentScore == s_matrix[tIDx][tIDy]) {// direction from main - direction = direction; - } - else if(currentScore == s_matrix_i[tIDx][tIDy]) {// direction from I - direction = direction_i; - } - else if(currentScore == s_matrix_j[tIDx][tIDy]){ // direction from J - direction = direction_j; - } - } - } + const unsigned int lid = get_local_id(1) * SHARED_X + get_local_id(0); + float m = currentScore; + s_maxima[lid] = m; - else if (i-1 == tXM1 + tYM1 ){ - // use this to find fmax - if (i==1) { - s_maxima[0][0] = fmax(maxPrev, currentScore); - } - else if (!tXM1 && tYM1) { - s_maxima[0][tYM1] = fmax(s_maxima[0][tYM1-1], currentScore); - } - else if (!tYM1 && tXM1) { - s_maxima[tXM1][0] = fmax(s_maxima[tXM1-1][0], currentScore); - } - else if (tXM1 && tYM1 ){ - s_maxima[tXM1][tYM1] = fmax(s_maxima[tXM1-1][tYM1], fmax(s_maxima[tXM1][tYM1-1], currentScore)); - } - } - // wait until all threads have calculated their new score - /**** sync barrier ****/ + for (int stride = SHARED_X * SHARED_Y / 2; stride > 0; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); + if (lid < stride) { + m = fmax(m, s_maxima[lid + stride]); + s_maxima[lid] = m; + } } - // copy end score to the scorings matrix: - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tXM1][tYM1] = s_matrix[tIDx][tIDy]; - (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tXM1][tYM1] = s_matrix_i[tIDx][tIDy]; - (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tXM1][tYM1] = s_matrix_j[tIDx][tIDy]; - (*globalDirection).direction[bIDx][bIDy].localDirection[blockx][blocky].value[tXM1][tYM1] = direction; - if (tIDx==SHARED_X && tIDy==SHARED_Y) - globalMaxima->blockMaxima[bIDx][bIDy].value[blockx][blocky] = fmax(currentScore, fmax(s_maxima[SHARED_X-2][SHARED_Y-1], s_maxima[SHARED_X-1][SHARED_Y-2])); + if (lid == 0) { + /* the next block is to get the maximum value from surrounding blocks. */ + if (blockx) { + m = fmax(m, globalMaxima[(blockx-1) * yDivSHARED_Y + blocky]); + } + if (blocky) { + m = fmax(m, globalMaxima[blockx * yDivSHARED_Y + (blocky-1)]); + } + globalMaxima[blockx * yDivSHARED_Y + blocky] = m; - // wait until all threads have copied their score: - /**** sync barrier ****/ - barrier(CLK_LOCAL_MEM_FENCE); + if (blockx == xDivSHARED_X - 1 && blocky == yDivSHARED_Y - 1) { + if (m >= MINIMUM_SCORE && m >= maxPossibleScore[bIDy * numberOfSequences + bIDx]) { + *isTracebackRequired = 1; + } + } + } } -__kernel void traceback( - __global GlobalMatrix *matrix, - unsigned int x, - unsigned int y, - unsigned int numberOfBlocks, - __global GlobalMaxima *globalMaxima, - __global GlobalDirection *globalDirection, - volatile __global unsigned int *indexIncrement, - __global StartingPoints *startingPoints, - __global float *maxPossibleScore) { +__kernel +__attribute__((reqd_work_group_size(SHARED_X, SHARED_Y, 1))) +void traceback( + const unsigned int numberOfSequences, + const unsigned int numberOfTargets, + const unsigned int xDivSHARED_X, + const unsigned int yDivSHARED_Y, + __global Matrix *matrix, + const unsigned int x, + const unsigned int y, + const __global float *globalMaxima, + __global Direction *globalDirection, + volatile __global unsigned int *indexIncrement, + __global StartingPoint *startingPoints, + const __global float *maxPossibleScore) { - /** + /** * shared memory block for calculations. It requires * extra (+1 in both directions) space to hold * Neighboring cells */ - __local float s_matrix[SHARED_X+1][SHARED_Y+1]; + __local float s_matrix[SHARED_X+1][SHARED_Y+1]; /** * shared memory for storing the maximum value of this alignment. */ - __local float s_maxima[1]; - __local float s_maxPossibleScore[1]; + __local float s_maxima[1]; + __local float s_maxPossibleScore[1]; // calculate indices: - unsigned int yDIVnumSeq = (get_group_id(1)/NUMBER_TARGETS); - unsigned int blockx = x - yDIVnumSeq; - unsigned int blocky = y + yDIVnumSeq; + unsigned int blockx = x - get_group_id(0); + unsigned int blocky = y + get_group_id(0); unsigned int tIDx = get_local_id(0); unsigned int tIDy = get_local_id(1); - unsigned int bIDx = get_group_id(0); - unsigned int bIDy = get_group_id(1)%NUMBER_TARGETS; - - float value = 0.0; + unsigned int bIDx = get_group_id(1); + unsigned int bIDy = get_group_id(2); + + // Move pointers to current target and sequence + const unsigned int offset = (bIDx * numberOfTargets + bIDy) * (xDivSHARED_X * yDivSHARED_Y); + matrix += offset; + globalMaxima += offset; + globalDirection += offset; + + __local bool s_needsProcessing; if (!tIDx && !tIDy) { - s_maxima[0] = globalMaxima->blockMaxima[bIDx][bIDy].value[XdivSHARED_X-1][YdivSHARED_Y-1]; - s_maxPossibleScore[0] = maxPossibleScore[bIDy*NUMBER_SEQUENCES+bIDx]; + s_maxima[0] = globalMaxima[(xDivSHARED_X-1) * yDivSHARED_Y + (yDivSHARED_Y-1)]; + s_maxPossibleScore[0] = maxPossibleScore[bIDy*numberOfSequences+bIDx]; + + s_needsProcessing = false; } barrier(CLK_LOCAL_MEM_FENCE); if (s_maxima[0]>= MINIMUM_SCORE) { // if the maximum score is below threshold, there is nothing to do - s_matrix[tIDx][tIDy] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy]; + s_matrix[tIDx][tIDy] = matrix[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; - unsigned char direction = globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy]; + unsigned char direction = globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; + const bool isStartCandidate = (direction == UPPER_LEFT_DIRECTION && s_matrix[tIDx][tIDy] >= LOWER_LIMIT_SCORE * s_maxima[0] && s_matrix[tIDx][tIDy] >= s_maxPossibleScore[0]); + // Check if there are continuing alignments (from neighbouring blocks) or new starting points. + // Otherwise there is nothing to do. + if (s_matrix[tIDx][tIDy] < 0.0f || isStartCandidate) + s_needsProcessing = true; // wait until all elements have been copied to the shared memory block /**** sync barrier ****/ barrier(CLK_LOCAL_MEM_FENCE); - for (int i=DIAGONAL-1; i >= 0; --i) { + if (!s_needsProcessing) + return; - if ((i == tIDx + tIDy) && direction == UPPER_LEFT_DIRECTION && s_matrix[tIDx][tIDy] >= LOWER_LIMIT_SCORE * s_maxima[0] && s_matrix[tIDx][tIDy] >= s_maxPossibleScore[0]) { - // found starting point! - // reserve index: - unsigned int index = atom_inc(&indexIncrement[0]); - StartingPoint start; - //__global StartingPoint *start = &(startingPoints->startingPoint[index]); - start.sequence = bIDx; - start.target = bIDy; - start.blockX = blockx; - start.blockY = blocky; - start.valueX = tIDx; - start.valueY = tIDy; - start.score = s_matrix[tIDx][tIDy]; - start.maxScore = s_maxima[0]; - start.posScore = s_maxPossibleScore[0]; - startingPoints->startingPoint[index] = start; - // mark this value: - s_matrix[tIDx][tIDy] = as_float(SIGN_BIT_MASK | as_int(s_matrix[tIDx][tIDy])); - - } - - barrier(CLK_LOCAL_MEM_FENCE); + for (int i=DIAGONAL-1; i >= 0; --i) { - if ((i == tIDx + tIDy) && s_matrix[tIDx][tIDy] < 0 && direction == UPPER_LEFT_DIRECTION) { - if (tIDx && tIDy){ - value = s_matrix[tIDx-1][tIDy-1]; - if (value == 0.0f) { - direction = STOP_DIRECTION; - } - else { - s_matrix[tIDx-1][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - - - } - else if (!tIDx && tIDy && blockx) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy-1]; - if (value == 0.0f) { - direction = STOP_DIRECTION; - } - else { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - - } - else if (!tIDx && !tIDy && blockx && blocky) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1]; - if (value == 0.0f) { - direction = STOP_DIRECTION; - } - else { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - + if (i == tIDx + tIDy) { + if (isStartCandidate && s_matrix[tIDx][tIDy] > 0.0f) { // is not a part of another alignment + // found starting point! + // reserve index: + unsigned int index = atom_inc(&indexIncrement[0]); + StartingPoint start; + //__global StartingPoint *start = &(startingPoints->startingPoint[index]); + start.sequence = bIDx; + start.target = bIDy; + start.blockX = blockx; + start.blockY = blocky; + start.valueX = tIDx; + start.valueY = tIDy; + start.score = s_matrix[tIDx][tIDy]; + start.maxScore = s_maxima[0]; + start.posScore = s_maxPossibleScore[0]; + startingPoints[index] = start; + // mark this value: + s_matrix[tIDx][tIDy] = as_float(SIGN_BIT_MASK | as_int(s_matrix[tIDx][tIDy])); } - else if (tIDx && !tIDy && blocky) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx-1][SHARED_Y-1]; - if (value == 0.0f) { - direction = STOP_DIRECTION; - } - else { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx-1][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if ((i == tIDx + tIDy) && s_matrix[tIDx][tIDy] < 0 && direction == UPPER_DIRECTION) { - if (!tIDy) { - if (blocky) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1]; + if (s_matrix[tIDx][tIDy] < 0) { + const int dx = direction == UPPER_DIRECTION ? 0 : -1; + const int dy = direction == LEFT_DIRECTION ? 0 : -1; + int prevx = tIDx + dx; + int prevy = tIDy + dy; + if (prevx >= 0 && prevy >= 0) { + const float value = s_matrix[prevx][prevy]; if (value == 0.0f) { - direction = STOP_DIRECTION; + direction = STOP_DIRECTION; + } else { + s_matrix[prevx][prevy] = as_float(SIGN_BIT_MASK | as_int(value)); } - else { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } else { + int prevBlockx = blockx; + int prevBlocky = blocky; + if (prevx < 0) { + prevBlockx += dx; + prevx = SHARED_X - 1; } - - } - } - else { - value = s_matrix[tIDx][tIDy-1]; - if (value == 0.0f) { - direction = STOP_DIRECTION; - } - else { - s_matrix[tIDx][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - if ((i == tIDx + tIDy) && s_matrix[tIDx][tIDy] < 0 && direction == LEFT_DIRECTION) { - if (!tIDx){ - if (blockx) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy]; - if (value == 0.0f) { - direction = STOP_DIRECTION; + if (prevy < 0) { + prevBlocky += dy; + prevy = SHARED_Y - 1; } - else { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy] = as_float(SIGN_BIT_MASK | as_int(value)); + if (prevBlockx >= 0 && prevBlocky >= 0) { + const float value = matrix[prevBlockx * yDivSHARED_Y + prevBlocky].value[prevx][prevy]; + if (value == 0.0f) { + direction = STOP_DIRECTION; + } else { + matrix[prevBlockx * yDivSHARED_Y + prevBlocky].value[prevx][prevy] = as_float(SIGN_BIT_MASK | as_int(value)); + } } - - } - } - else { - value = s_matrix[tIDx-1][tIDy]; - if (value == 0.0f) { - direction = STOP_DIRECTION; - } - else { - s_matrix[tIDx-1][tIDy] = as_float(SIGN_BIT_MASK | as_int(value)); } - } } - + /**** sync barrier ****/ barrier(CLK_LOCAL_MEM_FENCE); - } // copy end score to the scorings matrix: if (s_matrix[tIDx][tIDy] < 0) { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy] = s_matrix[tIDx][tIDy]; - globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy] = direction; + matrix[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = s_matrix[tIDx][tIDy]; + globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = direction; } - /**** sync barrier ****/ - barrier(CLK_LOCAL_MEM_FENCE); } } -__kernel void tracebackAffineGap( - __global GlobalMatrix *matrix, - __global GlobalMatrix *matrix_i, - __global GlobalMatrix *matrix_j, - unsigned int x, - unsigned int y, - unsigned int numberOfBlocks, - __global GlobalMaxima *globalMaxima, - __global GlobalDirection *globalDirection, - volatile __global unsigned int *indexIncrement, - __global StartingPoints *startingPoints, - __global float *maxPossibleScore) { - - /** +__kernel +__attribute__((reqd_work_group_size(SHARED_X, SHARED_Y, 1))) +void tracebackAffineGap( + const unsigned int numberOfSequences, + const unsigned int numberOfTargets, + const unsigned int xDivSHARED_X, + const unsigned int yDivSHARED_Y, + __global Matrix *matrix, + __global Matrix *matrix_i, + __global Matrix *matrix_j, + const unsigned int x, + const unsigned int y, + const __global float *globalMaxima, + __global Direction *globalDirection, + volatile __global unsigned int *indexIncrement, + __global StartingPoint *startingPoints, + const __global float *maxPossibleScore) { + + /** * shared memory block for calculations. It requires * extra (+1 in both directions) space to hold * Neighboring cells */ - __local float s_matrix[SHARED_X+1][SHARED_Y+1]; - __local float s_matrix_i[SHARED_X+1][SHARED_Y+1]; - __local float s_matrix_j[SHARED_X+1][SHARED_Y+1]; + __local float s_matrix[SHARED_X+1][SHARED_Y+1]; + __local float s_matrix_i[SHARED_X+1][SHARED_Y+1]; + __local float s_matrix_j[SHARED_X+1][SHARED_Y+1]; /** * shared memory for storing the maximum value of this alignment. */ - __local float s_maxima[1]; - __local float s_maxPossibleScore[1]; + __local float s_maxima[1]; + __local float s_maxPossibleScore[1]; // calculate indices: - unsigned int yDIVnumSeq = (get_group_id(1)/NUMBER_TARGETS); - unsigned int blockx = x - yDIVnumSeq; - unsigned int blocky = y + yDIVnumSeq; + unsigned int blockx = x - get_group_id(0); + unsigned int blocky = y + get_group_id(0); unsigned int tIDx = get_local_id(0); unsigned int tIDy = get_local_id(1); - unsigned int bIDx = get_group_id(0); - unsigned int bIDy = get_group_id(1)%NUMBER_TARGETS; - - float value = 0.0; + unsigned int bIDx = get_group_id(1); + unsigned int bIDy = get_group_id(2); + + // Move pointers to current target and sequence + const unsigned int offset = (bIDx * numberOfTargets + bIDy) * (xDivSHARED_X * yDivSHARED_Y); + matrix += offset; + matrix_i += offset; + matrix_j += offset; + globalMaxima += offset; + globalDirection += offset; + + float value = 0.0f; if (!tIDx && !tIDy) { - s_maxima[0] = globalMaxima->blockMaxima[bIDx][bIDy].value[XdivSHARED_X-1][YdivSHARED_Y-1]; - s_maxPossibleScore[0] = maxPossibleScore[bIDy*NUMBER_SEQUENCES+bIDx]; + s_maxima[0] = globalMaxima[(xDivSHARED_X-1) * yDivSHARED_Y + (yDivSHARED_Y-1)]; + s_maxPossibleScore[0] = maxPossibleScore[bIDy*numberOfSequences+bIDx]; } barrier(CLK_LOCAL_MEM_FENCE); if (s_maxima[0]>= MINIMUM_SCORE) { // if the maximum score is below threshold, there is nothing to do - unsigned char direction = DIRECTION_MASK & globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy]; - unsigned char matrix_source = MATRIX_MASK & globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy]; + unsigned char direction = DIRECTION_MASK & globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; + unsigned char matrix_source = MATRIX_MASK & globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; - s_matrix[tIDx][tIDy] = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy]; - s_matrix_i[tIDx][tIDy] = (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy]; - s_matrix_j[tIDx][tIDy] = (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy]; + s_matrix[tIDx][tIDy] = matrix[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; + s_matrix_i[tIDx][tIDy] = matrix_i[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; + s_matrix_j[tIDx][tIDy] = matrix_j[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy]; // wait until all elements have been copied to the shared memory block @@ -756,7 +579,7 @@ __kernel void tracebackAffineGap( // reserve index: unsigned int index = atom_inc(&indexIncrement[0]); StartingPoint start; - //__global StartingPoint *start = &(startingPoints->startingPoint[index]); + //__global StartingPoint *start = &(startingPoints[index]); start.sequence = bIDx; start.target = bIDy; start.blockX = blockx; @@ -766,102 +589,100 @@ __kernel void tracebackAffineGap( start.score = s_matrix[tIDx][tIDy]; start.maxScore = s_maxima[0]; start.posScore = s_maxPossibleScore[0]; - startingPoints->startingPoint[index] = start; - // mark this value: - s_matrix[tIDx][tIDy] = as_float(SIGN_BIT_MASK | as_int(s_matrix[tIDx][tIDy])); - + startingPoints[index] = start; + // mark this value: + s_matrix[tIDx][tIDy] = as_float(SIGN_BIT_MASK | as_int(s_matrix[tIDx][tIDy])); + } - + barrier(CLK_LOCAL_MEM_FENCE); if ((i == tIDx + tIDy) && ( - (s_matrix[tIDx][tIDy] < 0 && matrix_source == MAIN_MATRIX) || - (s_matrix_i[tIDx][tIDy] < 0 && s_matrix_i[tIDx][tIDy] > AFFINE_GAP_INIT && matrix_source == I_MATRIX) || - (s_matrix_j[tIDx][tIDy] < 0 && s_matrix_j[tIDx][tIDy] > AFFINE_GAP_INIT && matrix_source == J_MATRIX) - )) { - // check which matrix to go to: - switch (direction) { - case A_DIRECTION : // M - if (tIDx && tIDy){ - value = s_matrix[tIDx-1][tIDy-1]; - if (value == 0.0f) - direction = STOP_DIRECTION; - else - s_matrix[tIDx-1][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - else if (!tIDx && tIDy && blockx) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy-1]; - if (value == 0.0f) - direction = STOP_DIRECTION; - else - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - else if (!tIDx && !tIDy && blockx && blocky) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1]; - if (value == 0.0f) - direction = STOP_DIRECTION; - else - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky-1].value[SHARED_X-1][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - else if (tIDx && !tIDy && blocky) { - value = (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx-1][SHARED_Y-1]; - if (value == 0.0f) - direction = STOP_DIRECTION; - else - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx-1][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } + (s_matrix[tIDx][tIDy] < 0 && matrix_source == MAIN_MATRIX) || + (s_matrix_i[tIDx][tIDy] < 0 && s_matrix_i[tIDx][tIDy] > AFFINE_GAP_INIT && matrix_source == I_MATRIX) || + (s_matrix_j[tIDx][tIDy] < 0 && s_matrix_j[tIDx][tIDy] > AFFINE_GAP_INIT && matrix_source == J_MATRIX) + )) { + // check which matrix to go to: + switch (direction) { + case A_DIRECTION : // M + if (tIDx && tIDy){ + value = s_matrix[tIDx-1][tIDy-1]; + if (value == 0.0f) + direction = STOP_DIRECTION; + else + s_matrix[tIDx-1][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } + else if (!tIDx && tIDy && blockx) { + value = matrix[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy-1]; + if (value == 0.0f) + direction = STOP_DIRECTION; + else + matrix[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } + else if (!tIDx && !tIDy && blockx && blocky) { + value = matrix[(blockx-1) * yDivSHARED_Y + (blocky-1)].value[SHARED_X-1][SHARED_Y-1]; + if (value == 0.0f) + direction = STOP_DIRECTION; + else + matrix[(blockx-1) * yDivSHARED_Y + (blocky-1)].value[SHARED_X-1][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } + else if (tIDx && !tIDy && blocky) { + value = matrix[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx-1][SHARED_Y-1]; + if (value == 0.0f) + direction = STOP_DIRECTION; + else + matrix[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx-1][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } //direction = tracebackStepLeftUp(blockx, blocky, s_matrix, matrix, direction); - break; - case B_DIRECTION : // I - if (!tIDy) { - if (blocky) { - value = (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1]; - (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx][blocky-1].value[tIDx][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - } - else { - value = s_matrix_i[tIDx][tIDy-1]; - s_matrix_i[tIDx][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); - } - - //direction = tracebackStepUp(blockx, blocky, s_matrix_i, matrix_i, direction); - break; - case C_DIRECTION : // J - if (!tIDx){ - if (blockx) { - value = (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy]; - (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx-1][blocky].value[SHARED_X-1][tIDy] = as_float(SIGN_BIT_MASK | as_int(value)); - } - } - else { - value = s_matrix_j[tIDx-1][tIDy]; - s_matrix_j[tIDx-1][tIDy] = as_float(SIGN_BIT_MASK | as_int(value)); - } - - //direction = tracebackStepLeft(blockx, blocky, s_matrix_j, matrix_j, direction); - break; - } - } + break; + case B_DIRECTION : // I + if (!tIDy) { + if (blocky) { + value = matrix_i[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx][SHARED_Y-1]; + matrix_i[blockx * yDivSHARED_Y + (blocky-1)].value[tIDx][SHARED_Y-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } + } + else { + value = s_matrix_i[tIDx][tIDy-1]; + s_matrix_i[tIDx][tIDy-1] = as_float(SIGN_BIT_MASK | as_int(value)); + } + + //direction = tracebackStepUp(blockx, blocky, s_matrix_i, matrix_i, direction); + break; + case C_DIRECTION : // J + if (!tIDx){ + if (blockx) { + value = matrix_j[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy]; + matrix_j[(blockx-1) * yDivSHARED_Y + blocky].value[SHARED_X-1][tIDy] = as_float(SIGN_BIT_MASK | as_int(value)); + } + } + else { + value = s_matrix_j[tIDx-1][tIDy]; + s_matrix_j[tIDx-1][tIDy] = as_float(SIGN_BIT_MASK | as_int(value)); + } + + //direction = tracebackStepLeft(blockx, blocky, s_matrix_j, matrix_j, direction); + break; + } + } barrier(CLK_LOCAL_MEM_FENCE); } // copy end score to the scorings matrix: if (matrix_source == MAIN_MATRIX) { - (*matrix).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy] = s_matrix[tIDx][tIDy]; - globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy] = direction; + matrix[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = s_matrix[tIDx][tIDy]; + globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = direction; } else if (matrix_source == I_MATRIX) { - (*matrix_i).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy] = s_matrix_i[tIDx][tIDy]; - globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy] = direction; + matrix_i[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = s_matrix_i[tIDx][tIDy]; + globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = direction; } else if (matrix_source == J_MATRIX) { - (*matrix_j).metaMatrix[bIDx][bIDy].matrix[blockx][blocky].value[tIDx][tIDy] = s_matrix_j[tIDx][tIDy]; - globalDirection->direction[bIDx][bIDy].localDirection[blockx][blocky].value[tIDx][tIDy] = direction; + matrix_j[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = s_matrix_j[tIDx][tIDy]; + globalDirection[blockx * yDivSHARED_Y + blocky].value[tIDx][tIDy] = direction; } /**** sync barrier ****/ barrier(CLK_LOCAL_MEM_FENCE); } } - - diff --git a/pyPaSWAS/Core/ocl/default_variable.cl b/pyPaSWAS/Core/ocl/default_variable_cpu.cl similarity index 100% rename from pyPaSWAS/Core/ocl/default_variable.cl rename to pyPaSWAS/Core/ocl/default_variable_cpu.cl diff --git a/pyPaSWAS/Core/ocl/default_variable_gpu.cl b/pyPaSWAS/Core/ocl/default_variable_gpu.cl new file mode 100644 index 0000000..35a4136 --- /dev/null +++ b/pyPaSWAS/Core/ocl/default_variable_gpu.cl @@ -0,0 +1,5 @@ +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +/** start of the alphabet, so scoringsmatrix index can be calculated */ +#define characterOffset '${CHAR_OFFSET}' +