Skip to content

Commit

Permalink
Merge pull request #957 from rouault/idwt_53_improvements
Browse files Browse the repository at this point in the history
IDWT 5x3 single-pass lifting and SSE2/AVX2 implementation
  • Loading branch information
rouault authored Jun 26, 2017
2 parents 6026786 + 4fe7620 commit 533fa2f
Show file tree
Hide file tree
Showing 9 changed files with 1,018 additions and 54 deletions.
34 changes: 34 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ language: cpp

matrix:
include:
# OSX
- os: osx
compiler: clang
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1

# Test code style
- os: linux
compiler: clang-3.8
env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_CHECK_STYLE=1 OPJ_CI_SKIP_TESTS=1
Expand All @@ -16,12 +19,31 @@ matrix:
packages:
- clang-3.8
- flip

# Performance test with GCC
- os: linux
compiler: g++
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1 OPJ_CI_PERF_TESTS=1

# Test compilation with AVX2
- os: linux
compiler: clang-3.8
# skip tests since Travis doesn't have AVX2 compatible machines
env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_INSTRUCTION_SETS="-mavx2" OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_SKIP_TESTS=1
addons:
apt:
sources:
- llvm-toolchain-precise-3.8
- ubuntu-toolchain-r-test
packages:
- clang-3.8

# Test multi-threading
- os: linux
compiler: g++
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_NUM_THREADS=2

# Test 32-bit compilation
- os: linux
compiler: g++
env: OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release
Expand All @@ -30,16 +52,22 @@ matrix:
packages:
- gcc-multilib
- g++-multilib

# Profile code (gcc -pg)
- os: linux
compiler: g++
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Debug OPJ_CI_PROFILE=1
addons:
apt:
packages:
- valgrind

# Test under ASAN
- os: linux
compiler: clang
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Debug OPJ_CI_ASAN=1

# Test with CLang 3.8
- os: linux
compiler: clang-3.8
env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_PERF_TESTS=1
Expand All @@ -50,6 +78,8 @@ matrix:
- ubuntu-toolchain-r-test
packages:
- clang-3.8

# Test with mingw 32 bit
- os: linux
compiler: x86_64-w64-mingw32-g++
env: OPJ_CI_CC=x86_64-w64-mingw32-gcc OPJ_CI_CXX=x86_64-w64-mingw32-g++ OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release
Expand All @@ -63,6 +93,8 @@ matrix:
- g++-mingw-w64-i686
- gcc-multilib
- g++-multilib

# Test with mingw 64 bit
- os: linux
compiler: x86_64-w64-mingw32-g++
env: OPJ_CI_CC=x86_64-w64-mingw32-gcc OPJ_CI_CXX=x86_64-w64-mingw32-g++ OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release
Expand All @@ -74,6 +106,8 @@ matrix:
- gcc-mingw-w64-x86-64
- gcc-mingw-w64
- g++-mingw-w64-x86-64

# Test with gcc 4.8
- os: linux
compiler: g++-4.8
env: OPJ_CI_CC=gcc-4.8 OPJ_CI_CXX=g++-4.8 OPJ_CI_ABI_CHECK=1
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ if(BUILD_JPIP_SERVER)
endif()
add_subdirectory(src/lib)
option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF)
option(BUILD_BENCH_DWT "Build bench_dwt utility (development benchmark)" OFF)

#-----------------------------------------------------------------------------
# Build Applications
Expand Down
10 changes: 10 additions & 0 deletions src/lib/openjp2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,13 @@ endif(OPJ_USE_THREAD AND NOT Threads_FOUND)
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT})
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)

if(BUILD_BENCH_DWT)
add_executable(bench_dwt bench_dwt.c dwt.c opj_malloc.c thread.c)
if(UNIX)
target_link_libraries(bench_dwt m)
endif()
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
target_link_libraries(bench_dwt ${CMAKE_THREAD_LIBS_INIT})
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
endif(BUILD_BENCH_DWT)
241 changes: 241 additions & 0 deletions src/lib/openjp2/bench_dwt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/*
* The copyright in this software is being made available under the 2-clauses
* BSD License, included below. This software may be subject to other third
* party and contributor rights, including patent rights, and no such rights
* are granted under this license.
*
* Copyright (c) 2017, IntoPix SA <contact@intopix.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include "opj_includes.h"

#ifdef _WIN32
#include <windows.h>
#else
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/times.h>
#endif /* _WIN32 */

OPJ_INT32 getValue(OPJ_UINT32 i)
{
return ((OPJ_INT32)i % 511) - 256;
}

void init_tilec(opj_tcd_tilecomp_t * l_tilec,
OPJ_INT32 x0,
OPJ_INT32 y0,
OPJ_INT32 x1,
OPJ_INT32 y1,
OPJ_UINT32 numresolutions)
{
opj_tcd_resolution_t* l_res;
OPJ_UINT32 resno, l_level_no;
size_t i, nValues;

memset(l_tilec, 0, sizeof(*l_tilec));
l_tilec->x0 = x0;
l_tilec->y0 = y0;
l_tilec->x1 = x1;
l_tilec->y1 = y1;
nValues = (size_t)(l_tilec->x1 - l_tilec->x0) *
(size_t)(l_tilec->y1 - l_tilec->y0);
l_tilec->data = opj_malloc(sizeof(OPJ_INT32) * nValues);
for (i = 0; i < nValues; i++) {
l_tilec->data[i] = getValue(i);
}
l_tilec->numresolutions = numresolutions;
l_tilec->resolutions = opj_calloc(l_tilec->numresolutions,
sizeof(opj_tcd_resolution_t));

l_level_no = l_tilec->numresolutions;
l_res = l_tilec->resolutions;

/* Adapted from opj_tcd_init_tile() */
for (resno = 0; resno < l_tilec->numresolutions; ++resno) {

--l_level_no;

/* border for each resolution level (global) */
l_res->x0 = opj_int_ceildivpow2(l_tilec->x0, (OPJ_INT32)l_level_no);
l_res->y0 = opj_int_ceildivpow2(l_tilec->y0, (OPJ_INT32)l_level_no);
l_res->x1 = opj_int_ceildivpow2(l_tilec->x1, (OPJ_INT32)l_level_no);
l_res->y1 = opj_int_ceildivpow2(l_tilec->y1, (OPJ_INT32)l_level_no);

++l_res;
}
}

void free_tilec(opj_tcd_tilecomp_t * l_tilec)
{
opj_free(l_tilec->data);
opj_free(l_tilec->resolutions);
}

void usage(void)
{
printf(
"bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n");
printf(
" [-offset x y] [-num_threads val]\n");
exit(1);
}


OPJ_FLOAT64 opj_clock(void)
{
#ifdef _WIN32
/* _WIN32: use QueryPerformance (very accurate) */
LARGE_INTEGER freq, t ;
/* freq is the clock speed of the CPU */
QueryPerformanceFrequency(&freq) ;
/* cout << "freq = " << ((double) freq.QuadPart) << endl; */
/* t is the high resolution performance counter (see MSDN) */
QueryPerformanceCounter(& t) ;
return freq.QuadPart ? (t.QuadPart / (OPJ_FLOAT64) freq.QuadPart) : 0 ;
#else
/* Unix or Linux: use resource usage */
struct rusage t;
OPJ_FLOAT64 procTime;
/* (1) Get the rusage data structure at this moment (man getrusage) */
getrusage(0, &t);
/* (2) What is the elapsed time ? - CPU time = User time + System time */
/* (2a) Get the seconds */
procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec);
/* (2b) More precisely! Get the microseconds part ! */
return (procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) *
1e-6) ;
#endif
}

int main(int argc, char** argv)
{
int num_threads = 0;
opj_tcd_tilecomp_t tilec;
opj_thread_pool_t* tp;
OPJ_INT32 i, j, k;
OPJ_BOOL display = OPJ_FALSE;
OPJ_BOOL check = OPJ_FALSE;
OPJ_INT32 size = 16384 - 1;
OPJ_FLOAT64 start, stop;
OPJ_UINT32 offset_x = (size + 1) / 2 - 1;
OPJ_UINT32 offset_y = (size + 1) / 2 - 1;
OPJ_UINT32 num_resolutions = 6;

for (i = 1; i < argc; i++) {
if (strcmp(argv[i], "-display") == 0) {
display = OPJ_TRUE;
check = OPJ_TRUE;
} else if (strcmp(argv[i], "-check") == 0) {
check = OPJ_TRUE;
} else if (strcmp(argv[i], "-size") == 0 && i + 1 < argc) {
size = atoi(argv[i + 1]);
i ++;
} else if (strcmp(argv[i], "-num_threads") == 0 && i + 1 < argc) {
num_threads = atoi(argv[i + 1]);
i ++;
} else if (strcmp(argv[i], "-num_resolutions") == 0 && i + 1 < argc) {
num_resolutions = atoi(argv[i + 1]);
if (num_resolutions == 0 || num_resolutions > 32) {
fprintf(stderr,
"Invalid value for num_resolutions. Should be >= 1 and <= 32\n");
exit(1);
}
i ++;
} else if (strcmp(argv[i], "-offset") == 0 && i + 2 < argc) {
offset_x = atoi(argv[i + 1]);
offset_y = atoi(argv[i + 2]);
i += 2;
} else {
usage();
}
}

tp = opj_thread_pool_create(num_threads);

init_tilec(&tilec, offset_x, offset_y, offset_x + size, offset_y + size,
num_resolutions);

if (display) {
printf("Before\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
}
}

start = opj_clock();
opj_dwt_decode(tp, &tilec, tilec.numresolutions);
stop = opj_clock();
printf("time for dwt_decode: %.03f s\n", stop - start);

if (display || check) {
if (display) {
printf("After IDWT\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
}
}

opj_dwt_encode(&tilec);
if (display) {
printf("After FDWT\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
}
}

if (check) {
size_t idx;
size_t nValues = (size_t)(tilec.x1 - tilec.x0) *
(size_t)(tilec.y1 - tilec.y0);
for (idx = 0; i < nValues; i++) {
if (tilec.data[idx] != getValue(idx)) {
printf("Difference found at idx = %u\n", (OPJ_UINT32)idx);
exit(1);
}
}
}
}

free_tilec(&tilec);

opj_thread_pool_destroy(tp);
return 0;
}
Loading

0 comments on commit 533fa2f

Please sign in to comment.