manodeep · manodeep · Sep 29, 2018 · Mar 22, 2018 · Mar 22, 2018 · Mar 23, 2018
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ wp
 xi
 DDrppi
 wprp
+DDsmu
 bin/*
 include/*
 run_correlations

diff --git a/.travis.yml b/.travis.yml
@@ -33,7 +33,7 @@ matrix:
     #   before_install:
     #     - brew update
     #     - brew tap homebrew/versions && brew reinstall gcc49 --without-multilib
-    #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
+    #     - wget https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     # - os: osx
     #   compiler: clang
@@ -42,36 +42,35 @@ matrix:
     #     - brew update
     #     - brew outdated xctool || brew upgrade xctool
     #     - brew tap homebrew/versions && brew install clang-omp
-    #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
+    #     - wget https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
     - os: osx
       osx_image: xcode9
       compiler: clang
       env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=3.6 DOCTEST=FALSE
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
+        - wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
 
 
     - os: osx
       osx_image: xcode8
       compiler: clang
       env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=3.5 DOCTEST=FALSE
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
+        - wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     - os: osx
       osx_image: xcode7.3
       compiler: clang
       env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=2.7 DOCTEST=FALSE
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
-
+        - wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     # - os: osx
     #   compiler: gcc
     #   env: COMPILER=gcc-4.8 V='4.8' PYTHON_VERSION=3.5 FAMILY=gcc
     #   before_install:
     #     - brew update && brew tap homebrew/versions && brew install gcc48 --without-multilib
-    #     - wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
+    #     - wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     # - os: linux
     #   dist: trusty
@@ -83,7 +82,7 @@ matrix:
     #       packages: ['clang-3.6','libgsl0-dev']
     #   env: COMPILER=clang-3.6 V=3.6 PYTHON_VERSION=2.7 
     #   before_install:
-    #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+    #     - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
 
     # - os: linux
     #   dist: trusty
@@ -95,39 +94,39 @@ matrix:
     #       packages: ['clang-3.6','libgsl0-dev']
     #   env: COMPILER=clang-3.6 V=3.6 PYTHON_VERSION=3.5
     #   before_install:
-    #     - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+    #     - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
     - os: linux
       dist: trusty
       sudo: required
       compiler: gcc
       env: COMPILER=gcc PYTHON_VERSION=2.7
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+        - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
 
     - os: linux
       dist: trusty
       sudo: required
       compiler: gcc
       env: COMPILER=gcc PYTHON_VERSION=3.4 
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+        - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
     - os: linux
       dist: trusty
       sudo: required
       compiler: gcc
       env: COMPILER=gcc PYTHON_VERSION=3.5
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+        - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
     - os: linux
       dist: trusty
       sudo: required
       compiler: gcc
       env: COMPILER=gcc PYTHON_VERSION=3.6
       before_install:
-        - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+        - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
 install:
   - bash miniconda.sh -b -p $HOME/miniconda

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -10,6 +10,20 @@ New features
 - conda installable package
 - GPU version
 
+2.3.0
+=======
+
+**Breaking Changes**
+--------------------
+
+New features
+------------
+- AVX512F kernels for all pair-counters [#167, #170]
+- Faster code from new optimizations using the minimum separation between pairs of cells [#170]
+
+Bug fixes
+---------
+- Fix segmentation fault in vpf_mocks [#168]
 
 2.3.0
 =======

diff --git a/Corrfunc/utils.py b/Corrfunc/utils.py
@@ -485,7 +485,7 @@ def translate_isa_string_to_enum(isa):
     except NameError:
         if not isinstance(isa, str):
             raise TypeError(msg)
-    valid_isa = ['FALLBACK', 'AVX', 'SSE42', 'FASTEST']
+    valid_isa = ['FALLBACK', 'AVX512F', 'AVX2', 'AVX', 'SSE42', 'FASTEST']
     isa_upper = isa.upper()
     if isa_upper not in valid_isa:
         msg = "Desired instruction set = {0} is not in the list of valid "\

diff --git a/common.mk b/common.mk
@@ -248,10 +248,10 @@ ifeq ($(DO_CHECKS), 1)
   ## done with check for conflicting options
 
   ifeq (icc,$(findstring icc,$(CC)))
-    CFLAGS += -xhost -opt-prefetch -opt-prefetch-distance=16 #-vec-report6
-    ifeq (USE_OMP,$(findstring USE_OMP,$(OPT)))
-      CFLAGS += -openmp
-      CLINK  += -openmp
+    CFLAGS += -xhost -axCORE-AVX512 
+	  ifeq (USE_OMP,$(findstring USE_OMP,$(OPT)))
+      CFLAGS += -qopenmp
+      CLINK  += -qopenmp
     endif ##openmp with icc
   else ## not icc -> gcc or clang follow
 
@@ -353,11 +353,6 @@ ifeq ($(DO_CHECKS), 1)
       endif # USE_OMP
     endif # CC is clang
 
-    # #### common options for gcc and clang
-    # ifeq (USE_AVX,$(findstring USE_AVX,$(OPT)))
-    #   CFLAGS  +=  -mavx
-    # endif
-
     CFLAGS += -funroll-loops
     CFLAGS += -march=native -fno-strict-aliasing
     CFLAGS += -Wformat=2  -Wpacked  -Wnested-externs -Wpointer-arith  -Wredundant-decls  -Wfloat-equal -Wcast-qual
@@ -435,7 +430,6 @@ ifeq ($(DO_CHECKS), 1)
             # python3-config failed; let's try python-config (for Python 2 or 3)
             PYTHON_CONFIG_EXE:="$(PYTHON_SCRIPTS)/python-config"
           endif
-
           $(warning $(ccblue)"PYTHON"$(ccreset) is set to $(ccblue)$(PYTHON)$(ccreset); using $(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset) as $(ccblue)python-config$(ccreset). If this is not correct, please also set $(ccblue)"PYTHON_CONFIG_EXE"$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset))
         endif
 

diff --git a/mocks.options b/mocks.options
@@ -11,7 +11,7 @@ OPT += -DLINK_IN_RA #link_in_dec must be enabled before link_in_ra
 #### Floating point precision to use
 OPT += -DDOUBLE_PREC
 
-#### If input distances are already in co-moving (relevant for DDrppi_mocks and vpf)
+#### If input distances are already in co-moving (relevant for DDrppi_mocks, DDsmu_mocks and vpf)
 #OPT += -DCOMOVING_DIST
 
 

diff --git a/mocks/DDrppi_mocks/DDrppi_mocks.c b/mocks/DDrppi_mocks/DDrppi_mocks.c
@@ -219,7 +219,7 @@ int main(int argc, char *argv[])
     /*---Count-pairs--------------------------------------*/
     results_countpairs_mocks results;
     struct config_options options = get_config_options();
-    
+
     /* Pack weights into extra options */
     struct extra_options extra = get_extra_options(weight_method);
     for(int w = 0; w < num_weights; w++){

diff --git a/mocks/DDrppi_mocks/Makefile b/mocks/DDrppi_mocks/Makefile
@@ -11,8 +11,8 @@ LIBNAME := countpairs_rp_pi_mocks
 LIBRARY := lib$(LIBNAME).a
 LIBSRC  := countpairs_rp_pi_mocks.c countpairs_rp_pi_mocks_impl_double.c countpairs_rp_pi_mocks_impl_float.c \
            $(UTILS_DIR)/gridlink_mocks_impl_float.c $(UTILS_DIR)/gridlink_mocks_impl_double.c \
-           $(UTILS_DIR)/utils.c $(UTILS_DIR)/progressbar.c $(UTILS_DIR)/cpu_features.c \
-	   $(UTILS_DIR)/set_cosmo_dist.c $(UTILS_DIR)/cosmology_params.c
+           $(UTILS_DIR)/utils.c $(UTILS_DIR)/progressbar.c $(UTILS_DIR)/cpu_features.c $(UTILS_DIR)/avx512_calls.c \
+	   $(UTILS_DIR)/set_cosmo_dist.c $(UTILS_DIR)/cosmology_params.c 
 LIBRARY_HEADERS := $(LIBNAME).h
 
 TARGET   := DDrppi_mocks
@@ -25,7 +25,7 @@ INCL     := countpairs_rp_pi_mocks_kernels_float.c countpairs_rp_pi_mocks_kernel
             $(UTILS_DIR)/gridlink_mocks_impl_double.h $(UTILS_DIR)/gridlink_mocks_impl_float.h $(UTILS_DIR)/gridlink_mocks_impl.h.src \
             $(UTILS_DIR)/cellarray_mocks_float.h $(UTILS_DIR)/cellarray_mocks_double.h $(UTILS_DIR)/cellarray_mocks.h.src \
 	    $(UTILS_DIR)/set_cosmo_dist.h $(UTILS_DIR)/cosmology_params.h  $(UTILS_DIR)/progressbar.h $(UTILS_DIR)/cpu_features.h \
-	    $(UTILS_DIR)/utils.h $(UTILS_DIR)/function_precision.h $(UTILS_DIR)/avx_calls.h $(UTILS_DIR)/defs.h \
+	    $(UTILS_DIR)/utils.h $(UTILS_DIR)/function_precision.h $(UTILS_DIR)/avx512_calls.h $(UTILS_DIR)/avx_calls.h $(UTILS_DIR)/defs.h \
         $(UTILS_DIR)/weight_functions_double.h $(UTILS_DIR)/weight_functions_float.h $(UTILS_DIR)/weight_functions.h.src \
 		  $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src
 

diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src
@@ -107,15 +107,18 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru
 {
 
     static countpairs_mocks_func_ptr_DOUBLE function = NULL;
-    static isa old_isa=-1;
+    static isa old_isa = (isa) -1;
     if(old_isa == options->instruction_set) {
         return function;
     } 
 
     /* Array of function pointers */
     countpairs_mocks_func_ptr_DOUBLE allfunctions[] = {
+#ifdef __AVX512F__
+      countpairs_rp_pi_mocks_avx512_intrinsics_DOUBLE,
+#endif			 
 #ifdef __AVX__
-        countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE,
+      countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE,
 #endif			 
 #ifdef __SSE4_2__
       countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE,
@@ -125,10 +128,17 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru
 
     const int num_functions = sizeof(allfunctions)/sizeof(void *);
     const int fallback_offset = num_functions - 1;
-#if defined(__AVX__) || defined __SSE4_2__
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__)
     const int highest_isa = instrset_detect();
 #endif    
     int curr_offset = 0;
+
+    /* Check for AVX512F support */    
+    int avx512_offset = fallback_offset;
+#ifdef __AVX512F__
+    avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset;
+    curr_offset++;
+#endif
 
     /* Now check if AVX is supported by the CPU */
     int avx_offset = fallback_offset;
@@ -153,7 +163,7 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru
     /* Check that cpu supports feature */
     if(options->instruction_set >= 0) {
         switch(options->instruction_set) {
-        case(AVX512F):
+	case(AVX512F):function_dispatch=avx512_offset;break;
         case(AVX2):
         case(AVX):function_dispatch=avx_offset;break;
         case(SSE42): function_dispatch=sse_offset;break;
@@ -173,6 +183,8 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru
         // This must be first (AVX/SSE may be aliased to fallback)
         if(function_dispatch == fallback_offset){
             fprintf(stderr,"Using fallback kernel\n");
+	} else if(function_dispatch == avx512_offset){
+	  fprintf(stderr,"Using AVX512 kernel\n");
         } else if(function_dispatch == avx_offset){
             fprintf(stderr,"Using AVX kernel\n");
         } else if(function_dispatch == sse_offset){