Skip to content

Commit

Permalink
Fix CUDA plot issues on Windows (#381)
Browse files Browse the repository at this point in the history
Fix cudaplot issues on Windows
- Fix cudaplot issues on windows with --disk-128
- Enable 16G disk-hybrid plotting
- Fixes on build files/CI
- Bug fixes in disk-hybrid modes on BufferChain and GpuDownload stream
  • Loading branch information
haorldbchi authored Aug 31, 2023
1 parent 7e7d528 commit 076eba4
Show file tree
Hide file tree
Showing 29 changed files with 592 additions and 294 deletions.
11 changes: 6 additions & 5 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"preLaunchTask" : "build_cuda_debug",

"program": "${workspaceFolder}/build/bladebit_cuda",

// "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
// "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow
// "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
Expand All @@ -140,8 +140,9 @@
// "-w -z 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",

// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
"-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
"-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",


"windows": {
Expand Down Expand Up @@ -357,8 +358,8 @@

/// Compare
"plotcmp",
"/home/harold/plot/plot-k32-c01-2023-08-09-20-50-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot.ref",
"/home/harold/plot/plot-k32-c01-2023-08-09-21-33-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot",
"/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
"/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",

// "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
Expand Down
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@
],
// "cmake.buildArgs": [],
"cmake.configureSettings": {
"BB_ENABLE_TESTS": "ON"
"BB_ENABLE_TESTS": "ON",
"BB_CUDA_USE_NATIVE": "ON"
},
"C_Cpp.dimInactiveRegions": false,
// "cmake.generator": "Unix Makefiles"
Expand Down
6 changes: 5 additions & 1 deletion Bladebit.cmake
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_library(bladebit_core src/plotting/DiskBuffer.h src/plotting/DiskBufferBase.cpp src/plotting/DiskBufferBase.h)
add_library(bladebit_core)
target_link_libraries(bladebit_core PUBLIC bladebit_config)

target_include_directories(bladebit_core PUBLIC
Expand Down Expand Up @@ -294,9 +294,13 @@ set(src_bladebit

src/plotting/DiskQueue.h
src/plotting/DiskQueue.cpp
src/plotting/DiskBuffer.h
src/plotting/DiskBuffer.cpp
src/plotting/DiskBucketBuffer.h
src/plotting/DiskBucketBuffer.cpp
src/plotting/DiskBufferBase.h
src/plotting/DiskBufferBase.cpp

src/util/MPMCQueue.h
src/util/CommandQueue.h
)
Expand Down
7 changes: 3 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_OSX_DEPLOYMENT_TARGET 10.16)

set(CMAKE_CONFIGURATION_TYPES Release Debug)

Expand All @@ -19,7 +18,7 @@ if(POLICY CMP0091)
cmake_policy(SET CMP0091 NEW)
endif()

set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.")
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "MSVC Runtime Library")

project(bladebit LANGUAGES C CXX ASM)
Expand Down Expand Up @@ -85,7 +84,7 @@ endif()
# NOTE: These are mostly sandbox test environment, not proper tests
option(BB_ENABLE_TESTS "Enable tests." OFF)
option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF)
option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF)
Expand Down Expand Up @@ -146,7 +145,7 @@ endif()
include(Config.cmake)

if(NOT ${BB_HARVESTER_ONLY})
if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION))
include(cmake_modules/EmbedVersion.cmake)
endif()

Expand Down
10 changes: 8 additions & 2 deletions Harvester.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
if(NOT ${BB_HARVESTER_STATIC})
add_library(bladebit_harvester SHARED)
add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp)
else()
add_library(bladebit_harvester STATIC)
endif()
Expand Down Expand Up @@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE
cuda/CudaF1.cu
cuda/CudaMatch.cu
cuda/CudaPlotUtil.cu
cuda/GpuQueue.cu

# TODO: Remove this, ought not be needed in harvester
# TODO: Does this have to be here?
cuda/GpuStreams.cu
cuda/GpuDownloadStream.cu
src/plotting/DiskBuffer.cpp
src/plotting/DiskBucketBuffer.cpp
src/plotting/DiskBufferBase.cpp
src/plotting/DiskQueue.cpp
>

$<$<NOT:${have_cuda}>:
Expand Down
27 changes: 18 additions & 9 deletions cmake_modules/EmbedVersion.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,25 @@
if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
message("Embedding local build version")

set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")

set(cmd_ver bash)
set(cmd_shell bash)
set(cmd_ext sh)
if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
set(cmd_ver bash.exe)

find_program(bash_path NAMES bash.exe NO_CACHE)

if(${bash_path} MATCHES "-NOTFOUND")
set(cmd_shell powershell)
set(cmd_ext ps1)
else()
set(cmd_shell "${bash_path}")
endif()
endif()

execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)

# Remove trailing whitespace incurred in windows gitbash
string(STRIP "${bb_ver_maj}" bb_ver_maj)
Expand All @@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI})
add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
endif()

set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
1 change: 1 addition & 0 deletions cuda/CudaPlotConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
// #define DBG_BBCU_P2_WRITE_MARKS 1

// #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
// #define DBG_BBCU_KEEP_TEMP_FILES 1


#define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
Expand Down
27 changes: 22 additions & 5 deletions cuda/CudaPlotContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,24 @@ struct CudaK32ParkContext

struct CudaK32HybridMode
{
// For clarity, these are the file names for the disk buffers
// whose disk space will be shared for temp data in both phase 1 and phase 3.
// The name indicates their usage and in which phase.
static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME = "p1y-p3index.tmp";
static constexpr std::string_view META_DISK_BUFFER_FILE_NAME = "p1meta-p3rmap.tmp";
static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp";

static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME = META_DISK_BUFFER_FILE_NAME;
static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME = Y_DISK_BUFFER_FILE_NAME;
static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME;

DiskQueue* temp1Queue; // Tables Queue
DiskQueue* temp2Queue; // Metadata Queue (could be the same as temp1Queue)

DiskBucketBuffer* metaBuffer; // Enabled in 64G mode
DiskBucketBuffer* unsortedXs; // Unsorted Xs are written to disk (uint64 entries)
DiskBucketBuffer* metaBuffer; // Enabled in < 128G mode
DiskBucketBuffer* yBuffer; // Enabled in < 128G mode
DiskBucketBuffer* unsortedL; // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries)
DiskBucketBuffer* unsortedR; // Unsorted R pairs in < 128G mode

DiskBuffer* tablesL[7];
DiskBuffer* tablesR[7];
Expand All @@ -58,8 +71,11 @@ struct CudaK32HybridMode

struct
{
DiskBucketBuffer* lpOut;
DiskBucketBuffer* indexOut;
// #NOTE: These buffers shared the same file-backed storage as
// with other buffers in phase 1.
DiskBucketBuffer* rMapBuffer; // Step 1
DiskBucketBuffer* indexBuffer; // X-step/Step 2
DiskBucketBuffer* lpAndLMapBuffer; // X-step/Step 2 (LP) | Step 3 (LMap)

} phase3;
};
Expand Down Expand Up @@ -142,6 +158,7 @@ struct CudaK32Phase3
GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values
GpuDownloadBuffer lpOut; // Output line points (uint64)
GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value)
GpuDownloadBuffer parksOut; // Output P7 parks on the last table
uint32* devLTable[2]; // Unpacked L table bucket

uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
Expand All @@ -151,7 +168,7 @@ struct CudaK32Phase3
struct {
GpuUploadBuffer lpIn; // Line points from step 2
GpuUploadBuffer indexIn; // Indices from step 2
GpuDownloadBuffer mapOut; // lTable for next step 1
GpuDownloadBuffer mapOut; // lTable for next step 2
GpuDownloadBuffer parksOut; // Downloads park buffers to host

uint32* hostParkOverrunCount;
Expand Down
2 changes: 1 addition & 1 deletion cuda/CudaPlotPhase2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
desc.sliceAlignment = cx.diskContext->temp1Queue->BlockSize();
}

if( cx.cfg.disableDirectDownloads )
if( !cx.downloadDirect )
desc.pinnedAllocator = acx.pinnedAllocator;

CudaK32Phase2& p2 = *cx.phase2;
Expand Down
Loading

0 comments on commit 076eba4

Please sign in to comment.