diff --git a/.vscode/launch.json b/.vscode/launch.json index 36ae6c2f..bceef625 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -131,7 +131,7 @@ "preLaunchTask" : "build_cuda_debug", "program": "${workspaceFolder}/build/bladebit_cuda", - + // "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6", // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb", @@ -140,8 +140,9 @@ // "-w -z 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp", // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot", - "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot", - // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot", + // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot", + // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot", + "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot", "windows": { @@ -357,8 +358,8 @@ /// Compare "plotcmp", - "/home/harold/plot/plot-k32-c01-2023-08-09-20-50-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot.ref", - "/home/harold/plot/plot-k32-c01-2023-08-09-21-33-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot", + "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot", + "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot", // "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" diff --git a/.vscode/settings.json b/.vscode/settings.json index fafad2e2..6c2da21b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -138,7 +138,8 @@ ], // "cmake.buildArgs": [], "cmake.configureSettings": { - "BB_ENABLE_TESTS": "ON" + "BB_ENABLE_TESTS": "ON", + "BB_CUDA_USE_NATIVE": "ON" }, "C_Cpp.dimInactiveRegions": false, // "cmake.generator": "Unix Makefiles" diff --git a/Bladebit.cmake b/Bladebit.cmake index df45dc3b..9f1166dd 100644 --- a/Bladebit.cmake +++ b/Bladebit.cmake @@ -1,4 +1,4 @@ -add_library(bladebit_core src/plotting/DiskBuffer.h src/plotting/DiskBufferBase.cpp src/plotting/DiskBufferBase.h) +add_library(bladebit_core) target_link_libraries(bladebit_core PUBLIC bladebit_config) target_include_directories(bladebit_core PUBLIC @@ -294,9 +294,13 @@ set(src_bladebit src/plotting/DiskQueue.h src/plotting/DiskQueue.cpp + src/plotting/DiskBuffer.h src/plotting/DiskBuffer.cpp src/plotting/DiskBucketBuffer.h src/plotting/DiskBucketBuffer.cpp + src/plotting/DiskBufferBase.h + src/plotting/DiskBufferBase.cpp + src/util/MPMCQueue.h src/util/CommandQueue.h ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5660cef0..8f72155c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_OSX_DEPLOYMENT_TARGET 10.16) set(CMAKE_CONFIGURATION_TYPES Release Debug) @@ -19,7 +18,7 @@ if(POLICY CMP0091) cmake_policy(SET CMP0091 NEW) endif() -set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.") +set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.") set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" CACHE STRING "MSVC Runtime Library") project(bladebit LANGUAGES C CXX ASM) @@ -85,7 +84,7 @@ endif() # NOTE: These are mostly sandbox test environment, not proper tests option(BB_ENABLE_TESTS "Enable tests." OFF) option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF) -option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON) +option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF) option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF) option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF) option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF) @@ -146,7 +145,7 @@ endif() include(Config.cmake) if(NOT ${BB_HARVESTER_ONLY}) - if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION)) + if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION)) include(cmake_modules/EmbedVersion.cmake) endif() diff --git a/Harvester.cmake b/Harvester.cmake index ece2c457..692daa80 100644 --- a/Harvester.cmake +++ b/Harvester.cmake @@ -1,5 +1,5 @@ if(NOT ${BB_HARVESTER_STATIC}) - add_library(bladebit_harvester SHARED) + add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp) else() add_library(bladebit_harvester STATIC) endif() @@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE cuda/CudaF1.cu cuda/CudaMatch.cu cuda/CudaPlotUtil.cu + cuda/GpuQueue.cu - # TODO: Remove this, ought not be needed in harvester + # TODO: Does this have to be here? cuda/GpuStreams.cu + cuda/GpuDownloadStream.cu + src/plotting/DiskBuffer.cpp + src/plotting/DiskBucketBuffer.cpp + src/plotting/DiskBufferBase.cpp + src/plotting/DiskQueue.cpp > $<$: diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake index 6ec042c0..1c346632 100644 --- a/cmake_modules/EmbedVersion.cmake +++ b/cmake_modules/EmbedVersion.cmake @@ -2,18 +2,25 @@ if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded})) message("Embedding local build version") - set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.") - - set(cmd_ver bash) + set(cmd_shell bash) + set(cmd_ext sh) if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") - set(cmd_ver bash.exe) + + find_program(bash_path NAMES bash.exe NO_CACHE) + + if(${bash_path} MATCHES "-NOTFOUND") + set(cmd_shell powershell) + set(cmd_ext ps1) + else() + set(cmd_shell "${bash_path}") + endif() endif() - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) # Remove trailing whitespace incurred in windows gitbash string(STRIP "${bb_ver_maj}" bb_ver_maj) @@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI}) add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}") add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}") endif() + +set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.") \ No newline at end of file diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h index 8fa77588..b42a5d8a 100644 --- a/cuda/CudaPlotConfig.h +++ b/cuda/CudaPlotConfig.h @@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI // #define DBG_BBCU_P2_WRITE_MARKS 1 // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1 + // #define DBG_BBCU_KEEP_TEMP_FILES 1 #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) ) diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h index c313b696..4d546480 100644 --- a/cuda/CudaPlotContext.h +++ b/cuda/CudaPlotContext.h @@ -44,11 +44,24 @@ struct CudaK32ParkContext struct CudaK32HybridMode { + // For clarity, these are the file names for the disk buffers + // whose disk space will be shared for temp data in both phase 1 and phase 3. + // The name indicates their usage and in which phase. + static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME = "p1y-p3index.tmp"; + static constexpr std::string_view META_DISK_BUFFER_FILE_NAME = "p1meta-p3rmap.tmp"; + static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp"; + + static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME = META_DISK_BUFFER_FILE_NAME; + static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME = Y_DISK_BUFFER_FILE_NAME; + static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME; + DiskQueue* temp1Queue; // Tables Queue DiskQueue* temp2Queue; // Metadata Queue (could be the same as temp1Queue) - DiskBucketBuffer* metaBuffer; // Enabled in 64G mode - DiskBucketBuffer* unsortedXs; // Unsorted Xs are written to disk (uint64 entries) + DiskBucketBuffer* metaBuffer; // Enabled in < 128G mode + DiskBucketBuffer* yBuffer; // Enabled in < 128G mode + DiskBucketBuffer* unsortedL; // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries) + DiskBucketBuffer* unsortedR; // Unsorted R pairs in < 128G mode DiskBuffer* tablesL[7]; DiskBuffer* tablesR[7]; @@ -58,8 +71,11 @@ struct CudaK32HybridMode struct { - DiskBucketBuffer* lpOut; - DiskBucketBuffer* indexOut; + // #NOTE: These buffers shared the same file-backed storage as + // with other buffers in phase 1. + DiskBucketBuffer* rMapBuffer; // Step 1 + DiskBucketBuffer* indexBuffer; // X-step/Step 2 + DiskBucketBuffer* lpAndLMapBuffer; // X-step/Step 2 (LP) | Step 3 (LMap) } phase3; }; @@ -142,6 +158,7 @@ struct CudaK32Phase3 GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values GpuDownloadBuffer lpOut; // Output line points (uint64) GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value) + GpuDownloadBuffer parksOut; // Output P7 parks on the last table uint32* devLTable[2]; // Unpacked L table bucket uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT]; @@ -151,7 +168,7 @@ struct CudaK32Phase3 struct { GpuUploadBuffer lpIn; // Line points from step 2 GpuUploadBuffer indexIn; // Indices from step 2 - GpuDownloadBuffer mapOut; // lTable for next step 1 + GpuDownloadBuffer mapOut; // lTable for next step 2 GpuDownloadBuffer parksOut; // Downloads park buffers to host uint32* hostParkOverrunCount; diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu index 0e1f6480..8d2d5094 100644 --- a/cuda/CudaPlotPhase2.cu +++ b/cuda/CudaPlotPhase2.cu @@ -419,7 +419,7 @@ void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte desc.sliceAlignment = cx.diskContext->temp1Queue->BlockSize(); } - if( cx.cfg.disableDirectDownloads ) + if( !cx.downloadDirect ) desc.pinnedAllocator = acx.pinnedAllocator; CudaK32Phase2& p2 = *cx.phase2; diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu index a9b34c62..8fcdfe2a 100644 --- a/cuda/CudaPlotPhase3.cu +++ b/cuda/CudaPlotPhase3.cu @@ -235,6 +235,14 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx ) } #endif + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.rMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + } + + const uint32 compressionLevel = cx.gCfg->compressionLevel; // Special case with the starting table, since it has the values inlined already @@ -250,6 +258,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx ) CompressInlinedTable( cx ); auto elapsed = TimerEnd( timer ); Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed ); + timer = TimerBegin(); CudaK32PlotPhase3Step3( cx ); @@ -284,7 +293,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx ) Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 ); cx.table = rTable; - + #if BBCU_DBG_SKIP_PHASE_2 if( rTable < TableId::Table7 ) DbgLoadTablePairs( cx, rTable+1, false ); @@ -378,7 +387,6 @@ void Step1( CudaK32PlotContext& cx ) p3.pairsLoadOffset = 0; LoadBucket( cx, 0 ); - /// /// Process buckets /// @@ -424,7 +432,7 @@ void Step1( CudaK32PlotContext& cx ) s1.rMapOut.Download2DT( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream ); } - + // Download slice counts cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream(); @@ -459,6 +467,11 @@ void Step1( CudaK32PlotContext& cx ) p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i]; } + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.rMapBuffer->Swap(); + } + // #if _DEBUG // DbgValidateRMap( cx ); // #endif @@ -598,9 +611,15 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i]; } + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + // #if _DEBUG -// // DbgValidateIndices( cx ); -// DbgValidateStep2Output( cx ); +// DbgValidateIndices( cx ); +// // DbgValidateStep2Output( cx ); // // DbgDumpSortedLinePoints( cx ); // #endif } @@ -612,27 +631,47 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) //----------------------------------------------------------- void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { + static_assert( sizeof( LMap ) == sizeof( uint64 ) ); + auto& p3 = *cx.phase3; // Shared allocations - p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment ); - p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment ); + p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment ); + p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment ); // Host allocations - p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index - p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs - - if( cx.cfg.hybrid64Mode ) + if( !cx.cfg.hybrid16Mode ) { - Panic( "Unimplemented for 64G mode. Need to offload LMap/Line Points to disk." ); + p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index + p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs } - - if( !acx.dryRun ) + else if( !cx.diskContext->phase3.rMapBuffer ) { - // ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL ); - // ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL ); + const size_t RMAP_SLICE_SIZE = sizeof( RMap ) * P3_PRUNED_SLICE_MAX; + const size_t INDEX_SLICE_SIZE = sizeof( uint32 ) * P3_PRUNED_SLICE_MAX; + const size_t LP_AND_LMAP_SLICE_SIZE = sizeof( uint64 ) * P3_PRUNED_SLICE_MAX; + + const FileFlags TMP2_QUEUE_FILE_FLAGS = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile; + + cx.diskContext->phase3.rMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_RMAP_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS ); + FatalIf( !cx.diskContext->phase3.rMapBuffer, "Failed to create R Map disk buffer." ); + + cx.diskContext->phase3.indexBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_INDEX_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, INDEX_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS ); + FatalIf( !cx.diskContext->phase3.indexBuffer, "Failed to create index disk buffer." ); + + cx.diskContext->phase3.lpAndLMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS ); + FatalIf( !cx.diskContext->phase3.lpAndLMapBuffer, "Failed to create LP/LMap disk buffer." ); } - // p3.hostBucketCounts = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment ); + + #if _DEBUG + if( !acx.dryRun && !cx.cfg.hybrid128Mode ) + { + ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL ); + } + #endif if( acx.dryRun ) { @@ -704,12 +743,17 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) desc.sliceAlignment = acx.alignment; desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; desc.deviceAllocator = acx.devAllocator; - desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr; + desc.pinnedAllocator = nullptr; GpuStreamDescriptor uploadDesc = desc; if( cx.cfg.hybrid128Mode ) + { uploadDesc.pinnedAllocator = acx.pinnedAllocator; + if( cx.cfg.hybrid16Mode ) + desc.pinnedAllocator = acx.pinnedAllocator; + } + auto& tx = cx.phase3->xTable; tx.devRMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment ); @@ -717,6 +761,12 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) tx.xIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun ); tx.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); tx.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + tx.lpOut .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer ); + } } //----------------------------------------------------------- @@ -728,25 +778,30 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex desc.sliceAlignment = acx.alignment; desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; desc.deviceAllocator = acx.devAllocator; - desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr; + desc.pinnedAllocator = nullptr; GpuStreamDescriptor uploadDesc = desc; if( cx.cfg.hybrid128Mode ) + { uploadDesc.pinnedAllocator = acx.pinnedAllocator; + if( cx.cfg.hybrid16Mode ) + desc.pinnedAllocator = acx.pinnedAllocator; + } + auto& s1 = cx.phase3->step1; const size_t alignment = acx.alignment; s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun ); - // sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun ); - // sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - s1.rMapOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); - // sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); s1.rTableMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer ); + } } //----------------------------------------------------------- @@ -758,25 +813,44 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex desc.sliceAlignment = acx.alignment; desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; desc.deviceAllocator = acx.devAllocator; - desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr; + desc.pinnedAllocator = nullptr; + + GpuStreamDescriptor uploadDesc = desc; + if( cx.cfg.hybrid16Mode ) + { + desc.pinnedAllocator = acx.pinnedAllocator; + } auto& s2 = cx.phase3->step2; const size_t alignment = acx.alignment; s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); - // sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); - // sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - - s2.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); - // sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); + s2.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT (desc, acx.dryRun ); - // sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); - + + + const size_t devParkAllocSize = P3_PARK_7_SIZE * P3_MAX_P7_PARKS_PER_BUCKET; + + GpuStreamDescriptor parksDesc = desc; + parksDesc.sliceCount = 1; + parksDesc.entriesPerSlice = devParkAllocSize; + parksDesc.sliceAlignment = RoundUpToNextBoundaryT( P3_PARK_7_SIZE, sizeof( uint64 ) ); + + s2.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( parksDesc, acx.dryRun ); + s2.devLTable[0] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment ); s2.devLTable[1] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer ); + s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + + s2.lpOut .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer ); + } } //----------------------------------------------------------- @@ -788,7 +862,12 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex desc.sliceAlignment = acx.alignment; desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; desc.deviceAllocator = acx.devAllocator; - desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr; + desc.pinnedAllocator = nullptr; + + if( cx.cfg.hybrid16Mode ) + { + desc.pinnedAllocator = acx.pinnedAllocator; + } auto& s3 = cx.phase3->step3; const size_t alignment = acx.alignment; @@ -808,7 +887,6 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex parksDesc.sliceAlignment = RoundUpToNextBoundaryT( DEV_MAX_PARK_SIZE, sizeof( uint64 ) ); s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( parksDesc, acx.dryRun ); - // cx.gpuDownloadStream[0]->CreateDownloadBuffer( devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); if( acx.dryRun ) { @@ -828,11 +906,16 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex s3.devDeltaLinePoints = acx.devAllocator->CAlloc( linePointAllocCount, alignment ); s3.devIndices = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment ); - // s3.devParks = acx.devAllocator->AllocT( parkAllocSize, alignment ); - // s3.hostParks = acx.devAllocator->AllocT ( maxParkSize , alignment ); - s3.devCTable = acx.devAllocator->AllocT( P3_MAX_CTABLE_SIZE, alignment ); s3.devParkOverrunCount = acx.devAllocator->CAlloc( 1 ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + s3.lpIn .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer ); + + s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + } } @@ -986,23 +1069,45 @@ void DbgValidateIndices( CudaK32PlotContext& cx ) const uint32* reader = p3.hostIndices; const size_t readerStride = P3_PRUNED_SLICE_MAX * 3; - uint64 entryCount = 0; for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { - for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ ) + if( cx.cfg.hybrid16Mode ) + { + const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket]; + + cx.diskContext->phase3.indexBuffer->OverrideReadSlices( bucket, sizeof( uint32 ), sizeSlices, BBCU_BUCKET_COUNT ); + cx.diskContext->phase3.indexBuffer->ReadNextBucket(); + const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs(); + ASSERT( readBucket.Length() == p3.prunedBucketCounts[(int)cx.table][bucket] ); + + bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() ); + + idxWriter += readBucket.Length(); + entryCount += readBucket.Length(); + } + else { - const uint32 copyCount = s2.prunedBucketSlices[bucket][slice]; + for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ ) + { + const uint32 copyCount = s2.prunedBucketSlices[slice][bucket]; - bbmemcpy_t( idxWriter, reader, copyCount ); + bbmemcpy_t( idxWriter, reader, copyCount ); - idxWriter += copyCount; - entryCount += copyCount; - reader += readerStride; + idxWriter += copyCount; + entryCount += copyCount; + reader += readerStride; + } } } + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.indexBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] ); RadixSort256::Sort( pool, indices, idxTmp, entryCount ); diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h index feb93a6f..34909123 100644 --- a/cuda/CudaPlotPhase3Internal.h +++ b/cuda/CudaPlotPhase3Internal.h @@ -37,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) ); #define P3_PRUNED_TABLE_MAX_ENTRIES BBCU_TABLE_ALLOC_ENTRY_COUNT //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT) #define P3_PRUNED_MAX_PARKS_PER_BUCKET ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2) -static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough -//static constexpr size_t P3_LP_BUCKET_COUNT = BBCU_BUCKET_COUNT;// << 1; -//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = BBCU_MAX_SLICE_ENTRY_COUNT; -//static constexpr uint32 P3_LP_BUCKET_BITS = BBC_BUCKET_BITS; - -// static constexpr uint32 P3_LP_BUCKET_BITS = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT )); -//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ), - //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE ); -// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT; - -//static constexpr size_t P3_LP_BUCKET_STRIDE = BBCU_BUCKET_ALLOC_ENTRY_COUNT; - -// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ), -// BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE ); -// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT; +static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough +static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2; +static constexpr size_t P3_PARK_7_SIZE = CalculatePark7Size( BBCU_K ); +static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE ); static constexpr size_t MAX_PARK_SIZE = CalculateParkSize( TableId::Table1 ); static constexpr size_t DEV_MAX_PARK_SIZE = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 ); // Align parks to 64 bits, for easier writing of stubs diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu index 0a7bc0f1..3a7a6449 100644 --- a/cuda/CudaPlotPhase3Step2.cu +++ b/cuda/CudaPlotPhase3Step2.cu @@ -369,6 +369,13 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX ); } + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.rMapBuffer->Swap(); + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + // #if _DEBUG // // if( cx.table > TableId::Table3 ) // { @@ -401,23 +408,26 @@ void WritePark7( CudaK32PlotContext& cx ) auto& p3 = *cx.phase3; auto& s2 = p3.step2; - + // Load initial bucket LoadBucket( cx, 0 ); // Begin park 7 table in plot cx.plotWriter->BeginTable( PlotTable::Table7 ); - constexpr size_t parkSize = CalculatePark7Size( BBCU_K ); + constexpr size_t parkSize = P3_PARK_7_SIZE; constexpr size_t parkFieldCount = parkSize / sizeof( uint64 ); static_assert( parkFieldCount * sizeof( uint64 ) == parkSize ); + GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut; - GpuDownloadBuffer& parkDownloader = s2.lpOut; - - constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2; + constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET; static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize ); + if( cx.useParkContext ) + { + cx.parkContext->parkBufferChain->Reset(); + } // Host stuff constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT; @@ -426,9 +436,10 @@ void WritePark7( CudaK32PlotContext& cx ) const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table]; const size_t totalParkCount = CDiv( (size_t)tableEntryCount, kEntriesPerPark ); - byte* hostParks = hostAllocator.AllocT( totalParkCount * parkSize ); - byte* hostParkWriter = hostParks; - uint32* hostLastParkEntries = hostAllocator.CAlloc( kEntriesPerPark ); + byte* hostParks = cx.useParkContext ? nullptr : hostAllocator.AllocT( totalParkCount * parkSize ); + byte* hostParksWriter = cx.useParkContext ? nullptr : hostParks; + uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints : + hostAllocator.CAlloc( kEntriesPerPark ); static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 ); uint32* devIndexBuffer = s2.devLTable[0] + kEntriesPerPark; @@ -478,14 +489,38 @@ void WritePark7( CudaK32PlotContext& cx ) // Download parks & write to plot const size_t downloadSize = parkCount * parkSize; - parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize, + if( cx.useParkContext ) + { + ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() ); + + // Override the park buffer to be used when using a park context + hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket ); + + // Wait for the next park buffer to be available + parkDownloader.HostCallback([&cx]{ + (void)cx.parkContext->parkBufferChain->GetNextBuffer(); + }); + } + + parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize, []( void* parksBuffer, size_t size, void* userData ) { auto& cx = *reinterpret_cast( userData ); cx.plotWriter->WriteTableData( parksBuffer, size ); + + // Release the buffer after the plot writer is done with it. + if( cx.useParkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } + }, &cx, cx.computeStream ); - hostParkWriter += downloadSize; + hostParksWriter += downloadSize; + if( cx.useParkContext ) + hostParksWriter = nullptr; } // Wait for parks to complete downloading @@ -498,9 +533,19 @@ void WritePark7( CudaK32PlotContext& cx ) // Was there a left-over park? if( retainedEntryCount > 0 ) { + if( cx.useParkContext ) + hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer(); + // Submit last park to plot - TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter ); - cx.plotWriter->WriteTableData( hostParkWriter, parkSize ); + TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter ); + cx.plotWriter->WriteTableData( hostParksWriter, parkSize ); + + if( cx.useParkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } } cx.plotWriter->EndTable(); diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu index 3f12dd05..c8f9337b 100644 --- a/cuda/CudaPlotPhase3Step3.cu +++ b/cuda/CudaPlotPhase3Step3.cu @@ -318,6 +318,11 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) s3.lpIn .Reset(); s3.indexIn.Reset(); + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } // #if _DEBUG diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu index cc9619d3..6b5dc586 100644 --- a/cuda/CudaPlotter.cu +++ b/cuda/CudaPlotter.cu @@ -56,10 +56,10 @@ GPU-based (CUDA) plotter [OPTIONS]: -h, --help : Shows this help message and exits. -d, --device : Select the CUDA device index. (default=0) - + --disk-128 : Enable hybrid disk plotting for 128G system RAM. Requires a --temp1 and --temp2 to be set. - --disk-64 : Enable hybrid disk plotting for 64G system RAM. + --disk-16 : Enable hybrid disk plotting for 16G system RAM. Requires a --temp1 and --temp2 to be set. -t1, --temp1 : Temporary directory 1. Used for longer-lived, sequential writes. -t2, --temp2 : Temporary directory 2. Used for temporary, shorted-lived read and writes. @@ -80,11 +80,9 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) { if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) ) continue; - if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) ) - continue; if( cli.ReadSwitch( cfg.hybrid128Mode, "--disk-128" ) ) continue; - if( cli.ReadSwitch( cfg.hybrid64Mode, "--disk-64" ) ) + if( cli.ReadSwitch( cfg.hybrid16Mode, "--disk-16" ) ) { cfg.hybrid128Mode = true; continue; @@ -105,6 +103,8 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) continue; if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) ) continue; + // if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) ) + // continue; if( cli.ArgMatch( "--help", "-h" ) ) { Log::Line( USAGE ); @@ -144,6 +144,10 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables; Log::Line( "[Bladebit CUDA Plotter]" ); + Log::Line( " Host RAM : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB ); + Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" ); + Log::NewLine(); + CudaInit( cx ); CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) ); @@ -165,11 +169,12 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) cx.plotFence = new Fence(); cx.parkFence = new Fence(); - #if __linux__ - cx.downloadDirect = cfg.disableDirectDownloads ? false : true; + #if _WIN32 + // #MAYBE: Add a configurable option to enable direct downloads on windows? + // On windows always default to using intermediate pinned buffers + cx.downloadDirect = false; #else - // #TODO: One windows, check if we have enough memory, if so, default to true. - cx.downloadDirect = true ;//false; + cx.downloadDirect = cfg.disableDirectDownloads ? false : true; #endif // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO ); @@ -177,9 +182,12 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) // cx.plotWriter->EnableDummyMode(); // Need to do allocations for park serialization differently under the following conditions - if( cx.cfg.disableDirectDownloads || cx.cfg.hybrid128Mode ) //cx.cfg.hybrid64Mode ) + if( cx.downloadDirect || cx.cfg.hybrid128Mode ) { - cx.parkContext = new CudaK32ParkContext{}; + cx.parkContext = new CudaK32ParkContext{}; + + if( cx.cfg.hybrid16Mode ) + cx.useParkContext = true; } // Check for hybrid mode @@ -302,17 +310,19 @@ void CudaK32Plotter::Run( const PlotRequest& req ) cx.plotWriter = nullptr; // Delete any temporary files - if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode ) - { - if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer; - if( cx.diskContext->unsortedXs ) delete cx.diskContext->unsortedXs; - - for( TableId t = TableId::Table1; t <= TableId::Table7; t++ ) + #if !(DBG_BBCU_KEEP_TEMP_FILES) + if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode ) { - if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t]; - if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t]; + if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer; + if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL; + + for( TableId t = TableId::Table1; t <= TableId::Table7; t++ ) + { + if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t]; + if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t]; + } } - } + #endif } //----------------------------------------------------------- @@ -490,13 +500,17 @@ void FpTable( CudaK32PlotContext& cx ) if( cx.cfg.hybrid128Mode ) { - if( cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 ) + if( cx.cfg.hybrid16Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 ) { - cx.diskContext->unsortedXs->Swap(); + cx.diskContext->unsortedL->Swap(); } - if( cx.cfg.hybrid64Mode ) + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->yBuffer->Swap(); cx.diskContext->metaBuffer->Swap(); + cx.diskContext->unsortedR->Swap(); + } } cx.yIn .Reset(); @@ -987,6 +1001,9 @@ void FinalizeTable7( CudaK32PlotContext& cx ) { cx.diskContext->tablesL[(int)TableId::Table7]->Swap(); cx.diskContext->tablesR[(int)TableId::Table7]->Swap(); + + if( cx.cfg.hybrid16Mode ) + cx.diskContext->yBuffer->Swap(); } auto elapsed = TimerEnd( timer ); @@ -1126,8 +1143,8 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket ) const uint32* hostY = cx.hostY; const uint32* hostMeta = cx.hostMeta; - const uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left; - const uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right; + const uint32* hostPairsL = cx.hostTableL; + const uint16* hostPairsR = cx.hostTableR; const bool uploadCompressed = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables; const bool uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2; @@ -1198,11 +1215,11 @@ void AllocBuffers( CudaK32PlotContext& cx ) cx.hostTempAllocSize = 0; cx.devAllocSize = 0; - size_t parksPinnedSize = 0; - - // If on <= 64G mode or not using direct downloads, + // If on <= 128G mode or not using direct downloads, // we need to use a separate buffer for downloading parks, instead of re-using exisintg ones. - const bool allocateParkBuffers = cx.cfg.disableDirectDownloads || cx.cfg.hybrid128Mode; //cx.cfg.hybrid64Mode; + // If on <= 64G mode or not using direct downloads, + const bool allocateParkBuffers = cx.downloadDirect || cx.cfg.hybrid128Mode; + size_t parksPinnedSize = 0; // Gather the size needed first { @@ -1222,7 +1239,6 @@ void AllocBuffers( CudaK32PlotContext& cx ) acx.devAllocator = &devAllocator; AllocateP1Buffers( cx, acx ); - cx.pinnedAllocSize = pinnedAllocator .Size(); cx.hostTableAllocSize = hostTableAllocator.Size(); cx.hostTempAllocSize = hostTempAllocator .Size(); @@ -1235,7 +1251,6 @@ void AllocBuffers( CudaK32PlotContext& cx ) devAllocator = {}; CudaK32PlotPhase2AllocateBuffers( cx, acx ); - cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() ); cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() ); cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() ); @@ -1248,7 +1263,6 @@ void AllocBuffers( CudaK32PlotContext& cx ) devAllocator = {}; CudaK32PlotPhase3AllocateBuffers( cx, acx ); - cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() ); cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() ); cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() ); @@ -1257,14 +1271,15 @@ void AllocBuffers( CudaK32PlotContext& cx ) // May need to allocate extra pinned buffers for park buffers if( allocateParkBuffers ) { + pinnedAllocator = {}; AllocateParkSerializationBuffers( cx, *acx.pinnedAllocator, acx.dryRun ); - parksPinnedSize = acx.pinnedAllocator->Size(); + parksPinnedSize = pinnedAllocator.Size(); } } - size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize; - size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize; + const size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize; + const size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize; Log::Line( "Kernel RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize, (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB ); @@ -1280,40 +1295,40 @@ void AllocBuffers( CudaK32PlotContext& cx ) Log::Line( "GPU RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize, (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB ); - Log::Line( "Allocating buffers" ); // Now actually allocate the buffers + Log::Line( "Allocating buffers..." ); CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) ); #if _DEBUG cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize ); #else - #if !_WIN32 - // if( cx.downloadDirect ) + + bool allocateHostTablesPinned = cx.downloadDirect; + #if _WIN32 + // On windows we always force the use of intermediate buffers, so we allocate on the host + allocateHostTablesPinned = false; + #endif + + Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" ); + if( allocateHostTablesPinned ) CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) ); - // else - // { - // // #TODO: On windows, first check if we have enough shared memory (512G)? - // // and attempt to alloc that way first. Otherwise, use intermediate pinned buffers. - #else + else cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize ); - #endif - // } #endif - //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) ); - cx.hostBufferTemp = nullptr; -#if _DEBUG - if( cx.hostTempAllocSize ) - cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize ); -#endif + #if _DEBUG || _WIN32 + if( cx.hostTempAllocSize ) + cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize ); + #endif + if( cx.hostBufferTemp == nullptr && cx.hostTempAllocSize ) CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) ); CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) ); // Warm start - if( true ) + if( true )// cx.gCfg->warmStart ) { FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer , cx.pinnedAllocSize ); FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize ); @@ -1377,23 +1392,25 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) // Temp allocations are pinned host buffers that can be re-used for other means in different phases. // This is roughly equivalent to temp2 dir during disk plotting. - cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); - if( !cx.cfg.hybrid64Mode ) + if( !cx.cfg.hybrid16Mode ) { + cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); cx.hostMeta = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment ); } else if( !cx.diskContext->metaBuffer ) { + const size_t ySliceSize = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT; const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT; - cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "metadata.tmp", - BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags ); + cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::Y_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->yBuffer, "Failed to create y disk buffer." ); + cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::META_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags ); FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." ); } -Log::Line( "Host Temp @ %llu GiB", (llu)acx.hostTempAllocator->Size() BtoGB ); -Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB ); // Marking tables used to prune back pointers { @@ -1469,83 +1486,97 @@ Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB multiplier = 1; } - cx.hostTableL = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); - cx.hostTableR = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); - // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead. const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair ); - cx.diskContext->unsortedXs = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_x.tmp", - BBCU_BUCKET_COUNT, xSliceSize, fileMode, FileAccess::ReadWrite, tmp2FileFlags ); - FatalIf( !cx.diskContext->unsortedXs, "Failed to create unsorted_x.tmp disk buffer." ); + cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::LPAIRS_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." ); + + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp", + BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted R disk buffer." ); + } + else + { + // In 128G mode we can store intermediate pairs in the host + cx.hostTableL = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); + cx.hostTableR = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); + } } } -Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB ); /// Device & Pinned allocations { - GpuStreamDescriptor directDesc{}; - directDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; - directDesc.sliceCount = BBCU_BUCKET_COUNT; - directDesc.sliceAlignment = alignment; - directDesc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; - directDesc.deviceAllocator = acx.devAllocator; - directDesc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers) + GpuStreamDescriptor yDesc{}; + yDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; + yDesc.sliceCount = BBCU_BUCKET_COUNT; + yDesc.sliceAlignment = alignment; + yDesc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + yDesc.deviceAllocator = acx.devAllocator; + yDesc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers) // In disk-backed mode, we always have pinned buffers, // which are the same buffers used to write and read from disk. - GpuStreamDescriptor diskDescTables = directDesc; - GpuStreamDescriptor diskDescXPair = directDesc; - GpuStreamDescriptor diskDescMeta = directDesc; + GpuStreamDescriptor descTablePairs = yDesc; + GpuStreamDescriptor descTableSortedPairs = yDesc; + GpuStreamDescriptor descXPairs = yDesc; + GpuStreamDescriptor descMeta = yDesc; if( cx.cfg.hybrid128Mode ) { // Temp 1 Queue - diskDescTables.pinnedAllocator = acx.pinnedAllocator; - diskDescTables.sliceAlignment = cx.diskContext->temp1Queue->BlockSize(); + descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator; + descTableSortedPairs.sliceAlignment = cx.diskContext->temp1Queue->BlockSize(); // Temp 2 Queue - diskDescXPair.pinnedAllocator = acx.pinnedAllocator; - diskDescXPair.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + descXPairs.pinnedAllocator = acx.pinnedAllocator; + descXPairs.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); - if( cx.cfg.hybrid64Mode ) + if( cx.cfg.hybrid16Mode ) { - diskDescMeta.pinnedAllocator = acx.pinnedAllocator; - diskDescMeta.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + yDesc.pinnedAllocator = acx.pinnedAllocator; + yDesc.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + + descMeta.pinnedAllocator = acx.pinnedAllocator; + descMeta.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + + descTablePairs.pinnedAllocator = acx.pinnedAllocator; + descTablePairs.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); } } - // In direct mode, we don't have any intermediate pinned buffers, - // but our destination buffer is already a pinned buffer. - if( cx.cfg.disableDirectDownloads ) + if( !cx.downloadDirect ) { - directDesc.pinnedAllocator = acx.pinnedAllocator; - - // Assign these here too in case we're not in disk-backed mode - diskDescTables.pinnedAllocator = acx.pinnedAllocator; - diskDescMeta .pinnedAllocator = acx.pinnedAllocator; + // Use intermediate pinned buffer for transfers to non-pinned destinations + yDesc.pinnedAllocator = acx.pinnedAllocator; + descTablePairs.pinnedAllocator = acx.pinnedAllocator; + descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator; + descXPairs.pinnedAllocator = acx.pinnedAllocator; + descMeta.pinnedAllocator = acx.pinnedAllocator; } - /// /// Downloads /// - cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( directDesc, acx.dryRun ); - cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( diskDescMeta, acx.dryRun ); + cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( yDesc, acx.dryRun ); + cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descMeta, acx.dryRun ); { // These download buffers share the same backing buffers const size_t devMarker = acx.devAllocator->Size(); const size_t pinnedMarker = acx.pinnedAllocator->Size(); - cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( directDesc, acx.dryRun ); - cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( directDesc, acx.dryRun ); + cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTablePairs, acx.dryRun ); + cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTablePairs, acx.dryRun ); acx.devAllocator->PopToMarker( devMarker ); acx.pinnedAllocator->PopToMarker( pinnedMarker ); // Allocate Pair at the end, to ensure we grab the highest value - cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( diskDescXPair, acx.dryRun ); + cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descXPairs, acx.dryRun ); } { @@ -1553,35 +1584,35 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB const size_t devMarker = acx.devAllocator->Size(); const size_t pinnedMarker = acx.pinnedAllocator->Size(); - cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( diskDescTables, acx.dryRun ); - cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( diskDescTables, acx.dryRun ); + cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTableSortedPairs, acx.dryRun ); + cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTableSortedPairs, acx.dryRun ); acx.devAllocator->PopToMarker( devMarker ); acx.pinnedAllocator->PopToMarker( pinnedMarker ); // Allocate Pair at the end, to ensure we grab the highest value - cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( diskDescTables, acx.dryRun ); + cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descXPairs, acx.dryRun ); } /// /// Uploads /// - cx.yIn = cx.gpuUploadStream[0]->CreateUploadBufferT( directDesc, acx.dryRun ); - cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT( diskDescMeta, acx.dryRun ); + cx.yIn = cx.gpuUploadStream[0]->CreateUploadBufferT( yDesc, acx.dryRun ); + cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descMeta, acx.dryRun ); // These uploaded buffers share the same backing buffers { const size_t devMarker = acx.devAllocator->Size(); const size_t pinnedMarker = acx.pinnedAllocator->Size(); - cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( directDesc, acx.dryRun ); - cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( directDesc, acx.dryRun ); + cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descTablePairs, acx.dryRun ); + cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descTablePairs, acx.dryRun ); acx.devAllocator->PopToMarker( devMarker ); acx.pinnedAllocator->PopToMarker( pinnedMarker ); // Allocate Pair at the end, to ensure we grab the highest value - cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT( diskDescXPair, acx.dryRun ); + cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descXPairs, acx.dryRun ); } /// Device-only allocations @@ -1617,11 +1648,20 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB /// In disk-backed mode, assign disk buffers to gpu buffers if( cx.cfg.hybrid128Mode && !acx.dryRun ) { - cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedXs ); - cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedXs ); + cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL ); + cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL ); - if( cx.cfg.hybrid64Mode ) + if( cx.cfg.hybrid16Mode ) { + cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL ); + cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL ); + + cx.pairsROut.AssignDiskBuffer( cx.diskContext->unsortedR ); + cx.pairsRIn .AssignDiskBuffer( cx.diskContext->unsortedR ); + + cx.yOut.AssignDiskBuffer( cx.diskContext->yBuffer ); + cx.yIn .AssignDiskBuffer( cx.diskContext->yBuffer ); + cx.metaOut.AssignDiskBuffer( cx.diskContext->metaBuffer ); cx.metaIn .AssignDiskBuffer( cx.diskContext->metaBuffer ); } @@ -1639,7 +1679,7 @@ void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinne // Get the largest park size const size_t maxParkSize = cx.cfg.gCfg->compressionLevel == 0 ? CalculateParkSize( TableId::Table1 ) : - GetCompressionInfoForLevel( cx.cfg.gCfg->compressionLevel ).tableParkSize; + GetLargestCompressedParkSize(); const size_t parksPerBuffer = CDivT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2; // CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kCheckpoint1Interval ) + 1; // Need an extra park for left-over entries diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h index 25a40bac..48e95a34 100644 --- a/cuda/CudaPlotter.h +++ b/cuda/CudaPlotter.h @@ -15,7 +15,7 @@ struct CudaK32PlotConfig // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory) bool hybrid128Mode = false; // Enable hybrid disk-offload w/ 128G of RAM. - bool hybrid64Mode = false; // Enable hybrid disk-offload w/ 64G of RAM. + bool hybrid16Mode = false; // Enable hybrid disk-offload w/ 64G of RAM. const char* temp1Path = nullptr; // For 128G RAM mode const char* temp2Path = nullptr; // For 64G RAM mode diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu index c1c2e875..3d06973c 100644 --- a/cuda/GpuDownloadStream.cu +++ b/cuda/GpuDownloadStream.cu @@ -41,7 +41,7 @@ void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStrea void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream ) { - ASSERT( 0 ); + Panic( "Unavailable" ); // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT ); // ASSERT( hostBuffer ); // ASSERT( workStream ); @@ -148,6 +148,8 @@ void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_ CallHostFunctionOnStream( downloadStream, [this](){ self->diskBuffer->GetNextWriteBuffer(); }); + + pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 ); } if( !isDirect ) diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu index 57ece050..63700c9c 100644 --- a/cuda/GpuStreams.cu +++ b/cuda/GpuStreams.cu @@ -50,12 +50,21 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t diskBuffer->ReadNextBucket(); // Block until the buffer is fully read from disk - // #TODO: Also not do this here, but in a disk stream, + // #TODO: Also should not do this here, but in a host-to-host background stream, // so that the next I/O read can happen in the background while // the previous upload to disk is happening, if needed. (void)diskBuffer->GetNextReadBuffer(); }); } + else if( !isDirect ) + { + // Copy from unpinned to pinned first + // #TODO: This should be done in a different backgrund host-to-host copy stream + CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) ); + CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) ); + + hostBuffer = self->pinnedBuffer[index]; + } // Ensure the device buffer is ready for use CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); @@ -63,6 +72,12 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t // Upload to the device buffer CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) ); + if( !isDirect ) + { + // Signal that the pinned buffer is ready for re-use + CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + } + // Signal work stream that the device buffer is ready to be used CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } @@ -143,89 +158,57 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 (void)diskBuffer->GetNextReadBuffer(); }); } - - // Upload to device buffer - if( !isDirect ) - { - for( uint32 i = 0; i < length; i++ ) - { - ASSERT( *counts ); - totalBufferSize += *counts * (size_t)elementSize; - counts += countStride; - } - - // #TODO: These should be done in a copy stream to perform the copies in the background - if( diskBuffer ) - { - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); - } - else - { - CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToHost, uploadStream ) ); - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], self->pinnedBuffer[index], totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); - } - } else { // Perform fragmented uploads - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + const auto waitEvent = isDirect ? self->deviceEvents[index] : self->pinnedEvent[index]; + const auto copyMode = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost; + + // Wait on device or pinned buffer to be ready (depending if a direct copy or not) + CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) ); - const byte* src = (byte*)hostBuffer; - byte* dst = (byte*)self->deviceBuffer[index]; + const byte* src = (byte*)hostBuffer; + byte* dst = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] ); + const uint32* sizes = counts; for( uint32 i = 0; i < length; i++ ) { - const size_t size = *counts * (size_t)elementSize; + const size_t size = *sizes * (size_t)elementSize; - CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, uploadStream ) ); + CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) ); dst += size; src += srcStride; - counts += countStride; + sizes += countStride; } - } - - // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); - - - - /// - /// Old pre-disk Impl - /// - // ASSERT( hostBuffer ); - // const uint32 index = SynchronizeOutgoingSequence(); - // auto stream = self->queue->GetStream(); - - // // Ensure the device buffer is ready for use - // CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); - - // // Perform uploads - // //size_t deviceCopySize = 0; - // const byte* src = (byte*)hostBuffer; - // byte* dst = (byte*)self->deviceBuffer[index]; - - // for( uint32 i = 0; i < length; i++ ) - // { - // const size_t size = *counts * (size_t)elementSize; - // //memcpy( dst, src, size ); - // CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) ); + if( !isDirect ) + { + // Set the pinned buffer as the host buffer so that we can do a sequential copy to the device now + hostBuffer = self->pinnedBuffer[index]; + } + } - // //deviceCopySize += size; + // Upload to device buffer if in non-direct mode + if( !isDirect ) + { + for( uint32 i = 0; i < length; i++ ) + { + ASSERT( *counts ); + totalBufferSize += *counts * (size_t)elementSize; + counts += countStride; + } - // dst += size; - // src += srcStride; - // counts += countStride; - // } + // #TODO: This should be done in a copy stream to perform the copies in the background + CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); - // // Copy to device buffer - // //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) ); + if( !self->diskBuffer ) + CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + } - // // Signal work stream that the device buffer is ready to be used - // CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) ); + // Signal work stream that the device buffer is ready to be used + CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu index 2aca03e5..7fb7c5d0 100644 --- a/cuda/chacha8.cu +++ b/cuda/chacha8.cu @@ -249,8 +249,11 @@ void GenF1Cuda( CudaK32PlotContext& cx ) cx.yOut .Reset(); cx.metaOut.Reset(); - if( cx.cfg.hybrid64Mode ) + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->yBuffer->Swap(); cx.diskContext->metaBuffer->Swap(); + } } /// diff --git a/extract-version.ps1 b/extract-version.ps1 new file mode 100644 index 00000000..c26d1c70 --- /dev/null +++ b/extract-version.ps1 @@ -0,0 +1,60 @@ +# Navigate to the script's directory +$scriptPath = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent +Set-Location -Path $scriptPath + +# Arguments +$ver_component = $args[0] # The user-specified component from the full version + +# Read the version from the file +$version_str = (Get-Content 'VERSION' | Select-Object -First 1 | Out-String).Trim() +$bb_version_suffix = (Get-Content 'VERSION' | Select-Object -Last 1 | Out-String).Trim() +$version_header = 'src\Version.h' + +if ($version_str -eq $bb_version_suffix) { + $bb_version_suffix = "" +} + +# Prepend a '-' to the suffix, if necessary +if (-Not [string]::IsNullOrEmpty($bb_version_suffix) -and $bb_version_suffix[0] -ne '-') { + $bb_version_suffix = "-$bb_version_suffix" +} + +# Parse the major, minor, and revision numbers +$bb_ver_maj, $bb_ver_min, $bb_ver_rev = $version_str -split '\.' | ForEach-Object { $_.Trim() } + +# Get the Git commit hash +$bb_git_commit = $env:GITHUB_SHA +if ([string]::IsNullOrEmpty($bb_git_commit)) { + $bb_git_commit = & git rev-parse HEAD +} + +if ([string]::IsNullOrEmpty($bb_git_commit)) { + $bb_git_commit = "unknown" +} + +# Check if the user wants a specific component +if (-Not [string]::IsNullOrEmpty($ver_component)) { + switch ($ver_component) { + "major" { + Write-Host -NoNewline $bb_ver_maj + } + "minor" { + Write-Host -NoNewline $bb_ver_min + } + "revision" { + Write-Host -NoNewline $bb_ver_rev + } + "suffix" { + Write-Host -NoNewline $bb_version_suffix + } + "commit" { + Write-Host -NoNewline $bb_git_commit + } + default { + Write-Error "Invalid version component '$ver_component'" + exit 1 + } + } + exit 0 +} + diff --git a/src/harvesting/HarvesterDummy.cpp b/src/harvesting/HarvesterDummy.cpp new file mode 100644 index 00000000..e2d8f69e --- /dev/null +++ b/src/harvesting/HarvesterDummy.cpp @@ -0,0 +1 @@ +// Only here to make CMake happy \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index ae568d1c..48beebb6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -477,7 +477,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c Log::Line( " Benchmark mode : %s", cfg.benchmarkMode ? "enabled" : "disabled" ); // Log::Line( " Output path : %s", cfg.outputFolder ); // Log::Line( "" ); - + FatalIf( plotter == nullptr, "No plotter type chosen." ); @@ -486,7 +486,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c // Parse plotter-specific CLI plotter->ParseCLI( cfg, cli ); - + // Parse remaining args as output directories cfg.outputFolderCount = (uint32)cli.RemainingArgCount(); FatalIf( cfg.outputFolderCount < 1, "At least one output folder must be specified." ); @@ -498,6 +498,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c while( cli.HasArgs() ) { outPath = cli.Arg(); + FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath.c_str() ); // Add trailing slash? const char endChar = outPath.back(); diff --git a/src/plotting/BufferChain.cpp b/src/plotting/BufferChain.cpp index ca11dd59..43a7e47b 100644 --- a/src/plotting/BufferChain.cpp +++ b/src/plotting/BufferChain.cpp @@ -41,8 +41,8 @@ byte* BufferChain::GetNextBuffer() { const uint32 bufferCount = (uint32)_buffers.Length(); - ASSERT( _nextBufferToRelease <= _nextBufferToLock ); - ASSERT( _nextBufferToLock - _nextBufferToRelease <= bufferCount ); + PanicIf( _nextBufferToRelease > _nextBufferToLock, "" ); + PanicIf( _nextBufferToLock - _nextBufferToRelease > bufferCount, "" ); if( _nextBufferToLock >= bufferCount ) { @@ -55,12 +55,17 @@ byte* BufferChain::GetNextBuffer() void BufferChain::ReleaseNextBuffer() { PanicIf( _nextBufferToRelease >= _nextBufferToLock, "" ); + PanicIf(_nextBufferToLock - _nextBufferToRelease > (uint32)_buffers.Length(), "" ); + _fence.Signal( ++_nextBufferToRelease ); } void BufferChain::Reset() { - GetNextBuffer(); + // Wait for the last buffer to be released + _fence.Wait( _nextBufferToLock ); + + // Reset state _fence.Reset( 0 ); _nextBufferToRelease = 0; _nextBufferToLock = 0; diff --git a/src/plotting/BufferChain.h b/src/plotting/BufferChain.h index dabacdad..edb934a7 100644 --- a/src/plotting/BufferChain.h +++ b/src/plotting/BufferChain.h @@ -37,7 +37,7 @@ class BufferChain private: Fence _fence; Span _buffers; - IAllocator* _allocator = nullptr; + IAllocator* _allocator = nullptr; size_t _bufferSize = 0; // Size of each individual buffer uint32 _nextBufferToLock = 0; uint32 _nextBufferToRelease = 0; diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp index 59c099d3..bde4313b 100644 --- a/src/plotting/Compression.cpp +++ b/src/plotting/Compression.cpp @@ -2,6 +2,7 @@ #include "plotting/FSETableGenerator.h" #include "util/Util.h" #include +#include // Caches for C and D tables static std::atomic _cTableCache[32] = {}; @@ -140,4 +141,19 @@ uint32 GetCompressedLPBitCount( const uint32 compressionLevel ) // lpBitSize = lpBitSize * 2 - 1; return lpBitSize * 2 - 1; +} + +size_t GetLargestCompressedParkSize() +{ + return std::max( { + GetCompressionInfoForLevel( 1 ).tableParkSize, + GetCompressionInfoForLevel( 2 ).tableParkSize, + GetCompressionInfoForLevel( 3 ).tableParkSize, + GetCompressionInfoForLevel( 4 ).tableParkSize, + GetCompressionInfoForLevel( 5 ).tableParkSize, + GetCompressionInfoForLevel( 6 ).tableParkSize, + GetCompressionInfoForLevel( 7 ).tableParkSize, + GetCompressionInfoForLevel( 8 ).tableParkSize, + GetCompressionInfoForLevel( 9 ).tableParkSize } + ); } \ No newline at end of file diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h index c1967ed4..dbb01228 100644 --- a/src/plotting/Compression.h +++ b/src/plotting/Compression.h @@ -16,6 +16,7 @@ FSE_CTable* CreateCompressionCTable( const uint32_t compressionLevel, size_t FSE_DTable* CreateCompressionDTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr ); CompressionInfo GetCompressionInfoForLevel( const uint32_t compressionLevel ); uint32_t GetCompressedLPBitCount( const uint32_t compressionLevel ); +size_t GetLargestCompressedParkSize(); template struct CompressionLevelInfo diff --git a/src/plotting/DiskBucketBuffer.h b/src/plotting/DiskBucketBuffer.h index 763ad75b..ec50cc8b 100644 --- a/src/plotting/DiskBucketBuffer.h +++ b/src/plotting/DiskBucketBuffer.h @@ -87,7 +87,7 @@ class DiskBucketBuffer : public DiskBufferBase private: size_t _sliceCapacity; // Maximum size of each slice - + bool _verticalWrite = false; // size_t _writeSliceStride; // Offset to the start of the next slices when writing // size_t _readSliceStride; // Offset to the start of the next slice when reading (these are swapped between tables). diff --git a/src/plotting/DiskBufferBase.cpp b/src/plotting/DiskBufferBase.cpp index 3faed0fe..38a26270 100644 --- a/src/plotting/DiskBufferBase.cpp +++ b/src/plotting/DiskBufferBase.cpp @@ -114,10 +114,11 @@ void DiskBufferBase::Swap() void* DiskBufferBase::GetNextWriteBuffer() { - FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() ); + PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." ); + PanicIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() ); void* buf = _writeBuffers[_nextWriteLock % 2]; - FatalIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() ); + PanicIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() ); if( _nextWriteLock++ >= 2 ) WaitForWriteToComplete( _nextWriteLock-2 ); @@ -127,11 +128,13 @@ void* DiskBufferBase::GetNextWriteBuffer() void* DiskBufferBase::PeekReadBufferForBucket( uint32 bucket ) { + PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." ); return _readBuffers[bucket % 2]; } void* DiskBufferBase::PeekWriteBufferForBucket( const uint32 bucket ) { + PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." ); return _writeBuffers[bucket % 2]; } @@ -150,10 +153,11 @@ void DiskBufferBase::WaitForLastWriteToComplete() void* DiskBufferBase::GetNextReadBuffer() { - FatalIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() ); + PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." ); + PanicIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() ); void* buf = _readBuffers[_nextReadLock % 2]; - FatalIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() ); + PanicIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() ); WaitForReadToComplete( _nextReadLock++ ); return buf; diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp index 625074b6..f275d980 100644 --- a/src/tools/PlotComparer.cpp +++ b/src/tools/PlotComparer.cpp @@ -105,12 +105,12 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli ) // TestTable( refPlot, tgtPlot, TableId::Table7 ); // TestTable( refPlot, tgtPlot, TableId::Table3 ); - // TestC3Table( refPlot, tgtPlot ); + TestC3Table( refPlot, tgtPlot ); for( TableId table = TableId::Table1; table <= TableId::Table7; table++ ) TestTable( refPlot, tgtPlot, table ); - TestC3Table( refPlot, tgtPlot ); + // TestC3Table( refPlot, tgtPlot ); } //----------------------------------------------------------- diff --git a/src/util/Util.h b/src/util/Util.h index 7d38cdde..e4477e84 100644 --- a/src/util/Util.h +++ b/src/util/Util.h @@ -68,9 +68,9 @@ /// /// Assorted utility functions /// -void Exit( int code ); -void FatalExit(); -void PanicExit(); +[[noreturn]] void Exit( int code ); +[[noreturn]] void FatalExit(); +[[noreturn]] void PanicExit(); void FatalErrorMsg( const char* message, ... ); void PanicErrorMsg( const char* message, ... );