diff --git a/.vscode/launch.json b/.vscode/launch.json
index 36ae6c2f..bceef625 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -131,7 +131,7 @@
             "preLaunchTask" : "build_cuda_debug",
 
             "program": "${workspaceFolder}/build/bladebit_cuda",
-            
+
             //                 "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
             // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73",    // Yes overflow
             // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
@@ -140,8 +140,9 @@
             // "-w -z 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
 
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
-            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
-            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
+            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
             
             
             "windows": {
@@ -357,8 +358,8 @@
 
                 /// Compare
                 "plotcmp",
-                "/home/harold/plot/plot-k32-c01-2023-08-09-20-50-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot.ref",
-                "/home/harold/plot/plot-k32-c01-2023-08-09-21-33-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot",
+                "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
 
                 // "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
                 // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
diff --git a/.vscode/settings.json b/.vscode/settings.json
index fafad2e2..6c2da21b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -138,7 +138,8 @@
     ],
     // "cmake.buildArgs": [],
     "cmake.configureSettings": {
-        "BB_ENABLE_TESTS": "ON"
+        "BB_ENABLE_TESTS": "ON",
+        "BB_CUDA_USE_NATIVE": "ON"
     },
     "C_Cpp.dimInactiveRegions": false,
     // "cmake.generator": "Unix Makefiles"
diff --git a/Bladebit.cmake b/Bladebit.cmake
index df45dc3b..9f1166dd 100644
--- a/Bladebit.cmake
+++ b/Bladebit.cmake
@@ -1,4 +1,4 @@
-add_library(bladebit_core src/plotting/DiskBuffer.h src/plotting/DiskBufferBase.cpp src/plotting/DiskBufferBase.h)
+add_library(bladebit_core)
 target_link_libraries(bladebit_core PUBLIC bladebit_config)
 
 target_include_directories(bladebit_core PUBLIC
@@ -294,9 +294,13 @@ set(src_bladebit
 
     src/plotting/DiskQueue.h
     src/plotting/DiskQueue.cpp
+    src/plotting/DiskBuffer.h
     src/plotting/DiskBuffer.cpp
     src/plotting/DiskBucketBuffer.h
     src/plotting/DiskBucketBuffer.cpp
+    src/plotting/DiskBufferBase.h
+    src/plotting/DiskBufferBase.cpp
+
     src/util/MPMCQueue.h
     src/util/CommandQueue.h
 )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5660cef0..8f72155c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_OSX_DEPLOYMENT_TARGET 10.16)
 
 set(CMAKE_CONFIGURATION_TYPES Release Debug)
 
@@ -19,7 +18,7 @@ if(POLICY CMP0091)
   cmake_policy(SET CMP0091 NEW) 
 endif()
 
-set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
+set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.")
 set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "MSVC Runtime Library")
 
 project(bladebit LANGUAGES C CXX ASM)
@@ -85,7 +84,7 @@ endif()
 # NOTE: These are mostly sandbox test environment, not proper tests
 option(BB_ENABLE_TESTS "Enable tests." OFF)
 option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
-option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
+option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF)
 option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
 option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
 option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF)
@@ -146,7 +145,7 @@ endif()
 include(Config.cmake)
 
 if(NOT ${BB_HARVESTER_ONLY})
-    if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
+    if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION))
         include(cmake_modules/EmbedVersion.cmake)
     endif()
 
diff --git a/Harvester.cmake b/Harvester.cmake
index ece2c457..692daa80 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -1,5 +1,5 @@
 if(NOT ${BB_HARVESTER_STATIC})
-    add_library(bladebit_harvester SHARED)
+    add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp)
 else()
     add_library(bladebit_harvester STATIC)
 endif()
@@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE
         cuda/CudaF1.cu
         cuda/CudaMatch.cu
         cuda/CudaPlotUtil.cu
+        cuda/GpuQueue.cu
 
-        # TODO: Remove this, ought not be needed in harvester
+        # TODO: Does this have to be here?
         cuda/GpuStreams.cu
+        cuda/GpuDownloadStream.cu
+        src/plotting/DiskBuffer.cpp
+        src/plotting/DiskBucketBuffer.cpp
+        src/plotting/DiskBufferBase.cpp
+        src/plotting/DiskQueue.cpp
     >
 
     $<$<NOT:${have_cuda}>:
diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index 6ec042c0..1c346632 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -2,18 +2,25 @@
 if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
     message("Embedding local build version")
 
-    set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
-
-    set(cmd_ver bash)
+    set(cmd_shell bash)
+    set(cmd_ext sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-        set(cmd_ver bash.exe)
+
+        find_program(bash_path NAMES bash.exe NO_CACHE)
+
+        if(${bash_path} MATCHES "-NOTFOUND")
+            set(cmd_shell powershell)
+            set(cmd_ext ps1)
+        else()
+            set(cmd_shell "${bash_path}")
+        endif()
     endif()
 
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)
@@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI})
     add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
     add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
 endif()
+
+set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
\ No newline at end of file
diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
index 8fa77588..b42a5d8a 100644
--- a/cuda/CudaPlotConfig.h
+++ b/cuda/CudaPlotConfig.h
@@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     // #define DBG_BBCU_P2_WRITE_MARKS   1
 
     // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+    // #define DBG_BBCU_KEEP_TEMP_FILES 1
 
 
     #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index c313b696..4d546480 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -44,11 +44,24 @@ struct CudaK32ParkContext
 
 struct CudaK32HybridMode
 {
+    // For clarity, these are the file names for the disk buffers
+    // whose disk space will be shared for temp data in both phase 1 and phase 3.
+    // The name indicates their usage and in which phase.
+    static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME      = "p1y-p3index.tmp";
+    static constexpr std::string_view META_DISK_BUFFER_FILE_NAME   = "p1meta-p3rmap.tmp";
+    static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp";
+
+    static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME        = META_DISK_BUFFER_FILE_NAME;
+    static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME       = Y_DISK_BUFFER_FILE_NAME;
+    static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME;
+
     DiskQueue*  temp1Queue;  // Tables Queue
     DiskQueue*  temp2Queue;  // Metadata Queue (could be the same as temp1Queue)
 
-    DiskBucketBuffer* metaBuffer;   // Enabled in 64G mode
-    DiskBucketBuffer* unsortedXs;   // Unsorted Xs are written to disk (uint64 entries)
+    DiskBucketBuffer* metaBuffer;   // Enabled in < 128G mode
+    DiskBucketBuffer* yBuffer;      // Enabled in < 128G mode
+    DiskBucketBuffer* unsortedL;    // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries)
+    DiskBucketBuffer* unsortedR;    // Unsorted R pairs in < 128G mode
 
     DiskBuffer*       tablesL[7];
     DiskBuffer*       tablesR[7];
@@ -58,8 +71,11 @@ struct CudaK32HybridMode
 
     struct
     {
-        DiskBucketBuffer* lpOut;
-        DiskBucketBuffer* indexOut;
+        // #NOTE: These buffers shared the same file-backed storage as
+        //        with other buffers in phase 1.
+        DiskBucketBuffer* rMapBuffer;           // Step 1
+        DiskBucketBuffer* indexBuffer;          // X-step/Step 2
+        DiskBucketBuffer* lpAndLMapBuffer;      // X-step/Step 2 (LP) | Step 3 (LMap)
 
     } phase3;
 };
@@ -142,6 +158,7 @@ struct CudaK32Phase3
         GpuUploadBuffer   lMapIn;       // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values
         GpuDownloadBuffer lpOut;        // Output line points (uint64)
         GpuDownloadBuffer indexOut;     // Output source line point index (uint32) (taken from the rMap source value)
+        GpuDownloadBuffer parksOut;     // Output P7 parks on the last table
         uint32*           devLTable[2]; // Unpacked L table bucket
 
         uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -151,7 +168,7 @@ struct CudaK32Phase3
     struct {
         GpuUploadBuffer   lpIn;         // Line points from step 2
         GpuUploadBuffer   indexIn;      // Indices from step 2
-        GpuDownloadBuffer mapOut;       // lTable for next step 1
+        GpuDownloadBuffer mapOut;       // lTable for next step 2
         GpuDownloadBuffer parksOut;     // Downloads park buffers to host
 
         uint32*           hostParkOverrunCount;
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
index 0e1f6480..8d2d5094 100644
--- a/cuda/CudaPlotPhase2.cu
+++ b/cuda/CudaPlotPhase2.cu
@@ -419,7 +419,7 @@ void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
         desc.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
     }
 
-    if( cx.cfg.disableDirectDownloads )
+    if( !cx.downloadDirect )
         desc.pinnedAllocator = acx.pinnedAllocator;
 
     CudaK32Phase2& p2 = *cx.phase2;
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index a9b34c62..8fcdfe2a 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -235,6 +235,14 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
     }
     #endif
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+    }
+
+
     const uint32 compressionLevel = cx.gCfg->compressionLevel;
 
     // Special case with the starting table, since it has the values inlined already
@@ -250,6 +258,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
         CompressInlinedTable( cx );
         auto elapsed = TimerEnd( timer );
         Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed );
+
         timer = TimerBegin();
         CudaK32PlotPhase3Step3( cx );
 
@@ -284,7 +293,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
         Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 );
 
         cx.table = rTable;
-        
+
         #if BBCU_DBG_SKIP_PHASE_2
             if( rTable < TableId::Table7 )
                 DbgLoadTablePairs( cx, rTable+1, false );
@@ -378,7 +387,6 @@ void Step1( CudaK32PlotContext& cx )
     p3.pairsLoadOffset = 0;
     LoadBucket( cx, 0 );
 
-
     ///
     /// Process buckets
     ///
@@ -424,7 +432,7 @@ void Step1( CudaK32PlotContext& cx )
         s1.rMapOut.Download2DT<RMap>( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
             P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream );
     }
-    
+
     // Download slice counts
     cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
 
@@ -459,6 +467,11 @@ void Step1( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+    }
+
     // #if _DEBUG
     //     DbgValidateRMap( cx );
     // #endif
@@ -598,9 +611,15 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
 // #if _DEBUG
-//     // DbgValidateIndices( cx );
-//     DbgValidateStep2Output( cx );
+//     DbgValidateIndices( cx );
+//     // DbgValidateStep2Output( cx );
 //     // DbgDumpSortedLinePoints( cx );
 // #endif
 }
@@ -612,27 +631,47 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 //-----------------------------------------------------------
 void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    static_assert( sizeof( LMap ) == sizeof( uint64 ) );
+
     auto& p3 = *cx.phase3;
 
     // Shared allocations
-    p3.devBucketCounts      = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
-    p3.devPrunedEntryCount  = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
+    p3.devBucketCounts     = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+    p3.devPrunedEntryCount = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
 
     // Host allocations
-    p3.hostRMap             = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
-    p3.hostLinePoints       = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
-
-    if( cx.cfg.hybrid64Mode )
+    if( !cx.cfg.hybrid16Mode )
     {
-        Panic( "Unimplemented for 64G mode. Need to offload LMap/Line Points to disk." );
+        p3.hostRMap       = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
+        p3.hostLinePoints = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
     }
-
-    if( !acx.dryRun )
+    else if( !cx.diskContext->phase3.rMapBuffer )
     {
-        // ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
-        // ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL );
+        const size_t RMAP_SLICE_SIZE        = sizeof( RMap )   * P3_PRUNED_SLICE_MAX;
+        const size_t INDEX_SLICE_SIZE       = sizeof( uint32 ) * P3_PRUNED_SLICE_MAX;
+        const size_t LP_AND_LMAP_SLICE_SIZE = sizeof( uint64 ) * P3_PRUNED_SLICE_MAX;
+
+        const FileFlags TMP2_QUEUE_FILE_FLAGS = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
+
+        cx.diskContext->phase3.rMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_RMAP_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.rMapBuffer, "Failed to create R Map disk buffer." );
+
+        cx.diskContext->phase3.indexBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_INDEX_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, INDEX_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.indexBuffer, "Failed to create index disk buffer." );
+
+        cx.diskContext->phase3.lpAndLMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.lpAndLMapBuffer, "Failed to create LP/LMap disk buffer." );
     }
-    // p3.hostBucketCounts     = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+
+    #if _DEBUG
+        if( !acx.dryRun && !cx.cfg.hybrid128Mode )
+        {
+            ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
+        }
+    #endif
 
     if( acx.dryRun )
     {
@@ -704,12 +743,17 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = nullptr;
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
+    {
         uploadDesc.pinnedAllocator = acx.pinnedAllocator;
 
+        if( cx.cfg.hybrid16Mode )
+            desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto& tx = cx.phase3->xTable;
 
     tx.devRMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
@@ -717,6 +761,12 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     tx.xIn       = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( uploadDesc, acx.dryRun );
     tx.lpOut     = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
     tx.indexOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( desc, acx.dryRun );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        tx.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+        tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+    }
 }
 
 //-----------------------------------------------------------
@@ -728,25 +778,30 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = nullptr;
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
+    {
         uploadDesc.pinnedAllocator = acx.pinnedAllocator;
 
+        if( cx.cfg.hybrid16Mode )
+            desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s1        = cx.phase3->step1;
     const size_t alignment = acx.alignment;
 
     s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( uploadDesc,  acx.dryRun );
-                    // sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-    
     s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( uploadDesc,  acx.dryRun );
-                    // sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
     s1.rMapOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<RMap>( desc, acx.dryRun );
-                    // sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
 
     s1.rTableMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+    }
 }
 
 //-----------------------------------------------------------
@@ -758,25 +813,44 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = nullptr;
+
+    GpuStreamDescriptor uploadDesc = desc;
+    if( cx.cfg.hybrid16Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+    }
 
     auto&        s2        = cx.phase3->step2;
     const size_t alignment = acx.alignment;
 
     s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT<RMap>( desc, acx.dryRun );
-        // sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
     s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT<LMap>( desc, acx.dryRun );
-        // sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
-    s2.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
-        // sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
 
+    s2.lpOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
     s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32> (desc, acx.dryRun );
-        // sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-    
+
+
+    const size_t devParkAllocSize = P3_PARK_7_SIZE * P3_MAX_P7_PARKS_PER_BUCKET;
+
+    GpuStreamDescriptor parksDesc = desc;
+    parksDesc.sliceCount      = 1;
+    parksDesc.entriesPerSlice = devParkAllocSize;
+    parksDesc.sliceAlignment  = RoundUpToNextBoundaryT<size_t>( P3_PARK_7_SIZE, sizeof( uint64 ) );
+
+    s2.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( parksDesc, acx.dryRun );
+
     s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
     s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+        s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+
+        s2.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+        s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+    }
 }
 
 //-----------------------------------------------------------
@@ -788,7 +862,12 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = nullptr;
+
+    if( cx.cfg.hybrid16Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+    }
 
     auto&        s3        = cx.phase3->step3;
     const size_t alignment = acx.alignment;
@@ -808,7 +887,6 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     parksDesc.sliceAlignment  = RoundUpToNextBoundaryT<size_t>( DEV_MAX_PARK_SIZE, sizeof( uint64 ) );
 
     s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( parksDesc, acx.dryRun );
-    // cx.gpuDownloadStream[0]->CreateDownloadBuffer( devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
 
     if( acx.dryRun )
     {
@@ -828,11 +906,16 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     s3.devDeltaLinePoints = acx.devAllocator->CAlloc<uint64>( linePointAllocCount, alignment );
     s3.devIndices         = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
 
-    // s3.devParks  = acx.devAllocator->AllocT<uint64>( parkAllocSize, alignment );
-    // s3.hostParks = acx.devAllocator->AllocT<byte>  ( maxParkSize  , alignment );
-
     s3.devCTable           = acx.devAllocator->AllocT<FSE_CTable>( P3_MAX_CTABLE_SIZE, alignment );
     s3.devParkOverrunCount = acx.devAllocator->CAlloc<uint32>( 1 );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        s3.lpIn   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+        s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+
+        s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+    }
 }
 
 
@@ -986,23 +1069,45 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
 
     const uint32* reader       = p3.hostIndices;
     const size_t  readerStride = P3_PRUNED_SLICE_MAX * 3;
-
     uint64 entryCount = 0;
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
-        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+        if( cx.cfg.hybrid16Mode )
+        {
+            const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket];
+
+            cx.diskContext->phase3.indexBuffer->OverrideReadSlices( bucket, sizeof( uint32 ), sizeSlices, BBCU_BUCKET_COUNT );
+            cx.diskContext->phase3.indexBuffer->ReadNextBucket();
+            const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs<uint32>();
+            ASSERT( readBucket.Length() == p3.prunedBucketCounts[(int)cx.table][bucket] );
+
+            bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() );
+
+            idxWriter  += readBucket.Length();
+            entryCount += readBucket.Length();
+        }
+        else
         {
-            const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+            {
+                const uint32 copyCount = s2.prunedBucketSlices[slice][bucket];
 
-            bbmemcpy_t( idxWriter, reader, copyCount );
+                bbmemcpy_t( idxWriter, reader, copyCount );
 
-            idxWriter += copyCount;
-            entryCount += copyCount;
-            reader += readerStride;
+                idxWriter  += copyCount;
+                entryCount += copyCount;
+                reader     += readerStride;
+            }
         }
     }
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.indexBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
     ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
 
     RadixSort256::Sort<BB_MAX_JOBS>( pool, indices, idxTmp, entryCount );
diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h
index feb93a6f..34909123 100644
--- a/cuda/CudaPlotPhase3Internal.h
+++ b/cuda/CudaPlotPhase3Internal.h
@@ -37,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) );
 #define P3_PRUNED_TABLE_MAX_ENTRIES     BBCU_TABLE_ALLOC_ENTRY_COUNT   //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
 #define P3_PRUNED_MAX_PARKS_PER_BUCKET  ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)
 
-static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u;  // Should be more than enough
 
-//static constexpr size_t P3_LP_BUCKET_COUNT       = BBCU_BUCKET_COUNT;// << 1;
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = BBCU_MAX_SLICE_ENTRY_COUNT;
-//static constexpr uint32 P3_LP_BUCKET_BITS        = BBC_BUCKET_BITS;
-
-// static constexpr uint32 P3_LP_BUCKET_BITS        = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-                                                     //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;
-
-//static constexpr size_t P3_LP_BUCKET_STRIDE      = BBCU_BUCKET_ALLOC_ENTRY_COUNT;
-
-// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-//                                                     BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT  = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
+static constexpr size_t P3_MAX_CTABLE_SIZE         = 38u * 1024u;  // Should be more than enough
+static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+static constexpr size_t P3_PARK_7_SIZE             = CalculatePark7Size( BBCU_K );
+static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE );
 
 static constexpr size_t MAX_PARK_SIZE            = CalculateParkSize( TableId::Table1 );
 static constexpr size_t DEV_MAX_PARK_SIZE        = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 );   // Align parks to 64 bits, for easier writing of stubs
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index 0a7bc0f1..3a7a6449 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -369,6 +369,13 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
     }
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
     // #if _DEBUG
     // // if( cx.table > TableId::Table3 )
     // {
@@ -401,23 +408,26 @@ void WritePark7( CudaK32PlotContext& cx )
     auto& p3 = *cx.phase3;
     auto& s2 = p3.step2;
 
-    
+
     // Load initial bucket
     LoadBucket( cx, 0 );
 
     // Begin park 7 table in plot
     cx.plotWriter->BeginTable( PlotTable::Table7 );
 
-    constexpr size_t parkSize       = CalculatePark7Size( BBCU_K );
+    constexpr size_t parkSize       = P3_PARK_7_SIZE;
     constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
     static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );
 
+    GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut;
 
-    GpuDownloadBuffer& parkDownloader = s2.lpOut;
-
-    constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+    constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET;
     static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
 
+    if( cx.useParkContext )
+    {
+        cx.parkContext->parkBufferChain->Reset();
+    }
 
     // Host stuff
     constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
@@ -426,9 +436,10 @@ void WritePark7( CudaK32PlotContext& cx )
     const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
     const size_t totalParkCount  = CDiv( (size_t)tableEntryCount, kEntriesPerPark );
 
-    byte*   hostParks           = hostAllocator.AllocT<byte>( totalParkCount * parkSize );
-    byte*   hostParkWriter      = hostParks;
-    uint32* hostLastParkEntries = hostAllocator.CAlloc<uint32>( kEntriesPerPark );
+    byte*   hostParks           = cx.useParkContext ? nullptr : hostAllocator.AllocT<byte>( totalParkCount * parkSize );
+    byte*   hostParksWriter     = cx.useParkContext ? nullptr : hostParks;
+    uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints : 
+                                                      hostAllocator.CAlloc<uint32>( kEntriesPerPark );
 
     static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
     uint32* devIndexBuffer     = s2.devLTable[0] + kEntriesPerPark;
@@ -478,14 +489,38 @@ void WritePark7( CudaK32PlotContext& cx )
         // Download parks & write to plot
         const size_t downloadSize = parkCount * parkSize;
 
-        parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
+        if( cx.useParkContext )
+        {
+            ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() );
+
+            // Override the park buffer to be used when using a park context
+            hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+            // Wait for the next park buffer to be available
+            parkDownloader.HostCallback([&cx]{
+               (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+            });
+        }
+
+        parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize,
               []( void* parksBuffer, size_t size, void* userData ) {
 
                 auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
                 cx.plotWriter->WriteTableData( parksBuffer, size );
+
+                // Release the buffer after the plot writer is done with it.
+                if( cx.useParkContext )
+                {
+                    cx.plotWriter->CallBack([&cx](){
+                        cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+                    });
+                }
+                
             }, &cx, cx.computeStream );
 
-        hostParkWriter += downloadSize;
+        hostParksWriter += downloadSize;
+        if( cx.useParkContext )
+            hostParksWriter = nullptr;
     }
 
     // Wait for parks to complete downloading
@@ -498,9 +533,19 @@ void WritePark7( CudaK32PlotContext& cx )
     // Was there a left-over park?
     if( retainedEntryCount > 0 )
     {
+        if( cx.useParkContext )
+            hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();
+
         // Submit last park to plot
-        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
-        cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
+        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter );
+        cx.plotWriter->WriteTableData( hostParksWriter, parkSize );
+
+        if( cx.useParkContext )
+        {
+            cx.plotWriter->CallBack([&cx](){
+                cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+            });
+        }
     }
     cx.plotWriter->EndTable();
 
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index 3f12dd05..c8f9337b 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -318,6 +318,11 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     s3.lpIn   .Reset();
     s3.indexIn.Reset();
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
 
 
     // #if _DEBUG
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index cc9619d3..6b5dc586 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -56,10 +56,10 @@ GPU-based (CUDA) plotter
 [OPTIONS]:
  -h, --help           : Shows this help message and exits.
  -d, --device         : Select the CUDA device index. (default=0)
- 
+
  --disk-128           : Enable hybrid disk plotting for 128G system RAM. 
                          Requires a --temp1 and --temp2 to be set.
- --disk-64            : Enable hybrid disk plotting for 64G system RAM. 
+ --disk-16            : Enable hybrid disk plotting for 16G system RAM. 
                          Requires a --temp1 and --temp2 to be set.
  -t1, --temp1         : Temporary directory 1. Used for longer-lived, sequential writes.
  -t2, --temp2         : Temporary directory 2. Used for temporary, shorted-lived read and writes.
@@ -80,11 +80,9 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
     {
         if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) )
             continue;
-        if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) )
-            continue;
         if( cli.ReadSwitch( cfg.hybrid128Mode, "--disk-128" ) )
             continue;
-        if( cli.ReadSwitch( cfg.hybrid64Mode, "--disk-64" ) )
+        if( cli.ReadSwitch( cfg.hybrid16Mode, "--disk-16" ) )
         {
             cfg.hybrid128Mode = true;
             continue;
@@ -105,6 +103,8 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
             continue;
         if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) )
             continue;
+        // if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) )
+        //     continue;
         if( cli.ArgMatch( "--help", "-h" ) )
         {
             Log::Line( USAGE );
@@ -144,6 +144,10 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
 
     Log::Line( "[Bladebit CUDA Plotter]" );
+    Log::Line( " Host RAM        : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
+    Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" );
+    Log::NewLine();
+
     CudaInit( cx );
 
     CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
@@ -165,11 +169,12 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     cx.plotFence  = new Fence();
     cx.parkFence  = new Fence();
 
-    #if __linux__
-        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
+    #if _WIN32
+        // #MAYBE: Add a configurable option to enable direct downloads on windows?
+        // On windows always default to using intermediate pinned buffers
+        cx.downloadDirect = false;
     #else
-        // #TODO: One windows, check if we have enough memory, if so, default to true.
-        cx.downloadDirect = true ;//false;
+        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
     #endif
 
     // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
@@ -177,9 +182,12 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     //     cx.plotWriter->EnableDummyMode();
 
     // Need to do allocations for park serialization differently under the following conditions
-    if( cx.cfg.disableDirectDownloads || cx.cfg.hybrid128Mode ) //cx.cfg.hybrid64Mode )
+    if( cx.downloadDirect || cx.cfg.hybrid128Mode )
     {
-        cx.parkContext = new CudaK32ParkContext{};
+        cx.parkContext    = new CudaK32ParkContext{};
+
+        if( cx.cfg.hybrid16Mode )
+            cx.useParkContext = true;
     }
 
     // Check for hybrid mode
@@ -302,17 +310,19 @@ void CudaK32Plotter::Run( const PlotRequest& req )
     cx.plotWriter = nullptr;
 
     // Delete any temporary files
-    if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
-    {
-        if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
-        if( cx.diskContext->unsortedXs ) delete cx.diskContext->unsortedXs;
-
-        for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+    #if !(DBG_BBCU_KEEP_TEMP_FILES)
+        if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
         {
-            if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t];
-            if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
+            if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
+            if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL;
+
+            for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+            {
+                if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t];
+                if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
+            }
         }
-    }
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -490,13 +500,17 @@ void FpTable( CudaK32PlotContext& cx )
 
     if( cx.cfg.hybrid128Mode )
     {
-        if( cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
+        if( cx.cfg.hybrid16Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
         {
-            cx.diskContext->unsortedXs->Swap();
+            cx.diskContext->unsortedL->Swap();
         }
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
+        {
+            cx.diskContext->yBuffer->Swap();
             cx.diskContext->metaBuffer->Swap();
+            cx.diskContext->unsortedR->Swap();
+        }
     }
 
     cx.yIn     .Reset();
@@ -987,6 +1001,9 @@ void FinalizeTable7( CudaK32PlotContext& cx )
     {
         cx.diskContext->tablesL[(int)TableId::Table7]->Swap();
         cx.diskContext->tablesR[(int)TableId::Table7]->Swap();
+
+        if( cx.cfg.hybrid16Mode )
+            cx.diskContext->yBuffer->Swap();
     }
 
     auto elapsed = TimerEnd( timer );
@@ -1126,8 +1143,8 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
 
     const uint32* hostY        = cx.hostY;
     const uint32* hostMeta     = cx.hostMeta;
-    const uint32* hostPairsL   = cx.hostTableL; //cx.hostBackPointers[6].left;
-    const uint16* hostPairsR   = cx.hostTableR; //cx.hostBackPointers[6].right;
+    const uint32* hostPairsL   = cx.hostTableL;
+    const uint16* hostPairsR   = cx.hostTableR;
 
     const bool   uploadCompressed   = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
     const bool   uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2;
@@ -1198,11 +1215,11 @@ void AllocBuffers( CudaK32PlotContext& cx )
     cx.hostTempAllocSize  = 0;
     cx.devAllocSize       = 0;
 
-    size_t parksPinnedSize = 0;
-
-    // If on <= 64G mode or not using direct downloads, 
+    // If on <= 128G mode or not using direct downloads, 
     // we need to use a separate buffer for downloading parks, instead of re-using exisintg ones.
-    const bool allocateParkBuffers = cx.cfg.disableDirectDownloads || cx.cfg.hybrid128Mode; //cx.cfg.hybrid64Mode;
+    // If on <= 64G mode or not using direct downloads, 
+    const bool allocateParkBuffers = cx.downloadDirect || cx.cfg.hybrid128Mode;
+    size_t parksPinnedSize = 0;
 
     // Gather the size needed first
     {
@@ -1222,7 +1239,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
         acx.devAllocator       = &devAllocator;
 
         AllocateP1Buffers( cx, acx );
-
         cx.pinnedAllocSize    = pinnedAllocator   .Size();
         cx.hostTableAllocSize = hostTableAllocator.Size();
         cx.hostTempAllocSize  = hostTempAllocator .Size();
@@ -1235,7 +1251,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
         devAllocator       = {};
 
         CudaK32PlotPhase2AllocateBuffers( cx, acx );
-
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1248,7 +1263,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
         devAllocator       = {};
 
         CudaK32PlotPhase3AllocateBuffers( cx, acx );
-
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1257,14 +1271,15 @@ void AllocBuffers( CudaK32PlotContext& cx )
         // May need to allocate extra pinned buffers for park buffers
         if( allocateParkBuffers )
         {
+            pinnedAllocator = {};
             AllocateParkSerializationBuffers( cx, *acx.pinnedAllocator, acx.dryRun );
-            parksPinnedSize = acx.pinnedAllocator->Size();
+            parksPinnedSize = pinnedAllocator.Size();
         }
     }
 
 
-    size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize;
-    size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
+    const size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize;
+    const size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
     Log::Line( "Kernel RAM required       : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize,
                    (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB );
 
@@ -1280,40 +1295,40 @@ void AllocBuffers( CudaK32PlotContext& cx )
     Log::Line( "GPU RAM required          : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize,
                    (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB );
 
-    Log::Line( "Allocating buffers" );
     // Now actually allocate the buffers
+    Log::Line( "Allocating buffers..." );
     CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) );
 
     #if _DEBUG
         cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
     #else
-        #if !_WIN32
-        // if( cx.downloadDirect )
+
+        bool allocateHostTablesPinned = cx.downloadDirect;
+        #if _WIN32
+            // On windows we always force the use of intermediate buffers, so we allocate on the host
+            allocateHostTablesPinned = false;
+        #endif
+
+        Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" );
+        if( allocateHostTablesPinned )
             CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-        // else
-        // {
-        //     // #TODO: On windows, first check if we have enough shared memory (512G)? 
-        //     //        and attempt to alloc that way first. Otherwise, use intermediate pinned buffers.
-        #else
+        else
             cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
-        #endif
-        // }
     #endif
 
-    //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-
     cx.hostBufferTemp = nullptr;
-#if _DEBUG
-    if( cx.hostTempAllocSize )
-        cx.hostBufferTemp   = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
-#endif
+    #if _DEBUG || _WIN32
+        if( cx.hostTempAllocSize )
+            cx.hostBufferTemp = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
+    #endif
+
     if( cx.hostBufferTemp == nullptr && cx.hostTempAllocSize )
         CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) );
 
     CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) );
 
     // Warm start
-    if( true )
+    if( true )// cx.gCfg->warmStart )
     {
         FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer    , cx.pinnedAllocSize    );
         FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize );
@@ -1377,23 +1392,25 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
         // Temp allocations are pinned host buffers that can be re-used for other means in different phases.
         // This is roughly equivalent to temp2 dir during disk plotting.
 
-        cx.hostY = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
 
-        if( !cx.cfg.hybrid64Mode )
+        if( !cx.cfg.hybrid16Mode )
         {
+            cx.hostY = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
             cx.hostMeta = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
         }
         else if( !cx.diskContext->metaBuffer )
         {
+            const size_t ySliceSize    = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT;
             const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT;
 
-            cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "metadata.tmp", 
-                                            BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+            cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::Y_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->yBuffer, "Failed to create y disk buffer." );
 
+            cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::META_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
             FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." );
         }
-Log::Line( "Host Temp @ %llu GiB", (llu)acx.hostTempAllocator->Size() BtoGB );
-Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
 
         // Marking tables used to prune back pointers
         {
@@ -1469,83 +1486,97 @@ Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
                 multiplier = 1;
             }
 
-            cx.hostTableL = acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
-            cx.hostTableR = acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
-
             // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead.
             const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair );
-            cx.diskContext->unsortedXs = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_x.tmp", 
-                                                                   BBCU_BUCKET_COUNT, xSliceSize, fileMode, FileAccess::ReadWrite, tmp2FileFlags );
-            FatalIf( !cx.diskContext->unsortedXs, "Failed to create unsorted_x.tmp disk buffer." );
+            cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::LPAIRS_DISK_BUFFER_FILE_NAME.data(), 
+                                                                   BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." );
+
+            if( cx.cfg.hybrid16Mode )
+            {
+                cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp", 
+                                                                    BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+                FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted R disk buffer." );
+            }
+            else
+            {
+                // In 128G mode we can store intermediate pairs in the host
+                cx.hostTableL = acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+                cx.hostTableR = acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+            }
         }
     }
-Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
 
     /// Device & Pinned allocations
     {
-        GpuStreamDescriptor directDesc{};
-        directDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
-        directDesc.sliceCount      = BBCU_BUCKET_COUNT;
-        directDesc.sliceAlignment  = alignment;
-        directDesc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
-        directDesc.deviceAllocator = acx.devAllocator;
-        directDesc.pinnedAllocator = nullptr;             // Start in direct mode (no intermediate pinined buffers)
+        GpuStreamDescriptor yDesc{};
+        yDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+        yDesc.sliceCount      = BBCU_BUCKET_COUNT;
+        yDesc.sliceAlignment  = alignment;
+        yDesc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+        yDesc.deviceAllocator = acx.devAllocator;
+        yDesc.pinnedAllocator = nullptr;             // Start in direct mode (no intermediate pinined buffers)
 
         // In disk-backed mode, we always have pinned buffers,
         // which are the same buffers used to write and read from disk.
-        GpuStreamDescriptor diskDescTables = directDesc;
-        GpuStreamDescriptor diskDescXPair  = directDesc;
-        GpuStreamDescriptor diskDescMeta   = directDesc;
+        GpuStreamDescriptor descTablePairs       = yDesc;
+        GpuStreamDescriptor descTableSortedPairs = yDesc;
+        GpuStreamDescriptor descXPairs           = yDesc;
+        GpuStreamDescriptor descMeta             = yDesc;
 
         if( cx.cfg.hybrid128Mode )
         {
             // Temp 1 Queue
-            diskDescTables.pinnedAllocator = acx.pinnedAllocator;
-            diskDescTables.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
+            descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+            descTableSortedPairs.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
 
             // Temp 2 Queue
-            diskDescXPair.pinnedAllocator  = acx.pinnedAllocator;
-            diskDescXPair.sliceAlignment   = cx.diskContext->temp2Queue->BlockSize();
+            descXPairs.pinnedAllocator   = acx.pinnedAllocator;
+            descXPairs.sliceAlignment    = cx.diskContext->temp2Queue->BlockSize();
 
-            if( cx.cfg.hybrid64Mode )
+            if( cx.cfg.hybrid16Mode )
             {
-                diskDescMeta.pinnedAllocator = acx.pinnedAllocator;
-                diskDescMeta.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+                yDesc.pinnedAllocator = acx.pinnedAllocator;
+                yDesc.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+
+                descMeta.pinnedAllocator = acx.pinnedAllocator;
+                descMeta.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+
+                descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+                descTablePairs.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
             }
         }
 
-        // In direct mode, we don't have any intermediate pinned buffers,
-        // but our destination buffer is already a pinned buffer.
-        if( cx.cfg.disableDirectDownloads )
+        if( !cx.downloadDirect )
         {
-            directDesc.pinnedAllocator = acx.pinnedAllocator;
-
-            // Assign these here too in case we're not in disk-backed mode
-            diskDescTables.pinnedAllocator = acx.pinnedAllocator;
-            diskDescMeta  .pinnedAllocator = acx.pinnedAllocator;
+            // Use intermediate pinned buffer for transfers to non-pinned destinations
+            yDesc.pinnedAllocator                = acx.pinnedAllocator;
+            descTablePairs.pinnedAllocator       = acx.pinnedAllocator;
+            descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+            descXPairs.pinnedAllocator           = acx.pinnedAllocator;
+            descMeta.pinnedAllocator             = acx.pinnedAllocator;
         }
 
 
-
         ///
         /// Downloads
         ///
-        cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( directDesc, acx.dryRun );
-        cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( diskDescMeta, acx.dryRun );
+        cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( yDesc, acx.dryRun );
+        cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         {
             // These download buffers share the same backing buffers
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( directDesc, acx.dryRun );
-            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( directDesc, acx.dryRun );
+            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( diskDescXPair, acx.dryRun );
+            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         {
@@ -1553,35 +1584,35 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( diskDescTables, acx.dryRun );
-            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( diskDescTables, acx.dryRun );
+            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTableSortedPairs, acx.dryRun );
+            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTableSortedPairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( diskDescTables, acx.dryRun );
+            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         ///
         /// Uploads
         ///
-        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( directDesc, acx.dryRun );
-        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( diskDescMeta, acx.dryRun );
+        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( yDesc, acx.dryRun );
+        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         // These uploaded buffers share the same backing buffers
         {
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( directDesc, acx.dryRun );
-            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( directDesc, acx.dryRun );
+            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( diskDescXPair, acx.dryRun );
+            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         /// Device-only allocations
@@ -1617,11 +1648,20 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
     /// In disk-backed mode, assign disk buffers to gpu buffers
     if( cx.cfg.hybrid128Mode && !acx.dryRun )
     {
-        cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedXs );
-        cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedXs );
+        cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+        cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL );
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
         {
+            cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+            cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL );
+
+            cx.pairsROut.AssignDiskBuffer( cx.diskContext->unsortedR );
+            cx.pairsRIn .AssignDiskBuffer( cx.diskContext->unsortedR );
+
+            cx.yOut.AssignDiskBuffer( cx.diskContext->yBuffer );
+            cx.yIn .AssignDiskBuffer( cx.diskContext->yBuffer );
+
             cx.metaOut.AssignDiskBuffer( cx.diskContext->metaBuffer );
             cx.metaIn .AssignDiskBuffer( cx.diskContext->metaBuffer );
         }
@@ -1639,7 +1679,7 @@ void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinne
     // Get the largest park size
     const size_t maxParkSize = cx.cfg.gCfg->compressionLevel == 0 ?
                                 CalculateParkSize( TableId::Table1 ) :
-                                GetCompressionInfoForLevel( cx.cfg.gCfg->compressionLevel ).tableParkSize;
+                                GetLargestCompressedParkSize();
 
     const size_t parksPerBuffer       = CDivT<size_t>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
     // CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kCheckpoint1Interval ) + 1; // Need an extra park for left-over entries
diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h
index 25a40bac..48e95a34 100644
--- a/cuda/CudaPlotter.h
+++ b/cuda/CudaPlotter.h
@@ -15,7 +15,7 @@ struct CudaK32PlotConfig
                                             // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
 
     bool hybrid128Mode            = false;  // Enable hybrid disk-offload w/ 128G of RAM.
-    bool hybrid64Mode             = false;  // Enable hybrid disk-offload w/ 64G of RAM.
+    bool hybrid16Mode             = false;  // Enable hybrid disk-offload w/ 64G of RAM.
 
     const char* temp1Path         = nullptr; // For 128G RAM mode
     const char* temp2Path         = nullptr; // For 64G RAM mode
diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu
index c1c2e875..3d06973c 100644
--- a/cuda/GpuDownloadStream.cu
+++ b/cuda/GpuDownloadStream.cu
@@ -41,7 +41,7 @@ void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStrea
 
 void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream  )
 {
-    ASSERT( 0 );
+    Panic( "Unavailable" );
     // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
     // ASSERT( hostBuffer );
     // ASSERT( workStream );
@@ -148,6 +148,8 @@ void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_
         CallHostFunctionOnStream( downloadStream, [this](){
             self->diskBuffer->GetNextWriteBuffer();
         });
+
+        pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 ); 
     }
 
     if( !isDirect )
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
index 57ece050..63700c9c 100644
--- a/cuda/GpuStreams.cu
+++ b/cuda/GpuStreams.cu
@@ -50,12 +50,21 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
             diskBuffer->ReadNextBucket();
 
             // Block until the buffer is fully read from disk
-            // #TODO: Also not do this here, but in a disk stream,
+            // #TODO: Also should not do this here, but in a host-to-host background stream,
             //        so that the next I/O read can happen in the background while
             //        the previous upload to disk is happening, if needed.
             (void)diskBuffer->GetNextReadBuffer();
         });
     }
+    else if( !isDirect )
+    {
+        // Copy from unpinned to pinned first
+        // #TODO: This should be done in a different backgrund host-to-host copy stream
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) );
+        CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) );
+
+        hostBuffer = self->pinnedBuffer[index];
+    }
 
     // Ensure the device buffer is ready for use
     CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
@@ -63,6 +72,12 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
     // Upload to the device buffer
     CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) );
 
+    if( !isDirect )
+    {
+        // Signal that the pinned buffer is ready for re-use
+        CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+    }
+
     // Signal work stream that the device buffer is ready to be used
     CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
@@ -143,89 +158,57 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32
             (void)diskBuffer->GetNextReadBuffer();
         });
     }
-
-    // Upload to device buffer
-    if( !isDirect )
-    {
-        for( uint32 i = 0; i < length; i++ )
-        {
-            ASSERT( *counts );
-            totalBufferSize += *counts * (size_t)elementSize;
-            counts += countStride;
-        }
-
-        // #TODO: These should be done in a copy stream to perform the copies in the background
-        if( diskBuffer )
-        {
-            CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
-            CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
-        }
-        else
-        {
-            CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToHost, uploadStream ) );
-            CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
-            CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], self->pinnedBuffer[index], totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
-        }
-    }
     else
     {
         // Perform fragmented uploads
-        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+        const auto waitEvent = isDirect ? self->deviceEvents[index] : self->pinnedEvent[index];
+        const auto copyMode  = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost;
+
+        // Wait on device or pinned buffer to be ready (depending if a direct copy or not)
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) );
 
-        const byte* src = (byte*)hostBuffer;
-              byte* dst = (byte*)self->deviceBuffer[index];
+        const byte*   src   = (byte*)hostBuffer;
+              byte*   dst   = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] );
+        const uint32* sizes = counts;
 
         for( uint32 i = 0; i < length; i++ )
         {
-            const size_t size = *counts * (size_t)elementSize;
+            const size_t size = *sizes * (size_t)elementSize;
 
-            CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, uploadStream ) );
+            CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) );
 
             dst    += size;
             src    += srcStride;
-            counts += countStride;
+            sizes += countStride;
         }
-    }
-
-    // Signal work stream that the device buffer is ready to be used
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
-
-
-
-    ///
-    /// Old pre-disk Impl
-    ///
-    // ASSERT( hostBuffer );
-    // const uint32 index = SynchronizeOutgoingSequence();
 
-    // auto stream = self->queue->GetStream();
-
-    // // Ensure the device buffer is ready for use
-    // CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
-
-    // // Perform uploads
-    // //size_t deviceCopySize = 0;
-    // const byte* src = (byte*)hostBuffer;
-    //       byte* dst = (byte*)self->deviceBuffer[index];
-
-    // for( uint32 i = 0; i < length; i++ )
-    // {
-    //     const size_t size = *counts * (size_t)elementSize;
-    //     //memcpy( dst, src, size );
-    //     CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) );
+        if( !isDirect )
+        {
+            // Set the pinned buffer as the host buffer so that we can do a sequential copy to the device now
+            hostBuffer = self->pinnedBuffer[index];
+        }
+    }
 
-    //     //deviceCopySize += size;
+    // Upload to device buffer if in non-direct mode
+    if( !isDirect )
+    {
+        for( uint32 i = 0; i < length; i++ )
+        {
+            ASSERT( *counts );
+            totalBufferSize += *counts * (size_t)elementSize;
+            counts += countStride;
+        }
 
-    //     dst    += size;
-    //     src    += srcStride;
-    //     counts += countStride;
-    // }
+        // #TODO: This should be done in a copy stream to perform the copies in the background
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+        CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
 
-    // // Copy to device buffer
-    // //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) );
+        if( !self->diskBuffer )
+            CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+    }
 
-    // // Signal work stream that the device buffer is ready to be used
-    // CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+    // Signal work stream that the device buffer is ready to be used
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
 
 void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, 
diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu
index 2aca03e5..7fb7c5d0 100644
--- a/cuda/chacha8.cu
+++ b/cuda/chacha8.cu
@@ -249,8 +249,11 @@ void GenF1Cuda( CudaK32PlotContext& cx )
     cx.yOut   .Reset();
     cx.metaOut.Reset();
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->yBuffer->Swap();
         cx.diskContext->metaBuffer->Swap();
+    }
 }
 
 ///
diff --git a/extract-version.ps1 b/extract-version.ps1
new file mode 100644
index 00000000..c26d1c70
--- /dev/null
+++ b/extract-version.ps1
@@ -0,0 +1,60 @@
+# Navigate to the script's directory
+$scriptPath = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent
+Set-Location -Path $scriptPath
+
+# Arguments
+$ver_component = $args[0]  # The user-specified component from the full version
+
+# Read the version from the file
+$version_str = (Get-Content 'VERSION' | Select-Object -First 1 | Out-String).Trim()
+$bb_version_suffix = (Get-Content 'VERSION' | Select-Object -Last 1 | Out-String).Trim()
+$version_header = 'src\Version.h'
+
+if ($version_str -eq $bb_version_suffix) {
+    $bb_version_suffix = ""
+}
+
+# Prepend a '-' to the suffix, if necessary
+if (-Not [string]::IsNullOrEmpty($bb_version_suffix) -and $bb_version_suffix[0] -ne '-') {
+    $bb_version_suffix = "-$bb_version_suffix"
+}
+
+# Parse the major, minor, and revision numbers
+$bb_ver_maj, $bb_ver_min, $bb_ver_rev = $version_str -split '\.' | ForEach-Object { $_.Trim() }
+
+# Get the Git commit hash
+$bb_git_commit = $env:GITHUB_SHA
+if ([string]::IsNullOrEmpty($bb_git_commit)) {
+    $bb_git_commit = & git rev-parse HEAD
+}
+
+if ([string]::IsNullOrEmpty($bb_git_commit)) {
+    $bb_git_commit = "unknown"
+}
+
+# Check if the user wants a specific component
+if (-Not [string]::IsNullOrEmpty($ver_component)) {
+    switch ($ver_component) {
+        "major" {
+            Write-Host -NoNewline $bb_ver_maj
+        }
+        "minor" {
+            Write-Host -NoNewline $bb_ver_min
+        }
+        "revision" {
+            Write-Host -NoNewline $bb_ver_rev
+        }
+        "suffix" {
+            Write-Host -NoNewline $bb_version_suffix
+        }
+        "commit" {
+            Write-Host -NoNewline $bb_git_commit
+        }
+        default {
+            Write-Error "Invalid version component '$ver_component'"
+            exit 1
+        }
+    }
+    exit 0
+}
+
diff --git a/src/harvesting/HarvesterDummy.cpp b/src/harvesting/HarvesterDummy.cpp
new file mode 100644
index 00000000..e2d8f69e
--- /dev/null
+++ b/src/harvesting/HarvesterDummy.cpp
@@ -0,0 +1 @@
+// Only here to make CMake happy
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index ae568d1c..48beebb6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -477,7 +477,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     Log::Line( " Benchmark mode        : %s", cfg.benchmarkMode ? "enabled" : "disabled" );
     // Log::Line( " Output path           : %s", cfg.outputFolder );
     // Log::Line( "" );
-    
+
 
     FatalIf( plotter == nullptr, "No plotter type chosen." );
 
@@ -486,7 +486,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
 
     // Parse plotter-specific CLI
     plotter->ParseCLI( cfg, cli );
-    
+
     // Parse remaining args as output directories
     cfg.outputFolderCount = (uint32)cli.RemainingArgCount();
     FatalIf( cfg.outputFolderCount < 1, "At least one output folder must be specified." );
@@ -498,6 +498,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     while( cli.HasArgs() )
     {
         outPath = cli.Arg();
+        FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath.c_str() );
 
         // Add trailing slash?
         const char endChar = outPath.back();
diff --git a/src/plotting/BufferChain.cpp b/src/plotting/BufferChain.cpp
index ca11dd59..43a7e47b 100644
--- a/src/plotting/BufferChain.cpp
+++ b/src/plotting/BufferChain.cpp
@@ -41,8 +41,8 @@ byte* BufferChain::GetNextBuffer()
 {
     const uint32 bufferCount = (uint32)_buffers.Length();
 
-    ASSERT( _nextBufferToRelease <= _nextBufferToLock );
-    ASSERT( _nextBufferToLock - _nextBufferToRelease <= bufferCount );
+    PanicIf( _nextBufferToRelease > _nextBufferToLock, "" );
+    PanicIf( _nextBufferToLock - _nextBufferToRelease > bufferCount, "" );
 
     if( _nextBufferToLock >= bufferCount )
     {
@@ -55,12 +55,17 @@ byte* BufferChain::GetNextBuffer()
 void BufferChain::ReleaseNextBuffer()
 {
     PanicIf( _nextBufferToRelease >= _nextBufferToLock, "" );
+    PanicIf(_nextBufferToLock - _nextBufferToRelease > (uint32)_buffers.Length(), "" );
+
     _fence.Signal( ++_nextBufferToRelease );
 }
 
 void BufferChain::Reset()
 {
-    GetNextBuffer();
+    // Wait for the last buffer to be released
+    _fence.Wait( _nextBufferToLock );
+
+    // Reset state
     _fence.Reset( 0 );
     _nextBufferToRelease = 0;
     _nextBufferToLock    = 0;
diff --git a/src/plotting/BufferChain.h b/src/plotting/BufferChain.h
index dabacdad..edb934a7 100644
--- a/src/plotting/BufferChain.h
+++ b/src/plotting/BufferChain.h
@@ -37,7 +37,7 @@ class BufferChain
 private:
     Fence        _fence;
     Span<byte*>  _buffers;
-    IAllocator*  _allocator = nullptr;
+    IAllocator*  _allocator           = nullptr;
     size_t       _bufferSize          = 0;  // Size of each individual buffer
     uint32       _nextBufferToLock    = 0;
     uint32       _nextBufferToRelease = 0;
diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp
index 59c099d3..bde4313b 100644
--- a/src/plotting/Compression.cpp
+++ b/src/plotting/Compression.cpp
@@ -2,6 +2,7 @@
 #include "plotting/FSETableGenerator.h"
 #include "util/Util.h"
 #include <mutex>
+#include <algorithm>
 
 // Caches for C and D tables
 static std::atomic<FSE_CTable*> _cTableCache[32] = {};
@@ -140,4 +141,19 @@ uint32 GetCompressedLPBitCount( const uint32 compressionLevel )
     //     lpBitSize = lpBitSize * 2 - 1;
 
     return lpBitSize * 2 - 1;
+}
+
+size_t GetLargestCompressedParkSize()
+{
+    return std::max( {
+        GetCompressionInfoForLevel( 1 ).tableParkSize,
+        GetCompressionInfoForLevel( 2 ).tableParkSize,
+        GetCompressionInfoForLevel( 3 ).tableParkSize,
+        GetCompressionInfoForLevel( 4 ).tableParkSize,
+        GetCompressionInfoForLevel( 5 ).tableParkSize,
+        GetCompressionInfoForLevel( 6 ).tableParkSize,
+        GetCompressionInfoForLevel( 7 ).tableParkSize,
+        GetCompressionInfoForLevel( 8 ).tableParkSize,
+        GetCompressionInfoForLevel( 9 ).tableParkSize }
+    );
 }
\ No newline at end of file
diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h
index c1967ed4..dbb01228 100644
--- a/src/plotting/Compression.h
+++ b/src/plotting/Compression.h
@@ -16,6 +16,7 @@ FSE_CTable*     CreateCompressionCTable( const uint32_t compressionLevel, size_t
 FSE_DTable*     CreateCompressionDTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr );
 CompressionInfo GetCompressionInfoForLevel( const uint32_t compressionLevel );
 uint32_t        GetCompressedLPBitCount( const uint32_t compressionLevel );
+size_t          GetLargestCompressedParkSize();
 
 template<uint32_t level>
 struct CompressionLevelInfo 
diff --git a/src/plotting/DiskBucketBuffer.h b/src/plotting/DiskBucketBuffer.h
index 763ad75b..ec50cc8b 100644
--- a/src/plotting/DiskBucketBuffer.h
+++ b/src/plotting/DiskBucketBuffer.h
@@ -87,7 +87,7 @@ class DiskBucketBuffer : public DiskBufferBase
 
 private:
     size_t _sliceCapacity;         // Maximum size of each slice
-    
+
     bool   _verticalWrite = false;
     // size_t _writeSliceStride;      // Offset to the start of the next slices when writing
     // size_t _readSliceStride;       // Offset to the start of the next slice when reading (these are swapped between tables).
diff --git a/src/plotting/DiskBufferBase.cpp b/src/plotting/DiskBufferBase.cpp
index 3faed0fe..38a26270 100644
--- a/src/plotting/DiskBufferBase.cpp
+++ b/src/plotting/DiskBufferBase.cpp
@@ -114,10 +114,11 @@ void DiskBufferBase::Swap()
 
 void* DiskBufferBase::GetNextWriteBuffer()
 {
-    FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() );
+    PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." );
+    PanicIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() );
 
     void* buf = _writeBuffers[_nextWriteLock % 2];
-    FatalIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() );
+    PanicIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() );
 
     if( _nextWriteLock++ >= 2 )
         WaitForWriteToComplete( _nextWriteLock-2 );
@@ -127,11 +128,13 @@ void* DiskBufferBase::GetNextWriteBuffer()
 
 void* DiskBufferBase::PeekReadBufferForBucket( uint32 bucket )
 {
+    PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." );
     return _readBuffers[bucket % 2];
 }
 
 void* DiskBufferBase::PeekWriteBufferForBucket( const uint32 bucket )
 {
+    PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." );
     return _writeBuffers[bucket % 2];
 }
 
@@ -150,10 +153,11 @@ void DiskBufferBase::WaitForLastWriteToComplete()
 
 void* DiskBufferBase::GetNextReadBuffer()
 {
-    FatalIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() );
+    PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." );
+    PanicIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() );
 
     void* buf = _readBuffers[_nextReadLock % 2];
-    FatalIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() );
+    PanicIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() );
 
     WaitForReadToComplete( _nextReadLock++ );
     return buf;
diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp
index 625074b6..f275d980 100644
--- a/src/tools/PlotComparer.cpp
+++ b/src/tools/PlotComparer.cpp
@@ -105,12 +105,12 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     // TestTable( refPlot, tgtPlot, TableId::Table7 );
     // TestTable( refPlot, tgtPlot, TableId::Table3 );
 
-    // TestC3Table( refPlot, tgtPlot );
+    TestC3Table( refPlot, tgtPlot );
 
     for( TableId table = TableId::Table1; table <= TableId::Table7; table++ )
         TestTable( refPlot, tgtPlot, table );
 
-    TestC3Table( refPlot, tgtPlot );
+    // TestC3Table( refPlot, tgtPlot );
 }
 
 //-----------------------------------------------------------
diff --git a/src/util/Util.h b/src/util/Util.h
index 7d38cdde..e4477e84 100644
--- a/src/util/Util.h
+++ b/src/util/Util.h
@@ -68,9 +68,9 @@
 ///
 /// Assorted utility functions
 /// 
-void Exit( int code );
-void FatalExit();
-void PanicExit();
+[[noreturn]] void Exit( int code );
+[[noreturn]] void FatalExit();
+[[noreturn]] void PanicExit();
 void FatalErrorMsg( const char* message, ... );
 void PanicErrorMsg( const char* message, ... );