diff --git a/CMakeLists.txt b/CMakeLists.txt
index a29ee1f..a69eaa5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -492,7 +492,8 @@ if (WITH_HIP)
   # target_link_libraries(${PROJECT_NAME}-${PROJECT_VERSION} ${CMAKE_BINARY_DIR}/tnn_hip.lib)
   # target_link_libraries(${PROJECT_NAME}-${PROJECT_VERSION} ${CMAKE_BINARY_DIR}/libtnn_hip.a)
 else() 
-  set(TNN_OUTPUT_NAME "tnn-miner-${INTERNAL_TNN_VERSION}_cpu")
+  # set(TNN_OUTPUT_NAME "tnn-miner-${INTERNAL_TNN_VERSION}_cpu")
+  set(TNN_OUTPUT_NAME "tnn-miner-cpu")
   add_executable(${TNN_OUTPUT_NAME} ${SOURCES_CORE} ${SOURCES_NET} ${SOURCES_CRYPTO} ${HEADERS_CRYPTO})
 
   setup_target_libraries(${TNN_OUTPUT_NAME})
@@ -504,4 +505,6 @@ else()
   # Set output name
   set_target_properties(${TNN_OUTPUT_NAME} PROPERTIES OUTPUT_NAME ${TNN_OUTPUT_NAME})
 
-endif()
\ No newline at end of file
+endif()
+
+unset(HIP_LANG CACHE)
\ No newline at end of file
diff --git a/cmake/hip-crypto/hip-crypto.cmake b/cmake/hip-crypto/hip-crypto.cmake
index b753f81..46edfe6 100644
--- a/cmake/hip-crypto/hip-crypto.cmake
+++ b/cmake/hip-crypto/hip-crypto.cmake
@@ -41,13 +41,15 @@ if (WITH_HIP)
   #   DESTINATION ${CMAKE_INSTALL_PREFIX}
   # )
 
-  if (CMAKE_HIP_PLATFORM MATCHES nvidia OR CMAKE_HIP_PLATFORM MATCHES nvcc)
+  if (HIP_PLATFORM MATCHES nvidia OR HIP_PLATFORM MATCHES nvcc)
     # set(TNN_RDC "-rdc=false")
   else()
     set(TNN_RDC "-fno-gpu-rdc")
   endif()
 
   set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${TNN_RDC}")
+  unset(TNN_RDC CACHE)
+
   list(APPEND SOURCES_CRYPTO
     ${hipSources}
   )
diff --git a/include/hex.h b/include/hex.h
index a2af615..3994022 100644
--- a/include/hex.h
+++ b/include/hex.h
@@ -10,7 +10,6 @@
 // #include <cuda.h>
 // #include <cuda_runtime.h>
 
-// __host__ __forceinline__ CUDA hints
 inline std::string hexStr(const unsigned char *data, int len)
 {
   static const char characters[] = "0123456789abcdef";
@@ -38,7 +37,6 @@ inline std::string hexStr(const unsigned char *data, int len)
 //   return result;
 // }
 
-// __host__ __device__ __forceinline__ CUDA hints
 inline void hexstrToBytes(std::string s, unsigned char *b)
 {
   for (unsigned int i = 0; i < s.length(); i += 2)
diff --git a/include/terminal.h b/include/terminal.h
index 403f0c3..49e184a 100644
--- a/include/terminal.h
+++ b/include/terminal.h
@@ -177,12 +177,10 @@ inline po::options_description get_prog_opts()
     ("test-randomx", "Run Tevador's reference RandomX tests")
   ;
 
-  // po::options_description astrix("Astrix", col_width);
-  // astrix.add_options()
-  //   ("astrix", "Required for mining to Xatum pools on Xelis")
-  //   ("xelis-bench", "Run a benchmark of xelis-hash with 1 thread")
-  //   ("test-xelis", "Run the xelis-hash tests from the official source code")
-  // ;
+  po::options_description astrix("Astrix", col_width);
+  astrix.add_options()
+    ("test-astrix", "Run a basic astrix-hash validation test")
+  ;
 
   po::options_description advanced("Advanced", col_width);
   advanced.add_options()
@@ -203,6 +201,7 @@ inline po::options_description get_prog_opts()
   general.add(spectre);
   general.add(xelis);
   general.add(randomX);
+  general.add(astrix);
   general.add(advanced);
   general.add(debug);
   return general;
diff --git a/include/tnn-common.hpp b/include/tnn-common.hpp
index 17a814a..40be118 100644
--- a/include/tnn-common.hpp
+++ b/include/tnn-common.hpp
@@ -66,6 +66,12 @@ extern int64_t difficultyDev;
 extern double doubleDiff;
 extern double doubleDiffDev;
 
+extern std::string HIP_names[32];
+extern std::vector<std::atomic<int64_t>> HIP_counters;
+extern std::vector<std::vector<int64_t>> HIP_rates5min;
+extern std::vector<std::vector<int64_t>> HIP_rates1min;
+extern std::vector<std::vector<int64_t>> HIP_rates30sec;
+
 extern std::vector<int64_t> rate5min;
 extern std::vector<int64_t> rate1min;
 extern std::vector<int64_t> rate30sec;
diff --git a/src/coins/mine_astrix.cpp b/src/coins/mine_astrix.cpp
index daa57f6..7f5ccab 100644
--- a/src/coins/mine_astrix.cpp
+++ b/src/coins/mine_astrix.cpp
@@ -5,286 +5,283 @@
 
 void mineAstrix(int tid)
 {
-  for(;;) {
-    boost::this_thread::yield();
-  }
-//   int64_t localJobCounter;
-//   int64_t localOurHeight = 0;
-//   int64_t localDevHeight = 0;
+  int64_t localJobCounter;
+  int64_t localOurHeight = 0;
+  int64_t localDevHeight = 0;
 
-//   uint64_t i = 0;
-//   uint64_t i_dev = 0;
+  uint64_t i = 0;
+  uint64_t i_dev = 0;
 
-//   byte powHash[32];
-//   byte work[AstrixHash::INPUT_SIZE] = {0};
-//   byte devWork[AstrixHash::INPUT_SIZE] = {0};
+  byte powHash[32];
+  byte work[AstrixHash::INPUT_SIZE] = {0};
+  byte devWork[AstrixHash::INPUT_SIZE] = {0};
 
-//   AstrixHash::worker *worker = (AstrixHash::worker *)malloc_huge_pages(sizeof(AstrixHash::worker));
-//   AstrixHash::worker *devWorker = (AstrixHash::worker *)malloc_huge_pages(sizeof(AstrixHash::worker));
+  AstrixHash::worker *worker = (AstrixHash::worker *)malloc_huge_pages(sizeof(AstrixHash::worker));
+  AstrixHash::worker *devWorker = (AstrixHash::worker *)malloc_huge_pages(sizeof(AstrixHash::worker));
 
-// waitForJob:
+waitForJob:
 
-//   while (!isConnected)
-//   {
-//     boost::this_thread::sleep_for(boost::chrono::milliseconds(100));
-//   }
+  while (!isConnected)
+  {
+    boost::this_thread::sleep_for(boost::chrono::milliseconds(100));
+  }
 
-//   while (true)
-//   {
-//     try
-//     {
-//       bool assigned = false;
-//       boost::json::value myJob;
-//       boost::json::value myJobDev;
-//       {
-//         std::scoped_lock<boost::mutex> lockGuard(mutex);
-//         myJob = job;
-//         myJobDev = devJob;
-//         localJobCounter = jobCounter;
-//       }
+  while (true)
+  {
+    try
+    {
+      bool assigned = false;
+      boost::json::value myJob;
+      boost::json::value myJobDev;
+      {
+        std::scoped_lock<boost::mutex> lockGuard(mutex);
+        myJob = job;
+        myJobDev = devJob;
+        localJobCounter = jobCounter;
+      }
       
-//       if (!myJob.at("template").is_string()) {
-//         continue;
-//       }
-//       if (ourHeight == 0 && devHeight == 0)
-//         continue;
+      if (!myJob.at("template").is_string()) {
+        continue;
+      }
+      if (ourHeight == 0 && devHeight == 0)
+        continue;
 
-//       if (ourHeight == 0 || localOurHeight != ourHeight)
-//       {
-//         byte *b2 = new byte[AstrixHash::INPUT_SIZE];
-//         switch (protocol)
-//         {
-//         case SPECTRE_SOLO:
-//           hexstrToBytes(std::string(myJob.at("template").as_string()), b2);
-//           break;
-//         case SPECTRE_STRATUM:
-//           hexstrToBytes(std::string(myJob.at("template").as_string()), b2);
-//           break;
-//         }
-//         memcpy(work, b2, AstrixHash::INPUT_SIZE);
-//         AstrixHash::newMatrix(work, worker->matBuffer, *worker);
-//         // AstrixHash::genPrePowHash(b2, *worker);/
-//         // AstrixHash::newMatrix(b2, worker->mat);
-//         delete[] b2;
-//         localOurHeight = ourHeight;
-//         i = 0;
-//       }
+      if (ourHeight == 0 || localOurHeight != ourHeight)
+      {
+        byte *b2 = new byte[AstrixHash::INPUT_SIZE];
+        switch (protocol)
+        {
+        case ASTRIX_SOLO:
+          hexstrToBytes(std::string(myJob.at("template").as_string()), b2);
+          break;
+        case ASTRIX_STRATUM:
+          hexstrToBytes(std::string(myJob.at("template").as_string()), b2);
+          break;
+        }
+        memcpy(work, b2, AstrixHash::INPUT_SIZE);
+        AstrixHash::newMatrix(work, worker->matBuffer, *worker);
+        // AstrixHash::genPrePowHash(b2, *worker);/
+        // AstrixHash::newMatrix(b2, worker->mat);
+        delete[] b2;
+        localOurHeight = ourHeight;
+        i = 0;
+      }
 
-//       if (devConnected && myJobDev.at("template").is_string())
-//       {
-//         if (devHeight == 0 || localDevHeight != devHeight)
-//         {
-//           byte *b2d = new byte[AstrixHash::INPUT_SIZE];
-//           switch (protocol)
-//           {
-//           case SPECTRE_SOLO:
-//             hexstrToBytes(std::string(myJobDev.at("template").as_string()), b2d);
-//             break;
-//           case SPECTRE_STRATUM:
-//             hexstrToBytes(std::string(myJobDev.at("template").as_string()), b2d);
-//             break;
-//           }
-//           memcpy(devWork, b2d, AstrixHash::INPUT_SIZE);
-//           AstrixHash::newMatrix(devWork, devWorker->matBuffer, *devWorker);
-//           // AstrixHash::genPrePowHash(b2d, *devWorker);
-//           // AstrixHash::newMatrix(b2d, devWorker->mat);
-//           delete[] b2d;
-//           localDevHeight = devHeight;
-//           i_dev = 0;
-//         }
-//       }
+      if (devConnected && myJobDev.at("template").is_string())
+      {
+        if (devHeight == 0 || localDevHeight != devHeight)
+        {
+          byte *b2d = new byte[AstrixHash::INPUT_SIZE];
+          switch (protocol)
+          {
+          case ASTRIX_SOLO:
+            hexstrToBytes(std::string(myJobDev.at("template").as_string()), b2d);
+            break;
+          case ASTRIX_STRATUM:
+            hexstrToBytes(std::string(myJobDev.at("template").as_string()), b2d);
+            break;
+          }
+          memcpy(devWork, b2d, AstrixHash::INPUT_SIZE);
+          AstrixHash::newMatrix(devWork, devWorker->matBuffer, *devWorker);
+          // AstrixHash::genPrePowHash(b2d, *devWorker);
+          // AstrixHash::newMatrix(b2d, devWorker->mat);
+          delete[] b2d;
+          localDevHeight = devHeight;
+          i_dev = 0;
+        }
+      }
 
-//       bool devMine = false;
-//       double which;
-//       bool submit = false;
-//       double DIFF = 1;
-//       Num cmpDiff;
+      bool devMine = false;
+      double which;
+      bool submit = false;
+      double DIFF = 1;
+      Num cmpDiff;
 
-//       // printf("end of job application\n");
-//       while (localJobCounter == jobCounter)
-//       {
-//         which = (double)(rand() % 10000);
-//         devMine = (devConnected && devHeight > 0 && which < devFee * 100.0);
-//         DIFF = devMine ? doubleDiffDev : doubleDiff;
-//         if (DIFF == 0)
-//           continue;
+      // printf("end of job application\n");
+      while (localJobCounter == jobCounter)
+      {
+        which = (double)(rand() % 10000);
+        devMine = (devConnected && devHeight > 0 && which < devFee * 100.0);
+        DIFF = devMine ? doubleDiffDev : doubleDiff;
+        if (DIFF == 0)
+          continue;
 
-//         // cmpDiff = ConvertDifficultyToBig(DIFF, SPECTRE_X);
-//         cmpDiff = AstrixHash::diffToTarget(DIFF);
+        // cmpDiff = ConvertDifficultyToBig(DIFF, ASTRIX_X);
+        cmpDiff = AstrixHash::diffToTarget(DIFF);
 
-//         uint64_t *nonce = devMine ? &i_dev : &i;
-//         (*nonce)++;
+        uint64_t *nonce = devMine ? &i_dev : &i;
+        (*nonce)++;
 
-//         // printf("nonce = %llu\n", *nonce);
+        // printf("nonce = %llu\n", *nonce);
 
-//         byte *WORK = (devMine && devConnected) ? &devWork[0] : &work[0];
-//         byte *nonceBytes = &WORK[72];
-//         uint64_t n;
+        byte *WORK = (devMine && devConnected) ? &devWork[0] : &work[0];
+        byte *nonceBytes = &WORK[72];
+        uint64_t n;
         
-//         int enLen = 0;
+        int enLen = 0;
         
-//         boost::json::value &J = devMine ? myJobDev : myJob;
-//         if (!J.as_object().if_contains("extraNonce") || J.at("extraNonce").as_string().size() == 0)
-//           n = ((tid - 1) % (256 * 256)) | ((rand() % 256) << 16) | ((*nonce) << 24);
-//         else {
-//           uint64_t eN = std::stoull(std::string(J.at("extraNonce").as_string().c_str()), NULL, 16);
-//           enLen = std::string(J.at("extraNonce").as_string()).size()/2;
-//           int offset = (64 - enLen*8);
-//           n = ((tid - 1) % (256 * 256)) | (((*nonce) << 16) & ((1ULL << offset)-1)) | (eN << offset);
-//         }
-//         memcpy(nonceBytes, (byte *)&n, 8);
+        boost::json::value &J = devMine ? myJobDev : myJob;
+        if (!J.as_object().if_contains("extraNonce") || J.at("extraNonce").as_string().size() == 0)
+          n = ((tid - 1) % (256 * 256)) | ((rand() % 256) << 16) | ((*nonce) << 24);
+        else {
+          uint64_t eN = std::stoull(std::string(J.at("extraNonce").as_string().c_str()), NULL, 16);
+          enLen = std::string(J.at("extraNonce").as_string()).size()/2;
+          int offset = (64 - enLen*8);
+          n = ((tid - 1) % (256 * 256)) | (((*nonce) << 16) & ((1ULL << offset)-1)) | (eN << offset);
+        }
+        memcpy(nonceBytes, (byte *)&n, 8);
 
-//         // printf("after nonce: %s\n", hexStr(WORK, AstrixHash::INPUT_SIZE).c_str());
+        // printf("after nonce: %s\n", hexStr(WORK, AstrixHash::INPUT_SIZE).c_str());
 
-//         if (localJobCounter != jobCounter) {
-//           // printf("thread %d updating job before hash\n", tid);
-//           break;
-//         }
+        if (localJobCounter != jobCounter) {
+          // printf("thread %d updating job before hash\n", tid);
+          break;
+        }
 
-//         AstrixHash::worker &usedWorker = devMine ? *devWorker : *worker;
-//         AstrixHash::hash(usedWorker, WORK, AstrixHash::INPUT_SIZE, powHash);
+        AstrixHash::worker &usedWorker = devMine ? *devWorker : *worker;
+        AstrixHash::hash(usedWorker, WORK, AstrixHash::INPUT_SIZE, powHash);
 
-//         // if (littleEndian())
-//         // {
-//         //   std::reverse(powHash, powHash + 32);
-//         // }
+        if (littleEndian())
+        {
+          std::reverse(usedWorker.scratchData, usedWorker.scratchData + 32);
+        }
 
-//         counter.fetch_add(1);
-//         submit = (devMine && devConnected) ? !submittingDev : !submitting;
+        counter.fetch_add(1);
+        submit = (devMine && devConnected) ? !submittingDev : !submitting;
 
-//         if (localJobCounter != jobCounter || localOurHeight != ourHeight) {
-//           // printf("thread %d updating job after hash\n", tid);
-//           break;
-//         }
+        if (localJobCounter != jobCounter || localOurHeight != ourHeight) {
+          // printf("thread %d updating job after hash\n", tid);
+          break;
+        }
 
 
-//         if (Num(hexStr(powHash, 32).c_str(), 16) <= cmpDiff)
-//         {
-//           // printf("thread %d entered submission process\n", tid);
-//           if (!submit) {
-//             for(;;) {
-//               submit = (devMine && devConnected) ? !submittingDev : !submitting;
-//               if (submit || localJobCounter != jobCounter || localOurHeight != ourHeight)
-//                 break;
-//               boost::this_thread::yield();
-//             }
-//           }
-//           if (localJobCounter != jobCounter) {
-//             // printf("thread %d updating job after check\n", tid);
-//             break;
-//           }
-//           // if (littleEndian())
-//           // {
-//           //   std::reverse(powHash, powHash + 32);
-//           // }
-//         //   std::string b64 = base64::to_base64(std::string((char *)&WORK[0], XELIS_TEMPLATE_SIZE));
-//           // boost::lock_guard<boost::mutex> lock(mutex);
-//           if (devMine)
-//           {
-//             submittingDev = true;
-//             // std::scoped_lock<boost::mutex> lockGuard(devMutex);
-//             // if (localJobCounter != jobCounter || localDevHeight != devHeight)
-//             // {
-//             //   break;
-//             // }
-//             setcolor(CYAN);
-//             std::cout << "\n(DEV) Thread " << tid << " found a dev share\n" << std::flush;
-//             setcolor(BRIGHT_WHITE);
-//             switch (protocol)
-//             {
-//             case SPECTRE_SOLO:
-//               devShare = {{"block_template", hexStr(&WORK[0], AstrixHash::INPUT_SIZE).c_str()}};
-//               break;
-//             case SPECTRE_STRATUM:
-//               std::vector<char> nonceStr;
-//               // Num(std::to_string((n << enLen*8) >> enLen*8).c_str(),10).print(nonceStr, 16);
-//               Num(std::to_string(n).c_str(),10).print(nonceStr, 16);
-//               devShare = {{{"id", SpectreStratum::submitID},
-//                         {"method", SpectreStratum::submit.method.c_str()},
-//                         {"params", {devWorkerName,                                   // WORKER
-//                                     myJobDev.at("jobId").as_string().c_str(), // JOB ID
-//                                     std::string(nonceStr.data()).c_str()}}}};
+        if (Num(hexStr(usedWorker.scratchData, 32).c_str(), 16) <= cmpDiff)
+        {
+          // printf("thread %d entered submission process\n", tid);
+          if (!submit) {
+            for(;;) {
+              submit = (devMine && devConnected) ? !submittingDev : !submitting;
+              if (submit || localJobCounter != jobCounter || localOurHeight != ourHeight)
+                break;
+              boost::this_thread::yield();
+            }
+          }
+          if (localJobCounter != jobCounter) {
+            // printf("thread %d updating job after check\n", tid);
+            break;
+          }
+          // if (littleEndian())
+          // {
+          //   std::reverse(powHash, powHash + 32);
+          // }
+        //   std::string b64 = base64::to_base64(std::string((char *)&WORK[0], XELIS_TEMPLATE_SIZE));
+          // boost::lock_guard<boost::mutex> lock(mutex);
+          if (devMine)
+          {
+            submittingDev = true;
+            // std::scoped_lock<boost::mutex> lockGuard(devMutex);
+            // if (localJobCounter != jobCounter || localDevHeight != devHeight)
+            // {
+            //   break;
+            // }
+            setcolor(CYAN);
+            std::cout << "\n(DEV) Thread " << tid << " found a dev share\n" << std::flush;
+            setcolor(BRIGHT_WHITE);
+            switch (protocol)
+            {
+            case ASTRIX_SOLO:
+              devShare = {{"block_template", hexStr(&WORK[0], AstrixHash::INPUT_SIZE).c_str()}};
+              break;
+            case ASTRIX_STRATUM:
+              std::vector<char> nonceStr;
+              // Num(std::to_string((n << enLen*8) >> enLen*8).c_str(),10).print(nonceStr, 16);
+              Num(std::to_string(n).c_str(),10).print(nonceStr, 16);
+              devShare = {{{"id", SpectreStratum::submitID},
+                        {"method", SpectreStratum::submit.method.c_str()},
+                        {"params", {devWorkerName,                                   // WORKER
+                                    myJobDev.at("jobId").as_string().c_str(), // JOB ID
+                                    std::string(nonceStr.data()).c_str()}}}};
 
-//               break;
-//             }
-//             data_ready = true;
-//           }
-//           else
-//           {
-//             submitting = true;
-//             // std::scoped_lock<boost::mutex> lockGuard(userMutex);
-//             // if (localJobCounter != jobCounter || localOurHeight != ourHeight)
-//             // {
-//             //   break;
-//             // }
-//             setcolor(BRIGHT_YELLOW);
-//             std::cout << "\nThread " << tid << " found a nonce!\n" << std::flush;
-//             setcolor(BRIGHT_WHITE);
-//             switch (protocol)
-//             {
-//             case SPECTRE_SOLO:
-//               share = {{"block_template", hexStr(&WORK[0], AstrixHash::INPUT_SIZE).c_str()}};
-//               break;
-//             case SPECTRE_STRATUM:
-//               std::vector<char> nonceStr;
-//               // Num(std::to_string((n << enLen*8) >> enLen*8).c_str(),10).print(nonceStr, 16);
-//               Num(std::to_string(n).c_str(),10).print(nonceStr, 16);
-//               share = {{{"id", SpectreStratum::submitID},
-//                         {"method", SpectreStratum::submit.method.c_str()},
-//                         {"params", {workerName,                                   // WORKER
-//                                     myJob.at("jobId").as_string().c_str(), // JOB ID
-//                                     std::string(nonceStr.data()).c_str()}}}};
+              break;
+            }
+            data_ready = true;
+          }
+          else
+          {
+            submitting = true;
+            // std::scoped_lock<boost::mutex> lockGuard(userMutex);
+            // if (localJobCounter != jobCounter || localOurHeight != ourHeight)
+            // {
+            //   break;
+            // }
+            setcolor(BRIGHT_YELLOW);
+            std::cout << "\nThread " << tid << " found a nonce!\n" << std::flush;
+            setcolor(BRIGHT_WHITE);
+            switch (protocol)
+            {
+            case ASTRIX_SOLO:
+              share = {{"block_template", hexStr(&WORK[0], AstrixHash::INPUT_SIZE).c_str()}};
+              break;
+            case ASTRIX_STRATUM:
+              std::vector<char> nonceStr;
+              // Num(std::to_string((n << enLen*8) >> enLen*8).c_str(),10).print(nonceStr, 16);
+              Num(std::to_string(n).c_str(),10).print(nonceStr, 16);
+              share = {{{"id", SpectreStratum::submitID},
+                        {"method", SpectreStratum::submit.method.c_str()},
+                        {"params", {workerName,                                   // WORKER
+                                    myJob.at("jobId").as_string().c_str(), // JOB ID
+                                    std::string(nonceStr.data()).c_str()}}}};
 
-//               // std::cout << "blob: " << hexStr(&WORK[0], AstrixHash::INPUT_SIZE).c_str() << std::endl;
-//               // std::cout << "nonce: " << nonceStr.data() << std::endl;
-//               // std::cout << "extraNonce: " << hexStr(&WORK[AstrixHash::INPUT_SIZE - 48], enLen).c_str() << std::endl;
-//               // std::cout << "hash: " << hexStr(&powHash[0], 32) << std::endl;
-//               // std::vector<char> diffHex;
-//               // cmpDiff.print(diffHex, 16);
-//               // std::cout << "difficulty (LE): " << std::string(diffHex.data()).c_str() << std::endl;
-//               // std::cout << "powValue: " << Num(hexStr(powHash, 32).c_str(), 16) << std::endl;
-//               // std::cout << "target (decimal): " << cmpDiff << std::endl;
+              // std::cout << "blob: " << hexStr(&WORK[0], AstrixHash::INPUT_SIZE).c_str() << std::endl;
+              // std::cout << "nonce: " << nonceStr.data() << std::endl;
+              // std::cout << "extraNonce: " << hexStr(&WORK[AstrixHash::INPUT_SIZE - 48], enLen).c_str() << std::endl;
+              // std::cout << "hash: " << hexStr(&powHash[0], 32) << std::endl;
+              // std::vector<char> diffHex;
+              // cmpDiff.print(diffHex, 16);
+              // std::cout << "difficulty (LE): " << std::string(diffHex.data()).c_str() << std::endl;
+              // std::cout << "powValue: " << Num(hexStr(powHash, 32).c_str(), 16) << std::endl;
+              // std::cout << "target (decimal): " << cmpDiff << std::endl;
 
-//               // printf("blob: %s\n", foundBlob.c_str());
-//               // printf("hash (BE): %s\n", hexStr(&powHash[0], 32).c_str());
-//               // printf("nonce (Full bytes for injection): %s\n", hexStr((byte *)&n, 8).c_str());
+              // printf("blob: %s\n", foundBlob.c_str());
+              // printf("hash (BE): %s\n", hexStr(&powHash[0], 32).c_str());
+              // printf("nonce (Full bytes for injection): %s\n", hexStr((byte *)&n, 8).c_str());
 
-//               break;
-//             }
-//             data_ready = true;
-//           }
-//           // printf("thread %d finished submission process\n", tid);
-//           cv.notify_all();
-//         }
+              break;
+            }
+            data_ready = true;
+          }
+          // printf("thread %d finished submission process\n", tid);
+          cv.notify_all();
+        }
 
-//         if (!isConnected) {
-//           data_ready = true;
-//           cv.notify_all();
-//           break;
-//         }
-//       }
-//       if (!isConnected) {
-//         data_ready = true;
-//         cv.notify_all();
-//         break;
-//       }
-//     }
-//     catch (std::exception& e)
-//     {
-//       setcolor(RED);
-//       std::cerr << "Error in POW Function" << std::endl;
-//       std::cerr << e.what() << std::endl;
-//       setcolor(BRIGHT_WHITE);
+        if (!isConnected) {
+          data_ready = true;
+          cv.notify_all();
+          break;
+        }
+      }
+      if (!isConnected) {
+        data_ready = true;
+        cv.notify_all();
+        break;
+      }
+    }
+    catch (std::exception& e)
+    {
+      setcolor(RED);
+      std::cerr << "Error in POW Function" << std::endl;
+      std::cerr << e.what() << std::endl;
+      setcolor(BRIGHT_WHITE);
 
-//       localJobCounter = -1;
-//       localOurHeight = -1;
-//       localDevHeight = -1;
-//     }
-//     if (!isConnected) {
-//       data_ready = true;
-//       cv.notify_all();
-//       break;
-//     }
-//   }
-//   goto waitForJob;
+      localJobCounter = -1;
+      localOurHeight = -1;
+      localDevHeight = -1;
+    }
+    if (!isConnected) {
+      data_ready = true;
+      cv.notify_all();
+      break;
+    }
+  }
+  goto waitForJob;
 }
diff --git a/src/coins/mine_spectre.cpp b/src/coins/mine_spectre.cpp
index 095a39a..3299782 100644
--- a/src/coins/mine_spectre.cpp
+++ b/src/coins/mine_spectre.cpp
@@ -166,7 +166,7 @@ void mineSpectre(int tid)
         }
 
 
-        if (Num(hexStr(powHash, 32).c_str(), 16) <= cmpDiff)
+        if (Num(hexStr(usedWorker.scratchData, 32).c_str(), 16) <= cmpDiff)
         {
           // printf("thread %d entered submission process\n", tid);
           if (!submit) {
diff --git a/src/core/gpulibs.h b/src/core/gpulibs.h
index f5843cf..2bdb62b 100644
--- a/src/core/gpulibs.h
+++ b/src/core/gpulibs.h
@@ -10,9 +10,8 @@ inline int GPUTest() {
     if (is_hip_supported()) {
       helloTest();
       benchAstrixHip();
+      AstrixHash::test();
     }
   #endif
-
-  AstrixHash::hipCompare();
   return 0;
 }
\ No newline at end of file
diff --git a/src/core/miner.cpp b/src/core/miner.cpp
index 5fd6ab0..d36c4b3 100644
--- a/src/core/miner.cpp
+++ b/src/core/miner.cpp
@@ -58,6 +58,13 @@
 int miningAlgo = DERO_HASH;
 int reportCounter = 0;
 int reportInterval = 3;
+
+std::string HIP_names[32];
+std::vector<std::atomic<int64_t>> HIP_counters(32);
+std::vector<std::vector<int64_t>> HIP_rates5min(32);
+std::vector<std::vector<int64_t>> HIP_rates1min(32);
+std::vector<std::vector<int64_t>> HIP_rates30sec(32);
+
 std::atomic<int64_t> counter = 0;
 std::atomic<int64_t> benchCounter = 0;
 boost::asio::io_context my_context;
@@ -200,6 +207,9 @@ int main(int argc, char **argv)
   // test_cshake256();
 
   GPUTest();
+  #ifdef TNN_HIP
+  return 0;
+  #endif
 
   std::atexit(onExit);
   signal(SIGTERM, sigterm);
@@ -320,7 +330,7 @@ int main(int argc, char **argv)
   {
     #if defined(TNN_ASTRIXHASH)
     symbol = "AIX";
-    protocol = ASTRIX_SOLO;
+    protocol = ASTRIX_STRATUM;
     #else
     setcolor(RED);
     printf(unsupported_astrix);
@@ -392,6 +402,19 @@ int main(int argc, char **argv)
     #endif
   }
 
+  if (vm.count("test-astrix"))
+  {
+    #if defined(TNN_RANDOMX)
+    return AstrixHash::test();
+    #else
+    setcolor(RED);
+    printf(unsupported_astrix);
+    fflush(stdout);
+    setcolor(BRIGHT_WHITE);
+    return 1;
+    #endif
+  }
+
   if (vm.count("xelis-bench"))
   {
     #if defined(TNN_XELISHASH)
@@ -479,7 +502,7 @@ int main(int argc, char **argv)
     }
     if(wallet.find("astrix", 0) != std::string::npos) {
       symbol = "AIX";
-      protocol = ASTRIX_SOLO;
+      protocol = ASTRIX_STRATUM;
     }
     if(wallet.find("ZEPHYR", 0) != std::string::npos) {
       symbol = "ZEPH";
diff --git a/src/core/miner.h b/src/core/miner.h
index 64d1df2..e3a25d9 100644
--- a/src/core/miner.h
+++ b/src/core/miner.h
@@ -68,7 +68,7 @@ std::string defaultHost[] = {
   "monerohash.com",
   "", // verus
   "",
-  "34150" // aix
+  "127.0.0.1" // aix
 };
 
 std::string devPort[] = {
diff --git a/src/core/reporter.cpp b/src/core/reporter.cpp
index 3ea3332..f9d2dd9 100644
--- a/src/core/reporter.cpp
+++ b/src/core/reporter.cpp
@@ -85,6 +85,12 @@ int update_handler(const boost::system::error_code& error)
       case RX0:
         dPrint = difficulty;
         break;
+      case VERUSHASH:
+        dPrint = difficulty;
+        break;
+      case ASTRIX_HASH:
+        dPrint = doubleDiff;
+        break;
     }
 
     std::cout << std::setw(2) << "ACCEPTED " << accepted << std::setw(2) << " | REJECTED " << rejected
diff --git a/src/crypto/astrix-hash/astrix-hash.cpp b/src/crypto/astrix-hash/astrix-hash.cpp
index f640007..ae1f91b 100644
--- a/src/crypto/astrix-hash/astrix-hash.cpp
+++ b/src/crypto/astrix-hash/astrix-hash.cpp
@@ -4,26 +4,34 @@
 #include <hex.h>
 #include <string.h>
 
-extern "C"{
-#include <sha3/sha3.h>
-#include <cshake/cshake.h>
-}
+#include <immintrin.h>
+
+#include <crypto/tiny-keccak/tiny-keccak.h>
 
 namespace AstrixHash
 {
+  constexpr int HALF_MATRIX_SIZE = 64/2;
+  constexpr int QUARTER_MATRIX_SIZE = 64/4;
+
   const char *pwHashDomain = "ProofOfWorkHash";
   const char *heavyHashDomain = "HeavyHash";
 
   const Num trueMax("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 16);
-  const Num maxTarget("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 16);
+  const Num maxTarget("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 16);
   const Num minHash = (Num(1) << 256) / maxTarget;
-  const Num bigKilo(1e3);
+  const Num bigGig(1e9);
+
+  static const uint8_t powP[Plen] = {0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02};
+  static const uint8_t heavyP[Plen] = {0x09, 0x85, 0x24, 0xb2, 0x52, 0x4c, 0xd7, 0x3a, 0x16, 0x42, 0x9f, 0x2f, 0x0e, 0x9b, 0x62, 0x79, 0xee, 0xf8, 0xc7, 0x16, 0x48, 0xff, 0x14, 0x7a, 0x98, 0x64, 0x05, 0x80, 0x4c, 0x5f, 0xa7, 0x11, 0xda, 0xce, 0xee, 0x44, 0xdf, 0xe0, 0x20, 0xe7, 0x69, 0x40, 0xf3, 0x14, 0x2e, 0xd8, 0xc7, 0x72, 0xba, 0x35, 0x89, 0x93, 0x2a, 0xff, 0x00, 0xc1, 0x62, 0xc4, 0x0f, 0x25, 0x40, 0x90, 0x21, 0x5e, 0x48, 0x6a, 0xcf, 0x0d, 0xa6, 0xf9, 0x39, 0x80, 0x0c, 0x3d, 0x2a, 0x79, 0x9f, 0xaa, 0xbc, 0xa0, 0x26, 0xa2, 0xa9, 0xd0, 0x5d, 0xc0, 0x31, 0xf4, 0x3f, 0x8c, 0xc1, 0x54, 0xc3, 0x4c, 0x1f, 0xd3, 0x3d, 0xcc, 0x69, 0xa7, 0x01, 0x7d, 0x6b, 0x6c, 0xe4, 0x93, 0x24, 0x56, 0xd3, 0x5b, 0xc6, 0x2e, 0x44, 0xb0, 0xcd, 0x99, 0x3a, 0x4b, 0xf7, 0x4e, 0xb0, 0xf2, 0x34, 0x54, 0x83, 0x86, 0x4c, 0x77, 0x16, 0x94, 0xbc, 0x36, 0xb0, 0x61, 0xe9, 0x07, 0x07, 0xcc, 0x65, 0x77, 0xb1, 0x1d, 0x8f, 0x7e, 0x39, 0x6d, 0xc4, 0xba, 0x80, 0xdb, 0x8f, 0xea, 0x58, 0xca, 0x34, 0x7b, 0xd3, 0xf2, 0x92, 0xb9, 0x57, 0xb9, 0x81, 0x84, 0x04, 0xc5, 0x76, 0xc7, 0x2e, 0xc2, 0x12, 0x51, 0x67, 0x9f, 0xc3, 0x47, 0x0a, 0x0c, 0x29, 0xb5, 0x9d, 0x39, 0xbb, 0x92, 0x15, 0xc6, 0x9f, 0x2f, 0x31, 0xe0, 0x9a, 0x54, 0x35, 0xda, 0xb9, 0x10, 0x7d, 0x32, 0x19, 0x16};
 
   bool checkPow(Num in, Num diff) {
     if (trueMax >> (diff.to_double()) < in) return false; 
     return true;
   }
 
+  #define make_uchar4(a,b,c,d) \
+    (((uint32_t)(a << 24) | (uint32_t)(b << 16) | (uint16_t)(c << 8) | d))
+
   Num diffToTarget(double diff) {
     // Create a Num object representing the difficulty
 
@@ -38,24 +46,45 @@ namespace AstrixHash
 
     // Calculate the target by dividing maxTarget by difficulty
     Num hv = Num::mul(diff, minHash);
-    Num target = Num::div(hv, bigKilo);
+    Num target = Num::div(hv, bigGig);
 
     return target;
   }
 
-  void heavyHash(byte *hash, matrix &mat, byte *out)
+  static inline void amul4bit(uint32_t packed_vec1[32], uint32_t packed_vec2[32], uint32_t *ret)
+  {
+    unsigned int res = 0;
+
+    // Loop through each 32-bit element (containing four 4-bit values)
+    for (int i = 0; i < QUARTER_MATRIX_SIZE; i++)
+    {
+      // Extract 4-bit values from packed_vec1 and packed_vec2
+      for (int j = 0; j < 8; j++)
+      {
+        uint8_t val1 = (packed_vec1[i] >> (4 * j)) & 0xF; // Extract 4 bits
+        uint8_t val2 = (packed_vec2[i] >> (4 * j)) & 0xF; // Extract 4 bits
+        res += val1 * val2;                               // Perform the dot product
+      }
+    }
+
+    *ret = res;
+  }
+
+  void heavyHash(byte *scratch, matrix &mat, byte *out)
   {
     std::array<uint16_t, 64> v{}, p{};
     for (int i = 0; i < matSize / 2; i++)
     {
-      v[i * 2] = uint16_t(hash[i] >> 4);
-      v[i * 2 + 1] = uint16_t(hash[i] & 0x0f);
+      v[i * 2] = uint16_t(scratch[i] >> 4);
+      v[i * 2 + 1] = uint16_t(scratch[i] & 0x0f);
     }
 
     // build the product array
+    #pragma unroll(4)
     for (int i = 0; i < 64; i++)
     {
       uint16_t s = 0;
+      #pragma unroll(4)
       for (int j = 0; j < 64; j++)
       {
         s += mat[i][j] * v[j];
@@ -66,26 +95,42 @@ namespace AstrixHash
     // calculate the digest
     for (size_t i = 0; i < 32; i++)
     {
-      out[i] = hash[i] ^ (static_cast<uint8_t>(p[i * 2] << 4) | static_cast<uint8_t>(p[i * 2 + 1]));
+      scratch[i] = scratch[i] ^ (static_cast<uint8_t>(p[i * 2] << 4) | static_cast<uint8_t>(p[i * 2 + 1]));
     }
 
     // hash the digest a final time, reverse bytes
+    #pragma unroll(5)
+    for (int i=0; i<4; i++) ((uint64_t *)scratch)[i] = ((uint64_t *)heavyP)[i] ^ ((uint64_t *)scratch)[i];
+
+    #pragma unroll
+    for (int i = 4; i < 25; i++) ((uint64_t *)scratch)[i] = ((uint64_t *)heavyP)[i];
+
+    keccakf(scratch);
+    // std::reverse(scratch, scratch+32);
 
-    cshake256_nil_function_name(out, 32, "HeavyHash", out, 32*8);
-    // std::reverse(out, out+32);
+    // memcpy(out, scratch, 32);
+  }
+
+  static inline void sha3_256_astrix(byte *in, byte *scratch) {
+    scratch[80] = 0x06;
+    scratch[135] = 0x80;
+
+    memcpy(scratch, in, 80);
+
+    keccakf(scratch);
   }
 
   void hash(worker &worker, byte *in, int len, byte *out)
   {
+    memset(worker.scratchData, 0, 200);
     // cshake256("ProofOfWorkHash", in, len, worker.sha3Hash, 32);
     // newMatrix(in, worker.mat, worker);
     memcpy(worker.mat, worker.matBuffer, sizeof(matrix));
-    sha3_256(in, len, worker.sha3Hash);
+    sha3_256_astrix(in, worker.scratchData);
     
-    printf("SHA3 Result: %s\n", hexStr(worker.sha3Hash, 32).c_str());
     // cshake256_nil_function_name(in, len, "ProofOfWorkHash", worker.sha3Hash, 32*8);
     // AstroBWTv3(worker.sha3Hash, 32, worker.astrobwtv3Hash, *worker.astroWorker, false);
-    heavyHash(worker.sha3Hash, worker.mat, out);
+    heavyHash(worker.scratchData, worker.mat, out);
   }
 
   void testWithInput(const char* input, byte *out) {
@@ -119,8 +164,10 @@ namespace AstrixHash
   //   free(aw);
   }
 
-  int hipCompare() {
+  int test() {
     const char* input = "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f40414243444546470000000000000000";
+    const char* expected = "40c377567fbc1d33f564493fb670eca26ba02c8a05c4dc61ab3bf86415006276";
+
     byte in[80];
     memset(in, 0, 80);
 
@@ -133,8 +180,8 @@ namespace AstrixHash
     newMatrix(in, w.matBuffer, w);
     hash(w, in, 80, out);
 
-    printf("CPU Result: %s\n", hexStr(out, 32).c_str());
-    return 0;
+    printf("CPU Astrix Result:\n%s\nWant:\n%s\n", hexStr(w.scratchData, 32).c_str(), expected);
+    return strcmp(hexStr(w.scratchData, 32).c_str(), expected) == 0;
   }
 
   // int test() {
diff --git a/src/crypto/astrix-hash/astrix-hash.h b/src/crypto/astrix-hash/astrix-hash.h
index fb9af40..79012d3 100644
--- a/src/crypto/astrix-hash/astrix-hash.h
+++ b/src/crypto/astrix-hash/astrix-hash.h
@@ -4,6 +4,7 @@
 #include <array>
 #include <num.h>
 #include <bitset>
+#include <string.h>
 
 // NOTES
 
@@ -25,6 +26,7 @@ namespace AstrixHash
   {
     matrix matBuffer;
     matrix mat;
+    uint8_t scratchData[200];
     double copied[matSize][matSize];
     std::bitset<matSize> rowsSelected;
     byte sha3Hash[32];
@@ -152,5 +154,5 @@ namespace AstrixHash
   bool checkPow(Num in, Num diff);
   Num diffToTarget(double diff);
   Num diffToHash(double diff);
-  int hipCompare();
+  int test();
 }
\ No newline at end of file
diff --git a/src/crypto/spectrex/spectrex.cpp b/src/crypto/spectrex/spectrex.cpp
index 3a0d3a8..60c86a4 100644
--- a/src/crypto/spectrex/spectrex.cpp
+++ b/src/crypto/spectrex/spectrex.cpp
@@ -4,9 +4,7 @@
 #include <openssl/sha.h>
 #include <hex.h>
 
-extern "C"{
-#include "cshake/cshake.h"
-}
+#include <crypto/tiny-keccak/tiny-keccak.h>
 
 namespace SpectreX
 {
@@ -23,6 +21,9 @@ namespace SpectreX
     return true;
   }
 
+  static const uint8_t powP[Plen] = {0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02};
+  static const uint8_t heavyP[Plen] = {0x09, 0x85, 0x24, 0xb2, 0x52, 0x4c, 0xd7, 0x3a, 0x16, 0x42, 0x9f, 0x2f, 0x0e, 0x9b, 0x62, 0x79, 0xee, 0xf8, 0xc7, 0x16, 0x48, 0xff, 0x14, 0x7a, 0x98, 0x64, 0x05, 0x80, 0x4c, 0x5f, 0xa7, 0x11, 0xda, 0xce, 0xee, 0x44, 0xdf, 0xe0, 0x20, 0xe7, 0x69, 0x40, 0xf3, 0x14, 0x2e, 0xd8, 0xc7, 0x72, 0xba, 0x35, 0x89, 0x93, 0x2a, 0xff, 0x00, 0xc1, 0x62, 0xc4, 0x0f, 0x25, 0x40, 0x90, 0x21, 0x5e, 0x48, 0x6a, 0xcf, 0x0d, 0xa6, 0xf9, 0x39, 0x80, 0x0c, 0x3d, 0x2a, 0x79, 0x9f, 0xaa, 0xbc, 0xa0, 0x26, 0xa2, 0xa9, 0xd0, 0x5d, 0xc0, 0x31, 0xf4, 0x3f, 0x8c, 0xc1, 0x54, 0xc3, 0x4c, 0x1f, 0xd3, 0x3d, 0xcc, 0x69, 0xa7, 0x01, 0x7d, 0x6b, 0x6c, 0xe4, 0x93, 0x24, 0x56, 0xd3, 0x5b, 0xc6, 0x2e, 0x44, 0xb0, 0xcd, 0x99, 0x3a, 0x4b, 0xf7, 0x4e, 0xb0, 0xf2, 0x34, 0x54, 0x83, 0x86, 0x4c, 0x77, 0x16, 0x94, 0xbc, 0x36, 0xb0, 0x61, 0xe9, 0x07, 0x07, 0xcc, 0x65, 0x77, 0xb1, 0x1d, 0x8f, 0x7e, 0x39, 0x6d, 0xc4, 0xba, 0x80, 0xdb, 0x8f, 0xea, 0x58, 0xca, 0x34, 0x7b, 0xd3, 0xf2, 0x92, 0xb9, 0x57, 0xb9, 0x81, 0x84, 0x04, 0xc5, 0x76, 0xc7, 0x2e, 0xc2, 0x12, 0x51, 0x67, 0x9f, 0xc3, 0x47, 0x0a, 0x0c, 0x29, 0xb5, 0x9d, 0x39, 0xbb, 0x92, 0x15, 0xc6, 0x9f, 0x2f, 0x31, 0xe0, 0x9a, 0x54, 0x35, 0xda, 0xb9, 0x10, 0x7d, 0x32, 0x19, 0x16};
+
   Num diffToTarget(double diff) {
     // Create a Num object representing the difficulty
 
@@ -42,19 +43,21 @@ namespace SpectreX
     return target;
   }
 
-  void heavyHash(byte *hash, matrix &mat, byte *out)
+  void heavyHash(byte *scratch, matrix &mat, byte *out)
   {
     std::array<uint16_t, 64> v{}, p{};
     for (int i = 0; i < matSize / 2; i++)
     {
-      v[i * 2] = uint16_t(hash[i] >> 4);
-      v[i * 2 + 1] = uint16_t(hash[i] & 0x0f);
+      v[i * 2] = uint16_t(scratch[i] >> 4);
+      v[i * 2 + 1] = uint16_t(scratch[i] & 0x0f);
     }
 
     // build the product array
+    #pragma unroll(4)
     for (int i = 0; i < 64; i++)
     {
       uint16_t s = 0;
+      #pragma unroll(4)
       for (int j = 0; j < 64; j++)
       {
         s += mat[i][j] * v[j];
@@ -65,13 +68,20 @@ namespace SpectreX
     // calculate the digest
     for (size_t i = 0; i < 32; i++)
     {
-      out[i] = hash[i] ^ (static_cast<uint8_t>(p[i * 2] << 4) | static_cast<uint8_t>(p[i * 2 + 1]));
+      scratch[i] = scratch[i] ^ (static_cast<uint8_t>(p[i * 2] << 4) | static_cast<uint8_t>(p[i * 2 + 1]));
     }
 
     // hash the digest a final time, reverse bytes
+    #pragma unroll(5)
+    for (int i=0; i<4; i++) ((uint64_t *)scratch)[i] = ((uint64_t *)heavyP)[i] ^ ((uint64_t *)scratch)[i];
+
+    #pragma unroll
+    for (int i = 4; i < 25; i++) ((uint64_t *)scratch)[i] = ((uint64_t *)heavyP)[i];
+
+    keccakf(scratch);
+    std::reverse(scratch, scratch+32);
 
-    cshake256_nil_function_name(out, 32, "HeavyHash", out, 32*8);
-    std::reverse(out, out+32);
+    // memcpy(out, scratch, 32);
   }
 
   void hash(worker &worker, byte *in, int len, byte *out)
@@ -79,10 +89,19 @@ namespace SpectreX
     // cshake256("ProofOfWorkHash", in, len, worker.sha3Hash, 32);
     // newMatrix(in, worker.mat, worker);
     memcpy(worker.mat, worker.matBuffer, sizeof(matrix));
-    cshake256_nil_function_name(in, len, "ProofOfWorkHash", worker.sha3Hash, 32*8);
-    AstroBWTv3(worker.sha3Hash, 32, worker.astrobwtv3Hash, *(worker.astroWorker), false);
+
+    #pragma unroll(5)
+    for (int i=0; i<10; i++) ((uint64_t *)worker.scratchData)[i] = ((uint64_t *)powP)[i] ^ ((uint64_t *)in)[i];
+
+    #pragma unroll
+    for (int i = 10; i < 25; i++) ((uint64_t *)worker.scratchData)[i] = ((uint64_t *)powP)[i];
+
+    keccakf(worker.scratchData);
+
+    AstroBWTv3(worker.scratchData, 32, worker.astrobwtv3Hash, *(worker.astroWorker), false);
     // AstroBWTv3(worker.sha3Hash, 32, worker.astrobwtv3Hash, *worker.astroWorker, false);
-    heavyHash(worker.astrobwtv3Hash, worker.mat, out);
+    memcpy(worker.scratchData, worker.astrobwtv3Hash, 32);
+    heavyHash(worker.scratchData, worker.mat, out);
   }
 
   void testWithInput(const char* input, byte *out) {
@@ -137,21 +156,15 @@ namespace SpectreX
     // std::reverse(out, out+32);
 
     int pieces_failed = 0;
-    const char *pow_expected = "ae63221b94390528bd5a092be6247f7173099978bf6b150031c034ed22b37cea";
-    printf("POW hash: %s\n", hexStr(w->sha3Hash, 32).c_str());
-    printf("WANT    : %s\n\n", pow_expected);
 
     const char *bwt_expected = "271bd27bf393fc8854e4ada0f255cef19c0e86c9b7245088bdafc01318172dc5";
     printf("BWT hash: %s\n", hexStr(w->astrobwtv3Hash, 32).c_str());
     printf("WANT    : %s\n\n", bwt_expected);
 
     const char *heavy_expected = "0b68c38a0d359b9ef74fecfae4b2b0a0ea026fdcee22c1d48bcc824f32050ef5";
-    printf("Heavy hash: %s\n", hexStr(out, 32).c_str());
+    printf("Heavy hash: %s\n", hexStr(w->scratchData, 32).c_str());
     printf("WANT      : %s\n\n", heavy_expected);
 
-    if(memcmp(hexStr(w->sha3Hash, 32).c_str(), pow_expected, 32) != 0) {
-      pieces_failed += 1;
-    }
     if(memcmp(hexStr(w->astrobwtv3Hash, 32).c_str(), bwt_expected, 32) != 0) {
       pieces_failed += 1;
     }
diff --git a/src/crypto/spectrex/spectrex.h b/src/crypto/spectrex/spectrex.h
index 9e885fd..d5f30a7 100644
--- a/src/crypto/spectrex/spectrex.h
+++ b/src/crypto/spectrex/spectrex.h
@@ -27,6 +27,7 @@ namespace SpectreX
     matrix mat;
     double copied[matSize][matSize];
     std::bitset<matSize> rowsSelected;
+    byte scratchData[200];
     byte sha3Hash[32];
     byte astrobwtv3Hash[32];
     workerData *astroWorker;
diff --git a/src/crypto/tiny-keccak/tiny-keccak.h b/src/crypto/tiny-keccak/tiny-keccak.h
new file mode 100644
index 0000000..a4eb823
--- /dev/null
+++ b/src/crypto/tiny-keccak/tiny-keccak.h
@@ -0,0 +1,162 @@
+/** libkeccak-tiny
+ *
+ * A single-file implementation of SHA-3 and SHAKE.
+ *
+ * Implementor: David Leon Gil
+ * License: CC0, attribution kindly requested. Blame taken too,
+ * but not liability.
+ */
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/******** The Keccak-f[1600] permutation ********/
+
+/*** Constants. ***/
+static const uint8_t rho[24] = \
+  { 1,  3,   6, 10, 15, 21,
+    28, 36, 45, 55,  2, 14,
+    27, 41, 56,  8, 25, 43,
+    62, 18, 39, 61, 20, 44};
+static const uint8_t pi[24] = \
+  {10,  7, 11, 17, 18, 3,
+    5, 16,  8, 21, 24, 4,
+   15, 23, 19, 13, 12, 2,
+   20, 14, 22,  9, 6,  1};
+static const uint64_t RC[24] = \
+  {1ULL, 0x8082ULL, 0x800000000000808aULL, 0x8000000080008000ULL,
+   0x808bULL, 0x80000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL,
+   0x8aULL, 0x88ULL, 0x80008009ULL, 0x8000000aULL,
+   0x8000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL,
+   0x8000000000008002ULL, 0x8000000000000080ULL, 0x800aULL, 0x800000008000000aULL,
+   0x8000000080008081ULL, 0x8000000000008080ULL, 0x80000001ULL, 0x8000000080008008ULL};
+
+/*** Helper macros to unroll the permutation. ***/
+#define rol(x, s) (((x) << s) | ((x) >> (64 - s)))
+#define REPEAT6(e) e e e e e e
+#define REPEAT24(e) REPEAT6(e e e e)
+#define REPEAT5(e) e e e e e
+#define FOR5(v, s, e) \
+  v = 0;            \
+  REPEAT5(e; v += s;)
+
+/*** Keccak-f[1600] ***/
+inline static void keccakf(void* state) {
+  uint64_t* a = (uint64_t*)state;
+  uint64_t b[5] = {0};
+  uint64_t t = 0;
+  uint8_t x, y;
+
+  #pragma unroll
+  for (int i = 0; i < 24; i++) {
+    // Theta
+    FOR5(x, 1,
+         b[x] = 0;
+         FOR5(y, 5,
+              b[x] ^= a[x + y]; ))
+    FOR5(x, 1,
+         FOR5(y, 5,
+              a[y + x] ^= b[(x + 4) % 5] ^ rol(b[(x + 1) % 5], 1); ))
+    // Rho and pi
+    t = a[1];
+    x = 0;
+    REPEAT24(b[0] = a[pi[x]];
+             a[pi[x]] = rol(t, rho[x]);
+             t = b[0];
+             x++; )
+    // Chi
+    FOR5(y,
+       5,
+       FOR5(x, 1,
+            b[x] = a[y + x];)
+       FOR5(x, 1,
+            a[y + x] = b[x] ^ ((~b[(x + 1) % 5]) & b[(x + 2) % 5]); ))
+    // Iota
+    a[0] ^= RC[i];
+  }
+}
+
+/******** The FIPS202-defined functions. ********/
+
+/*** Some helper macros. ***/
+
+#define _(S) do { S } while (0)
+#define FOR(i, ST, L, S) \
+  _(for (size_t i = 0; i < L; i += ST) { S; })
+#define mkapply_ds(NAME, S)                                          \
+  static inline void NAME(uint8_t* dst,                              \
+                          const uint8_t* src,                        \
+                          size_t len) {                              \
+    FOR(i, 1, len, S);                                               \
+  }
+#define mkapply_sd(NAME, S)                                          \
+  static inline void NAME(const uint8_t* src,                        \
+                          uint8_t* dst,                              \
+                          size_t len) {                              \
+    FOR(i, 1, len, S);                                               \
+  }
+
+mkapply_ds(xorin, dst[i] ^= src[i])  // xorin
+mkapply_sd(setout, dst[i] = src[i])  // setout
+
+#define P keccakf
+#define Plen 200
+
+#define foldP(I, L, F) \
+  while (L >= rate) {  \
+    F(a, I, rate);     \
+    P(a);              \
+    I += rate;         \
+    L -= rate;         \
+  }
+
+/** The sponge-based hash construction. **/
+static inline int hash(uint8_t* out, size_t outlen,
+                       const uint8_t* in, size_t inlen,
+                       size_t rate, uint8_t delim) {
+  if ((out == NULL) || ((in == NULL) && inlen != 0) || (rate >= Plen)) {
+    return -1;
+  }
+  uint8_t a[Plen] = {0};
+  // Absorb input.
+  foldP(in, inlen, xorin);
+  // Xor in the DS and pad frame.
+  a[inlen] ^= delim;
+  a[rate - 1] ^= 0x80;
+  // Xor in the last block.
+  xorin(a, in, inlen);
+  // Apply P
+  P(a);
+  // Squeeze output.
+  foldP(out, outlen, setout);
+  setout(a, out, outlen);
+  return 0;
+}
+
+/*** Helper macros to define SHA3 and SHAKE instances. ***/
+#define defshake(bits)                                            \
+  static inline int shake##bits(uint8_t* out, size_t outlen,                    \
+                  const uint8_t* in, size_t inlen) {              \
+    return hash(out, outlen, in, inlen, 200 - (bits / 4), 0x1f);  \
+  }
+#define defsha3(bits)                                             \
+  static inline int sha3_##bits(uint8_t* out, size_t outlen,                    \
+                  const uint8_t* in, size_t inlen) {              \
+    if (outlen > (bits/8)) {                                      \
+      return -1;                                                  \
+    }                                                             \
+    return hash(out, outlen, in, inlen, 200 - (bits / 4), 0x06);  \
+  }
+
+/*** FIPS202 SHAKE VOFs ***/
+defshake(128)
+defshake(256)
+
+/*** FIPS202 SHA3 FOFs ***/
+defsha3(224)
+defsha3(256)
+defsha3(384)
+defsha3(512)
\ No newline at end of file
diff --git a/src/net/astrix/net_astrix_stratum.cpp b/src/net/astrix/net_astrix_stratum.cpp
index 9683b71..c4e6229 100644
--- a/src/net/astrix/net_astrix_stratum.cpp
+++ b/src/net/astrix/net_astrix_stratum.cpp
@@ -111,7 +111,7 @@ int handleAstrixStratumPacket(boost::json::object packet, AstrixStratum::jobCach
   }
   else if (M.compare(AstrixStratum::s_setDifficulty) == 0)
   {
-    // std::cout << boost::json::serialize(packet).c_str() << std::endl;
+    std::cout << boost::json::serialize(packet).c_str() << std::endl;
     double *d = isDev ? &doubleDiffDev : &doubleDiff;
     (*d) = packet.at("params").as_array()[0].get_double();
     if ((*d) < 0.00000000001) (*d) = packet.at("params").as_array()[0].get_uint64();
@@ -526,7 +526,7 @@ void astrix_stratum_session(
         // Consume the data from the buffer after processing it
         response.consume(trans);
 
-        std::cout << "received: " << data << std::endl << std::flush;
+        // std::cout << "received: " << data << std::endl << std::flush;
         // printf("received data\n");
         fflush(stdout);
 
diff --git a/src/tnn_hip/coins/mine_astrix.hip b/src/tnn_hip/coins/mine_astrix.hip
new file mode 100644
index 0000000..ee73524
--- /dev/null
+++ b/src/tnn_hip/coins/mine_astrix.hip
@@ -0,0 +1,353 @@
+#include <tnn_hip/crypto/astrix-hash/astrix-hash.hip.h>
+#include "miners.hip.hpp"
+#include "tnn-hugepages.h"
+#include <stratum/stratum.h>
+#include <hip/hip_runtime.h>
+
+#define GPU_FOR(d, count) for (d = 0; d < count; d++)
+
+void mineAstrix_hip()
+{
+  int d = 0;
+
+  int GPUCount = 0;
+  hipError_t err = hipGetDeviceCount(&GPUCount);
+
+  size_t *blocks, *threads, *batchSizes;
+  blocks = (size_t*)malloc(sizeof(size_t)*GPUCount);
+  threads = (size_t*)malloc(sizeof(size_t)*GPUCount);
+  batchSizes = (size_t*)malloc(sizeof(size_t)*GPUCount);
+
+  GPU_FOR(d, GPUCount) {
+    hipDeviceProp_t deviceProps;
+    hipGetDeviceProperties(&deviceProps, d); // Query device properties
+    int smCount = deviceProps.multiProcessorCount;
+
+    int numBlocksPerSm;
+    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, astrixHash_hip<false>, Astrix_HIP::THREAD_DIM, 0);
+    blocks[d] = numBlocksPerSm*smCount*128;
+    batchSize[d] = blocks*Astrix_HIP::THREAD_DIM;
+  }
+
+  if (err != hipSuccess) {
+    std::cerr << "Failed to get device count: " << hipGetErrorString(err) << std::endl;
+    return;
+  }
+
+  int64_t localJobCounter;
+  int64_t localOurHeight = 0;
+  int64_t localDevHeight = 0;
+
+  uint64_t i = 0;
+  uint64_t i_dev = 0;
+
+  byte powHash[32];
+  byte work[Astrix_HIP::INPUT_SIZE] = {0};
+  byte devWork[Astrix_HIP::INPUT_SIZE] = {0};
+waitForJob:
+
+  while (!isConnected)
+  {
+    boost::this_thread::sleep_for(boost::chrono::milliseconds(100));
+  }
+
+  uint64_t *nonceCache = (uint64_t*)malloc(sizeof(uint64_t)*GPUCount);
+
+  uint16_t *h_nonceCounts = (uint16_t*)malloc(sizeof(uint16_t)*GPUCount);
+  uint64_t *h_nonceBuffers = (uint64_t*)malloc(sizeof(uint64_t)*Astrix_HIP::MAX_NONCES*GPUCount);
+
+  uint16_t *d_nonceCount;
+  uint64_t *d_nonceBuffer;
+
+  GPU_FOR(d, GPUCount) {
+    hipSetDevice(d);
+
+    hipMalloc(&d_nonceCount[d], sizeof(uint16_t)); 
+    hipMalloc(&d_nonceBuffer[d], sizeof(uint64_t)*Astrix_HIP::MAX_NONCES);
+
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess) {
+      std::cerr << "hipMalloc failed on GPU " << d << ": " << hipGetErrorString(err) << std::endl;
+      return;
+    }
+  }
+
+  while (true)
+  {
+    try
+    {
+      boost::json::value myJob;
+      boost::json::value myJobDev;
+      {
+        std::scoped_lock<boost::mutex> lockGuard(mutex);
+        myJob = job;
+        myJobDev = devJob;
+        localJobCounter = jobCounter;
+      }
+      
+      if (!myJob.at("template").is_string()) {
+        continue;
+      }
+      if (ourHeight == 0 && devHeight == 0)
+        continue;
+
+      if (ourHeight == 0 || localOurHeight != ourHeight)
+      {
+        byte *b2 = new byte[SpectreX::INPUT_SIZE];
+        switch (protocol)
+        {
+        case ASTRIX_SOLO:
+          hexstrToBytes(std::string(myJob.at("template").as_string()), b2);
+          break;
+        case ASTRIX_STRATUM:
+          hexstrToBytes(std::string(myJob.at("template").as_string()), b2);
+          break;
+        }
+        memcpy(work, b2, SpectreX::INPUT_SIZE);
+        delete[] b2;
+
+        GPU_FOR(d, GPUCount) {
+          hipSetDevice(d);
+          Astrix_HIP::newMatrix(work);
+          hipDeviceSynchronize();
+        }
+
+        localOurHeight = ourHeight;
+        i = 0;
+      }
+
+      if (devConnected && myJobDev.at("template").is_string())
+      {
+        if (devHeight == 0 || localDevHeight != devHeight)
+        {
+          byte *b2d = new byte[SpectreX::INPUT_SIZE];
+          switch (protocol)
+          {
+          case ASTRIX_SOLO:
+            hexstrToBytes(std::string(myJobDev.at("template").as_string()), b2d);
+            break;
+          case ASTRIX_STRATUM:
+            hexstrToBytes(std::string(myJobDev.at("template").as_string()), b2d);
+            break;
+          }
+          memcpy(devWork, b2d, SpectreX::INPUT_SIZE);
+          delete[] b2d;
+
+          GPU_FOR(d, GPUCount) {
+            hipSetDevice(d);
+            Astrix_HIP::newMatrix(work, true);
+            hipDeviceSynchronize();
+          }
+
+          localDevHeight = devHeight;
+          i_dev = 0;
+        }
+      }
+
+      bool devMine = false;
+      double which;
+      bool submit = false;
+      double DIFF = 1;
+      Num cmpDiff;
+
+      memset(h_nonceCounts, 0, sizeof(uint16_t)*GPUCount);
+
+      boost::thread_group workers;
+
+      // printf("end of job application\n");
+      GPU_FOR(d, GPUCount) {
+        threads.create_thread([&]() {
+          hipSetDevice(d);
+
+          int kernelIndex = 0;
+          while (localJobCounter == jobCounter)
+          {
+            which = (double)(rand() % 10000);
+            devMine = (devConnected && devHeight > 0 && which < devFee * 100.0);
+            DIFF = devMine ? doubleDiffDev : doubleDiff;
+            if (DIFF == 0)
+              continue;
+
+            // cmpDiff = ConvertDifficultyToBig(DIFF, ASTRIX_X);
+            cmpDiff = SpectreX::diffToTarget(DIFF);
+
+            uint64_t *nonce = devMine ? &i_dev : &i;
+            (*nonce)++;
+
+            // printf("nonce = %llu\n", *nonce);
+
+            byte *WORK = (devMine && devConnected) ? &devWork[0] : &work[0];
+            byte *nonceBytes = &WORK[72];
+            uint64_t n;
+            
+            int enLen = 0;
+            uint64_t nonceMask = -1ULL;
+            
+            boost::json::value &J = devMine ? myJobDev : myJob;
+            if (!J.as_object().if_contains("extraNonce") || J.at("extraNonce").as_string().size() == 0) {
+              n = (rand() % 65536) << 48;
+              nonceMask >>= 16;
+            } else {
+              uint64_t eN = std::stoull(std::string(J.at("extraNonce").as_string().c_str()), NULL, 16);
+              enLen = std::string(J.at("extraNonce").as_string()).size()/2;
+              int offset = (64 - enLen*8);
+              n = (eN << offset);
+              nonceMask >>= enLen*8;
+            }
+            memcpy(nonceBytes, (byte *)&n, 8);
+
+            // printf("after nonce: %s\n", hexStr(WORK, SpectreX::INPUT_SIZE).c_str());
+
+            if (localJobCounter != jobCounter) {
+              // printf("thread %d updating job before hash\n", tid);
+              break;
+            }
+
+            Astrix_HIP::astrixHash_wrapper(blocks[d], nonceMask, n, d_nonceBuffer, d_nonceCount, kernelIndex, batchSizes[d], d, devMine);
+
+            kernelIndex++;
+
+            // if (littleEndian())
+            // {
+            //   std::reverse(powHash, powHash + 32);
+            // }
+
+            HIP_counters[d].fetch_add(batchSizes[d]);
+            submit = (devMine && devConnected) ? !submittingDev : !submitting;
+
+            if (localJobCounter != jobCounter || localOurHeight != ourHeight) {
+              // printf("thread %d updating job after hash\n", tid);
+              break;
+            }
+
+            // if (Num(hexStr(powHash, 32).c_str(), 16) <= cmpDiff)
+            // {
+            //   // printf("thread %d entered submission process\n", tid);
+            //   if (!submit) {
+            //     for(;;) {
+            //       submit = (devMine && devConnected) ? !submittingDev : !submitting;
+            //       if (submit || localJobCounter != jobCounter || localOurHeight != ourHeight)
+            //         break;
+            //       boost::this_thread::yield();
+            //     }
+            //   }
+            //   if (localJobCounter != jobCounter) {
+            //     // printf("thread %d updating job after check\n", tid);
+            //     break;
+            //   }
+            //   // if (littleEndian())
+            //   // {
+            //   //   std::reverse(powHash, powHash + 32);
+            //   // }
+            // //   std::string b64 = base64::to_base64(std::string((char *)&WORK[0], XELIS_TEMPLATE_SIZE));
+            //   // boost::lock_guard<boost::mutex> lock(mutex);
+            //   if (devMine)
+            //   {
+            //     submittingDev = true;
+            //     std::scoped_lock<boost::mutex> lockGuard(devMutex);
+            //     // if (localJobCounter != jobCounter || localDevHeight != devHeight)
+            //     // {
+            //     //   break;
+            //     // }
+            //     setcolor(CYAN);
+            //     std::cout << "\n(DEV) Thread " << tid << " found a dev share\n" << std::flush;
+            //     setcolor(BRIGHT_WHITE);
+            //     switch (protocol)
+            //     {
+            //     case ASTRIX_SOLO:
+            //       devShare = {{"block_template", hexStr(&WORK[0], SpectreX::INPUT_SIZE).c_str()}};
+            //       break;
+            //     case ASTRIX_STRATUM:
+            //       std::vector<char> nonceStr;
+            //       // Num(std::to_string((n << enLen*8) >> enLen*8).c_str(),10).print(nonceStr, 16);
+            //       Num(std::to_string(n).c_str(),10).print(nonceStr, 16);
+            //       devShare = {{{"id", SpectreStratum::submitID},
+            //                 {"method", SpectreStratum::submit.method.c_str()},
+            //                 {"params", {devWorkerName,                                   // WORKER
+            //                             myJobDev.at("jobId").as_string().c_str(), // JOB ID
+            //                             std::string(nonceStr.data()).c_str()}}}};
+
+            //       break;
+            //     }
+            //     data_ready = true;
+            //   }
+            //   else
+            //   {
+            //     submitting = true;
+            //     std::scoped_lock<boost::mutex> lockGuard(userMutex);
+            //     // if (localJobCounter != jobCounter || localOurHeight != ourHeight)
+            //     // {
+            //     //   break;
+            //     // }
+            //     setcolor(BRIGHT_YELLOW);
+            //     std::cout << "\nThread " << tid << " found a nonce!\n" << std::flush;
+            //     setcolor(BRIGHT_WHITE);
+            //     switch (protocol)
+            //     {
+            //     case ASTRIX_SOLO:
+            //       share = {{"block_template", hexStr(&WORK[0], SpectreX::INPUT_SIZE).c_str()}};
+            //       break;
+            //     case ASTRIX_STRATUM:
+            //       std::vector<char> nonceStr;
+            //       // Num(std::to_string((n << enLen*8) >> enLen*8).c_str(),10).print(nonceStr, 16);
+            //       Num(std::to_string(n).c_str(),10).print(nonceStr, 16);
+            //       share = {{{"id", SpectreStratum::submitID},
+            //                 {"method", SpectreStratum::submit.method.c_str()},
+            //                 {"params", {workerName,                                   // WORKER
+            //                             myJob.at("jobId").as_string().c_str(), // JOB ID
+            //                             std::string(nonceStr.data()).c_str()}}}};
+
+            //       // std::cout << "blob: " << hexStr(&WORK[0], SpectreX::INPUT_SIZE).c_str() << std::endl;
+            //       // std::cout << "nonce: " << nonceStr.data() << std::endl;
+            //       // std::cout << "extraNonce: " << hexStr(&WORK[SpectreX::INPUT_SIZE - 48], enLen).c_str() << std::endl;
+            //       // std::cout << "hash: " << hexStr(&powHash[0], 32) << std::endl;
+            //       // std::vector<char> diffHex;
+            //       // cmpDiff.print(diffHex, 16);
+            //       // std::cout << "difficulty (LE): " << std::string(diffHex.data()).c_str() << std::endl;
+            //       // std::cout << "powValue: " << Num(hexStr(powHash, 32).c_str(), 16) << std::endl;
+            //       // std::cout << "target (decimal): " << cmpDiff << std::endl;
+
+            //       // printf("blob: %s\n", foundBlob.c_str());
+            //       // printf("hash (BE): %s\n", hexStr(&powHash[0], 32).c_str());
+            //       // printf("nonce (Full bytes for injection): %s\n", hexStr((byte *)&n, 8).c_str());
+
+            //       break;
+            //     }
+            //     data_ready = true;
+            //   }
+            //   // printf("thread %d finished submission process\n", tid);
+            //   cv.notify_all();
+            // }
+
+            if (!isConnected) {
+              data_ready = true;
+              cv.notify_all();
+              break;
+            }
+          }
+        });
+      }
+
+      workers.join_all();
+    }
+    catch (std::exception& e)
+    {
+      setcolor(RED);
+      std::cerr << "Error in POW Function" << std::endl;
+      std::cerr << e.what() << std::endl;
+      setcolor(BRIGHT_WHITE);
+
+      localJobCounter = -1;
+      localOurHeight = -1;
+      localDevHeight = -1;
+    }
+    if (!isConnected) {
+      data_ready = true;
+      cv.notify_all();
+      break;
+    }
+  }
+  goto waitForJob;
+}
+
+#undef GPU_FOR
\ No newline at end of file
diff --git a/src/tnn_hip/coins/miners.hip.hpp b/src/tnn_hip/coins/miners.hip.hpp
new file mode 100644
index 0000000..a205209
--- /dev/null
+++ b/src/tnn_hip/coins/miners.hip.hpp
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <tnn-common.hpp>
+#include <net.hpp>
+#include <num.h>
+#include <hex.h>
+#include <endian.hpp>
+#include <terminal.h>
+
+using byte = unsigned char;
+
+inline Num ConvertDifficultyToBig(Num d, int algo)
+{
+  switch(algo) {
+    case DERO_HASH:
+      return oneLsh256 / d;
+    case XELIS_HASH:
+      return maxU256 / d;
+    case SPECTRE_X:
+      return oneLsh256 / (d+1);
+    default:
+      return 0;
+  }
+}
+
+inline bool CheckHash(unsigned char *hash, int64_t diff, int algo)
+{
+  if (littleEndian()) std::reverse(hash, hash+32);
+  bool cmp = Num(hexStr(hash, 32).c_str(), 16) <= ConvertDifficultyToBig(diff, algo);
+  if (littleEndian()) std::reverse(hash, hash+32);
+  return (cmp);
+}
+
+inline bool CheckHash(unsigned char *hash, Num diff, int algo)
+{
+  if (littleEndian()) std::reverse(hash, hash+32);
+  bool cmp = Num(hexStr(hash, 32).c_str(), 16) <= diff;
+  if (littleEndian()) std::reverse(hash, hash+32);
+  return (cmp);
+}
+
+inline std::string uint32ToHex(uint32_t value) {
+  std::stringstream ss;
+  ss << std::hex << std::setw(8) << std::setfill('0') << value;
+  return ss.str();
+}
+
+void mineAstrix_hip();
+
+static inline void unsupported() {
+  printf("This coin is not supported on GPUs\n");
+}
+
+typedef void (*mineFunc_hip)();
+const mineFunc_hip POW[] = {
+  unsupported, // 0
+  unsupported, 
+  unsupported, 
+  unsupported,
+  unsupported,
+  unsupported,
+  unsupported, // 5
+  unsupported,
+  unsupported,
+  unsupported,
+  unsupported,
+  unsupported, // 10
+  unsupported,
+  mineAstrix_hip
+};
+
diff --git a/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip b/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip
index b8dd452..b48c5f5 100644
--- a/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip
+++ b/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip
@@ -3,6 +3,7 @@
 
 #include <hip/hip_runtime.h>
 #include "astrix_archdef.h"
+#include "astrix-hash.hip.h"
 
 #include <tnn_hip/crypto/keccak-tiny.hip.inc>
 #include <tnn_hip/crypto/xoshiro256starstar.hip.inc>
@@ -25,16 +26,15 @@ hipDeviceProp_t deviceProps;
 #define RANDOM_LEAN 0
 #define RANDOM_XOSHIRO 1
 
-#define HIP_ASTRIX_MAX_NONCES 640
-
 #define LT_U256(X, Y) (X.number[3] != Y.number[3] ? X.number[3] < Y.number[3] : X.number[2] != Y.number[2] ? X.number[2] < Y.number[2] \
                                                                             : X.number[1] != Y.number[1]   ? X.number[1] < Y.number[1] \
                                                                                                            : X.number[0] < Y.number[0])
-
-__constant__ uint8_t matrix[MATRIX_SIZE][MATRIX_SIZE];
-__constant__ uint8_t hash_header[HASH_HEADER_SIZE];
+__align__(32) __constant__ uint8_t matrix[MATRIX_SIZE][MATRIX_SIZE];
+__align__(32) __constant__ uint8_t matrix_dev[MATRIX_SIZE][MATRIX_SIZE];
+__align__(32) __constant__ uint8_t hash_header[HASH_HEADER_SIZE];
+__align__(32) __constant__ uint8_t hash_header_dev[HASH_HEADER_SIZE];
 __constant__ uint256_t target;
-__constant__ static const uint8_t powP[Plen] = {0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02};
+__align__(32) __constant__ static const uint8_t powP[Plen] = {0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02};
 __align__(32) __constant__ static const uint8_t heavyP[Plen] = {0x09, 0x85, 0x24, 0xb2, 0x52, 0x4c, 0xd7, 0x3a, 0x16, 0x42, 0x9f, 0x2f, 0x0e, 0x9b, 0x62, 0x79, 0xee, 0xf8, 0xc7, 0x16, 0x48, 0xff, 0x14, 0x7a, 0x98, 0x64, 0x05, 0x80, 0x4c, 0x5f, 0xa7, 0x11, 0xda, 0xce, 0xee, 0x44, 0xdf, 0xe0, 0x20, 0xe7, 0x69, 0x40, 0xf3, 0x14, 0x2e, 0xd8, 0xc7, 0x72, 0xba, 0x35, 0x89, 0x93, 0x2a, 0xff, 0x00, 0xc1, 0x62, 0xc4, 0x0f, 0x25, 0x40, 0x90, 0x21, 0x5e, 0x48, 0x6a, 0xcf, 0x0d, 0xa6, 0xf9, 0x39, 0x80, 0x0c, 0x3d, 0x2a, 0x79, 0x9f, 0xaa, 0xbc, 0xa0, 0x26, 0xa2, 0xa9, 0xd0, 0x5d, 0xc0, 0x31, 0xf4, 0x3f, 0x8c, 0xc1, 0x54, 0xc3, 0x4c, 0x1f, 0xd3, 0x3d, 0xcc, 0x69, 0xa7, 0x01, 0x7d, 0x6b, 0x6c, 0xe4, 0x93, 0x24, 0x56, 0xd3, 0x5b, 0xc6, 0x2e, 0x44, 0xb0, 0xcd, 0x99, 0x3a, 0x4b, 0xf7, 0x4e, 0xb0, 0xf2, 0x34, 0x54, 0x83, 0x86, 0x4c, 0x77, 0x16, 0x94, 0xbc, 0x36, 0xb0, 0x61, 0xe9, 0x07, 0x07, 0xcc, 0x65, 0x77, 0xb1, 0x1d, 0x8f, 0x7e, 0x39, 0x6d, 0xc4, 0xba, 0x80, 0xdb, 0x8f, 0xea, 0x58, 0xca, 0x34, 0x7b, 0xd3, 0xf2, 0x92, 0xb9, 0x57, 0xb9, 0x81, 0x84, 0x04, 0xc5, 0x76, 0xc7, 0x2e, 0xc2, 0x12, 0x51, 0x67, 0x9f, 0xc3, 0x47, 0x0a, 0x0c, 0x29, 0xb5, 0x9d, 0x39, 0xbb, 0x92, 0x15, 0xc6, 0x9f, 0x2f, 0x31, 0xe0, 0x9a, 0x54, 0x35, 0xda, 0xb9, 0x10, 0x7d, 0x32, 0x19, 0x16};
 
 __device__ int deviceArch;
@@ -55,10 +55,6 @@ __device__ __forceinline__ void amul4bit(uint32_t packed_vec1[32], uint32_t pack
 {
   // We assume each 32 bits have four values: A0 B0 C0 D0
   unsigned int res = 0;
-#if __CUDA_ARCH__ < 610
-  char4 *a4 = (char4 *)packed_vec1;
-  char4 *b4 = (char4 *)packed_vec2;
-#endif
 #pragma unroll
   for (int i = 0; i < QUARTER_MATRIX_SIZE; i++)
   {
@@ -66,6 +62,8 @@ __device__ __forceinline__ void amul4bit(uint32_t packed_vec1[32], uint32_t pack
 #if (HIP_ARCH >= 1030)
     res = v_dot4_u32_u8(packed_vec1[i], packed_vec2[i], res);
 #else
+    char4 *a4 = (char4 *)packed_vec1;
+    char4 *b4 = (char4 *)packed_vec2;
     res += a4[i].x * b4[i].x;
     res += a4[i].y * b4[i].y;
     res += a4[i].z * b4[i].z;
@@ -75,6 +73,8 @@ __device__ __forceinline__ void amul4bit(uint32_t packed_vec1[32], uint32_t pack
 #if __CUDA_ARCH__ >= 610
     res = __dp4a(packed_vec1[i], packed_vec2[i], res);
 #else
+    char4 *a4 = (char4 *)packed_vec1;
+    char4 *b4 = (char4 *)packed_vec2;
     res += a4[i].x * b4[i].x;
     res += a4[i].y * b4[i].y;
     res += a4[i].z * b4[i].z;
@@ -86,6 +86,7 @@ __device__ __forceinline__ void amul4bit(uint32_t packed_vec1[32], uint32_t pack
   *ret = res;
 }
 
+template <bool isDev>
 __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_fixed, uint64_t *final_nonces, int *nonce_count, int kIndex, size_t batchSize, uint8_t device = 0)
 {
   // assuming header_len is 72
@@ -93,16 +94,26 @@ __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_f
   uint64_t nonce;
 
   nonce = nonceId + kIndex * batchSize;
-  // nonce = (nonce << 4) | device;
-  // nonce = (nonce & nonce_mask) | nonce_fixed;
+  nonce = (nonce << 4) | device;
+  nonce = (nonce & nonce_mask) | nonce_fixed;
+
 
   // header
   __align__(32) uint8_t input[200] = {0};
-  memcpy(input, hash_header, HASH_HEADER_SIZE);
+  input[80] = 0x06;
+  input[135] = 0x80;
+
+  if constexpr (isDev) {
+    memcpy(input, hash_header, HASH_HEADER_SIZE);
+  } else {
+    memcpy(input, hash_header_dev, HASH_HEADER_SIZE);
+  }
+
   // data
   // TODO: check endianity?
   // uint256_t hash_;
-  memcpy(&input[HASH_HEADER_SIZE], (uint8_t *)(&nonce), 8);
+  ((uint64_t*)input)[9] = nonce;
+  // memcpy(&input[HASH_HEADER_SIZE], (uint8_t *)(&nonce), 8);
 
   // if (nonceId == 0 && kIndex == 0) {
   //   printf("input 0:\n");
@@ -114,8 +125,6 @@ __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_f
   // __syncthreads();
 
   // sha3_256_astrix(hash_.hash, input);
-  input[80] = 0x06;
-  input[135] = 0x80;
 
   keccakf(input);
 
@@ -140,11 +149,17 @@ __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_f
         (input[2 * i + 1] & 0x0F));
   }
   uint32_t product1, product2;
+
 #pragma unroll
   for (int rowId = 0; rowId < HALF_MATRIX_SIZE; rowId++)
   {
-    amul4bit((uint32_t *)(matrix[(2 * rowId)]), (uint32_t *)(packed_hash), &product1);
-    amul4bit((uint32_t *)(matrix[(2 * rowId + 1)]), (uint32_t *)(packed_hash), &product2);
+    if constexpr (isDev) {
+      amul4bit((uint32_t *)(matrix_dev[(2 * rowId)]), (uint32_t *)(packed_hash), &product1);
+      amul4bit((uint32_t *)(matrix_dev[(2 * rowId + 1)]), (uint32_t *)(packed_hash), &product2);
+    } else {
+      amul4bit((uint32_t *)(matrix[(2 * rowId)]), (uint32_t *)(packed_hash), &product1);
+      amul4bit((uint32_t *)(matrix[(2 * rowId + 1)]), (uint32_t *)(packed_hash), &product2);
+    }
     product1 >>= 6;
     product1 &= 0xF0;
     product2 >>= 10;
@@ -162,30 +177,12 @@ __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_f
     input[rowId] = input[rowId] ^ ((uint8_t)(product1) | (uint8_t)(product2));
 #endif
   }
-  memset(&input[32], 0, 80-32);
-  // shake32_heavy(heavyP, hash_.hash, input);
 
   #pragma unroll
   for (int i=0; i<4; i++) ((uint64_t *)input)[i] = ((uint64_t *)heavyP)[i] ^ ((uint64_t *)input)[i];
-  // ((ulonglong4 *)input)[0] ^= ((ulonglong4 *)heavyP)[0];
 
   #pragma unroll
-  for (int i = 1; i < 6; i++) ((ulonglong4 *)input)[i] = ((ulonglong4 *)heavyP)[i];
-
-  ((uint64_t *)input)[24] = ((uint64_t *)heavyP)[24];
-  // ((ulonglong4 *)a)[0] = ((ulonglong4 *)initP)[0] ^ ((ulonglong4 *)in)[0];
-  // ((ulonglong4 *)a)[1] = ((ulonglong4 *)initP)[1] ^ ((ulonglong4 *)in)[1];
-  // ((ulonglong2 *)a)[4] = ((ulonglong2 *)initP)[4] ^ ((ulonglong4 *)in)[4];
-
-  // #pragma unroll
-  // for (int i=10; i<25; i++) ((uint64_t *)input)[i] = ((uint64_t *)heavyP)[i];
-  // ((ulonglong2 *)a)[5] = ((ulonglong2 *)initP)[5];
-  
-  // #pragma unroll
-  // for (int i = 3; i < 6; i++) ((ulonglong4 *)a)[i] = ((ulonglong4 *)initP)[i];
-
-  // ((ulonglong2 *)a)[12] = ((ulonglong2 *)initP)[12];
-  // ((uint64_t *)a)[24] = ((uint64_t *)initP)[24];
+  for (int i = 4; i < 25; i++) ((uint64_t *)input)[i] = ((uint64_t *)heavyP)[i];
 
   keccakf(input);
 
@@ -194,13 +191,15 @@ __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_f
   //   hash_.hash[i] = hash_.hash[31 - i];
   //   hash_.hash[31 - i] = temp;
   // }
-
+  // int index = cond ? atomicAdd(nonce_count, 1) : -1;
   if (LT_U256(((uint256_t*)input)[0], target))
   {
     int index = atomicAdd(nonce_count, 1);
-    index = index >= HIP_ASTRIX_MAX_NONCES ? HIP_ASTRIX_MAX_NONCES - 1 : index;
+    index = index >= Astrix_HIP::MAX_NONCES ? Astrix_HIP::MAX_NONCES - 1 : index;
     final_nonces[index] = nonce;
+    // atomicCAS(&((unsigned long long int*)final_nonces)[index], 0, (unsigned long long int)nonce);
   }
+  
 
   // if (nonceId == 0 && kIndex == 0) {
   //   printf("GPU result: ");
@@ -215,7 +214,7 @@ __global__ void astrixHash_hip(const uint64_t nonce_mask, const uint64_t nonce_f
 inline int calculateRank(uint8_t mat[MATRIX_SIZE][MATRIX_SIZE])
 {
   double copied[MATRIX_SIZE][MATRIX_SIZE];
-  bool rowsSelected[MATRIX_SIZE];
+  bool rowsSelected[MATRIX_SIZE] = {false};
   const double epsilon = 1e-9;
 
   for (int i = 0; i < MATRIX_SIZE; i++)
@@ -265,6 +264,82 @@ inline int calculateRank(uint8_t mat[MATRIX_SIZE][MATRIX_SIZE])
   return rank;
 }
 
+namespace Astrix_HIP {
+  void getHashBlocksPerSM(int *numBlocksPerSm, bool isDev) {
+    if (isDev) {
+      hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocksPerSm, astrixHash_hip<true>, Astrix_HIP::THREAD_DIM, 0);
+    } else {
+      hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocksPerSm, astrixHash_hip<false>, Astrix_HIP::THREAD_DIM, 0);
+    }
+  }
+
+  void astrixHash_wrapper(
+    int blocks,
+    const uint64_t nonce_mask, 
+    const uint64_t nonce_fixed, 
+    uint64_t *final_nonces, 
+    int *nonce_count, 
+    int kIndex, 
+    size_t batchSize, 
+    uint8_t device,
+    bool isDev
+  ) {
+    int h_nonce_count = 0; // Initialize on the host
+
+    hipMemcpy(nonce_count, &h_nonce_count, sizeof(int), hipMemcpyHostToDevice);
+
+    if (isDev) {
+      hipLaunchKernelGGL(astrixHash_hip<true>, dim3(blocks), dim3(Astrix_HIP::THREAD_DIM), 0, 0,
+                          nonce_mask, nonce_fixed, final_nonces, nonce_count, kIndex, batchSize, device);
+    } else {
+      hipLaunchKernelGGL(astrixHash_hip<false>, dim3(blocks), dim3(Astrix_HIP::THREAD_DIM), 0, 0,
+                          nonce_mask, nonce_fixed, final_nonces, nonce_count, kIndex, batchSize, device);
+    }
+
+    // Check for errors after the kernel launch
+    hipError_t err = hipGetLastError();
+    if (err != hipSuccess)
+    {
+      printf("Kernel launch failed: %s\n", hipGetErrorString(err));
+    }
+  }
+
+  void newMatrix(uint8_t *in, bool isDev) {
+    // Prepare host-side data for the matrix, hash_header, and target
+    uint8_t h_matrix[MATRIX_SIZE][MATRIX_SIZE]; // Host-side matrix
+
+    // Fill the matrix
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+      memset(h_matrix[i], 0, MATRIX_SIZE);
+    }
+
+    alignas(64) uint64_t s0 = *(uint64_t*)&in[0];
+    alignas(64) uint64_t s1 = *(uint64_t*)&in[8];
+    alignas(64) uint64_t s2 = *(uint64_t*)&in[16];
+    alignas(64) uint64_t s3 = *(uint64_t*)&in[24];
+
+    Xoshiro256PlusPlusHasher hasher(s0, s1, s2, s3);
+                  
+    while (calculateRank(h_matrix) != MATRIX_SIZE)
+    {
+      for (int i = 0; i < MATRIX_SIZE; i++)
+      {
+        for (int j = 0; j < MATRIX_SIZE; j += QUARTER_MATRIX_SIZE)
+        {
+          uint64_t value = hasher.next();
+          for (int k = 0; k < 16; k++)
+          {
+            h_matrix[i][j + k] = uint16_t((value >> (4 * k)) & 0x0f);
+          }
+        }
+      }
+    }
+
+    hipMemcpyToSymbol(isDev ? matrix : matrix_dev, h_matrix, sizeof(h_matrix));
+  }
+
+}
+
 void benchAstrixHip()
 {
   int device;
@@ -272,6 +347,7 @@ void benchAstrixHip()
 
   hipDeviceProp_t deviceProps;
   hipGetDeviceProperties(&deviceProps, device); // Query device properties
+  int smCount = deviceProps.multiProcessorCount;
 
   // Get the architecture version (gcnArch) and store it for device code
   // int arch = deviceProps.gcnArch;
@@ -279,19 +355,19 @@ void benchAstrixHip()
 
   // Retrieve architecture-specific dimensions
   size_t blocks, threads, batchSize;
-  getArchDims(blocks, threads, batchSize);
+  // getArchDims(blocks, threads, batchSize);
 
   // Example input data setup
   uint64_t nonce_mask = 0xFFFFFFFFFFFFFFFF;  // Example nonce mask
   uint64_t nonce_fixed = 0x0000000000000000; // Example nonce fixed value
 
   // Allocate memory for final_nonce on the CPU
-  uint64_t h_final_nonces[HIP_ASTRIX_MAX_NONCES] = {0};
+  uint64_t h_final_nonces[Astrix_HIP::MAX_NONCES] = {0};
 
   // Allocate space for final_nonce on the GPU
   uint64_t *d_final_nonces;
 
-  hipMalloc((void **)&d_final_nonces, sizeof(uint64_t) * HIP_ASTRIX_MAX_NONCES);
+  hipMalloc((void **)&d_final_nonces, sizeof(uint64_t) * Astrix_HIP::MAX_NONCES);
 
   // Timing events
   hipEvent_t start, stop;
@@ -299,7 +375,7 @@ void benchAstrixHip()
   hipEventCreate(&stop);
 
   // Launch the kernel and perform the benchmark
-  size_t numRuns = 1000;
+  size_t numRuns = 500;
 
   int *d_nonce_count;
   int h_nonce_count = 0; // Initialize on the host
@@ -318,6 +394,7 @@ void benchAstrixHip()
   for (int i = 0; i < 4; i++)
   {
     h_target.number[i] = 0xFFFFFFFFFFFFFFFFULL; // Example target
+    // h_target.number[i] = 0; // Example target
   }
 
   // Fill the matrix
@@ -350,15 +427,29 @@ void benchAstrixHip()
   // Transfer the matrix, hash_header, and target data to device constant memory
   hipMemcpyToSymbol(matrix, h_matrix, sizeof(h_matrix));
   hipMemcpyToSymbol(hash_header, h_hash_header, sizeof(h_hash_header));
-  hipMemcpyToSymbol(target, &h_target, sizeof(h_target));
+  hipMemcpyToSymbol(&target, &h_target, sizeof(h_target));
+
+
+  int numBlocksPerSm;
+  hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, astrixHash_hip<false>, Astrix_HIP::THREAD_DIM, 0);
+  blocks = numBlocksPerSm*smCount*128;
+  batchSize = blocks*Astrix_HIP::THREAD_DIM;
+
+  if (batchSize == 0) {
+    blocks = 1;
+    batchSize = Astrix_HIP::THREAD_DIM;
+  }
+
+  printf("batchSize: %llu\nsmCount: %llu\n", batchSize, smCount);
+  fflush(stdout);
 
   // Record start event
   hipEventRecord(start, 0);
 
   for (int i = 0; i < numRuns; ++i)
   {
-    hipMemcpy(d_nonce_count, &h_nonce_count, sizeof(int), hipMemcpyHostToDevice);
-    hipLaunchKernelGGL(astrixHash_hip, dim3(blocks), dim3(threads), 0, 0,
+    hipMemset(d_nonce_count, 0, sizeof(int));
+    hipLaunchKernelGGL(astrixHash_hip<false>, dim3(blocks), dim3(Astrix_HIP::THREAD_DIM), 0, 0,
                        nonce_mask, nonce_fixed, d_final_nonces, d_nonce_count, i, batchSize, 0);
 
     // Check for errors after the kernel launch
@@ -380,7 +471,7 @@ void benchAstrixHip()
 
   // Calculate throughput
   float seconds = milliseconds / 1000.0f;
-  size_t totalComputations = blocks * threads * numRuns;
+  size_t totalComputations = batchSize * numRuns;
   float runsPerSecond = totalComputations / seconds;
 
   printf("Total time for %d runs: %.3f ms\n", numRuns, milliseconds);
diff --git a/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip.h b/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip.h
new file mode 100644
index 0000000..50f7a53
--- /dev/null
+++ b/src/tnn_hip/crypto/astrix-hash/astrix-hash.hip.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <inttypes.h>
+
+namespace Astrix_HIP {
+  constexpr int INPUT_SIZE = 80;
+  constexpr int MAX_NONCES = 640;
+  constexpr int THREAD_DIM = 1024;
+  /**
+   * Update the Astrix matrix data on the currently active GPU
+   * @param  in  A host pointer to the start of the new prePowHash
+   */
+  void newMatrix(uint8_t *in, bool isDev = false);
+
+  /**
+   * Call the astrixHash_hip kernel with launch parameters provided
+   * 
+   * @param  blocks  The amount of blocks to launch the POW kernel with
+   * @param  final_nonces  A device pointer to a buffer for storing all found nonces for this POW round
+   * @param  nonce_count  A device pointer to a modifiable tally of found nonces for this POW round
+   * @param  kIndex  The current run count/index for the current job on the active GPU
+   */
+  void astrixHash_wrapper(
+    int blocks,
+    const uint64_t nonce_mask, 
+    const uint64_t nonce_fixed, 
+    uint64_t *final_nonces, 
+    int *nonce_count, 
+    int kIndex, 
+    size_t batchSize, 
+    uint8_t device = 0,
+    bool isDev = false
+  );
+}
\ No newline at end of file
diff --git a/src/tnn_hip/crypto/astrix-hash/astrix_archdef.h b/src/tnn_hip/crypto/astrix-hash/astrix_archdef.h
index 314f2ab..424c5c2 100644
--- a/src/tnn_hip/crypto/astrix-hash/astrix_archdef.h
+++ b/src/tnn_hip/crypto/astrix-hash/astrix_archdef.h
@@ -31,7 +31,7 @@ using ArchDims = std::tuple<size_t, size_t, size_t>;
 
 // RDNA 3
   // RX 7900+
-    archDim(HIP_ASTRIX_gfx1100, 61440*256, 1024);
+    archDim(HIP_ASTRIX_gfx1100, 61440*192, 1024);
   // RX 7800+
     archDim(HIP_ASTRIX_gfx1101, 38400*128, 256);
   // RX 7700+
@@ -41,7 +41,6 @@ using ArchDims = std::tuple<size_t, size_t, size_t>;
 
 #elif defined(__HIP_PLATFORM_NVIDIA__)
 // NVIDIA platform: Define grid dimensions and shared memory size based on architecture
-  #if defined(__CUDA_ARCH__)
     #if __CUDA_ARCH__ >= 900  // Hopper
       archDim(HIP_ASTRIX, 2048, 256);
 
@@ -57,7 +56,6 @@ using ArchDims = std::tuple<size_t, size_t, size_t>;
     #else  // Older NVIDIA architectures
       archDim(HIP_ASTRIX, 2048, 128);
     #endif
-  #endif
 #else
     #error "Unsupported platform"
 #endif
@@ -65,31 +63,37 @@ using ArchDims = std::tuple<size_t, size_t, size_t>;
 ArchDims defaultDims = {HIP_ASTRIX_BLOCKS, HIP_ASTRIX_THREADS, HIP_ASTRIX_BATCH_SIZE};
 
 // Define a lookup table (map) that holds the block/thread/batch sizes for each architecture
-static inline const std::unordered_map<std::string, ArchDims> archDimsMap = {
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx900),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx906),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1010),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1030),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1031),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1032),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1034),
-  ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1100)
-};
+// #if defined(__HIP_PLATFORM_AMD__)
+// static inline const std::unordered_map<std::string, ArchDims> archDimsMap = {
+//   ARCH_DIM_ENTRY(HIP_ASTRIX, gfx900),
+//   ARCH_DIM_ENTRY(HIP_ASTRIX, gfx906),
+//   ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1010),
+//   ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1030),
+//   // ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1031),
+//   // ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1032),
+//   // ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1034),
+//   ARCH_DIM_ENTRY(HIP_ASTRIX, gfx1100)
+// };
+// #elif defined(__HIP_PLATFORM_NVIDIA__)
+// static inline const std::unordered_map<std::string, ArchDims> archDimsMap = {
+  
+// }
+// #endif
 
 // Function to retrieve architecture dimensions
-static inline void getArchDims(size_t &blocks, size_t &threads, size_t &batchSize) {
-  int device;
-  hipGetDevice(&device);
+// static inline void getArchDims(size_t &blocks, size_t &threads, size_t &batchSize) {
+//   int device;
+//   hipGetDevice(&device);
 
-  hipDeviceProp_t props;
-  hipGetDeviceProperties(&props, device);
+//   hipDeviceProp_t props;
+//   hipGetDeviceProperties(&props, device);
 
-  const char* archName = props.gcnArchName;
+//   const char* archName = props.gcnArchName;
 
-  // Find the architecture in the lookup table
-  auto it = archDimsMap.find(std::string(archName));
-  const ArchDims& dims = (it != archDimsMap.end()) ? it->second : defaultDims;
+//   // Find the architecture in the lookup table
+//   auto it = archDimsMap.find(std::string(archName));
+//   const ArchDims& dims = (it != archDimsMap.end()) ? it->second : defaultDims;
 
-  // Unpack the tuple into blocks, threads, and batchSize
-  std::tie(blocks, threads, batchSize) = dims;
-}
\ No newline at end of file
+//   // Unpack the tuple into blocks, threads, and batchSize
+//   std::tie(blocks, threads, batchSize) = dims;
+// }
\ No newline at end of file
diff --git a/src/tnn_hip/crypto/keccak-tiny.hip.inc b/src/tnn_hip/crypto/keccak-tiny.hip.inc
index 9730f93..990e450 100644
--- a/src/tnn_hip/crypto/keccak-tiny.hip.inc
+++ b/src/tnn_hip/crypto/keccak-tiny.hip.inc
@@ -44,7 +44,7 @@ __device__ static const uint64_t RC[24] = \
   REPEAT5(e; v += s;)
 
 /*** Keccak-f[1600] ***/
-__device__ __forceinline__ static void keccakf(void* state) {
+static inline void keccakf(void* state) {
   uint64_t* a = (uint64_t*)state;
   uint64_t b[5] = {0};
   uint64_t t = 0;