diff --git a/.gitignore b/.gitignore
index 565866fd4bcdc..e479c61805a6a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ models/*
 /vdot
 /Pipfile
 
+build-info.h
 arm_neon.h
 compile_commands.json
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e4d37497ee199..42f9c6a000765 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,41 @@ option(LLAMA_HIPBLAS                "llama: use hipBLAS"
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 
+#
+# Build info header
+#
+
+# Write header template to binary dir to keep source directory clean
+file(WRITE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in" "\
+#ifndef BUILD_INFO_H\n\
+#define BUILD_INFO_H\n\
+\n\
+#define BUILD_NUMBER @BUILD_NUMBER@\n\
+#define BUILD_COMMIT \"@BUILD_COMMIT@\"\n\
+\n\
+#endif // BUILD_INFO_H\n\
+")
+
+# Generate initial build-info.h
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+    # Add a custom target for build-info.h
+    add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+
+    # Add a custom command to rebuild build-info.h when .git/index changes
+    add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
+        COMMENT "Generating build details from Git"
+        COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/.git/index"
+        VERBATIM
+    )
+else()
+    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+endif()
+
 #
 # Compile flags
 #
@@ -288,9 +323,22 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
         # TODO: arm msvc?
     else()
         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+            # Apple M1, M2, etc.
+            # Raspberry Pi 3, 4, Zero 2 (64-bit)
             add_compile_options(-mcpu=native)
         endif()
-        # TODO: armv6,7,8 version specific flags
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+            # Raspberry Pi 1, Zero
+            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+            # Raspberry Pi 2
+            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
+        endif()
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
     message(STATUS "x86 detected")
diff --git a/Makefile b/Makefile
index 8c9721d743fcd..9f3113e0be5bb 100644
--- a/Makefile
+++ b/Makefile
@@ -148,19 +148,21 @@ ifdef LLAMA_PERF
 	CXXFLAGS += -DGGML_PERF
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
+	# Apple M1, M2, etc.
+	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, 2, 3
+	# Raspberry Pi 1, Zero
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 4
+	# Raspberry Pi 2
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 4
+	# Raspberry Pi 3, 4, Zero 2 (32-bit)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
@@ -192,41 +194,56 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+libllama.so: llama.o ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
 
-main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+#
+# Examples
+#
+
+main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-libllama.so: llama.o ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+build-info.h: $(wildcard .git/index) scripts/build-info.sh
+	@scripts/build-info.sh > $@.tmp
+	@if ! cmp -s $@.tmp $@; then \
+		mv $@.tmp $@; \
+	else \
+		rm $@.tmp; \
+	fi
 
 #
 # Tests
 #
 
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
 
+vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 05deebcd10c79..0376961945ad7 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 19cbab1c38825..2cc1a1477762c 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,5 +1,6 @@
 #include <locale.h>
 #include "ggml.h"
+#include "build-info.h"
 #include <assert.h>
 #include <math.h>
 #include <cstring>
@@ -90,9 +91,10 @@ int main(int argc, char ** argv)  {
         }
     }
 
-    // create the ggml context
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
     printf("Starting Test\n");
 
+    // create the ggml context
     struct ggml_context * ctx;
     //const int sizex = 4096;
     //const int sizey = 11008;
diff --git a/examples/common.cpp b/examples/common.cpp
index 6c712c713db9b..ad7b0bba32f1f 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,13 +1,18 @@
 #include "common.h"
 
 #include <cassert>
+#include <iostream>
 #include <cstring>
 #include <fstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
 #include <sstream>
-#include <iostream>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
 
 #if defined (_WIN32)
 #include <fcntl.h>
@@ -25,19 +30,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
 #define CP_UTF8 65001
 #endif
 
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    // determine sensible default number of threads.
-    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+int32_t get_num_physical_cores() {
 #ifdef __linux__
     std::ifstream cpuinfo("/proc/cpuinfo");
-    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
-                                  std::istream_iterator<std::string>(),
-                                  std::string("processor"));
-#endif
-    if (params.n_threads == 0) {
-        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+    std::string line;
+    while (std::getline(cpuinfo, line)) {
+        std::size_t pos = line.find("cpu cores");
+        if (pos != std::string::npos) {
+            pos = line.find(": ", pos);
+            if (pos != std::string::npos) {
+                try {
+                    // Extract the number and return it
+                    return static_cast<int32_t>(std::stoul(line.substr(pos + 2)));
+                } catch (const std::invalid_argument &) {
+                    // Ignore if we could not parse
+                }
+            }
+        }
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
     }
+#elif defined(_WIN32)
+    //TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
 
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
     gpt_params default_params;
diff --git a/examples/common.h b/examples/common.h
index fce1d42a9da70..627696e30a4f6 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -13,11 +13,12 @@
 //
 // CLI argument parsing
 //
+int32_t get_num_physical_cores();
 
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = -1;   // new tokens to predict
+    int32_t n_threads     = get_num_physical_cores();
+    int32_t n_predict     = -1;  // new tokens to predict
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
     int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 88c425d4a1fd1..db73b6b44f07f 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET embedding)
 add_executable(${TARGET} embedding.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index e10de619c9d5f..b3e001476ea95 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <ctime>
 
@@ -18,11 +19,13 @@ int main(int argc, char ** argv) {
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.seed <= 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index b2dcc2910f333..c364242fbadb4 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET main)
 add_executable(${TARGET} main.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 990d0fa023c63..7dc1005123775 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -5,6 +5,7 @@
 
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -81,11 +82,13 @@ int main(int argc, char ** argv) {
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.seed <= 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
@@ -161,23 +164,22 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> session_tokens;
 
     if (!path_session.empty()) {
-        fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
 
-        // REVIEW - fopen to check for existing session
+        // fopen to check for existing session
         FILE * fp = std::fopen(path_session.c_str(), "rb");
         if (fp != NULL) {
             std::fclose(fp);
 
             session_tokens.resize(params.n_ctx);
             size_t n_token_count_out = 0;
-            const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
+            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
             session_tokens.resize(n_token_count_out);
 
-            if (n_session_bytes > 0) {
-                fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
-            } else {
-                fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
-            }
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {
             fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
         }
@@ -214,7 +216,7 @@ int main(int argc, char ** argv) {
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
         params.n_keep = (int)embd_inp.size();
     }
 
@@ -329,7 +331,7 @@ int main(int argc, char ** argv) {
                 // insert n_left/2 tokens at the start of embd from last_n_tokens
                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
 
-                // REVIEW - stop saving session if we run out of context
+                // stop saving session if we run out of context
                 path_session = "";
 
                 //printf("\n---\n");
@@ -355,6 +357,7 @@ int main(int argc, char ** argv) {
                     n_session_consumed++;
 
                     if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
                         break;
                     }
                 }
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index 5836df8b27752..61b17b828dd1b 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET perplexity)
 add_executable(${TARGET} perplexity.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 615157e7b68ec..2ca3388355bd0 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <cmath>
 #include <ctime>
@@ -106,11 +107,13 @@ int main(int argc, char ** argv) {
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.seed <= 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 4e6c2c8314661..9a2aa7c6474fb 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "build-info.h"
 
 #define LLAMA_API_INTERNAL
 #include "llama.h"
@@ -308,6 +309,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     // load the model
     fprintf(stderr, "Loading model\n");
 
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index fb27d45171c7a..475fc8be885a6 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index dd175c690232e..198bd5fcb4cf6 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <cstdio>
 #include <map>
@@ -50,6 +51,8 @@ int main(int argc, char ** argv) {
         ftype = (enum llama_ftype)atoi(argv[3]);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     int nthread = argc > 4 ? atoi(argv[4]) : 0;
 
     const int64_t t_main_start_us = ggml_time_us();
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index cff79fa1f3e17..08dbe5c2b3edf 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index f1531ba39eb5e..ea0a984d93816 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <vector>
 #include <cstdio>
@@ -17,6 +18,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.n_predict < 0) {
         params.n_predict = 16;
     }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b9d60db010077..033c7d5c88ff0 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,15 +1,44 @@
+#include <cstddef>
+#include <cstdint>
 #include <stdint.h>
 #include <stdio.h>
+#include <atomic>
+
 #if defined(GGML_USE_HIPBLAS)
+#include "hip/hip_runtime.h"
+#include "hipblas/hipblas.h"
 #include "hip/hip_fp16.h"
 #else
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
 #include <cuda_fp16.h>
 #endif
-#include <atomic>
-#include "ggml-cuda.h"
 
-typedef uint16_t ggml_fp16_t;
-static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+#include "ggml-cuda.h"
+#include "ggml.h"
+
+static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                cudaGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
 
 #define QK4_0 32
 typedef struct {
@@ -28,14 +57,14 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b
 
 #define QK4_2 16
 typedef struct {
-    __half  d;              // delta
+    half  d;                // delta
     uint8_t qs[QK4_2 / 2];  // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 
 #define QK5_0 32
 typedef struct {
-    __half d;               // delta
+    half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
     uint8_t qs[QK5_0 / 2];  // nibbles / quants
 } block_q5_0;
@@ -43,9 +72,9 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 
 #define QK5_1 32
 typedef struct {
-    __half d;               // delta
-    __half m;               // min
-    uint32_t qh;            // 5-th bit of quants
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qh[4];          // 5-th bit of quants
     uint8_t qs[QK5_1 / 2];  // nibbles / quants
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
@@ -166,7 +195,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
 
     const uint8_t * pp = x[i].qs;
 
-    const uint32_t qh = x[i].qh;
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
 
     for (int l = 0; l < QK5_1; l += 2) {
         const uint8_t vi = pp[l/2];
@@ -201,37 +231,50 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
     }
 }
 
-void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_0;
     dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_1;
     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_2;
     dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_0;
     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_1;
     dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK8_0;
     dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
+// TODO: optimize
+static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
+    const half * x = (const half *) vx;
+
+    const int i = blockIdx.x;
+
+    y[i] = __half2float(x[i]);
+}
+
+static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
+    convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
             return dequantize_row_q4_0_cuda;
@@ -245,6 +288,8 @@ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
             return dequantize_row_q5_1_cuda;
         case GGML_TYPE_Q8_0:
             return dequantize_row_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_fp16_to_fp32_cuda;
         default:
             return nullptr;
     }
@@ -275,7 +320,7 @@ struct cuda_buffer {
 static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -294,7 +339,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     return ptr;
 }
 
-void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -309,28 +354,55 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
     CUDA_CHECK(cudaFree(ptr));
 }
 
-cublasHandle_t g_cublasH = nullptr;
-cudaStream_t g_cudaStream = nullptr;
-cudaStream_t g_cudaStream2 = nullptr;
-cudaEvent_t g_cudaEvent = nullptr;
+#define GGML_CUDA_MAX_STREAMS 8
+#define GGML_CUDA_MAX_EVENTS 64
+static cublasHandle_t g_cublasH = nullptr;
+static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
 
 void ggml_init_cublas() {
     if (g_cublasH == nullptr) {
-        // create cublas handle, bind a stream
-        CUBLAS_CHECK(cublasCreate(&g_cublasH));
-        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
-        CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
+        // create streams
+        for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
+            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
+            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
+        }
+        // create events
+        for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
+        }
 
-        // create additional stream and event for synchronization
-        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
-        CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
+        // create cublas handle
+        CUBLAS_CHECK(cublasCreate(&g_cublasH));
+        CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
 
         // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
     }
 }
 
-cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
+void * ggml_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    return ptr;
+}
+
+void ggml_cuda_host_free(void * ptr) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+}
+
+static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
     const uint64_t ne0 = src->ne[0];
     const uint64_t ne1 = src->ne[1];
     const uint64_t nb0 = src->nb[0];
@@ -358,12 +430,293 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
     }
 }
 
-void * ggml_cuda_host_malloc(size_t size) {
-    void * ptr;
-    CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
-    return ptr;
+static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+
+    size_t x_size, y_size, d_size;
+    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+
+            float * c_X = d_X + i * x_ne;
+            float * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+
+            // copy data to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, ne00,
+                                c_Y, ne10,
+                        &beta,  c_D, ne01));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
 }
 
-void ggml_cuda_host_free(void * ptr) {
-    CUDA_CHECK(cudaFreeHost(ptr));
+static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+
+    size_t x_size, y_size, d_size;
+    half  * d_X =  (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
+    half  * d_Y =  (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+
+            half  * c_X = d_X + i * x_ne;
+            half  * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+
+            // copy src0 to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                    }
+                }
+            }
+            else {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                        // very slow due to no inlining
+                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, CUDA_R_16F, ne00,
+                                c_Y, CUDA_R_16F, ne10,
+                        &beta,  c_D, CUDA_R_32F, ne01,
+                        CUBLAS_COMPUTE_32F_FAST_16F,
+                        CUBLAS_GEMM_DEFAULT));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+}
+
+static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_type type = src0->type;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+
+    size_t x_size, y_size, d_size, q_size;
+    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+    char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
+    GGML_ASSERT(to_fp32_cuda != nullptr);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+            cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
+            cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
+
+            float * c_X = d_X + i * x_ne;
+            float * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+            char  * c_Q = d_Q + i * q_sz;
+
+            // copy src0 and convert to fp32 on device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
+            to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
+            CUDA_CHECK(cudaGetLastError());
+            CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+            // copy src1 to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+            // wait for conversion
+            CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, ne00,
+                                c_Y, ne10,
+                        &beta,  c_D, ne01));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+    ggml_cuda_pool_free(d_Q, q_size);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+    size_t src0_sz = ggml_nbytes(src0);
+    size_t src1_sz = ggml_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
+    GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_is_quantized(src0->type)) {
+        ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_ASSERT(false);
+    }
+}
+
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    }
+    else {
+        return 0;
+    }
 }
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 778995d372fbe..0e740e30908bc 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -2,6 +2,7 @@
 #include "hipblas/hipblas.h"
 #include "hip/hip_runtime.h"
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
@@ -49,49 +50,16 @@
 extern "C" {
 #endif
 
-#define CUDA_CHECK(err)                                                                 \
-    do {                                                                                \
-        cudaError_t err_ = (err);                                                       \
-        if (err_ != cudaSuccess) {                                                      \
-            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
-                cudaGetErrorString(err_));                                              \
-            exit(1);                                                                    \
-        }                                                                               \
-    } while (0)
-
-#define CUBLAS_CHECK(err)                                                               \
-    do {                                                                                \
-        cublasStatus_t err_ = (err);                                                    \
-        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
-            exit(1);                                                                    \
-        }                                                                               \
-    } while (0)
+void   ggml_init_cublas(void);
 
-extern cublasHandle_t g_cublasH;
-extern cudaStream_t g_cudaStream;
-extern cudaStream_t g_cudaStream2;
-extern cudaEvent_t g_cudaEvent;
+bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
-void   ggml_init_cublas(void);
+// TODO: export these with GGML_API
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
-void   ggml_cuda_pool_free(void * ptr, size_t size);
-
-void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-
-cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
-
-typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
-dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl
deleted file mode 100644
index a65a79f4d6b58..0000000000000
--- a/ggml-opencl-dequant.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
-
-struct block_q4_0
-{
-    float d;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-struct block_q4_1
-{
-    float d;
-    float m;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
-
-struct block_q4_2
-{
-    ushort d;
-    uchar qs[8];
-};
-
-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-);
diff --git a/ggml-opencl.c b/ggml-opencl.c
index b748f86b7a37e..4389eca393466 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -3,12 +3,141 @@
 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast_c.h>
 
+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 
 #include "ggml.h"
 
-#include "ggml-opencl-dequant.cl"
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+const char * clblast_dequant = MULTILINE_QUOTE(
+
+struct block_q4_0
+{
+    float d;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+struct block_q4_1
+{
+    float d;
+    float m;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+    const float m = blocks[i].m;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = (vi & 0xf) * d + m;
+    result[index + 1] = (vi >> 4) * d + m;
+}
+
+struct block_q4_2
+{
+    ushort d;
+    uchar qs[8];
+};
+
+__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 16;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &blocks[i].d);
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*16 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+
+struct block_q5_0
+{
+    float d;
+    uint qh;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint l2 = l * 2;
+
+    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
+    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+
+    const uint index = i*32 + l2;
+    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
+    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
+}
+
+struct block_q5_1
+{
+    ushort d;
+    ushort m;
+    uint qh;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &blocks[i].d);
+    const float m = vload_half(0, (__global half*) &blocks[i].m);
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint l2 = l * 2;
+
+    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
+    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+
+    const uint index = i*32 + l2;
+    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
+    result[index + 1] = ((vi >>  4) | vh1)*d + m;
+}
+
+struct block_q8_0
+{
+    float d;
+    char qs[32];
+};
+
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
+}
+
+);
 
 #define CL_CHECK(err, name)                                                                     \
     do {                                                                                        \
@@ -19,12 +148,26 @@
         }                                                                                       \
     } while (0)
 
+#define QK5_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+
+
+typedef struct {
+    float d;                // delta
+    uint32_t qh;          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} cl_block_q5_0;
+
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
 
@@ -97,6 +240,12 @@ void ggml_cl_init(void) {
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
     CL_CHECK(err, "clCreateKernel");
+    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
+    CL_CHECK(err, "clCreateKernel");
 }
 
 static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -125,6 +274,7 @@ void ggml_cl_sgemm_wrapper(
     cl_kernel kernel;
     size_t global = n * k, local, size_qb;
     bool dequant;
+    cl_block_q5_0* cl_host_b;
 
     switch (btype) {
     case GGML_TYPE_F32:
@@ -146,7 +296,36 @@ void ggml_cl_sgemm_wrapper(
         dequant = true;
         kernel = kernel_q4_2;
         local = 8;
-        size_qb = global * (sizeof(short) + local) / 16;
+        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
+        break;
+    case GGML_TYPE_Q5_0:
+        dequant = true;
+        kernel = kernel_q5_0;
+        local = 16;
+        // For some reason OpenCL seems to be incapable of working with structs of size 22.
+        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
+        // TODO Find the reason, fix and remove workaround.
+        const block_q5_0* b = (const block_q5_0*) host_b;
+        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
+        for (size_t i = 0; i < global / 32; i++) {
+            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
+            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
+            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
+        }
+        host_b = (const float*) cl_host_b;
+        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
+        break;
+    case GGML_TYPE_Q5_1:
+        dequant = true;
+        kernel = kernel_q5_1;
+        local = 16;
+        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
+        break;
+    case GGML_TYPE_Q8_0:
+        dequant = true;
+        kernel = kernel_q8_0;
+        local = 32;
+        size_qb = global * (sizeof(float) + local) / 32;
         break;
     default:
         fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -171,12 +350,15 @@ void ggml_cl_sgemm_wrapper(
         err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
         err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
         CL_CHECK(err, "clSetKernelArg");
-        clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+        CL_CHECK(err, "clEnqueueWriteBuffer qb");
     } else {
-        clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+        CL_CHECK(err, "clEnqueueWriteBuffer b");
     }
 
-    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    CL_CHECK(err, "clEnqueueWriteBuffer a");
     if (dequant) {
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
         CL_CHECK(err, "clEnqueueNDRangeKernel");
@@ -188,15 +370,20 @@ void ggml_cl_sgemm_wrapper(
     clReleaseEvent(ev_b);
 
     cl_event ev_sgemm;
-    CLBlastSgemm((CLBlastLayout)order,
-                 (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
-                 m, n, k,
-                 alpha,
-                 cl_buffer_a, 0, lda,
-                 cl_buffer_b, 0, ldb,
-                 beta,
-                 cl_buffer_c, 0, ldc,
-                 &queue, &ev_sgemm);
+    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
+                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+                                            m, n, k,
+                                            alpha,
+                                            cl_buffer_a, 0, lda,
+                                            cl_buffer_b, 0, ldb,
+                                            beta,
+                                            cl_buffer_c, 0, ldc,
+                                            &queue, &ev_sgemm);
+
+    if (status != CLBlastSuccess) {
+        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
+        abort();
+    }
 
     cl_event ev_c;
     clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
@@ -205,4 +392,7 @@ void ggml_cl_sgemm_wrapper(
     clWaitForEvents(1, &ev_c);
     clReleaseEvent(ev_sgemm);
     clReleaseEvent(ev_c);
+    if (btype == GGML_TYPE_Q5_0) {
+        free((void*) cl_host_b);
+    }
 }
diff --git a/ggml.c b/ggml.c
index d8882d6446f85..795839abcedcd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #define UNUSED(x) (void)(x)
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
 
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
@@ -330,7 +322,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB)
 static float table_f32_f16[1 << 16];
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__wasm_simd128__)
 #define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
 #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
 #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
     return GGML_FP32_TO_FP16(x);
 }
 
+void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
+    for (size_t i = 0; i < n; i++) {
+        y[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
+    size_t i = 0;
+#if defined(__F16C__)
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for(; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
+
 //
 // timing
 //
@@ -1087,7 +1105,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
             const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
             const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
             const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
-            const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
+            const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
 
             y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
             y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
@@ -1911,8 +1929,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
             const uint8_t vi = pp[l/2];
 
             // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
             const int8_t vi0 = (vi & 0x0F) | vh0;
             const int8_t vi1 = (vi >>   4) | vh1;
@@ -1948,8 +1966,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
             const uint8_t vi = pp[l/2];
 
             // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
             const uint8_t vi0 = (vi & 0x0F) | vh0;
             const uint8_t vi1 = (vi >>   4) | vh1;
@@ -3180,6 +3198,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint64_t tmp[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q8_0 * restrict y0 = &y[i];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+        const v128_t s16b = wasm_i8x16_splat(0x10);
+
+        // extract the 5th bit
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // interleave
+        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
+        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+        // add high bit and sub 16
+        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
+        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -3220,8 +3304,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         for (int j = 0; j < QK8_0/2; j++) {
             const uint8_t v0 = x0[j];
 
-            const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
+            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
+            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
 
             const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
             const int x1_0 = ((v0 >>   4) | x1_0h) - 16;
@@ -3311,6 +3395,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv) + summs;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint64_t tmp[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        static bool x = true;
+
+        // interleave
+        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
+        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0lz, qhl);
+        const v128_t v0hf = wasm_v128_or(v0hz, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -3354,8 +3509,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         for (int j = 0; j < QK8_1/2; j++) {
             const uint8_t v0 = x0[j];
 
-            const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
+            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
+            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
 
             const int x0_0 = (v0 & 0x0F) | x0_0h;
             const int x1_0 = (v0 >>   4) | x1_0h;
@@ -4057,6 +4212,27 @@ bool ggml_is_quantized(enum ggml_type type) {
     return GGML_IS_QUANTIZED[type];
 }
 
+enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
+    enum ggml_type wtype = GGML_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
+        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
+        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
+        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
+        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
+        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
+        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
+        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
+    }
+
+    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
+
+    return wtype;
+}
+
 static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
@@ -4167,12 +4343,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
-        // initialize cuBLAS
-        #if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUBLAS)
         ggml_init_cublas();
-        #elif defined(GGML_USE_CLBLAST)
+#elif defined(GGML_USE_CLBLAST)
         ggml_cl_init();
-        #endif
+#endif
 
         is_first_call = false;
     }
@@ -4253,7 +4428,7 @@ void ggml_free(struct ggml_context * ctx) {
 }
 
 size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end->offs + ctx->objects_end->size;
+    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
 }
 
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
@@ -7943,7 +8118,7 @@ static void ggml_compute_forward_rms_norm(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -7959,12 +8134,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     const int64_t ne1 = dst->ne[1];
 
     // TODO: find the optimal values for these
-    if (
-#if !defined(GGML_USE_CUBLAS)
-        ggml_is_contiguous(src0) &&
+    if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) &&
-#endif
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
         return true;
@@ -7972,7 +8144,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
     return false;
 }
-
 #endif
 
 static void ggml_compute_forward_mul_mat_f32(
@@ -7988,7 +8159,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -8045,7 +8216,16 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS)
+    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8059,43 +8239,13 @@ static void ggml_compute_forward_mul_mat_f32(
             return;
         }
 
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne00;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#endif
-
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
-#if !defined(GGML_USE_CUBLAS)
                 const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-#endif
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CUBLAS)
-                // copy data to device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, ne00,
-                                    d_Y, ne10,
-                            &beta,  d_D, ne01));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 // zT = y * xT
                 ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
                         ne11, ne01, ne10,
@@ -8112,12 +8262,6 @@ static void ggml_compute_forward_mul_mat_f32(
 #endif
             }
         }
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-#endif
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
@@ -8247,7 +8391,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS)
+    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -8263,37 +8416,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
             return;
         }
 
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne00;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size;
-        ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float       * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#endif
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
-#if defined(GGML_USE_CUBLAS)
-                // copy src0 while converting src1
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
-
-                // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
-                ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne11; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne10; ++i00) {
-                            wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
-                        }
-                    }
-
-                    assert(id*sizeof(ggml_fp16_t) <= params->wsize);
-                }
-#else
                 float * const wdata = params->wdata;
                 {
                     size_t id = 0;
@@ -8305,28 +8429,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
                     assert(id*sizeof(float) <= params->wsize);
                 }
-#endif
 
-#if defined(GGML_USE_CUBLAS)
-                const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, CUDA_R_16F, ne00,
-                                    d_Y, CUDA_R_16F, ne10,
-                            &beta,  d_D, CUDA_R_32F, ne01,
-                            CUBLAS_COMPUTE_32F,
-                            CUBLAS_GEMM_DEFAULT));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 const float * x = wdata;
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
 
@@ -8355,12 +8459,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
             }
         }
 
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-#endif
         /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
 
         return;
@@ -8513,7 +8611,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS)
+    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8527,25 +8634,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
             return;
         }
 
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne00;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size, q_size;
-        float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-        void  * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
-
-        const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
-        GGML_ASSERT(dequantize_row_q_cuda != NULL);
-#else
         float * const wdata = params->wdata;
         dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-#endif
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8553,14 +8643,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CUBLAS)
-                // copy and dequantize on device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
-
-                dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
-                CUDA_CHECK(cudaGetLastError());
-                CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
 #else
                 {
@@ -8576,24 +8659,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 const float * x = wdata;
 #endif
 
-#if defined(GGML_USE_CUBLAS)
-                // copy data to device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
-
-                // wait for dequantization
-                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, ne00,
-                                    d_Y, ne10,
-                            &beta,  d_D, ne01));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 // zT = y * xT
                 ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
                         ne11, ne01, ne10,
@@ -8611,13 +8677,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
             }
         }
 
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-        ggml_cuda_pool_free(d_Q, q_size);
-#endif
         //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
@@ -11601,18 +11660,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
                         size_t cur = 0;
 
+#if defined(GGML_USE_CUBLAS)
+                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
+                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                //       the threads are still spinning
+                            cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
+                        }
+                        else
+#endif
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
-#if defined(GGML_USE_CUBLAS)
-                                // with cuBLAS, we need memory for the full 3D / 4D data of src1
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#else
                                 // here we need memory just for single 2D matrix from src0
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-#endif
                             } else {
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
                             }
@@ -11621,13 +11683,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                             }
 #endif
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -12899,8 +12961,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int l = 0; l < QK5_0; l += 2) {
-                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -12929,8 +12991,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int l = 0; l < QK5_1; l += 2) {
-                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
diff --git a/ggml.h b/ggml.h
index c1c5495c63f44..ef5a048c3b7e4 100644
--- a/ggml.h
+++ b/ggml.h
@@ -197,6 +197,14 @@
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 
+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -212,6 +220,9 @@ extern "C" {
     GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
     GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
 
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
+
     struct ggml_object;
     struct ggml_context;
 
@@ -232,6 +243,20 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
 
+    // model file types
+    enum ggml_ftype {
+        GGML_FTYPE_UNKNOWN     = -1,
+        GGML_FTYPE_ALL_F32     = 0,
+        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+    };
+
     // available tensor operations:
     enum ggml_op {
         GGML_OP_NONE = 0,
@@ -385,6 +410,9 @@ extern "C" {
 
     GGML_API bool    ggml_is_quantized(enum ggml_type type);
 
+    // TODO: temporary until model loading of ggml examples is refactored
+    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+
     // main
 
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
diff --git a/llama-util.h b/llama-util.h
index ca4dd162f59fe..d531588d58019 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -243,7 +243,8 @@ struct llama_mmap {
 #else
     static constexpr bool SUPPORTED = false;
 
-    llama_mmap(struct llama_file *) {
+    llama_mmap(struct llama_file *, bool prefetch = true) {
+        (void)prefetch;
         throw std::string("mmap not supported");
     }
 #endif
@@ -382,8 +383,13 @@ struct llama_mlock {
 #else
     static constexpr bool SUPPORTED = false;
 
-    void raw_lock(const void * addr, size_t size) {
+    size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t size) {
         fprintf(stderr, "warning: mlock not supported on this system\n");
+        return false;
     }
 
     void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +401,8 @@ struct llama_buffer {
     uint8_t * addr = NULL;
     size_t size = 0;
 
+    llama_buffer() = default;
+
     void resize(size_t size) {
         delete[] addr;
         addr = new uint8_t[size];
@@ -404,27 +412,59 @@ struct llama_buffer {
     ~llama_buffer() {
         delete[] addr;
     }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
 };
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
     uint8_t * addr = NULL;
+    bool is_cuda;
     size_t size = 0;
 
+    llama_ctx_buffer() = default;
+
     void resize(size_t size) {
+        free();
+
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
         if (addr) {
-            ggml_cuda_host_free(addr);
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
         }
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
         this->size = size;
     }
 
-    ~llama_ctx_buffer() {
+    void free() {
         if (addr) {
-            ggml_cuda_host_free(addr);
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
         }
+        addr = NULL;
     }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
diff --git a/llama.cpp b/llama.cpp
index f8b4c8e46b521..868a58a8b0b93 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,8 +727,7 @@ struct llama_model_loader {
             LLAMA_ASSERT(offset == lt.size);
         } else if (lt.split_type == SPLIT_BY_COLUMNS) {
             // Let's load the data into temporary buffers to ensure the OS performs large loads.
-            std::vector<llama_buffer> tmp_bufs;
-            tmp_bufs.resize(lt.shards.size());
+            std::vector<llama_buffer> tmp_bufs(lt.shards.size());
             for (size_t i = 0; i < lt.shards.size(); i++) {
                 llama_load_tensor_shard & shard = lt.shards.at(i);
                 llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -2373,7 +2372,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
     }
 }
 
-int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return ctx->model.kv_self.n;
 }
 
@@ -2387,7 +2386,7 @@ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
 }
 
 // Returns the size of the state
-size_t llama_get_state_size(struct llama_context * ctx) {
+size_t llama_get_state_size(const struct llama_context * ctx) {
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
     // for reference, std::mt19937(1337) serializes to 6701 bytes.
     const size_t s_rng_size        = sizeof(size_t);
@@ -2567,6 +2566,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
     return nread;
 }
 
+bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(path_session, "rb");
+
+    // sanity checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
+            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+            return false;
+        }
+
+        llama_hparams session_hparams;
+        file.read_raw(&session_hparams, sizeof(llama_hparams));
+
+        if (session_hparams != ctx->model.hparams) {
+            fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
+            return false;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return false;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
+
+    // restore the context state
+    {
+        const size_t n_state_size_cur = file.size - file.tell();
+        const size_t n_state_size_exp = llama_get_state_size(ctx);
+
+        if (n_state_size_cur != n_state_size_exp) {
+            fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur);
+            return false;
+        }
+
+        std::vector<uint8_t> state_data(n_state_size_cur);
+        file.read_raw(state_data.data(), n_state_size_cur);
+
+        llama_set_state_data(ctx, state_data.data());
+    }
+
+    return true;
+}
+
+bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(path_session, "wb");
+
+    file.write_u32(LLAMA_SESSION_MAGIC);
+    file.write_u32(LLAMA_SESSION_VERSION);
+
+    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state
+    {
+        const size_t n_state_size = llama_get_state_size(ctx);
+
+        std::vector<uint8_t> state_data(n_state_size);
+        llama_copy_state_data(ctx, state_data.data());
+
+        file.write_raw(state_data.data(), n_state_size);
+    }
+
+    return true;
+}
+
 int llama_eval(
         struct llama_context * ctx,
            const llama_token * tokens,
@@ -2605,15 +2683,15 @@ int llama_tokenize(
     return res.size();
 }
 
-int llama_n_vocab(struct llama_context * ctx) {
+int llama_n_vocab(const struct llama_context * ctx) {
     return ctx->vocab.id_to_token.size();
 }
 
-int llama_n_ctx(struct llama_context * ctx) {
+int llama_n_ctx(const struct llama_context * ctx) {
     return ctx->model.hparams.n_ctx;
 }
 
-int llama_n_embd(struct llama_context * ctx) {
+int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
@@ -2625,7 +2703,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
     if (token >= llama_n_vocab(ctx)) {
         return nullptr;
     }
@@ -2694,57 +2772,3 @@ const char * llama_print_system_info(void) {
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
     return ctx->model.tensors_by_name;
 }
-
-size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    // TODO leverage mmap
-    llama_file file(path_session, "rb");
-    const uint32_t magic = file.read_u32();
-    const uint32_t version = file.read_u32();
-
-    if (!(magic == 'ggsn' && version == 0)) {
-        fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-        return 0;
-    }
-
-    llama_hparams session_hparams;
-    file.read_raw(&session_hparams, sizeof(llama_hparams));
-
-    // REVIEW
-    if (session_hparams != ctx->model.hparams) {
-        fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
-        return 0;
-    }
-
-    const uint32_t n_token_count = file.read_u32();
-    LLAMA_ASSERT(n_token_capacity >= n_token_count);
-    file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-    *n_token_count_out = n_token_count;
-
-    const size_t n_state_size = file.size - file.tell();
-    const size_t n_orig_state_size = llama_get_state_size(ctx);
-    if (n_state_size != n_orig_state_size) {
-        fprintf(stderr, "%s : failed to validate state size\n", __func__);
-    }
-    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
-    file.read_raw(state_data.get(), n_state_size);
-    return llama_set_state_data(ctx, state_data.get());
-}
-
-size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    // TODO save temp & swap
-    llama_file file(path_session, "wb");
-
-    const size_t n_state_size = llama_get_state_size(ctx);
-    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
-    llama_copy_state_data(ctx, state_data.get());
-
-    file.write_u32('ggsn'); // magic
-    file.write_u32(0); // version
-    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
-
-    file.write_u32((uint32_t) n_token_count); // REVIEW
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
-    file.write_raw(state_data.get(), n_state_size);
-    return n_state_size; // REVIEW
-}
diff --git a/llama.h b/llama.h
index 34a8f5b3ca52c..2f6ce8d831e6c 100644
--- a/llama.h
+++ b/llama.h
@@ -19,9 +19,11 @@
 #    define LLAMA_API
 #endif
 
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
+#define LLAMA_FILE_VERSION           1
+#define LLAMA_FILE_MAGIC             'ggjt'
+#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
+#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_SESSION_VERSION        0
 
 #ifdef __cplusplus
 extern "C" {
@@ -120,13 +122,13 @@ extern "C" {
                              int   n_threads);
 
     // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
 
     // Sets the current rng seed.
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
 
     // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
-    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
 
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
@@ -138,8 +140,8 @@ extern "C" {
     LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
 
     // Save/load session file
-    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
 
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
@@ -164,9 +166,9 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
@@ -180,7 +182,7 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 
     // Special tokens
     LLAMA_API llama_token llama_token_bos();
diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake
new file mode 100644
index 0000000000000..fb46ed2b5939e
--- /dev/null
+++ b/scripts/build-info.cmake
@@ -0,0 +1,53 @@
+set(TEMPLATE_FILE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in")
+set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+set(BUILD_NUMBER 0)
+set(BUILD_COMMIT "unknown")
+
+# Look for git
+find_package(Git)
+if(NOT Git_FOUND)
+    execute_process(
+        COMMAND which git
+        OUTPUT_VARIABLE GIT_EXECUTABLE
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT GIT_EXECUTABLE STREQUAL "")
+        set(Git_FOUND TRUE)
+        message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
+    else()
+        message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
+    endif()
+endif()
+
+# Get the commit count and hash
+if(Git_FOUND)
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE HEAD
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE GIT_HEAD_RESULT
+    )
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE COUNT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE GIT_COUNT_RESULT
+    )
+    if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+        set(BUILD_NUMBER ${COUNT})
+    endif()
+endif()
+
+# Only write the header if it's changed to prevent unnecessary recompilation
+if(EXISTS ${HEADER_FILE})
+    file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
+    list(GET CONTENTS 0 EXISTING)
+    if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
+        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+    endif()
+else()
+    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+endif()
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
new file mode 100755
index 0000000000000..507d7e1537b69
--- /dev/null
+++ b/scripts/build-info.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+BUILD_NUMBER="0"
+BUILD_COMMIT="unknown"
+
+REV_LIST=$(git rev-list --count HEAD)
+if [ $? -eq 0 ]; then
+  BUILD_NUMBER=$REV_LIST
+fi
+
+REV_PARSE=$(git rev-parse --short HEAD)
+if [ $? -eq 0 ]; then
+  BUILD_COMMIT=$REV_PARSE
+fi
+
+echo "#ifndef BUILD_INFO_H"
+echo "#define BUILD_INFO_H"
+echo ""
+echo "#define BUILD_NUMBER $BUILD_NUMBER"
+echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
+echo ""
+echo "#endif // BUILD_INFO_H"