rustformers · philpax · Apr 30, 2023 · Apr 15, 2023 · Apr 15, 2023 · Apr 16, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "ggml-rs/ggml"]
+	path = ggml-rs/ggml
+	url = git@github.com:ggerganov/ggml.git
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,44 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "type": "lldb",
+      "request": "launch",
+      "name": "Debug example 'gpt2_inference'",
+      "cargo": {
+        "args": [
+          "build",
+          "--example=gpt2_inference",
+          "--package=gpt2"
+        ],
+        "filter": {
+          "name": "gpt2_inference",
+          "kind": "example"
+        }
+      },
+      "args": ["${env:HOME}/.ggml-models/cerebras-gpt-13b.bin"],
+      "cwd": "${workspaceFolder}"
+    },
+    {
+      "type": "lldb",
+      "request": "launch",
+      "name": "Debug example 'llama_inference'",
+      "cargo": {
+        "args": [
+          "build",
+          "--example=llama_inference",
+          "--package=llama"
+        ],
+        "filter": {
+          "name": "llama_inference",
+          "kind": "example"
+        }
+      },
+      "args": ["${env:HOME}/.ggml-models/gpt4all-7b.bin"],
+      "cwd": "${workspaceFolder}"
+    }
+  ]
+}
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,16 +1,20 @@
 [workspace]
 members = [
-    "ggml-sys",
-    "ggml",
-    "ggml-format",
-    "llama-rs",
-    "llama-cli",
-    "generate-ggml-bindings"
+    # Crates
+    "ggml-rs",
+    "llm-base",
+    "gpt2",
+    "llama",
+    "llm",
+    "llm-cli",
 ]
 resolver = "2"
 
 [workspace.package]
 version = "0.1.0"
 
 [workspace.dependencies]
+bytemuck = "1.13.1"
+log = "0.4"
 rand = "0.8.5"
+serde = { version = "1.0", features = ["derive"] }
diff --git a/README.md b/README.md
@@ -1,39 +1,31 @@
 # LLaMA-rs
 
-<!-- markdownlint-disable-file MD026 -->
+This project is a Rust port of
+[llama.cpp](https://github.com/ggerganov/llama.cpp) 🦙🦀🚀
 
-> Do the LLaMA thing, but now in Rust 🦀🚀🦙
-
-![A llama riding a crab, AI-generated](./doc/resources/logo2.png)
-
-> _Image by [@darthdeus](https://github.com/darthdeus/), using Stable Diffusion_
-
-[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/F1F8DNO5D)
+Just like its C++ counterpart, it is powered by the
+[`ggml`](https://github.com/ggerganov/ggml) tensor library, which allows running
+inference for Facebook's [LLaMA](https://github.com/facebookresearch/llama)
+model on a CPU with good performance using full precision, f16 or 4-bit
+quantized versions of the model.
 
 [![Latest version](https://img.shields.io/crates/v/llama-rs.svg)](https://crates.io/crates/llama_rs)
 ![MIT/Apache2](https://shields.io/badge/license-MIT%2FApache--2.0-blue)
 [![Discord](https://img.shields.io/discord/1085885067601137734)](https://discord.gg/YB9WaXYAWU)
 
-![Gif showcasing language generation using llama-rs](./doc/resources/llama_gif.gif)
-
-**LLaMA-rs** is a Rust port of the
-[llama.cpp](https://github.com/ggerganov/llama.cpp) project. This allows running
-inference for Facebook's [LLaMA](https://github.com/facebookresearch/llama)
-model on a CPU with good performance using full precision, f16 or 4-bit
-quantized versions of the model.
+![A llama riding a crab, AI-generated](./doc/resources/logo2.png)
 
-Just like its C++ counterpart, it is powered by the
-[`ggml`](https://github.com/ggerganov/ggml) tensor library, achieving the same
-performance as the original code.
+> _Image by [@darthdeus](https://github.com/darthdeus/), using Stable Diffusion_
 
 ## Getting started
 
 Make sure you have a Rust 1.65.0 or above and C toolchain[^1] set up.
 
-`llama-rs` is a Rust library, while `llama-cli` is a CLI application that wraps
-`llama-rs` and offers basic inference capabilities.
+`llm-base`, `gpt2`, and `llama` are Rust libraries, while `llm-cli` is a CLI
+applications that wraps `gpt2` and `llama` and offer basic inference
+capabilities.
 
-The following instructions explain how to build `llama-cli`.
+The following instructions explain how to build CLI applications.
 
 **NOTE**: For best results, make sure to build and run in release mode.
 Debug builds are going to be very slow.
@@ -43,41 +35,45 @@ Debug builds are going to be very slow.
 Run
 
 ```shell
-cargo install --git https://github.com/rustformers/llama-rs llama-cli
+cargo install --git https://github.com/rustformers/llama-rs llm-cli
 ```
 
-to install `llama-cli` to your Cargo `bin` directory, which `rustup` is likely to
+to install `llm-cli` to your Cargo `bin` directory, which `rustup` is likely to
 have added to your `PATH`.
 
-It can then be run through `llama-cli`.
+The CLI application can then be run through `llm-cli`.
+
+![Gif showcasing language generation using llama-rs](./doc/resources/llama_gif.gif)
 
 ### Building from repository
 
-Clone the repository, and then build it through
+Clone the repository and then build it with
 
 ```shell
-cargo build --release --bin llama-cli
+git clone --recurse-submodules git@github.com:rustformers/llama-rs.git
+cargo build --release
 ```
 
-The resulting binary will be at `target/release/llama-cli[.exe]`.
+The resulting binary will be at `target/release/llm-cli[.exe]`.
 
 It can also be run directly through Cargo, using
 
 ```shell
-cargo run --release --bin llama-cli -- <ARGS>
+cargo run --release --bin llm-cli -- <ARGS>
 ```
 
 This is useful for development.
 
-### Getting the weights
+### Getting LLaMA weights
 
 In order to run the inference code in `llama-rs`, a copy of the model's weights
 are required.
 
 #### From Hugging Face
 
 Compatible weights - not necessarily the original LLaMA weights - can be found
-on [Hugging Face by searching for GGML](https://huggingface.co/models?search=ggml). At present, LLaMA-architecture models are supported.
+on [Hugging Face by searching for GGML](https://huggingface.co/models?search=ggml).
+At present, LLaMA-architecture models are supported.
 
 #### LLaMA original weights
 
@@ -107,6 +103,13 @@ cargo run -p llama-cli quantize /path/to/your/models/7B/ggml-model-f16.bin /path
 > The [llama.cpp repository](https://github.com/ggerganov/llama.cpp) has
 > additional information on how to obtain and run specific models.
 
+### GPT2
+
+OpenAI's [GPT-2](https://jalammar.github.io/illustrated-gpt2/) architecture is
+also supported. The open-source family of
+[Cerebras](https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/)
+models is built on this architecture.
+
 _Support for other open source models is currently planned. For models where
 weights can be legally distributed, this section will be updated with scripts to
 make the install process as user-friendly as possible. Due to the model's legal
@@ -133,9 +136,9 @@ Some additional things to try:
 
   ![Gif showcasing alpaca repl mode](./doc/resources/alpaca_repl_screencap.gif)
 
-- Sessions can be loaded (`--load-session`) or saved (`--save-session`) to file. To automatically load
-  and save the same session, use `--persist-session`. This can be used to cache prompts to reduce load
-  time, too:
+- Sessions can be loaded (`--load-session`) or saved (`--save-session`) to file.
+  To automatically load and save the same session, use `--persist-session`.
+  This can be used to cache prompts to reduce load time, too:
 
   ![Gif showcasing prompt caching](./doc/resources/prompt_caching_screencap.gif)
 

diff --git a/generate-ggml-bindings/Cargo.toml b/generate-ggml-bindings/Cargo.toml