Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BPE tokenizers #62

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ set(mlxdata-src
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/ThreadController.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/ThreadPool.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/Tokenizer.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/BPETokenizer.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/Levenshtein.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/Utils.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/core/audio/Audio.cpp
Expand Down Expand Up @@ -204,7 +205,8 @@ set(mlxdata-src
${CMAKE_CURRENT_LIST_DIR}/mlx/data/op/Squeeze.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/op/Tokenize.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/op/ImageTransform.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/op/RemoveValue.cpp)
${CMAKE_CURRENT_LIST_DIR}/mlx/data/op/RemoveValue.cpp
${CMAKE_CURRENT_LIST_DIR}/mlx/data/op/Replace.cpp)

if(AWSSDK_FOUND)
list(APPEND mlxdata-src
Expand Down
51 changes: 51 additions & 0 deletions mlx/data/Dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "mlx/data/op/ReadFromTAR.h"
#include "mlx/data/op/RemoveValue.h"
#include "mlx/data/op/RenameKey.h"
#include "mlx/data/op/Replace.h"
#include "mlx/data/op/SampleTransform.h"
#include "mlx/data/op/SaveImage.h"
#include "mlx/data/op/Shape.h"
Expand Down Expand Up @@ -633,6 +634,31 @@ T Dataset<T, B>::remove_value_if(
}
}

template <class T, class B>
T Dataset<T, B>::replace(
const std::string& key,
const std::string& old,
const std::string& replacement,
int count) {
return transform_(
std::make_shared<op::Replace>(key, old, replacement, count));
}

template <class T, class B>
T Dataset<T, B>::replace_if(
bool cond,
const std::string& key,
const std::string& old,
const std::string& replacement,
int count) {
if (cond) {
return transform_(
std::make_shared<op::Replace>(key, old, replacement, count));
} else {
return T(self_);
}
}

template <class T, class B>
T Dataset<T, B>::rename_key(const std::string& ikey, const std::string& okey)
const {
Expand Down Expand Up @@ -824,6 +850,31 @@ T Dataset<T, B>::tokenize_if(
}
}

template <class T, class B>
T Dataset<T, B>::tokenize_bpe(
const std::string& ikey,
std::shared_ptr<const core::Trie<char>> symbols,
std::shared_ptr<const core::BPEMerges> merges,
const std::string& okey) const {
return transform_(
std::make_shared<op::BPETokenize>(ikey, symbols, merges, okey));
}

template <class T, class B>
T Dataset<T, B>::tokenize_bpe_if(
bool cond,
const std::string& ikey,
std::shared_ptr<const core::Trie<char>> symbols,
std::shared_ptr<const core::BPEMerges> merges,
const std::string& okey) const {
if (cond) {
return transform_(
std::make_shared<op::BPETokenize>(ikey, symbols, merges, okey));
} else {
return T(self_);
}
}

// Implement Stream
template <>
Stream Dataset<Stream, stream::Stream>::transform_(
Expand Down
23 changes: 23 additions & 0 deletions mlx/data/Dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,18 @@ class Dataset {
double value,
double pad) const;

T replace(
const std::string& key,
const std::string& old,
const std::string& replacement,
int count = -1);
T replace_if(
bool cond,
const std::string& key,
const std::string& old,
const std::string& replacement,
int count = -1);

T rename_key(const std::string& ikey, const std::string& okey) const;
T rename_key_if(bool cond, const std::string& ikey, const std::string& okey)
const;
Expand Down Expand Up @@ -384,6 +396,17 @@ class Dataset {
bool ignore_unk = false,
const std::vector<double>& trie_key_scores = {},
const std::string& okey = "") const;
T tokenize_bpe(
const std::string& ikey,
std::shared_ptr<const core::Trie<char>> symbols,
std::shared_ptr<const core::BPEMerges> merges,
const std::string& okey = "") const;
T tokenize_bpe_if(
bool cond,
const std::string& ikey,
std::shared_ptr<const core::Trie<char>> symbols,
std::shared_ptr<const core::BPEMerges> merges,
const std::string& okey = "") const;

protected:
std::shared_ptr<B> self_;
Expand Down
178 changes: 178 additions & 0 deletions mlx/data/core/BPETokenizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// Copyright © 2024 Apple Inc.

#include <queue>
#include <sstream>

#include "mlx/data/core/BPETokenizer.h"
#include "mlx/data/core/Trie.h"

namespace mlx {
namespace data {
namespace core {

void BPEMerges::add(
const std::string& left,
const std::string& right,
int64_t token) {
auto [left_s, left_inserted] = strings_.insert(left);
auto [right_s, right_inserted] = strings_.insert(right);

std::string_view left_v(*left_s);
std::string_view right_v(*right_s);

auto left_it = merges_.find(left_v);
if (left_it == merges_.end()) {
merges_[left_v][right_v] = token;
} else {
auto right_it = left_it->second.find(right_v);
if (right_it == left_it->second.end()) {
left_it->second[right_v] = token;
} else {
right_it->second = std::min(token, right_it->second);
}
}
}

std::pair<bool, int64_t> BPEMerges::can_merge(
std::string_view left,
std::string_view right) const {
auto left_it = merges_.find(left);
if (left_it == merges_.end()) {
return {false, 0};
}
auto right_it = left_it->second.find(right);
if (right_it == left_it->second.end()) {
return {false, 0};
}
return {true, right_it->second};
}

BPETokenizer::BPETokenizer(
std::shared_ptr<const Trie<char>> symbols,
std::shared_ptr<const BPEMerges> merges)
: symbols_(symbols), merges_(merges) {}

std::vector<int64_t> BPETokenizer::tokenize(std::string_view input) const {
struct Symbol {
std::string_view value;
int left;
int right;
int64_t token;
};

struct Pair {
std::vector<Symbol>::iterator left;
std::vector<Symbol>::iterator right;
int64_t token;
std::string_view value;

Pair(
std::vector<Symbol>::iterator left,
std::vector<Symbol>::iterator right,
int64_t token)
: left(left),
right(right),
token(token),
value(left->value.data(), left->value.size() + right->value.size()) {}

bool operator<(const Pair& right) const {
return token >= right.token;
};
};

// Transform the input to a sequence of basic symbols that will subsequently
// be merged.
std::vector<Symbol> symbols;
symbols.reserve(input.size());
for (auto it = input.begin(); it != input.end(); it++) {
auto [node, length] = symbols_->search_longest_prefix(it, input.end());
if (length == 0) {
std::ostringstream msg;
msg << "BPETokenizer: Unknown symbol '" << *it << "'";
throw std::runtime_error(msg.str());
}
symbols.push_back(Symbol{
std::string_view(&*it, length),
static_cast<int>(symbols.size() - 1),
static_cast<int>(symbols.size() + 1),
node->id});
it += length - 1;
}

std::priority_queue<Pair> merge_queue;

// Initialize the merge queue
auto left = symbols.begin();
auto right = std::next(left);
while (right != symbols.end()) {
auto [can_merge, token] = merges_->can_merge(left->value, right->value);
if (can_merge) {
merge_queue.emplace(left, right, token);
}
left++;
right++;
}

while (!merge_queue.empty()) {
Pair top = std::move(merge_queue.top());
merge_queue.pop();

// Skip invalidated pairs
if (top.left->token < 0 || top.right->token < 0) {
continue;
}
if (top.value.size() != top.left->value.size() + top.right->value.size()) {
continue;
}
if (top.value.data() != top.left->value.data()) {
continue;
}

// Yay! Valid pair, let's merge into the left one.
top.left->token = top.token;
top.left->value = top.value;

// Invalidate our neighbor which we just merged into ourselves.
top.right->token = -1;

// Adjust the pointers to neighboring symbols
top.left->right = top.right->right;
if (top.right->right < symbols.size()) {
symbols[top.right->right].left = top.right->left;
}

// Check for a possible merge to the left.
if (top.left != symbols.begin()) {
auto neighbor = symbols.begin() + top.left->left;
auto [can_merge, token] =
merges_->can_merge(neighbor->value, top.left->value);
if (can_merge) {
merge_queue.emplace(neighbor, top.left, token);
}
}

// Do the same to our right.
if (top.left->right < symbols.size()) {
auto neighbor = symbols.begin() + top.left->right;
auto [can_merge, token] =
merges_->can_merge(top.left->value, neighbor->value);
if (can_merge) {
merge_queue.emplace(top.left, neighbor, token);
}
}
}

// Gather the final result in a vector
std::vector<int64_t> tokens;
for (auto& symbol : symbols) {
if (symbol.token >= 0) {
tokens.push_back(symbol.token);
}
}

return tokens;
}

} // namespace core
} // namespace data
} // namespace mlx
53 changes: 53 additions & 0 deletions mlx/data/core/BPETokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright © 2024 Apple Inc.

#pragma once

#include <unordered_map>
#include <unordered_set>

#include "mlx/data/core/Trie.h"

namespace mlx {
namespace data {
namespace core {

class BPEMerges {
public:
void add(const std::string& left, const std::string& right, int64_t token);
std::pair<bool, int64_t> can_merge(
std::string_view left,
std::string_view right) const;

template <typename iterator_type>
std::pair<bool, int64_t>
can_merge(iterator_type left, iterator_type middle, iterator_type end) const {
// switch to std::string_view(left, middle) when in C++20
return can_merge(
std::string_view(&(*left), std::distance(left, middle)),
std::string_view(&(*middle), std::distance(middle, end)));
}

private:
std::unordered_set<std::string> strings_;
std::unordered_map<
std::string_view,
std::unordered_map<std::string_view, int64_t>>
merges_;
};

class BPETokenizer {
public:
BPETokenizer(
std::shared_ptr<const Trie<char>> symbols,
std::shared_ptr<const BPEMerges> merges);

std::vector<int64_t> tokenize(std::string_view input) const;

private:
std::shared_ptr<const Trie<char>> symbols_;
std::shared_ptr<const BPEMerges> merges_;
};

} // namespace core
} // namespace data
} // namespace mlx
Loading