From 8fcde25ad7b3e4f23077071b0e6e3299884de132 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Thu, 15 Aug 2024 11:32:46 +0800 Subject: [PATCH] =?UTF-8?q?feat(xtask):=20=E6=B7=BB=E5=8A=A0=20convert=20?= =?UTF-8?q?=E5=91=BD=E4=BB=A4=EF=BC=8C=E6=94=AF=E6=8C=81=E8=BD=AC=E6=8D=A2?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .cargo/config.toml | 1 + xtask/src/convert.rs | 63 ++++++++++++ xtask/src/convert/mod.rs | 125 ----------------------- xtask/src/filter.rs | 27 +++-- xtask/src/main.rs | 9 +- xtask/src/merge.rs | 24 +++-- xtask/src/show.rs | 5 +- xtask/src/split.rs | 37 +++---- xtask/src/{ => utils}/file_info.rs | 44 +++++++- xtask/src/utils/mod.rs | 111 ++++++++++++++++++++ xtask/src/{ => utils}/name_pattern.rs | 0 xtask/src/{convert => utils}/operator.rs | 16 ++- xtask/src/{convert => utils}/read.rs | 0 xtask/src/{ => utils}/shards.rs | 0 xtask/src/{convert => utils}/write.rs | 33 +++--- 15 files changed, 291 insertions(+), 204 deletions(-) create mode 100644 xtask/src/convert.rs delete mode 100644 xtask/src/convert/mod.rs rename xtask/src/{ => utils}/file_info.rs (78%) create mode 100644 xtask/src/utils/mod.rs rename xtask/src/{ => utils}/name_pattern.rs (100%) rename xtask/src/{convert => utils}/operator.rs (89%) rename xtask/src/{convert => utils}/read.rs (100%) rename xtask/src/{ => utils}/shards.rs (100%) rename xtask/src/{convert => utils}/write.rs (82%) diff --git a/.cargo/config.toml b/.cargo/config.toml index dcfa670..4e7f695 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -5,3 +5,4 @@ show = "xtask show" split = "xtask split" merge = "xtask merge" filter = "xtask filter" +convert = "xtask convert" diff --git a/xtask/src/convert.rs b/xtask/src/convert.rs new file mode 100644 index 0000000..f4e38d3 --- /dev/null +++ b/xtask/src/convert.rs @@ -0,0 +1,63 @@ +use crate::utils::{operate, show_file_info, Operator, OutputConfig, Shards}; +use std::path::PathBuf; + +#[derive(Args, Default)] +pub struct ConvertArgs { + /// File to split + file: PathBuf, + /// Output directory for splited shards + #[clap(long, short)] + output_dir: Option, + /// Operations to apply, separated by "->" + #[clap(long)] + ops: String, + /// Max count of tensors per shard + #[clap(long, short = 't')] + max_tensors: Option, + /// Max size in bytes per shard + #[clap(long, short = 's')] + max_bytes: Option, + /// If set, the first shard will not contain any tensor + #[clap(long, short)] + no_tensor_first: bool, +} + +impl ConvertArgs { + pub fn convert(self) { + let Self { + file, + output_dir, + ops, + max_tensors, + max_bytes, + no_tensor_first, + } = self; + + let shards = Shards::from(&*file); + let files = operate( + shards.iter_all(), + ops.split("->").map(|op| { + let op = op.trim(); + if let Some(content) = op.strip_prefix("filter-meta:") { + Operator::filter_meta_key(content) + } else if let Some(content) = op.strip_prefix("filter-tensor:") { + Operator::filter_tensor_name(content) + } else if let Some(content) = op.strip_prefix("cast:") { + Operator::cast(content) + } else { + panic!("Unsupported operation: {}", op) + } + }), + OutputConfig { + dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), + name: shards.name.into(), + shard_max_tensor_count: max_tensors.unwrap_or(usize::MAX), + shard_max_file_size: max_bytes.map_or(Default::default(), |s| s.parse().unwrap()), + shard_no_tensor_first: no_tensor_first, + }, + ) + .unwrap(); + + show_file_info(&files); + } +} diff --git a/xtask/src/convert/mod.rs b/xtask/src/convert/mod.rs deleted file mode 100644 index 5fe49c4..0000000 --- a/xtask/src/convert/mod.rs +++ /dev/null @@ -1,125 +0,0 @@ -mod operator; -mod read; -mod write; - -use crate::file_info::FileInfo; -use ggus::{GGmlType, GGufError, GGufMetaDataValueType, GGufReader}; -use indexmap::IndexMap; -use memmap2::{Mmap, MmapMut}; -use read::read_files; -use std::{ - borrow::Cow, - fs::File, - io, - path::PathBuf, - sync::{Arc, LazyLock}, -}; - -pub use operator::Operator; - -pub struct ConvertArgs { - pub input_files: Vec, - pub output_dir: PathBuf, - pub output_name: String, - pub operations: Vec, - pub split_tensor_count: usize, - pub split_file_size: usize, - pub split_no_tensor_first: bool, -} - -#[allow(dead_code)] -#[derive(Debug)] -pub enum ConvertError { - GGuf(GGufError), - Io(io::Error), -} - -impl ConvertArgs { - pub fn convert(self) -> Result, ConvertError> { - let Self { - input_files, - output_dir, - output_name, - operations, - split_tensor_count, - split_file_size, - split_no_tensor_first, - } = self; - - let files = input_files - .into_iter() - .map(|path| File::open(path).and_then(|f| unsafe { Mmap::map(&f) })) - .collect::, _>>() - .map_err(ConvertError::Io)?; - let files = read_files(files.iter().map(|m| &**m)).map_err(ConvertError::GGuf)?; - - let mut content = Content::new(&files); - - for op in operations { - content.apply(op); - } - - content - .write_files( - &output_dir, - &output_name, - split_tensor_count, - split_file_size, - split_no_tensor_first, - ) - .map_err(ConvertError::Io) - } -} - -struct Content<'a> { - alignment: usize, - meta_kvs: IndexMap>, - tensors: IndexMap>, -} - -struct MetaValue<'a> { - ty: GGufMetaDataValueType, - value: Cow<'a, [u8]>, -} - -impl MetaValue<'_> { - #[inline] - fn value_reader(&self) -> GGufReader { - GGufReader::new(&self.value) - } -} - -struct Tensor<'a> { - ty: GGmlType, - shape: Vec, - data: DataPromise<'a>, -} - -#[derive(Clone)] -enum DataPromise<'a> { - Borrowed(&'a [u8]), - Owned(Arc<[u8]>), - Lazy(Arc), -} - -impl ggus::DataFuture for DataPromise<'_> { - #[inline] - fn get(&self) -> &[u8] { - match self { - Self::Borrowed(data) => data, - Self::Owned(data) => data, - Self::Lazy(data) => data.get(), - } - } -} - -trait LazyData { - fn get(&self) -> &[u8]; -} - -impl MmapMut> LazyData for LazyLock { - #[inline] - fn get(&self) -> &[u8] { - self - } -} diff --git a/xtask/src/filter.rs b/xtask/src/filter.rs index 6d1b566..35a6edd 100644 --- a/xtask/src/filter.rs +++ b/xtask/src/filter.rs @@ -1,7 +1,4 @@ -use crate::{ - convert::{ConvertArgs, Operator}, - file_info::show_file_info, -}; +use crate::utils::{operate, show_file_info, Operator, OutputConfig}; use std::path::PathBuf; #[derive(Args, Default)] @@ -28,20 +25,22 @@ impl FilterArgs { filter_tensor, } = self; - let files = ConvertArgs { - output_name: file_path.file_stem().unwrap().to_str().unwrap().to_string() + ".part", - input_files: vec![file_path], - output_dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), - operations: vec![ + let files = operate( + [&file_path], + [ Operator::filter_meta_key(filter_meta), Operator::filter_tensor_name(filter_tensor), ], - split_tensor_count: usize::MAX, - split_file_size: usize::MAX, - split_no_tensor_first: false, - } - .convert() + OutputConfig { + dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), + name: file_path.file_stem().unwrap().to_str().unwrap().to_string() + ".part", + shard_max_tensor_count: usize::MAX, + shard_max_file_size: Default::default(), + shard_no_tensor_first: false, + }, + ) .unwrap(); + show_file_info(&files); } } diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 4c52029..1b8c2eb 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -1,11 +1,9 @@ mod convert; -mod file_info; mod filter; mod merge; -mod name_pattern; -mod shards; mod show; mod split; +mod utils; #[macro_use] extern crate clap; @@ -18,6 +16,7 @@ fn main() { Split(args) => args.split(), Merge(args) => args.merge(), Filter(args) => args.filter(), + Convert(args) => args.convert(), } } @@ -35,7 +34,5 @@ enum Commands { Split(split::SplitArgs), Merge(merge::MergeArgs), Filter(filter::FilterArgs), + Convert(convert::ConvertArgs), } - -const YES: &str = "✔️ "; -const ERR: &str = "❌ "; diff --git a/xtask/src/merge.rs b/xtask/src/merge.rs index cbfb7f6..6fd88bf 100644 --- a/xtask/src/merge.rs +++ b/xtask/src/merge.rs @@ -1,4 +1,4 @@ -use crate::{convert::ConvertArgs, file_info::show_file_info, shards::Shards}; +use crate::utils::{operate, show_file_info, OutputConfig, Shards}; use std::path::PathBuf; #[derive(Args, Default)] @@ -20,17 +20,19 @@ impl MergeArgs { return; } - let files = ConvertArgs { - input_files: shards.iter_all().collect(), - output_dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), - output_name: shards.name.into(), - operations: Vec::new(), - split_tensor_count: usize::MAX, - split_file_size: usize::MAX, - split_no_tensor_first: false, - } - .convert() + let files = operate( + shards.iter_all(), + [], + OutputConfig { + dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), + name: shards.name.into(), + shard_max_tensor_count: usize::MAX, + shard_max_file_size: Default::default(), + shard_no_tensor_first: false, + }, + ) .unwrap(); + show_file_info(&files); } } diff --git a/xtask/src/show.rs b/xtask/src/show.rs index 693bfff..29a6285 100644 --- a/xtask/src/show.rs +++ b/xtask/src/show.rs @@ -1,4 +1,4 @@ -use crate::{name_pattern::compile_patterns, shards::Shards, ERR, YES}; +use crate::utils::{compile_patterns, Shards}; use ggus::{GGufFileHeader, GGufMetaDataValueType, GGufMetaKV, GGufReadError, GGufReader}; use indexmap::IndexMap; use memmap2::Mmap; @@ -9,6 +9,9 @@ use std::{ path::{Path, PathBuf}, }; +const YES: &str = "✔️ "; +const ERR: &str = "❌ "; + #[derive(Args, Default)] pub struct ShowArgs { /// The file to show diff --git a/xtask/src/split.rs b/xtask/src/split.rs index c02a674..267aaac 100644 --- a/xtask/src/split.rs +++ b/xtask/src/split.rs @@ -1,5 +1,5 @@ -use crate::{convert::ConvertArgs, file_info::show_file_info, shards::Shards}; -use std::{path::PathBuf, str::from_utf8}; +use crate::utils::{operate, show_file_info, OutputConfig, Shards}; +use std::path::PathBuf; #[derive(Args, Default)] pub struct SplitArgs { @@ -35,30 +35,19 @@ impl SplitArgs { return; } - fn parse_size_num(num: &[u8], k: usize) -> Option { - from_utf8(num).ok()?.parse().ok().map(|n: usize| n << k) - } - - let files = ConvertArgs { - output_name: shards.name.into(), - input_files: vec![file], - output_dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), - operations: Vec::new(), - split_tensor_count: max_tensors.unwrap_or(usize::MAX), - split_file_size: match max_bytes { - Some(s) => match s.trim().as_bytes() { - [num @ .., b'G'] => parse_size_num(num, 30), - [num @ .., b'M'] => parse_size_num(num, 20), - [num @ .., b'K'] => parse_size_num(num, 10), - num => parse_size_num(num, 0), - } - .unwrap_or_else(|| panic!("Invalid max bytes format: \"{s}\"")), - None => usize::MAX, + let files = operate( + [&file], + [], + OutputConfig { + dir: output_dir.unwrap_or_else(|| std::env::current_dir().unwrap()), + name: shards.name.into(), + shard_max_tensor_count: max_tensors.unwrap_or(usize::MAX), + shard_max_file_size: max_bytes.map_or(Default::default(), |s| s.parse().unwrap()), + shard_no_tensor_first: no_tensor_first, }, - split_no_tensor_first: no_tensor_first, - } - .convert() + ) .unwrap(); + show_file_info(&files); } } diff --git a/xtask/src/file_info.rs b/xtask/src/utils/file_info.rs similarity index 78% rename from xtask/src/file_info.rs rename to xtask/src/utils/file_info.rs index 7fbd40e..8097b2b 100644 --- a/xtask/src/file_info.rs +++ b/xtask/src/utils/file_info.rs @@ -1,4 +1,11 @@ -use std::{cmp::max, fmt, path::PathBuf}; +use std::{ + cmp::max, + fmt, + num::ParseIntError, + path::PathBuf, + str::{from_utf8, FromStr}, + usize, +}; pub struct FileInfo { pub path: PathBuf, @@ -56,7 +63,40 @@ pub fn show_file_info(file_info: &[FileInfo]) { #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] #[repr(transparent)] -struct MemSize(pub usize); +pub(crate) struct MemSize(usize); + +impl Default for MemSize { + #[inline] + fn default() -> Self { + Self(usize::MAX) + } +} + +impl MemSize { + #[inline] + pub const fn nbytes(self) -> usize { + self.0 + } +} + +impl FromStr for MemSize { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + #[inline] + fn parse_size_num(num: &[u8], k: usize) -> Result { + from_utf8(num).unwrap().parse().map(|n: usize| n << k) + } + + match s.trim().as_bytes() { + [num @ .., b'G'] => parse_size_num(num, 30), + [num @ .., b'M'] => parse_size_num(num, 20), + [num @ .., b'K'] => parse_size_num(num, 10), + num => parse_size_num(num, 0), + } + .map(Self) + } +} impl fmt::Display for MemSize { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { diff --git a/xtask/src/utils/mod.rs b/xtask/src/utils/mod.rs new file mode 100644 index 0000000..2d3a824 --- /dev/null +++ b/xtask/src/utils/mod.rs @@ -0,0 +1,111 @@ +mod file_info; +mod name_pattern; +mod operator; +mod read; +mod shards; +mod write; + +use file_info::FileInfo; +use ggus::{GGmlType, GGufError, GGufMetaDataValueType, GGufReader}; +use indexmap::IndexMap; +use memmap2::{Mmap, MmapMut}; +use read::read_files; +use std::{ + borrow::Cow, + fs::File, + io, + path::{Path, PathBuf}, + sync::{Arc, LazyLock}, +}; + +pub(crate) use file_info::{show_file_info, MemSize}; +pub(crate) use name_pattern::compile_patterns; +pub(crate) use operator::Operator; +pub(crate) use shards::Shards; + +pub struct OutputConfig { + pub dir: PathBuf, + pub name: String, + pub shard_max_tensor_count: usize, + pub shard_max_file_size: MemSize, + pub shard_no_tensor_first: bool, +} + +#[allow(dead_code)] +#[derive(Debug)] +pub enum ConvertError { + GGuf(GGufError), + Io(io::Error), +} + +pub fn operate>( + input_files: impl IntoIterator, + operations: impl IntoIterator, + out: OutputConfig, +) -> Result, ConvertError> { + let files = input_files + .into_iter() + .map(|path| File::open(path).and_then(|f| unsafe { Mmap::map(&f) })) + .collect::, _>>() + .map_err(ConvertError::Io)?; + let files = read_files(files.iter().map(|m| &**m)).map_err(ConvertError::GGuf)?; + + let mut content = Content::new(&files); + + for op in operations { + content.apply(op); + } + + content.write_files(out).map_err(ConvertError::Io) +} + +struct Content<'a> { + alignment: usize, + meta_kvs: IndexMap>, + tensors: IndexMap>, +} + +struct MetaValue<'a> { + ty: GGufMetaDataValueType, + value: Cow<'a, [u8]>, +} + +impl MetaValue<'_> { + #[inline] + fn value_reader(&self) -> GGufReader { + GGufReader::new(&self.value) + } +} + +struct Tensor<'a> { + ty: GGmlType, + shape: Vec, + data: DataPromise<'a>, +} + +#[derive(Clone)] +enum DataPromise<'a> { + Borrowed(&'a [u8]), + Lazy(Arc), +} + +impl ggus::DataFuture for DataPromise<'_> { + #[inline] + fn get(&self) -> &[u8] { + match self { + Self::Borrowed(data) => data, + Self::Lazy(data) => data.get(), + } + } +} + +trait LazyData { + fn get(&self) -> &[u8]; +} + +impl MmapMut> LazyData for LazyLock { + #[inline] + fn get(&self) -> &[u8] { + self + } +} diff --git a/xtask/src/name_pattern.rs b/xtask/src/utils/name_pattern.rs similarity index 100% rename from xtask/src/name_pattern.rs rename to xtask/src/utils/name_pattern.rs diff --git a/xtask/src/convert/operator.rs b/xtask/src/utils/operator.rs similarity index 89% rename from xtask/src/convert/operator.rs rename to xtask/src/utils/operator.rs index 4d7afe0..d1ed4e4 100644 --- a/xtask/src/convert/operator.rs +++ b/xtask/src/utils/operator.rs @@ -1,5 +1,4 @@ -use super::Content; -use crate::{convert::DataPromise, name_pattern::compile_patterns}; +use super::{compile_patterns, Content, DataPromise}; use ggus::{DataFuture, GGmlType}; use half::f16; use memmap2::MmapMut; @@ -11,7 +10,7 @@ use std::{ sync::{Arc, LazyLock}, }; -pub enum Operator { +pub(crate) enum Operator { FilterMetaKey(Regex), FilterTensorName(Regex), Cast(GGmlType), @@ -29,6 +28,15 @@ impl Operator { pub fn filter_tensor_name(p: impl AsRef) -> Self { Self::FilterTensorName(compile_patterns(p.as_ref())) } + + pub fn cast(t: impl AsRef) -> Self { + let t = t.as_ref(); + Self::Cast(match t.to_lowercase().as_str() { + "f16" | "fp16" | "half" => GGmlType::F16, + "f32" | "fp32" | "float" => GGmlType::F32, + _ => panic!("unsupported cast type: {t}"), + }) + } } impl Content<'_> { @@ -96,7 +104,7 @@ impl Content<'_> { }) .unwrap_or("reference"); - if layout.split(';').any(|s| s == "transposed") { + if layout.split('+').any(|s| s == "transposed") { return; } diff --git a/xtask/src/convert/read.rs b/xtask/src/utils/read.rs similarity index 100% rename from xtask/src/convert/read.rs rename to xtask/src/utils/read.rs diff --git a/xtask/src/shards.rs b/xtask/src/utils/shards.rs similarity index 100% rename from xtask/src/shards.rs rename to xtask/src/utils/shards.rs diff --git a/xtask/src/convert/write.rs b/xtask/src/utils/write.rs similarity index 82% rename from xtask/src/convert/write.rs rename to xtask/src/utils/write.rs index c100ae1..8cf700e 100644 --- a/xtask/src/convert/write.rs +++ b/xtask/src/utils/write.rs @@ -1,22 +1,21 @@ -use super::Content; -use crate::{file_info::FileInfo, shards::Shards}; +use super::{Content, FileInfo, OutputConfig, Shards}; use ggus::{GGufFileHeader, GGufFileSimulator, GGufFileWriter}; -use std::{fs::File, io, iter::zip, path::Path, thread}; +use std::{fs::File, io, iter::zip, thread}; impl Content<'_> { - pub fn write_files( - self, - output_dir: &Path, - output_name: &str, - split_tensor_count: usize, - split_file_size: usize, - split_no_tensor_first: bool, - ) -> Result, io::Error> { + pub fn write_files(self, out: OutputConfig) -> Result, io::Error> { let Self { alignment, meta_kvs, tensors, } = self; + let OutputConfig { + dir, + name, + shard_max_tensor_count, + shard_max_file_size, + shard_no_tensor_first, + } = out; // 规划分片方案 @@ -29,15 +28,15 @@ impl Content<'_> { let mut shards = vec![vec![]]; for (name, tensor) in tensors { match &mut *shards { - [_] if split_no_tensor_first => { + [_] if shard_no_tensor_first => { simulator = GGufFileSimulator::with_alignment(alignment).finish(); simulator.write_tensor(&name, tensor.ty, &tensor.shape); shards.push(vec![(name, tensor)]); } [.., current] => { simulator.write_tensor(&name, tensor.ty, &tensor.shape); - if current.len() < split_tensor_count - && simulator.written_bytes() < split_file_size + if current.len() < shard_max_tensor_count + && simulator.written_bytes() < shard_max_file_size.nbytes() { current.push((name, tensor)); } else { @@ -54,8 +53,8 @@ impl Content<'_> { let meta_kvs = &meta_kvs; let names = Shards { - dir: output_dir, - name: output_name, + dir: &dir, + name: &name, index: 0, count: shards.len(), format: 5, @@ -63,7 +62,7 @@ impl Content<'_> { // 并行写入文件 - std::fs::create_dir_all(output_dir)?; + std::fs::create_dir_all(&dir)?; thread::scope(|s| { zip(shards, names) .enumerate()