Skip to content

Commit

Permalink
make sure to remove chat features from tokenized ds
Browse files Browse the repository at this point in the history
  • Loading branch information
winglian committed Oct 6, 2024
1 parent 7b24570 commit d0cdac2
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/axolotl/core/datasets/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,12 @@ def map_fn(ex):
process_count or os.cpu_count() # type: ignore[assignment]
)
num_proc = min(64, process_or_cpu_count)
features = data.features.keys()
tokenized_data = data.map(
map_fn,
num_proc=num_proc,
keep_in_memory=keep_in_memory,
remove_columns=features,
desc="Tokenizing Chats",
)
super().__init__(tokenized_data.data, *args, **kwargs)

0 comments on commit d0cdac2

Please sign in to comment.