Skip to content

Commit

Permalink
Feature/audio size filter (#184)
Browse files Browse the repository at this point in the history
* add new OP: audio_size_filter

* rename get_image_size to get_file_size
  • Loading branch information
drcege authored Jan 19, 2024
1 parent 8600d83 commit 33d82fe
Show file tree
Hide file tree
Showing 9 changed files with 221 additions and 8 deletions.
4 changes: 4 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ process:
min_duration: 0 # the min audio duration of filter range (in seconds)
max_duration: 3600 # the max audio duration of filter range (in seconds)
any_or_all: any # keep this sample when any/all audios meet the filter condition
- audio_size_filter: # Keep data samples whose audios' sizes are within a specified range.
min_duration: "0" # the min audio size of filter range
max_duration: "1TB" # the max audio size of filter range
any_or_all: any # keep this sample when any/all audios meet the filter condition
- average_line_length_filter: # filter text with the average length of lines out of specific range.
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# yapf: disable
from . import (alphanumeric_filter, audio_duration_filter,
from . import (alphanumeric_filter, audio_duration_filter, audio_size_filter,
average_line_length_filter, character_repetition_filter,
face_area_filter, flagged_words_filter,
image_aspect_ratio_filter, image_shape_filter,
Expand Down
74 changes: 74 additions & 0 deletions data_juicer/ops/filter/audio_size_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import get_file_size, size_to_bytes

from ..base_op import OPERATORS, Filter


@OPERATORS.register_module('audio_size_filter')
class AudioSizeFilter(Filter):
"""Keep data samples whose audio size (in bytes/kb/MB/...) within a
specific range.
"""

def __init__(self,
min_size: str = '0',
max_size: str = '1TB',
any_or_all: str = 'any',
*args,
**kwargs):
"""
Initialization method.
:param min_size: The min audio size to keep samples. set to be "0" by
default for no size constraint
:param max_size: The max audio size to keep samples. set to be
"1Tb" by default, an approximate for un-limited case
:param any_or_all: keep this sample with 'any' or 'all' strategy of
all audios. 'any': keep this sample if any audios meet the
condition. 'all': keep this sample only if all audios meet the
condition.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.min_size = size_to_bytes(min_size)
self.max_size = size_to_bytes(max_size)
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')

def compute_stats(self, sample, context=False):
# check if it's computed already
if StatsKeys.audio_sizes in sample[Fields.stats]:
return sample

# there is no audio in this sample
if self.audio_key not in sample or not sample[self.audio_key]:
sample[Fields.stats][StatsKeys.audio_sizes] = np.array(
[], dtype=np.float64)
return sample

# for size calculation, no need to load audios into memory
sample[Fields.stats][StatsKeys.audio_sizes] = [
get_file_size(aud_path) for aud_path in sample[self.audio_key]
]

return sample

def process(self, sample):
audio_sizes = sample[Fields.stats][StatsKeys.audio_sizes]
keep_bools = np.array([
self.min_size <= audio_size <= self.max_size
for audio_size in audio_sizes
])
if len(keep_bools) <= 0:
return True

# different strategies
if self.any:
return keep_bools.any()
else:
return keep_bools.all()
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/image_size_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import get_image_size, size_to_bytes
from data_juicer.utils.mm_utils import get_file_size, size_to_bytes

from ..base_op import OPERATORS, Filter

Expand Down Expand Up @@ -53,7 +53,7 @@ def compute_stats(self, sample, context=False):

# for size calculation, no need to load images into memory
sample[Fields.stats][StatsKeys.image_sizes] = [
get_image_size(img_path) for img_path in sample[self.image_key]
get_file_size(img_path) for img_path in sample[self.image_key]
]

return sample
Expand Down
1 change: 1 addition & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ class StatsKeysConstant(object):

# audios
audio_duration = 'audio_duration'
audio_sizes = 'audio_sizes'

# multimodal
# image-text
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def pil_to_opencv(pil_image):
return opencv_image


def get_image_size(path, ):
def get_file_size(path):
import os
return os.path.getsize(path)

Expand Down
3 changes: 2 additions & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
|-----------------------------------|:------:|-------------------------------------------------|
| [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data |
| [ Mapper ]( #mapper ) | 24 | Edits and transforms samples |
| [ Filter ]( #filter ) | 26 | Filters out low-quality samples |
| [ Filter ]( #filter ) | 27 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 4 | Detects and removes duplicate samples |
| [ Selector ]( #selector ) | 2 | Selects top samples based on ranking |

Expand Down Expand Up @@ -80,6 +80,7 @@ All the specific operators are listed below, each featured with several capabili
|--------------------------------|------------|--------|----------------------------------------------------------------------------------------------------------------|
| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range |
| audio_duration_filter | Audio | - | Keep data samples whose audios' durations are within a specified range |
| audio_size_filter | Audio | - | Keep data samples whose audios' sizes are within a specified range |
| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range |
| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range |
| face_area_filter | Image | - | Keeps samples containing images with face area ratios within the specified range |
Expand Down
7 changes: 4 additions & 3 deletions docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
|------------------------------------|:--:|---------------|
| [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 |
| [ Mapper ]( #mapper ) | 24 | 对数据样本进行编辑和转换 |
| [ Filter ]( #filter ) | 26 | 过滤低质量样本 |
| [ Filter ]( #filter ) | 27 | 过滤低质量样本 |
| [ Deduplicator ]( #deduplicator ) | 4 | 识别、删除重复样本 |
| [ Selector ]( #selector ) | 2 | 基于排序选取高质量样本 |

Expand Down Expand Up @@ -77,15 +77,16 @@ Data-Juicer 中的算子分为以下 5 种类型。
|--------------------------------|------------|--------|------------------------------------|
| alphanumeric_filter | General | en, zh | 保留字母数字比例在指定范围内的样本 |
| audio_duration_filter | Audio | - | 保留样本中包含的音频的时长在指定范围内的样本 |
| audio_size_filter | Audio | - | 保留样本中包含的音频的大小(bytes)在指定范围内的样本 |
| average_line_length_filter | Code | en, zh | 保留平均行长度在指定范围内的样本 |
| character_repetition_filter | General | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 |
| face_area_filter | Image | - | 保留样本中包含的图片的最大脸部区域在指定范围内的样本 |
| flagged_words_filter | General | en, zh | 保留使标记字比率保持在指定阈值以下的样本 |
| image_aspect_ratio_filter | Image | - | 保留样本中包含的图片的宽高比在指定范围内的样本 |
| image_shape_filter | Image | - | 保留样本中包含的图片的形状(即宽和高)在指定范围内的样本 |
| image_size_filter | Image | - | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 |
| image_text_matching_filter | Multimodal | - | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本 |
| image_text_similarity_filter | Multimodal | - | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 |
| image_text_matching_filter | Multimodal | - | 保留图像-文本的分类匹配分基于BLIP模型在指定范围内的样本 |
| image_text_similarity_filter | Multimodal | - | 保留图像-文本的特征余弦相似度基于CLIP模型在指定范围内的样本 |
| language_id_score_filter | General | en, zh | 保留特定语言的样本,通过预测的置信度得分来判断 |
| maximum_line_length_filter | Code | en, zh | 保留最大行长度在指定范围内的样本 |
| perplexity_filter | General | en, zh | 保留困惑度低于指定阈值的样本 |
Expand Down
132 changes: 132 additions & 0 deletions tests/ops/filter/test_audio_size_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import os
import unittest

from datasets import Dataset

from data_juicer.ops.filter.audio_size_filter import AudioSizeFilter
from data_juicer.utils.constant import Fields


class AudioSizeFilterTest(unittest.TestCase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'..', 'data')
aud1_path = os.path.join(data_path, 'audio1.wav') # 970574 / 948K
aud2_path = os.path.join(data_path, 'audio2.wav') # 2494872 / 2.4M
aud3_path = os.path.join(data_path, 'audio3.ogg') # 597254 / 583K

def _run_audio_size_filter(self,dataset: Dataset, target_list, op, np=1):
if Fields.stats not in dataset.features:
dataset = dataset.add_column(name=Fields.stats,
column=[{}] * dataset.num_rows)
dataset = dataset.map(op.compute_stats, num_proc=np)
dataset = dataset.filter(op.process, num_proc=np)
dataset = dataset.select_columns(column_names=[op.audio_key])
res_list = dataset.to_list()
self.assertEqual(res_list, target_list)

def test_min_max(self):

ds_list = [{
'audios': [self.aud1_path]
}, {
'audios': [self.aud2_path]
}, {
'audios': [self.aud3_path]
}]
tgt_list = [{
'audios': [self.aud1_path]
}]
dataset = Dataset.from_list(ds_list)
op = AudioSizeFilter(min_size="800kb", max_size="1MB")
self._run_audio_size_filter(dataset, tgt_list, op)

def test_min(self):

ds_list = [{
'audios': [self.aud1_path]
}, {
'audios': [self.aud2_path]
}, {
'audios': [self.aud3_path]
}]
tgt_list = [{
'audios': [self.aud1_path]
}, {
'audios': [self.aud2_path]
}]
dataset = Dataset.from_list(ds_list)
op = AudioSizeFilter(min_size="900kib")
self._run_audio_size_filter(dataset, tgt_list, op)

def test_max(self):

ds_list = [{
'audios': [self.aud1_path]
}, {
'audios': [self.aud2_path]
}, {
'audios': [self.aud3_path]
}]
tgt_list = [{
'audios': [self.aud1_path]
}, {
'audios': [self.aud3_path]
}]
dataset = Dataset.from_list(ds_list)
op = AudioSizeFilter(max_size="2MiB")
self._run_audio_size_filter(dataset, tgt_list, op)

def test_any(self):

ds_list = [{
'audios': [self.aud1_path, self.aud2_path]
}, {
'audios': [self.aud2_path, self.aud3_path]
}, {
'audios': [self.aud1_path, self.aud3_path]
}]
tgt_list = [{
'audios': [self.aud1_path, self.aud2_path]
}, {
'audios': [self.aud1_path, self.aud3_path]
}]
dataset = Dataset.from_list(ds_list)
op = AudioSizeFilter(min_size="800kb", max_size="1MB",
any_or_all='any')
self._run_audio_size_filter(dataset, tgt_list, op)

def test_all(self):

ds_list = [{
'audios': [self.aud1_path, self.aud2_path]
}, {
'audios': [self.aud2_path, self.aud3_path]
}, {
'audios': [self.aud1_path, self.aud3_path]
}]
tgt_list = []
dataset = Dataset.from_list(ds_list)
op = AudioSizeFilter(min_size="800kb", max_size="1MB",
any_or_all='all')
self._run_audio_size_filter(dataset, tgt_list, op)

def test_filter_in_parallel(self):

ds_list = [{
'audios': [self.aud1_path]
}, {
'audios': [self.aud2_path]
}, {
'audios': [self.aud3_path]
}]
tgt_list = [{
'audios': [self.aud1_path]
}]
dataset = Dataset.from_list(ds_list)
op = AudioSizeFilter(min_size="800kb", max_size="1MB")
self._run_audio_size_filter(dataset, tgt_list, op, np=2)


if __name__ == '__main__':
unittest.main()

0 comments on commit 33d82fe

Please sign in to comment.