Feature/audio size filter (#184)

* add new OP: audio_size_filter * rename get_image_size to get_file_size
modelscope · Jan 19, 2024 · 33d82fe · 33d82fe
1 parent 8600d83
commit 33d82fe
Show file tree

Hide file tree

Showing 9 changed files with 221 additions and 8 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -118,6 +118,10 @@ process:
       min_duration: 0                                         # the min audio duration of filter range (in seconds)
       max_duration: 3600                                      # the max audio duration of filter range (in seconds)
       any_or_all: any                                         # keep this sample when any/all audios meet the filter condition
+  - audio_size_filter:                                      # Keep data samples whose audios' sizes are within a specified range.
+      min_duration: "0"                                       # the min audio size of filter range
+      max_duration: "1TB"                                     # the max audio size of filter range
+      any_or_all: any                                         # keep this sample when any/all audios meet the filter condition
   - average_line_length_filter:                             # filter text with the average length of lines out of specific range.
       min_len: 10                                             # the min length of filter range
       max_len: 10000                                          # the max length of filter range

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -1,5 +1,5 @@
 # yapf: disable
-from . import (alphanumeric_filter, audio_duration_filter,
+from . import (alphanumeric_filter, audio_duration_filter, audio_size_filter,
                average_line_length_filter, character_repetition_filter,
                face_area_filter, flagged_words_filter,
                image_aspect_ratio_filter, image_shape_filter,

diff --git a/data_juicer/ops/filter/audio_size_filter.py b/data_juicer/ops/filter/audio_size_filter.py
@@ -0,0 +1,74 @@
+import numpy as np
+
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import get_file_size, size_to_bytes
+
+from ..base_op import OPERATORS, Filter
+
+
+@OPERATORS.register_module('audio_size_filter')
+class AudioSizeFilter(Filter):
+    """Keep data samples whose audio size (in bytes/kb/MB/...) within a
+    specific range.
+    """
+
+    def __init__(self,
+                 min_size: str = '0',
+                 max_size: str = '1TB',
+                 any_or_all: str = 'any',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param min_size: The min audio size to keep samples.  set to be "0" by
+        default for no size constraint
+        :param max_size: The max audio size to keep samples.  set to be
+        "1Tb" by default, an approximate for un-limited case
+        :param any_or_all: keep this sample with 'any' or 'all' strategy of
+            all audios. 'any': keep this sample if any audios meet the
+            condition. 'all': keep this sample only if all audios meet the
+            condition.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self.min_size = size_to_bytes(min_size)
+        self.max_size = size_to_bytes(max_size)
+        if any_or_all not in ['any', 'all']:
+            raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
+                             f'Can only be one of ["any", "all"].')
+        self.any = (any_or_all == 'any')
+
+    def compute_stats(self, sample, context=False):
+        # check if it's computed already
+        if StatsKeys.audio_sizes in sample[Fields.stats]:
+            return sample
+
+        # there is no audio in this sample
+        if self.audio_key not in sample or not sample[self.audio_key]:
+            sample[Fields.stats][StatsKeys.audio_sizes] = np.array(
+                [], dtype=np.float64)
+            return sample
+
+        # for size calculation, no need to load audios into memory
+        sample[Fields.stats][StatsKeys.audio_sizes] = [
+            get_file_size(aud_path) for aud_path in sample[self.audio_key]
+        ]
+
+        return sample
+
+    def process(self, sample):
+        audio_sizes = sample[Fields.stats][StatsKeys.audio_sizes]
+        keep_bools = np.array([
+            self.min_size <= audio_size <= self.max_size
+            for audio_size in audio_sizes
+        ])
+        if len(keep_bools) <= 0:
+            return True
+
+        # different strategies
+        if self.any:
+            return keep_bools.any()
+        else:
+            return keep_bools.all()
diff --git a/data_juicer/ops/filter/image_size_filter.py b/data_juicer/ops/filter/image_size_filter.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import get_image_size, size_to_bytes
+from data_juicer.utils.mm_utils import get_file_size, size_to_bytes
 
 from ..base_op import OPERATORS, Filter
 
@@ -53,7 +53,7 @@ def compute_stats(self, sample, context=False):
 
         # for size calculation, no need to load images into memory
         sample[Fields.stats][StatsKeys.image_sizes] = [
-            get_image_size(img_path) for img_path in sample[self.image_key]
+            get_file_size(img_path) for img_path in sample[self.image_key]
         ]
 
         return sample

diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -128,6 +128,7 @@ class StatsKeysConstant(object):
 
     # audios
     audio_duration = 'audio_duration'
+    audio_sizes = 'audio_sizes'
 
     # multimodal
     # image-text

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -81,7 +81,7 @@ def pil_to_opencv(pil_image):
     return opencv_image
 
 
-def get_image_size(path, ):
+def get_file_size(path):
     import os
     return os.path.getsize(path)
 

diff --git a/docs/Operators.md b/docs/Operators.md
@@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
 | [ Mapper ]( #mapper )             |   24   | Edits and transforms samples                    |
-| [ Filter ]( #filter )             |   26   | Filters out low-quality samples                 |
+| [ Filter ]( #filter )             |   27   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
 
@@ -80,6 +80,7 @@ All the specific operators are listed below, each featured with several capabili
 |--------------------------------|------------|--------|----------------------------------------------------------------------------------------------------------------|
 | alphanumeric_filter            | General    | en, zh | Keeps samples with alphanumeric ratio within the specified range                                               |
 | audio_duration_filter          | Audio      | -      | Keep data samples whose audios' durations are within a specified range                                         |
+| audio_size_filter              | Audio      | -      | Keep data samples whose audios' sizes are within a specified range                                         |
 | average_line_length_filter     | Code       | en, zh | Keeps samples with average line length within the specified range                                              |
 | character_repetition_filter    | General    | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range                               |
 | face_area_filter               | Image      | -      | Keeps samples containing images with face area ratios within the specified range                               |

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
@@ -10,7 +10,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
 | [ Mapper ]( #mapper )              | 24 | 对数据样本进行编辑和转换  |
-| [ Filter ]( #filter )              | 26 | 过滤低质量样本       |
+| [ Filter ]( #filter )              | 27 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
 
@@ -77,15 +77,16 @@ Data-Juicer 中的算子分为以下 5 种类型。
 |--------------------------------|------------|--------|------------------------------------|
 | alphanumeric_filter            | General    | en, zh | 保留字母数字比例在指定范围内的样本                  |
 | audio_duration_filter          | Audio      | -      | 保留样本中包含的音频的时长在指定范围内的样本             |
+| audio_size_filter              | Audio      | -      | 保留样本中包含的音频的大小（bytes）在指定范围内的样本             |
 | average_line_length_filter     | Code       | en, zh | 保留平均行长度在指定范围内的样本                   |
 | character_repetition_filter    | General    | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 |
 | face_area_filter               | Image      | -      | 保留样本中包含的图片的最大脸部区域在指定范围内的样本         |
 | flagged_words_filter           | General    | en, zh | 保留使标记字比率保持在指定阈值以下的样本               |
 | image_aspect_ratio_filter      | Image      | -      | 保留样本中包含的图片的宽高比在指定范围内的样本            |
 | image_shape_filter             | Image      | -      | 保留样本中包含的图片的形状（即宽和高）在指定范围内的样本       |
 | image_size_filter              | Image      | -      | 保留样本中包含的图片的大小（bytes）在指定范围内的样本      |
-| image_text_matching_filter     | Multimodal | -      | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本   |
-| image_text_similarity_filter   | Multimodal | -      | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 |
+| image_text_matching_filter     | Multimodal | -      | 保留图像-文本的分类匹配分（基于BLIP模型）在指定范围内的样本   |
+| image_text_similarity_filter   | Multimodal | -      | 保留图像-文本的特征余弦相似度（基于CLIP模型）在指定范围内的样本 |
 | language_id_score_filter       | General    | en, zh | 保留特定语言的样本，通过预测的置信度得分来判断            |
 | maximum_line_length_filter     | Code       | en, zh | 保留最大行长度在指定范围内的样本                   |
 | perplexity_filter              | General    | en, zh | 保留困惑度低于指定阈值的样本                     |

diff --git a/tests/ops/filter/test_audio_size_filter.py b/tests/ops/filter/test_audio_size_filter.py
@@ -0,0 +1,132 @@
+import os
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.audio_size_filter import AudioSizeFilter
+from data_juicer.utils.constant import Fields
+
+
+class AudioSizeFilterTest(unittest.TestCase):
+
+    data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                             '..', 'data')
+    aud1_path = os.path.join(data_path, 'audio1.wav')  # 970574 / 948K
+    aud2_path = os.path.join(data_path, 'audio2.wav')  # 2494872 / 2.4M
+    aud3_path = os.path.join(data_path, 'audio3.ogg')  # 597254 / 583K
+
+    def _run_audio_size_filter(self,dataset: Dataset, target_list, op, np=1):
+        if Fields.stats not in dataset.features:
+            dataset = dataset.add_column(name=Fields.stats,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.compute_stats, num_proc=np)
+        dataset = dataset.filter(op.process, num_proc=np)
+        dataset = dataset.select_columns(column_names=[op.audio_key])
+        res_list = dataset.to_list()
+        self.assertEqual(res_list, target_list)
+
+    def test_min_max(self):
+
+        ds_list = [{
+            'audios': [self.aud1_path]
+        }, {
+            'audios': [self.aud2_path]
+        }, {
+            'audios': [self.aud3_path]
+        }]
+        tgt_list = [{
+            'audios': [self.aud1_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = AudioSizeFilter(min_size="800kb", max_size="1MB")
+        self._run_audio_size_filter(dataset, tgt_list, op)
+
+    def test_min(self):
+
+        ds_list = [{
+            'audios': [self.aud1_path]
+        }, {
+            'audios': [self.aud2_path]
+        }, {
+            'audios': [self.aud3_path]
+        }]
+        tgt_list = [{
+            'audios': [self.aud1_path]
+        }, {
+            'audios': [self.aud2_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = AudioSizeFilter(min_size="900kib")
+        self._run_audio_size_filter(dataset, tgt_list, op)
+
+    def test_max(self):
+
+        ds_list = [{
+            'audios': [self.aud1_path]
+        }, {
+            'audios': [self.aud2_path]
+        }, {
+            'audios': [self.aud3_path]
+        }]
+        tgt_list = [{
+            'audios': [self.aud1_path]
+        }, {
+            'audios': [self.aud3_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = AudioSizeFilter(max_size="2MiB")
+        self._run_audio_size_filter(dataset, tgt_list, op)
+
+    def test_any(self):
+
+        ds_list = [{
+            'audios': [self.aud1_path, self.aud2_path]
+        }, {
+            'audios': [self.aud2_path, self.aud3_path]
+        }, {
+            'audios': [self.aud1_path, self.aud3_path]
+        }]
+        tgt_list = [{
+            'audios': [self.aud1_path, self.aud2_path]
+        }, {
+            'audios': [self.aud1_path, self.aud3_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = AudioSizeFilter(min_size="800kb", max_size="1MB",
+                                    any_or_all='any')
+        self._run_audio_size_filter(dataset, tgt_list, op)
+
+    def test_all(self):
+
+        ds_list = [{
+            'audios': [self.aud1_path, self.aud2_path]
+        }, {
+            'audios': [self.aud2_path, self.aud3_path]
+        }, {
+            'audios': [self.aud1_path, self.aud3_path]
+        }]
+        tgt_list = []
+        dataset = Dataset.from_list(ds_list)
+        op = AudioSizeFilter(min_size="800kb", max_size="1MB",
+                                    any_or_all='all')
+        self._run_audio_size_filter(dataset, tgt_list, op)
+
+    def test_filter_in_parallel(self):
+
+        ds_list = [{
+            'audios': [self.aud1_path]
+        }, {
+            'audios': [self.aud2_path]
+        }, {
+            'audios': [self.aud3_path]
+        }]
+        tgt_list = [{
+            'audios': [self.aud1_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = AudioSizeFilter(min_size="800kb", max_size="1MB")
+        self._run_audio_size_filter(dataset, tgt_list, op, np=2)
+
+
+if __name__ == '__main__':
+    unittest.main()