Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
FlameSky-S committed Nov 22, 2021
0 parents commit 8f51408
Show file tree
Hide file tree
Showing 21 changed files with 1,410 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.vscode
tmp
test_files
test.ipynb
test.py
*.egg-info
dist
*.pyc
__pycache__
pretrained
Empty file added README.md
Empty file.
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"
33 changes: 33 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[metadata]
name = MMSA-FET
version = 0.0.7
author = THUIAR
description = A Tool for extracting multimodal features from videos.
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/FlameSky-S/MMSA_FET
project_urls =
Bug Tracker = https://github.com/FlameSky-S/MMSA_FET/issues
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
Operating System :: OS Independent

[options]
package_dir =
= src
packages = find:
python_requires = >=3.6
install_requires =
torch >= 1.7.0
mediapipe >= 0.8.8.1
opencv-contrib-python >= 4.5.4.58
transformers >= 4.4.0
opensmile >= 2.2.0
librosa >= 0.8.1
ffmpeg-python >= 0.2.0
ffmpeg >= 4.2.2
numpy >= 1.20.3

[options.packages.find]
where = src
1 change: 1 addition & 0 deletions src/MSA_FET/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .main import FeatureExtractionTool
29 changes: 29 additions & 0 deletions src/MSA_FET/example_configs/librosa+face_mesh.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"audio": {
"tool": "librosa",
"sample_rate": null,
"args": {
"mfcc": {
"n_mfcc": 20,
"htk": true
},
"rms": {},
"zero_crossing_rate": {},
"spectral_rolloff": {},
"spectral_centroid": {}
}
},
"video": {
"tool": "mediapipe",
"fps": 25,
"args": {
"face_mesh": {
"refine_landmarks": true,
"min_detection_confidence": 0.35,
"min_tracking_confidence": 0.5
},
"visualize": true,
"visualize_dir": "/home/mhs/Projects/MMSA-Feature-Extraction-Tool/tmp/out/"
}
}
}
19 changes: 19 additions & 0 deletions src/MSA_FET/example_configs/opensmile+vggface.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"audio": {
"tool": "opensmile",
"sample_rate": 48000,
"args": {
"feature_set": "ComParE_2016",
"feature_level": "Functionals",
"start": null,
"end": null
}
},
"video": {
"tool": "vggface",
"fps": 25
},
"alignment": {
"tool": "p2fa"
}
}
23 changes: 23 additions & 0 deletions src/MSA_FET/example_configs/wav2vec+holistic.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"audio": {
"tool": "wav2vec",
"sample_rate": 16000,
"pretrained": "facebook/wav2vec2-base-960h"
},
"video": {
"tool": "mediapipe",
"fps": 25,
"args": {
"holistic": {
"model_complexity": 1,
"smooth_landmarks": true,
"enable_segmentation": true,
"smooth_segmentation": true,
"min_detection_confidence": 0.5,
"min_tracking_confidence": 0.5
},
"visualize": true,
"visualize_dir": "/home/mhs/Projects/MMSA-Feature-Extraction-Tool/tmp/out/"
}
}
}
5 changes: 5 additions & 0 deletions src/MSA_FET/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .audio.librosa import librosaExtractor
from .audio.opensmile import opensmileExtractor
from .audio.wave2vec import wav2vec2Extractor
from .video.mediapipe import mediapipeExtractor
from .video.vggface import vggfaceExtractor
Empty file.
45 changes: 45 additions & 0 deletions src/MSA_FET/extractors/audio/librosa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np
import librosa
from ..baseExtractor import baseAudioExtractor


class librosaExtractor(baseAudioExtractor):
"""
Audio feature extractor using librosa.
Ref: https://librosa.org/doc/latest/index.html
"""
def __init__(self, config, logger):
try:
logger.info("Initializing librosa audio feature extractor.")
super().__init__(config, logger)
except Exception as e:
self.logger.error("Failed to initialize librosaExtractor.")
raise e

def extract(self, file):
"""
Function:
Extract features from audio file using librosa.
Parameters:
file: path to audio file
Returns:
audio_result: extracted audio features in numpy array
"""
try:
y, sr = self.load_audio(file)
res = {}
for audio_feature in self.config['args'].keys():
kwargs = self.config['args'][audio_feature]
method = getattr(librosa.feature, audio_feature)
try:
res[audio_feature] = method(y=y, sr=sr, **kwargs).T
except TypeError:
res[audio_feature] = method(y=y, **kwargs).T
# concatenate all features
audio_result = np.concatenate(list(res.values()), axis=1)
return audio_result
except Exception as e:
self.logger.error(f"Failed to extract audio features with librosa from {file}.")
raise e
46 changes: 46 additions & 0 deletions src/MSA_FET/extractors/audio/opensmile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from ..baseExtractor import baseAudioExtractor
import opensmile

class opensmileExtractor(baseAudioExtractor):
"""
Audio feature extractor using openSMILE.
Ref: https://github.com/audeering/opensmile-python
"""
def __init__(self, config, logger):
try:
logger.info("Initializing openSMILE audio feature extractor")
super().__init__(config, logger)
self.args = self.config['args']
self.extractor = opensmile.Smile(
feature_set=eval(f"opensmile.FeatureSet.{self.args['feature_set']}"),
feature_level=eval(f"opensmile.FeatureLevel.{self.args['feature_level']}"),
num_workers=None,
)
except Exception as e:
self.logger.error("Failed to initialize opensmileExtractor.")
raise e

def extract(self, file):
"""
Function:
Extract features from audio file using openSMILE.
Parameters:
file: path to audio file
Returns:
audio_result: extracted audio features in numpy array
"""
try:
y, sr = self.load_audio(file)
audio_result = self.extractor.process_signal(
signal = y,
sampling_rate = sr,
start = self.args['start'],
end = self.args['end']
)
audio_result = self.extractor.to_numpy(audio_result)
return audio_result
except Exception as e:
self.logger.error(f"Failed to extract audio features with openSMILE from {file}")
raise e
36 changes: 36 additions & 0 deletions src/MSA_FET/extractors/audio/wave2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from transformers import Wav2Vec2FeatureExtractor
from ..baseExtractor import baseAudioExtractor


class wav2vec2Extractor(baseAudioExtractor):
"""
Audio feature extractor using Wav2Vec2.
Ref: https://huggingface.co/transformers/model_doc/wav2vec2.html
"""
def __init__(self, config, logger):
try:
logger.info("Initializing Wav2Vec2 audio feature extractor.")
super().__init__(config, logger)
self.extractor = Wav2Vec2FeatureExtractor.from_pretrained(config['pretrained'])
except Exception as e:
self.logger.error("Failed to initialize Wav2VecExtractor.")
raise e

def extract(self, file):
"""
Function:
Extract features from audio file using wav2vec2 pretrained model.
Parameters:
file: path to audio file
Returns:
audio_result: extracted audio features in numpy array
"""
try:
y, sr = self.load_audio(file)
audio_result = self.extractor(y, sampling_rate=sr, return_tensors="np").input_values.T
return audio_result
except Exception as e:
self.logger.error(f"Failed to extract audio features with Wav2Vec2 from {file}.")
raise e
49 changes: 49 additions & 0 deletions src/MSA_FET/extractors/baseExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import librosa


class baseExtractor(object):
"""
Base class for all extractors.
"""
def __init__(self, config, logger):
self.config = config
self.logger = logger

def extract(self, file):
"""
Extract features from input file.
"""
raise NotImplementedError("extract() not implemented")


class baseAudioExtractor(baseExtractor):
"""
Base class for all audio extractors.
"""
def __init__(self, config, logger):
super().__init__(config, logger)

def load_audio(self, file):
"""
Load audio file using librosa.
"""
y, sr = librosa.load(file, sr=self.config['sample_rate'])
return y, sr

# class baseVideoExtractor(baseExtractor):
# """
# Base class for all video extractors.
# """
# def __init__(self, config, logger):
# super().__init__(config, logger)

# def load_images(self, img_dir):
# """
# Load image files using cv2.
# """
# images = []
# for image_path in sorted(glob(osp.join(img_dir, '*.bmp'))):
# name = Path(image_path).stem
# image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
# images.append(image)
# return images
Empty file.
Loading

0 comments on commit 8f51408

Please sign in to comment.