juanmc2005 · janaab11 · May 25, 2024 · May 25, 2024 · Jun 28, 2024 · Jul 8, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,19 @@
+# Development
+.git/
+.github/
+.idea/
+__pycache__/
+
+# Data and examples
+assets/
+example/
+expected_outputs/
+tests/
+
+# Documentation
+docs/
+
+# Build artifacts
+*.egg-info/
+dist/
+build/
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,35 @@
+name: Pytest
+
+on:
+  pull_request:
+    branches:
+      - main
+      - develop
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+
+    - name: Install apt dependencies
+      run: |
+        sudo add-apt-repository ppa:savoury1/ffmpeg4
+        sudo apt-get update
+        sudo apt-get -y install ffmpeg libportaudio2=19.6.0-1.1
+
+    - name: Install pip dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install .[tests]
+
+    - name: Run tests
+      run: |
+        pytest
diff --git a/.github/workflows/quick-runs.yml b/.github/workflows/quick-runs.yml
@@ -38,6 +38,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install .
+        pip install onnxruntime==1.18.0
     - name: Crop audio and rttm
       run: |
         sox audio/ES2002a_long.wav audio/ES2002a.wav trim 00:40 00:30
@@ -50,10 +51,10 @@ jobs:
         rm rttms/ES2002b_long.rttm
     - name: Run stream
       run: |
-        diart.stream audio/ES2002a.wav --output trash --no-plot --hf-token ${{ secrets.HUGGINGFACE }}
+        diart.stream audio/ES2002a.wav --segmentation assets/models/segmentation_uint8.onnx --embedding assets/models/embedding_uint8.onnx --output trash --no-plot
     - name: Run benchmark
       run: |
-        diart.benchmark audio --reference rttms --batch-size 4 --hf-token ${{ secrets.HUGGINGFACE }}
+        diart.benchmark audio --reference rttms --batch-size 4 --segmentation assets/models/segmentation_uint8.onnx --embedding assets/models/embedding_uint8.onnx
     - name: Run tuning
       run: |
-        diart.tune audio --reference rttms --batch-size 4 --num-iter 2 --output trash --hf-token ${{ secrets.HUGGINGFACE }}
+        diart.tune audio --reference rttms --batch-size 4 --num-iter 2 --output trash --segmentation assets/models/segmentation_uint8.onnx --embedding assets/models/embedding_uint8.onnx
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,75 @@
+# Use NVIDIA CUDA base image
+FROM docker.io/nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+# Install sudo, git, wget, gcc, g++, and other essential build tools
+RUN apt-get update && \
+    apt-get install -y sudo git wget build-essential && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Miniconda
+ENV CONDA_DIR=/opt/conda
+ENV PATH=$CONDA_DIR/bin:$PATH
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
+    bash /tmp/miniconda.sh -b -p $CONDA_DIR && \
+    rm /tmp/miniconda.sh
+
+# Install Python 3.10 using Conda
+RUN conda install python=3.10
+
+# Upgrade pip and setuptools to avoid deprecation warnings
+RUN pip install --upgrade pip setuptools
+
+# Set Python 3.11 as default by creating a symbolic link
+RUN ln -sf /opt/conda/bin/python3.10 /opt/conda/bin/python && \
+    ln -sf /opt/conda/bin/python3.10 /usr/bin/python
+
+# Verify installations
+RUN python --version && \
+    gcc --version && \
+    g++ --version && \
+    pip --version && \
+    conda --version
+
+# Create app directory and copy files
+WORKDIR /diart
+COPY . .
+
+# Install diart dependencies
+RUN conda install portaudio pysoundfile ffmpeg -c conda-forge
+RUN pip install -e .
+
+# Expose the port the app runs on
+EXPOSE 7007
+
+# Define environment variable to prevent Python from buffering stdout/stderr
+# and writing byte code to file
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Define custom options as env variables with defaults
+ENV HOST=0.0.0.0
+ENV PORT=7007
+ENV SEGMENTATION=pyannote/segmentation-3.0
+ENV EMBEDDING=speechbrain/spkrec-resnet-voxceleb
+ENV TAU_ACTIVE=0.45
+ENV RHO_UPDATE=0.25
+ENV DELTA_NEW=0.6
+ENV LATENCY=5
+ENV MAX_SPEAKERS=3
+
+CMD ["sh", "-c", "python -m diart.console.serve --host ${HOST} --port ${PORT} --segmentation ${SEGMENTATION} --embedding ${EMBEDDING} --tau-active ${TAU_ACTIVE} --rho-update ${RHO_UPDATE} --delta-new ${DELTA_NEW} --latency ${LATENCY} --max-speakers ${MAX_SPEAKERS}"]
+
+# Example run command with environment variables:
+# docker run -p 7007:7007 --restart unless-stopped --gpus all \
+#   -e HF_TOKEN=<token> \
+#   -e HOST=0.0.0.0 \
+#   -e PORT=7007 \
+#   -e SEGMENTATION=pyannote/segmentation-3.0 \
+#   -e EMBEDDING=speechbrain/spkrec-resnet-voxceleb \
+#   -e TAU_ACTIVE=0.45 \
+#   -e RHO_UPDATE=0.25 \
+#   -e DELTA_NEW=0.6 \
+#   -e LATENCY=5 \
+#   -e MAX_SPEAKERS=3 \
+#   diart-image
diff --git a/README.md b/README.md
@@ -202,6 +202,7 @@ def embedding_loader():
 segmentation = SegmentationModel(segmentation_loader)
 embedding = EmbeddingModel(embedding_loader)
 config = SpeakerDiarizationConfig(
+    # Set the segmentation model used in the paper
     segmentation=segmentation,
     embedding=embedding,
 )
@@ -284,21 +285,27 @@ Obtain overlap-aware speaker embeddings from a microphone stream:
 ```python
 import rx.operators as ops
 import diart.operators as dops
-from diart.sources import MicrophoneAudioSource
+from diart.sources import MicrophoneAudioSource, FileAudioSource
 from diart.blocks import SpeakerSegmentation, OverlapAwareSpeakerEmbedding
 
 segmentation = SpeakerSegmentation.from_pretrained("pyannote/segmentation")
 embedding = OverlapAwareSpeakerEmbedding.from_pretrained("pyannote/embedding")
-mic = MicrophoneAudioSource()
+
+source = MicrophoneAudioSource()
+# To take input from file:
+# source = FileAudioSource("<filename>", sample_rate=16000)
+
+# Make sure the models have been trained with this sample rate
+print(source.sample_rate)
 
 stream = mic.stream.pipe(
     # Reformat stream to 5s duration and 500ms shift
-    dops.rearrange_audio_stream(sample_rate=segmentation.model.sample_rate),
+    dops.rearrange_audio_stream(sample_rate=source.sample_rate),
     ops.map(lambda wav: (wav, segmentation(wav))),
     ops.starmap(embedding)
 ).subscribe(on_next=lambda emb: print(emb.shape))
 
-mic.read()
+source.read()
 ```
 
 Output:
@@ -326,20 +333,57 @@ diart.client microphone --host <server-address> --port 7007
 
 See `-h` for more options.
 
+### From the Dockerfile
-### From the Dockerfile
+### From a Docker container
-### From the Dockerfile
+### From a Docker container
+
+You can also run the server in a Docker container. First, build the image:
+```shell
+docker build -t diart -f Dockerfile .
+```
+
+Run the server with default configuration:
+```shell
+docker run -p 7007:7007 --gpus all -e HF_TOKEN=<token> diart
+```
+
+Run with custom configuration:
-Run with custom configuration:
+Example with a custom configuration:
-Run with custom configuration:
+Example with a custom configuration:
+```shell
+docker run -p 7007:7007 --restart unless-stopped --gpus all \
+  -e HF_TOKEN=<token> \
+  -e HOST=0.0.0.0 \
+  -e PORT=7007 \
+  -e SEGMENTATION=pyannote/segmentation-3.0 \
+  -e EMBEDDING=speechbrain/spkrec-resnet-voxceleb \
+  -e TAU_ACTIVE=0.45 \
+  -e RHO_UPDATE=0.25 \
+  -e DELTA_NEW=0.6 \
+  -e LATENCY=5 \
+  -e MAX_SPEAKERS=3 \
+  diart
+```
+
+The server can be configured using these environment variables, at runtime:
+- `HOST`: Server host (default: 0.0.0.0)
+- `PORT`: Server port (default: 7007)
+- `SEGMENTATION`: Segmentation model (default: pyannote/segmentation)
+- `EMBEDDING`: Embedding model (default: pyannote/embedding)
+- `TAU_ACTIVE`: Activity threshold (default: 0.5)
+- `RHO_UPDATE`: Update threshold (default: 0.3)
+- `DELTA_NEW`: New speaker threshold (default: 1.0)
+- `LATENCY`: Processing latency in seconds (default: 0.5)
+- `MAX_SPEAKERS`: Maximum number of speakers (default: 20)
+
 ### From python
 
-For customized solutions, a server can also be created in python using the `WebSocketAudioSource`:
+For customized solutions, a server can also be created in python using `WebSocketStreamingServer`:
 
 ```python
-from diart import SpeakerDiarization
-from diart.sources import WebSocketAudioSource
-from diart.inference import StreamingInference
+from diart import SpeakerDiarization, SpeakerDiarizationConfig
+from diart.websockets import WebSocketStreamingServer
 
-pipeline = SpeakerDiarization()
-source = WebSocketAudioSource(pipeline.config.sample_rate, "localhost", 7007)
-inference = StreamingInference(pipeline, source)
-inference.attach_hooks(lambda ann_wav: source.send(ann_wav[0].to_rttm()))
-prediction = inference()
+pipeline_class = SpeakerDiarization
+pipeline_config = SpeakerDiarizationConfig(step=0.5, sample_rate=16000)
+server = WebSocketStreamingServer(pipeline_class, pipeline_config, host="localhost", port=7007)
-server = WebSocketStreamingServer(pipeline_class, pipeline_config, host="localhost", port=7007)
+server = WebSocketStreamingServer(SpeakerDiarization, pipeline_config, host="localhost", port=7007)
-server = WebSocketStreamingServer(pipeline_class, pipeline_config, host="localhost", port=7007)
+server = WebSocketStreamingServer(SpeakerDiarization, pipeline_config, host="localhost", port=7007)
+server.run()
 ```
 
 ## 🔬 Powered by research

diff --git a/assets/models/embedding_uint8.onnx b/assets/models/embedding_uint8.onnx
diff --git a/assets/models/segmentation_uint8.onnx b/assets/models/segmentation_uint8.onnx
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-numpy>=1.20.2
-matplotlib>=3.3.3
+numpy>=1.20.2,<2.0.0
+matplotlib>=3.3.3,<3.6.0
 rx>=3.2.0
 scipy>=1.6.0
 sounddevice>=0.4.2

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name=diart
-version=0.9.0
+version=0.9.1
 author=Juan Manuel Coria
 description=A python framework to build AI for real-time speech
 long_description=file: README.md
@@ -20,8 +20,8 @@ package_dir=
     =src
 packages=find:
 install_requires=
-    numpy>=1.20.2
-    matplotlib>=3.3.3
+    numpy>=1.20.2,<2.0.0
+    matplotlib>=3.3.3,<3.6.0
     rx>=3.2.0
     scipy>=1.6.0
     sounddevice>=0.4.2
@@ -41,6 +41,11 @@ install_requires=
     websocket-client>=0.58.0
     rich>=12.5.1
 
+[options.extras_require]
+tests=
+    pytest>=7.4.0,<8.0.0
+    onnxruntime==1.18.0
+
 [options.packages.find]
 where=src