From 8be9b03e0e8e991f22366161c2dd3b8ab94b8952 Mon Sep 17 00:00:00 2001
From: Ondrej Platek <ondrej.platek@seznam.cz>
Date: Sun, 8 Oct 2023 22:32:25 +0200
Subject: [PATCH] spokewoz recipe skeleton

---
 lhotse/bin/modes/recipes/__init__.py  |   1 +
 lhotse/bin/modes/recipes/spokenwoz.py |  62 +++++
 lhotse/recipes/__init__.py            |   1 +
 lhotse/recipes/spokenwoz.py           | 378 ++++++++++++++++++++++++++
 4 files changed, 442 insertions(+)
 create mode 100644 lhotse/bin/modes/recipes/spokenwoz.py
 create mode 100644 lhotse/recipes/spokenwoz.py

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
index 32a3dc93e..6dd3df1a9 100644
--- a/lhotse/bin/modes/recipes/__init__.py
+++ b/lhotse/bin/modes/recipes/__init__.py
@@ -62,6 +62,7 @@
 from .rir_noise import *
 from .speechcommands import *
 from .spgispeech import *
+from .spokenwoz import *
 from .stcmds import *
 from .switchboard import *
 from .tal_asr import *
diff --git a/lhotse/bin/modes/recipes/spokenwoz.py b/lhotse/bin/modes/recipes/spokenwoz.py
new file mode 100644
index 000000000..ea2df241f
--- /dev/null
+++ b/lhotse/bin/modes/recipes/spokenwoz.py
@@ -0,0 +1,62 @@
+from typing import Sequence
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes import download_spokenwoz, prepare_spokenwoz
+from lhotse.utils import Pathlike
+
+__all__ = ["spokenwoz"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many jobs to use (can give good speed-ups with slow disks).",
+)
+@click.option(
+    "-p",
+    "--dataset-splits",
+    type=str,
+    default=["all"],
+    multiple=True,
+    help="List of dataset parts to prepare. To prepare multiple parts, pass each with `-p` "
+    "Example: `-p train -p dev -p test`",
+)
+def spokenwoz(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    dataset_splits: Sequence[str],
+    num_jobs: int,
+):
+    """SpokenWOZ data preparation."""
+    prepare_spokenwoz(
+        corpus_dir,
+        output_dir=output_dir,
+        num_jobs=num_jobs,
+        dataset_splits=dataset_splits,
+    )
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "-p",
+    "--dataset-parts",
+    type=str,
+    default=["all"],
+    multiple=True,
+    help="List of dataset parts to download. To prepare multiple parts, pass each with `-p` "
+    "Example: `-p train_dev -p test`",
+)
+def spokenwoz(
+    target_dir: Pathlike,
+    dataset_parts: Sequence[str],
+):
+    """SpokenWOZ data download."""
+    download_spokenwoz(target_dir, dataset_parts=dataset_parts)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
index 9ae74710c..6e2fd49c7 100644
--- a/lhotse/recipes/__init__.py
+++ b/lhotse/recipes/__init__.py
@@ -63,6 +63,7 @@
 from .rir_noise import download_rir_noise, prepare_rir_noise
 from .speechcommands import download_speechcommands, prepare_speechcommands
 from .spgispeech import download_spgispeech, prepare_spgispeech
+from .spokenwoz import download_spokenwoz, prepare_spokenwoz
 from .stcmds import download_stcmds, prepare_stcmds
 from .switchboard import prepare_switchboard
 from .tedlium import download_tedlium, prepare_tedlium
diff --git a/lhotse/recipes/spokenwoz.py b/lhotse/recipes/spokenwoz.py
new file mode 100644
index 000000000..a5199408b
--- /dev/null
+++ b/lhotse/recipes/spokenwoz.py
@@ -0,0 +1,378 @@
+"""
+# SpokenWOZ dataset
+
+SpokenWOZ is a large-scale multi-domain speech-text dataset
+for spoken task-oriented dialogue modeling, which consists of 203k turns,
+5.7k dialogues and 249 hours audios from realistic human-to-human spoken conversations.
+
+The data is split into training, dev, and test sets.
+The dataset is distributed under the CC BY-NC 4.0 license.
+
+
+## Why SpokenWOZ?
+
+The majority of existing TOD datasets are constructed via writing or paraphrasing
+from annotators rather than being collected from realistic spoken conversations.
+The written TDO datasets may not be representative of the way people naturally speak
+in real-world conversations, and make it difficult to train and evaluate models
+that are specifically designed for spoken TOD.
+Additionally, the robustness issue, such as ASR noise, also can not be fully explored
+using these written TOD datasets. Different exsiting spoken TOD datasets,
+we introduce common spoken characteristics in SpokenWOZ, such like word-by-word processing
+and commonsense in spoken language.
+SpokenWOZ also includes cross-turn detection and reasoning slot detection
+as new challenges to better handle these spoken characteristics.
+
+
+## Data structure
+
+There are 5,700 dialogues ranging form single-domain to multi-domain in SpokenWOZ.
+The test sets contain 1k examples.
+Dialogues with MUL in the name refers to multi-domain dialogues.
+Dialogues with SNG refers to single-domain dialogues. Each dialogue consists of a goal,
+multiple user and system utterances, dialogue state, dialogue act, corresponding audio and ASR transcription.
+
+The file name of the audio is consistent with the id of the dialogue, for example,
+the corresponding audio file for MUL0032 is MUL0032.wav.
+
+The dialogue goal for each dialogue is recorded in the "goal" field.
+The dialogue goal holds the fields involved in the dialogue as well as
+the slots involved and the corresponding values.
+
+The dialogue state for each dialogue is recorded in the "metadata" field in every turn the same as MultiWOZ 2.1.
+The  state have two sections: semi, book. Semi refers to slots from a particular domain.
+Book refers to booking slots for a particular domain. The joint accuracy metrics includes ALL slots.
+
+The dialogue act for each dialogue is recorded in the "dialogue_act" and "span_info" field in every turn:
+
+```
+{
+  "$dialogue_id": {
+  "log":{
+    "$turn_id": {
+      "dialogue_act": {
+        "$act_name": [
+          [
+            "$slot_name",
+            "$action_value"
+          ]
+        ]
+      },
+      "span_info": [
+        [
+          "$act_name"
+          "$slot_name",
+          "$action_value"
+          "$start_charater_index",
+          "$exclusive_end_character_index"
+        ]
+  }
+}
+```
+
+The ASR transcription for each dialogue is recorded in the "words" field in every turn.
+
+```
+{
+  "$dialogue_id": {
+  "log":{
+    "$turn_id": {
+      "words": [
+        {
+        "$word_context": "$word",
+        "$begin_time": "$begintime",
+        "end_time": "$endtime",
+        "channel_id": "$channel",
+        "word_index": "$index",
+        }
+  }
+}
+```
+
+
+## Citation
+
+[1] Website https://spokenwoz.github.io/SpokenWOZ-github.io/
+[2] Arxiv pre-print
+```
+@article{si2023spokenwoz,
+  title={SpokenWOZ: A Large-Scale Speech-Text Dataset for Spoken Task-Oriented Dialogue in Multiple Domains},
+  author={Si, Shuzheng and Ma, Wentao and Wu, Yuchuan and Dai, Yinpei and Gao, Haoyu and Lin, Ting-En and Li, Hangyu and Yan, Rui and Huang, Fei and Li, Yongbin},
+  journal={arXiv preprint arXiv:2305.13040},
+  year={2023},
+  url={https://arxiv.org/abs/2305.13040}
+}
+```
+
+"""
+import json
+import logging
+import shutil
+import tarfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+from tqdm import tqdm
+
+from lhotse import (
+    RecordingSet,
+    SupervisionSegment,
+    SupervisionSet,
+    fix_manifests,
+    validate_recordings_and_supervisions,
+)
+from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
+from lhotse.supervision import AlignmentItem
+from lhotse.utils import Pathlike, resumable_download, safe_extract
+
+SPOKENWOZ_BASE_URL = (
+    "https://spokenwoz.oss-cn-wulanchabu.aliyuncs.com/{modality}_5700_{part}.tar.gz"
+)
+
+MODALITIES = ("text", "audio")
+# Parts how the dataset is distributed.
+PARTS = ("test", "train_dev")
+# Splits how the dataset is used and saved in Lhotse manifests.
+SPLITS = ("test", "dev", "train")
+
+
+def get_spokenwoz_metadata(corpus_dir: Pathlike) -> Dict[str, Any]:
+    """
+    Helper function which loads the metadata not included explicitly to the Lhotse manifests.
+    """
+    td = Path(corpus_dir) / "text_5700_train_dev"
+    metadata = {
+        "ontology.json": json.load(open(td / "ontology.json", "r")),
+        "README.md": open(td / "README.md").read(),
+        "db": {
+            "data": {
+                "attraction": json.load(open(td / "db" / "attraction_db.json", "r")),
+                "hospital": json.load(open(td / "db" / "hospital_db.json", "r")),
+                "hotel": json.load(open(td / "db" / "hotel_db.json", "r")),
+                "police": json.load(open(td / "db" / "police_db.json", "r")),
+                "restaurant": json.load(open(td / "db" / "restaurant_db.json", "r")),
+                "taxi": json.load(open(td / "db" / "taxi_db.json", "r")),
+                "train": json.load(open(td / "db" / "train_db.json", "r")),
+            },
+            "value_set": json.load(open(td / "db" / "value_set.json", "r")),
+        },
+    }
+    return metadata
+
+
+def download_spokenwoz(
+    target_dir: Pathlike = ".",
+    dataset_parts: Optional[Union[str, Sequence[str]]] = "all",
+    force_download: Optional[bool] = False,
+) -> Path:
+    """
+    Download and untar the SpokenWOZ dataset.
+
+    :param target_dir: Pathlike, the path of the dir to storage the dataset.
+    :param dataset_parts: "all", or a list of parts "train_dev",  or "test" to download.
+    :param force_download: Bool, if True, download the tars no matter if the tars exist.
+    :return: the path to downloaded and extracted directory with data.
+    """
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    if dataset_parts == "all" or dataset_parts[0] == "all":
+        dataset_parts = PARTS
+    elif isinstance(dataset_parts, str):
+        dataset_parts = [dataset_parts]
+
+    dataset_parts = [
+        (part, modality) for part in dataset_parts for modality in MODALITIES
+    ]
+    for part, modality in tqdm(dataset_parts, desc=f"Downloading SpokenWOZ parts"):
+        if part not in PARTS:
+            logging.warning(
+                f"Skipping invalid dataset part name: {part} (possible choices: {PARTS})"
+            )
+            continue
+        url = SPOKENWOZ_BASE_URL.format(modality=modality, part=part)
+        tar_name = f"{modality}_5700_{part}.tar.gz"
+        tar_path = target_dir / tar_name
+        part_dir = target_dir / f"{modality}_5700_{part}"
+        target_dir.mkdir(parents=True, exist_ok=True)
+        completed_detector = part_dir / ".completed"
+        if completed_detector.is_file():
+            logging.info(
+                f"Skipping {modality}-{part} because {completed_detector} exists."
+            )
+            continue
+        resumable_download(url, filename=tar_path, force_download=force_download)
+        shutil.rmtree(part_dir, ignore_errors=True)
+        with tarfile.open(tar_path) as tar:
+            safe_extract(tar, path=target_dir)
+        completed_detector.touch()
+
+
+def prepare_spokenwoz(
+    corpus_dir: Pathlike,
+    dataset_splits: Union[str, Sequence[str]] = "all",
+    output_dir: Optional[Pathlike] = None,
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions.
+    When all the manifests are available in the ``output_dir``, it will simply read and return them.
+
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param dataset_parts: string or sequence of strings representing dataset part names 'train_dev', or 'test'.
+        By default we will infer which parts are available in ``corpus_dir``.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :param num_jobs: the number of parallel workers parsing the data.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    manifests = {}
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    if dataset_splits == "all" or dataset_splits[0] == "all":
+        dataset_splits = SPLITS
+    elif isinstance(dataset_splits, str):
+        dataset_splits = [dataset_splits]
+
+    text_dirs = {
+        "train": corpus_dir / f"text_5700_train_dev",
+        "dev": corpus_dir / f"text_5700_train_dev",
+        "test": corpus_dir / f"text_5700_test",
+    }
+    audio_dirs = {
+        "train": corpus_dir / f"audio_5700_train_dev",
+        "dev": corpus_dir / f"audio_5700_train_dev",
+        "test": corpus_dir / f"audio_5700_test",
+    }
+
+    dialogue_ids = {"train": None, "dev": None, "test": None}
+    exclude_dialogue_ids = {"train": None, "dev": None, "test": []}
+    if "train" in dataset_splits or "dev" in dataset_splits:
+        # It is actually not a json file, but a list of dialogue ids. One per line.
+        dialogue_ids["dev"] = (
+            open(corpus_dir / "text_5700_train_dev" / "valListFile.json")
+            .read()
+            .splitlines()
+        )
+        train_dev = list(json.load(open(text_dirs["train"] / "data.json", "r")).keys())
+        dialogue_ids["train"] = list(set(train_dev) - set(dialogue_ids["dev"]))
+        exclude_dialogue_ids["train"] = dialogue_ids["dev"]
+        exclude_dialogue_ids["dev"] = dialogue_ids["train"]
+    if "test" in dataset_splits:
+        dialogue_ids_data = list(
+            json.load(open(text_dirs["test"] / "data.json", "r")).keys()
+        )
+        dialogue_ids_list = (
+            open(corpus_dir / "text_5700_test" / "testListFile.json")
+            .read()
+            .splitlines()
+        )
+        # Take into account that the assert fails ie the data are different!
+        # assert dialogue_ids_data == dialogue_ids_list, f"The testListFile.json does not match the data.json: {dialogue_ids_data} != {dialogue_ids_list}"
+        dialogue_ids["test"] = dialogue_ids_list
+
+    for split in tqdm(dataset_splits, desc="Preparing spokenWOZ parts"):
+        if manifests_exist(part=split, output_dir=output_dir, prefix="spokenwoz"):
+            logging.info(f"SpokenWOZ subset: {split} already prepared - skipping.")
+            continue
+
+        recordings, supervisions = _spokenwoz_manifests(
+            audio_dirs[split],
+            text_dirs[split],
+            dialogue_ids[split],
+            exclude_dialogue_ids[split],
+            num_jobs=num_jobs,
+        )
+
+        if output_dir is not None:
+            supervisions.to_file(
+                output_dir / f"spokenwoz_supervisions_{split}.jsonl.gz"
+            )
+            recordings.to_file(output_dir / f"spokenwoz_recordings_{split}.jsonl.gz")
+        manifests[split] = {"recordings": recordings, "supervisions": supervisions}
+
+    return manifests
+
+
+def _spokenwoz_manifests(
+    audio_dir: Pathlike,
+    text_dir: Pathlike,
+    dialogue_ids: List[str],
+    exclude_dialogue_ids: List[str],
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """Opens the SpokenWOZ audio directory, list all the wav recordings in the directory.
+    Loads the data.json file from the text directory.
+    See the spokenwoz module doc string for the 'Data structure' overview at the top of this file.
+    For each dialogue and turn we will create SupervisionSegments.
+    We will add the the dialog_act span_info, tag={user,system}, to the custom dictionary of the SupervisionSegment.
+    We will fill the text and alignment using the words field from the data.json file.
+    ATM the turn['log']['metadata'] field is not saved to the SupervisionSegment.
+    """
+
+    audio_dir = Path(audio_dir)
+    wav_files = dict(
+        ((wavf.stem, wavf) for wavf in audio_dir.iterdir() if wavf.suffix == ".wav")
+    )
+
+    with open(text_dir / "data.json", "r") as f:
+        data = json.load(f)
+
+    missing_dialogues_data = [did for did in dialogue_ids if did not in data.keys()]
+    assert (
+        len(missing_dialogues_data) == 0
+    ), f"The dialogues are missing from the data.json: {missing_dialogues_data}"
+    missing_audio_files = [did for did in dialogue_ids if did not in wav_files.keys()]
+    assert (
+        len(missing_audio_files) == 0
+    ), f"The dialogues do not have corresponding recording:{missing_audio_files}"
+
+    exclude_pattern = (
+        "|".join(f"{did}\.wav" for did in exclude_dialogue_ids)
+        if exclude_dialogue_ids
+        else None
+    )
+    recordings = RecordingSet.from_dir(
+        audio_dir, "*.wav", num_jobs=num_jobs, exclude_pattern=exclude_pattern
+    )
+
+    supervisions = []
+    for did, dv in data.items():
+        for i, turn in enumerate(dv["log"]):
+
+            words = turn["words"]
+            word_alignments = [
+                AlignmentItem(
+                    start=w["BeginTime"] / 1000.0,
+                    duration=(w["EndTime"] - w["BeginTime"]) / 1000.0,
+                    symbol=w["Word"],
+                )
+                for w in words
+            ]
+
+            supervisions.append(
+                SupervisionSegment(
+                    id=f"{did}-{i:03d}",
+                    recording_id=did,
+                    start=words[0]["BeginTime"] / 1000.0,
+                    duration=(words[-1]["EndTime"] - words[0]["BeginTime"]) / 1000.0,
+                    channel=0 if turn["tag"] == "user" else 1,  # user: 0, system: 1
+                    text=turn["text"],
+                    language="en",
+                    custom={
+                        "dialogue_act": turn["dialog_act"],
+                        "span_info": turn["span_info"],
+                        "tag": turn["tag"],
+                    },
+                    alignment={"words": word_alignments},
+                )
+            )
+
+    supervisions = SupervisionSet.from_segments(supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
+    validate_recordings_and_supervisions(recordings, supervisions)
+
+    return recordings, supervisions