#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os.path as osp
from pathlib import Path
from typing import Any, Callable, ClassVar, List, Optional, Union
from torch import Tensor
from typing_extensions import TypedDict
from aac_datasets.datasets.base import AACDataset
from aac_datasets.datasets.functional.wavcaps import (
WavCapsCard,
_get_audio_subset_dpath,
download_wavcaps_dataset,
load_wavcaps_dataset,
)
from aac_datasets.utils.globals import _get_root, _get_zip_path
pylog = logging.getLogger(__name__)
[docs]class WavCapsItem(TypedDict):
# Common attributes
audio: Tensor
captions: List[str]
dataset: str
fname: str
index: int
subset: str
sr: int
duration: float
# WavCaps-specific attributes
author: Optional[str] # FSD and SB
description: Optional[str] # BBC, FSD and SB only
duration: float
download_link: Optional[str] # BBC, FSD and SB only
href: Optional[str] # FSD and SB only
id: str
source: str
tags: List[str] # FSD only
[docs]class WavCaps(AACDataset[WavCapsItem]):
r"""Unofficial WavCaps PyTorch dataset.
WavCaps Paper : https://arxiv.org/pdf/2303.17395.pdf
HuggingFace source : https://huggingface.co/datasets/cvssp/WavCaps
This dataset contains 4 training subsets, extracted from different sources:
- AudioSet strongly labeled ("audioset")
- BBC Sound Effects ("bbc")
- FreeSound ("freesound")
- SoundBible ("soundbible")
- AudioSet strongly labeled without AudioCaps ("audioset_no_audiocaps")
- FreeSound without Clotho ("freesound_no_clotho")
.. warning::
WavCaps download is experimental ; it requires a lot of disk space and can take very long time to download and extract, so you might expect errors.
.. code-block:: text
:caption: Dataset folder tree
{root}
└── WavCaps
├── Audio
│ ├── AudioSet_SL
│ │ └── (108317 flac files, ~64GB)
│ ├── BBC_Sound_Effects
│ │ └── (31201 flac files, ~142GB)
│ ├── FreeSound
│ │ └── (262300 flac files, ~1.4TB)
│ └── SoundBible
│ └── (1232 flac files, ~884MB)
├── Zip_files
│ ├── AudioSet_SL
│ │ └── (8 zip files, ~76GB)
│ ├── BBC_Sound_Effects
│ │ └── (26 zip files, ~562GB)
│ ├── FreeSound
│ │ └── (123 zip? files, ~1.4TB)
│ └── SoundBible
│ └── (1 zip? files, ~624GB)
├── json_files
│ ├── AudioSet_SL
│ │ └── as_final.json
│ ├── BBC_Sound_Effects
│ │ └── bbc_final.json
│ ├── FreeSound
│ │ ├── fsd_final_2s.json
│ │ └── fsd_final.json
│ ├── SoundBible
│ │ └── sb_final.json
│ └── blacklist
│ ├── blacklist_exclude_all_ac.json
│ ├── blacklist_exclude_test_ac.json
│ └── blacklist_exclude_ubs8k_esc50_vggsound.json
├── .gitattributes
└── README.md
"""
# Common globals
CARD: ClassVar[WavCapsCard] = WavCapsCard()
def __init__(
self,
# Common args
root: Union[str, Path, None] = None,
subset: str = WavCapsCard.DEFAULT_SUBSET,
download: bool = False,
transform: Optional[Callable[[WavCapsItem], Any]] = None,
verbose: int = 0,
force_download: bool = False,
verify_files: bool = False,
*,
# WavCaps-specific args
clean_archives: bool = False,
hf_cache_dir: Optional[str] = None,
repo_id: Optional[str] = None,
revision: Optional[str] = WavCapsCard.DEFAULT_REVISION,
zip_path: Union[str, Path, None] = None,
) -> None:
"""
:param root: The parent of the dataset root directory.
The data will be stored in the 'MACS' subdirectory.
defaults to ".".
:param subset: The subset of the dataset. Can be one of :attr:`~WavCapsCard.SUBSETS`.
defaults to "audioset".
:param download: Download the dataset if download=True and if the dataset is not already downloaded.
defaults to False.
:param transform: The transform to apply to the global dict item. This transform is applied only in getitem method when argument is an integer.
defaults to None.
:param verbose: Verbose level. Can be 0 or 1.
defaults to 0.
:param force_download: If True, force to re-download file even if they exists on disk.
defaults to False.
:param verify_files: If True, check hash value when possible.
defaults to False.
:param clean_archives: If True, remove the compressed archives from disk to save space.
defaults to False.
:param hf_cache_dir: HuggingFace cache directory. If None, use the global value :variable:`~huggingface_hub.constants.HUGGINGFACE_HUB_CACHE`.
defaults to None.
:param repo_id: Repository ID on HuggingFace.
defaults to "cvssp/WavCaps".
:param revision: The HuggingFace revision tag.
defaults to :attr:`~WavCapsCard.DEFAULT_REVISION`.
:param zip_path: Path to zip executable path in shell.
defaults to "zip".
"""
if subset not in WavCapsCard.SUBSETS:
raise ValueError(
f"Invalid argument subset={subset} for {WavCapsCard.PRETTY_NAME}. (expected one of {WavCapsCard.SUBSETS})"
)
root = _get_root(root)
zip_path = _get_zip_path(zip_path)
if download:
download_wavcaps_dataset(
root=root,
subset=subset,
force=force_download,
verbose=verbose,
clean_archives=clean_archives,
hf_cache_dir=hf_cache_dir,
repo_id=repo_id,
revision=revision,
verify_files=verify_files,
zip_path=zip_path,
)
raw_data = load_wavcaps_dataset(
root=root,
subset=subset,
verbose=verbose,
hf_cache_dir=hf_cache_dir,
revision=revision,
)
size = len(next(iter(raw_data.values())))
raw_data["dataset"] = [WavCapsCard.NAME] * size
raw_data["subset"] = [subset] * size
raw_data["fpath"] = [
osp.join(
_get_audio_subset_dpath(
root, hf_cache_dir, revision, raw_data["source"][i]
),
fname,
)
for i, fname in enumerate(raw_data["fname"])
]
raw_data["index"] = list(range(size))
super().__init__(
raw_data=raw_data,
transform=transform,
column_names=WavCapsItem.__required_keys__,
flat_captions=False,
sr=WavCapsCard.SAMPLE_RATE,
verbose=verbose,
)
self._root = root
self._subset = subset
self._download = download
self._hf_cache_dir = hf_cache_dir
self._revision = revision
self.add_online_columns(
{
"audio": WavCaps._load_audio,
"audio_metadata": WavCaps._load_audio_metadata,
"duration": WavCaps._load_duration,
"num_channels": WavCaps._load_num_channels,
"num_frames": WavCaps._load_num_frames,
"sr": WavCaps._load_sr,
}
)
# Properties
@property
def download(self) -> bool:
return self._download
@property
def root(self) -> str:
return self._root
@property
def sr(self) -> int:
return self._sr # type: ignore
@property
def subset(self) -> str:
return self._subset