aac_datasets.datasets.functional.wavcaps module

class WavCapsCard[source]

Bases: DatasetCard

ANNOTATIONS_CREATORS : Tuple[str, ...] = ('machine-generated',)
CAPTIONS_PER_AUDIO : Dict[Literal['audioset', 'bbc', 'freesound', 'soundbible', 'audioset_no_audiocaps_v1', 'freesound_no_clotho_v2'], int] = {'audioset': 1, 'audioset_no_audiocaps_v1': 1, 'bbc': 1, 'freesound': 1, 'freesound_no_clotho_v2': 1, 'soundbible': 1}
CITATION : str = '\n @article{mei2023WavCaps,\n title = {Wav{C}aps: A {ChatGPT}-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research},\n author = {Xinhao Mei and Chutong Meng and Haohe Liu and Qiuqiang Kong and Tom Ko and Chengqi Zhao and Mark D. Plumbley and Yuexian Zou and Wenwu Wang},\n year = 2023,\n journal = {arXiv preprint arXiv:2303.17395},\n url = {https://arxiv.org/pdf/2303.17395.pdf}\n }\n '
DEFAULT_REVISION : str = '85a0c21e26fa7696a5a74ce54fada99a9b43c6de'
DEFAULT_SUBSET : Literal['audioset', 'bbc', 'freesound', 'soundbible', 'audioset_no_audiocaps_v1', 'freesound_no_clotho_v2'] = 'audioset_no_audiocaps_v1'
DESCRIPTION : str = 'WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research.'
EXPECTED_SIZES : Dict[Literal['AudioSet_SL', 'BBC_Sound_Effects', 'FreeSound', 'SoundBible'], int] = {'AudioSet_SL': 108317, 'BBC_Sound_Effects': 31201, 'FreeSound': 262300, 'SoundBible': 1320}
HOMEPAGE = 'https://huggingface.co/datasets/cvssp/WavCaps'
LANGUAGE : Tuple[str, ...] = ('en',)
LANGUAGE_DETAILS : Tuple[str, ...] = ('en-US',)
NAME : str = 'wavcaps'
PRETTY_NAME : str = 'WavCaps'
REPO_ID : str = 'cvssp/WavCaps'
SAMPLE_RATE : int = 32000
SIZE_CATEGORIES : Tuple[str, ...] = ('100K<n<1M',)
SOURCES : Tuple[Literal['AudioSet_SL', 'BBC_Sound_Effects', 'FreeSound', 'SoundBible'], ...] = ('AudioSet_SL', 'BBC_Sound_Effects', 'FreeSound', 'SoundBible')
SUBSETS : Tuple[Literal['audioset', 'bbc', 'freesound', 'soundbible', 'audioset_no_audiocaps_v1', 'freesound_no_clotho_v2'], ...] = ('audioset', 'bbc', 'freesound', 'soundbible', 'audioset_no_audiocaps_v1', 'freesound_no_clotho_v2')
TASK_CATEGORIES : Tuple[str, ...] = ('audio-to-text', 'text-to-audio')
download_wavcaps_dataset(
root: str | Path | None = None,
subset: 'audioset' | 'bbc' | 'freesound' | 'soundbible' | 'audioset_no_audiocaps_v1' | 'freesound_no_clotho_v2' = 'audioset_no_audiocaps_v1',
force: bool = False,
verbose: int = 0,
verify_files: bool = False,
*,
clean_archives: bool = False,
hf_cache_dir: str | None = None,
repo_id: str | None = None,
revision: str | None = None,
zip_path: str | Path | None = None,
) None[source]

Prepare WavCaps data.

Parameters:
root: str | Path | None = None

Dataset root directory. defaults to “.”.

subset: 'audioset' | 'bbc' | 'freesound' | 'soundbible' | 'audioset_no_audiocaps_v1' | 'freesound_no_clotho_v2' = 'audioset_no_audiocaps_v1'

The subset of MACS to use. Can be one of SUBSETS. defaults to “audioset_no_audiocaps_v1”.

force: bool = False

If True, force to download again all files. defaults to False.

verbose: int = 0

Verbose level. defaults to 0.

verify_files: bool = False

If True, check all file already downloaded are valid. defaults to False.

clean_archives: bool = False

If True, remove the compressed archives from disk to save space. defaults to True.

hf_cache_dir: str | None = None

Optional override for HuggingFace cache directory path. defaults to None.

repo_id: str | None = None

Repository ID on HuggingFace. defaults to “cvssp/WavCaps”.

revision: str | None = None

Optional override for revision commit/name for HuggingFace rapository. defaults to None.

zip_path: str | Path | None = None

Path to zip executable path in shell. defaults to “zip”.

download_wavcaps_datasets(
root: str | Path | None = None,
subsets: 'audioset' | 'bbc' | 'freesound' | 'soundbible' | 'audioset_no_audiocaps_v1' | 'freesound_no_clotho_v2' | Iterable['audioset' | 'bbc' | 'freesound' | 'soundbible' | 'audioset_no_audiocaps_v1' | 'freesound_no_clotho_v2'] = 'audioset_no_audiocaps_v1',
force: bool = False,
verbose: int = 0,
*,
clean_archives: bool = False,
hf_cache_dir: str | None = None,
repo_id: str | None = None,
revision: str | None = None,
verify_files: bool = False,
zip_path: str | Path | None = None,
) None[source]

Function helper to download a list of subsets. See download_wavcaps_dataset() for details.

load_wavcaps_dataset(
root: str | Path | None = None,
subset: 'audioset' | 'bbc' | 'freesound' | 'soundbible' | 'audioset_no_audiocaps_v1' | 'freesound_no_clotho_v2' = 'audioset_no_audiocaps_v1',
verbose: int = 0,
*,
hf_cache_dir: str | None = None,
revision: str | None = None,
) dict[str, list[Any]][source]

Load WavCaps metadata.

Parameters:
root: str | Path | None = None

Dataset root directory. defaults to “.”.

subset: 'audioset' | 'bbc' | 'freesound' | 'soundbible' | 'audioset_no_audiocaps_v1' | 'freesound_no_clotho_v2' = 'audioset_no_audiocaps_v1'

The subset of MACS to use. Can be one of SUBSETS. defaults to “audioset_no_audiocaps_v1”.

verbose: int = 0

Verbose level. defaults to 0.

hf_cache_dir: str | None = None

Optional override for HuggingFace cache directory path. defaults to None.

revision: str | None = None

Optional override for revision commit/name for HuggingFace rapository. defaults to None.

Returns:

A dictionnary of lists containing each metadata.