aac_datasets.datasets.clotho module

class Clotho(
root: str | Path | None = None,
subset: 'dev' | 'val' | 'eval' = 'dev',
download: bool = False,
transform: Callable[[ClothoItem], Any] | None = None,
verbose: int = 0,
force_download: bool = False,
verify_files: bool = False,
*,
clean_archives: bool = True,
flat_captions: bool = False,
version: 'v1' | 'v2' | 'v2.1' = ClothoCard.DEFAULT_VERSION,
)[source]
class Clotho(
root: str | Path | None = None,
*,
subset: 'dcase_aac_test',
download: bool = False,
transform: Callable[[ClothoItem], Any] | None = None,
verbose: int = 0,
force_download: bool = False,
verify_files: bool = False,
clean_archives: bool = True,
flat_captions: bool = False,
version: 'v1' | 'v2' | 'v2.1' = ClothoCard.DEFAULT_VERSION,
)
class Clotho(
root: str | Path | None = None,
*,
subset: 'dcase_aac_analysis',
download: bool = False,
transform: Callable[[ClothoItem], Any] | None = None,
verbose: int = 0,
force_download: bool = False,
verify_files: bool = False,
clean_archives: bool = True,
flat_captions: bool = False,
version: 'v1' | 'v2' | 'v2.1' = ClothoCard.DEFAULT_VERSION,
)
class Clotho(
root: str | Path | None = None,
*,
subset: 'dcase_t2a_audio',
download: bool = False,
transform: Callable[[ClothoItem], Any] | None = None,
verbose: int = 0,
force_download: bool = False,
verify_files: bool = False,
clean_archives: bool = True,
flat_captions: bool = False,
version: 'v1' | 'v2' | 'v2.1' = ClothoCard.DEFAULT_VERSION,
)
class Clotho(
root: str | Path | None = None,
*,
subset: 'dcase_t2a_captions',
download: bool = False,
transform: Callable[[ClothoItem], Any] | None = None,
verbose: int = 0,
force_download: bool = False,
verify_files: bool = False,
clean_archives: bool = True,
flat_captions: bool = False,
version: 'v1' | 'v2' | 'v2.1' = ClothoCard.DEFAULT_VERSION,
)

Bases: Generic[T_ClothoItem], AACDataset[T_ClothoItem]

Unofficial Clotho PyTorch dataset.

Subsets available are ‘train’, ‘val’, ‘eval’, ‘dcase_aac_test’, ‘dcase_aac_analysis’, ‘dcase_t2a_audio’ and ‘dcase_t2a_captions’.

Audio are waveform sounds of 15 to 30 seconds, sampled at 44100 Hz. Target is a list of 5 different sentences strings describing an audio sample. The maximal number of words in captions is 20.

Clotho V1 Paper: https://arxiv.org/pdf/1910.09387.pdf

Dataset folder tree for version ‘v2.1’, with all subsets
{root}
└── CLOTHO_v2.1
    ├── archives
    |   └── (5 7z files, ~8.9GB)
    ├── clotho_audio_files
    │   ├── clotho_analysis
    │   │    └── (8360 wav files, ~19GB)
    │   ├── development
    │   │    └── (3839 wav files, ~7.1GB)
    │   ├── evaluation
    │   │    └── (1045 wav files, ~2.0GB)
    │   ├── test
    │   |    └── (1043 wav files, ~2.0GB)
    │   ├── test_retrieval_audio
    │   |    └── (1000 wav files, ~2.0GB)
    │   └── validation
    │        └── (1045 wav files, ~2.0GB)
    └── clotho_csv_files
        ├── clotho_captions_development.csv
        ├── clotho_captions_evaluation.csv
        ├── clotho_captions_validation.csv
        ├── clotho_metadata_development.csv
        ├── clotho_metadata_evaluation.csv
        ├── clotho_metadata_test.csv
        ├── clotho_metadata_validation.csv
        ├── retrieval_audio_metadata.csv
        └── retrieval_captions.csv
CARD : ClassVar[ClothoCard] = <aac_datasets.datasets.functional.clotho.ClothoCard object>
INVALID_SOUND_ID : ClassVar[str] = 'Not found'
INVALID_START_END_SAMPLES : ClassVar[str] = ''
property download : bool
property root : str
property sr : int
property subset : 'dev' | 'val' | 'eval' | 'dcase_aac_test' | 'dcase_aac_analysis' | 'dcase_t2a_audio' | 'dcase_t2a_captions'
property version : 'v1' | 'v2' | 'v2.1'
class ClothoDCASEAACAnalysisItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio : Tensor
dataset : str
duration : float
fname : str
index : int
sr : int
subset : Literal['dev', 'val', 'eval', 'dcase_aac_test', 'dcase_aac_analysis', 'dcase_t2a_audio', 'dcase_t2a_captions']
class ClothoDCASEAACTestItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio : Tensor
dataset : str
duration : float
fname : str
index : int
license : str
manufacturer : str
sr : int
start_end_samples : str
subset : Literal['dev', 'val', 'eval', 'dcase_aac_test', 'dcase_aac_analysis', 'dcase_t2a_audio', 'dcase_t2a_captions']
class ClothoDCASET2AAudioItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio : Tensor
dataset : str
duration : float
fname : str
index : int
keywords : List[str]
license : str
manufacturer : str
sound_id : str
sr : int
start_end_samples : str
subset : Literal['dev', 'val', 'eval', 'dcase_aac_test', 'dcase_aac_analysis', 'dcase_t2a_audio', 'dcase_t2a_captions']
class ClothoDCASET2ACaptionsItem[source]

Bases: TypedDict

Class representing a single Clotho item.

captions : List[str]
dataset : str
index : int
subset : Literal['dev', 'val', 'eval', 'dcase_aac_test', 'dcase_aac_analysis', 'dcase_t2a_audio', 'dcase_t2a_captions']
class ClothoDevValEvalItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio : Tensor
captions : List[str]
dataset : str
duration : float
fname : str
index : int
keywords : List[str]
license : str
manufacturer : str
sound_id : str
sr : int
start_end_samples : str
subset : Literal['dev', 'val', 'eval', 'dcase_aac_test', 'dcase_aac_analysis', 'dcase_t2a_audio', 'dcase_t2a_captions']
class ClothoItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio : NotRequired[Tensor]
captions : NotRequired[List[str]]
dataset : str
duration : NotRequired[float]
fname : NotRequired[str]
index : int
keywords : NotRequired[List[str]]
license : NotRequired[str]
manufacturer : NotRequired[str]
sound_id : NotRequired[str]
sr : NotRequired[int]
start_end_samples : NotRequired[str]
subset : Literal['dev', 'val', 'eval', 'dcase_aac_test', 'dcase_aac_analysis', 'dcase_t2a_audio', 'dcase_t2a_captions']