aac_datasets.datasets.clotho module

class Clotho(root: str | ~pathlib.Path | None = None, subset: ~typing.Literal['dev', 'val', 'eval'] = 'dev', download: bool = False, transform: ~typing.Callable[[~aac_datasets.datasets.clotho.ClothoItem], ~typing.Any] | None = None, verbose: int = 0, force_download: bool = False, verify_files: bool = False, *, clean_archives: bool = True, flat_captions: bool = False, version: typing_extensions.Literal[v1, v2, v2.1] = ClothoCard.DEFAULT_VERSION)[source]
class Clotho(root: str | ~pathlib.Path | None = None, *, subset: ~typing.Literal['dcase_aac_test'], download: bool = False, transform: ~typing.Callable[[~aac_datasets.datasets.clotho.ClothoItem], ~typing.Any] | None = None, verbose: int = 0, force_download: bool = False, verify_files: bool = False, clean_archives: bool = True, flat_captions: bool = False, version: typing_extensions.Literal[v1, v2, v2.1] = ClothoCard.DEFAULT_VERSION)
class Clotho(root: str | ~pathlib.Path | None = None, *, subset: ~typing.Literal['dcase_aac_analysis'], download: bool = False, transform: ~typing.Callable[[~aac_datasets.datasets.clotho.ClothoItem], ~typing.Any] | None = None, verbose: int = 0, force_download: bool = False, verify_files: bool = False, clean_archives: bool = True, flat_captions: bool = False, version: typing_extensions.Literal[v1, v2, v2.1] = ClothoCard.DEFAULT_VERSION)
class Clotho(root: str | ~pathlib.Path | None = None, *, subset: ~typing.Literal['dcase_t2a_audio'], download: bool = False, transform: ~typing.Callable[[~aac_datasets.datasets.clotho.ClothoItem], ~typing.Any] | None = None, verbose: int = 0, force_download: bool = False, verify_files: bool = False, clean_archives: bool = True, flat_captions: bool = False, version: typing_extensions.Literal[v1, v2, v2.1] = ClothoCard.DEFAULT_VERSION)
class Clotho(root: str | ~pathlib.Path | None = None, *, subset: ~typing.Literal['dcase_t2a_captions'], download: bool = False, transform: ~typing.Callable[[~aac_datasets.datasets.clotho.ClothoItem], ~typing.Any] | None = None, verbose: int = 0, force_download: bool = False, verify_files: bool = False, clean_archives: bool = True, flat_captions: bool = False, version: typing_extensions.Literal[v1, v2, v2.1] = ClothoCard.DEFAULT_VERSION)

Bases: Generic[T_ClothoItem], AACDataset[T_ClothoItem]

Unofficial Clotho PyTorch dataset.

Subsets available are ‘train’, ‘val’, ‘eval’, ‘dcase_aac_test’, ‘dcase_aac_analysis’, ‘dcase_t2a_audio’ and ‘dcase_t2a_captions’.

Audio are waveform sounds of 15 to 30 seconds, sampled at 44100 Hz. Target is a list of 5 different sentences strings describing an audio sample. The maximal number of words in captions is 20.

Clotho V1 Paper: https://arxiv.org/pdf/1910.09387.pdf

Dataset folder tree for version ‘v2.1’, with all subsets
{root}
└── CLOTHO_v2.1
    ├── archives
    |   └── (5 7z files, ~8.9GB)
    ├── clotho_audio_files
    │   ├── clotho_analysis
    │   │    └── (8360 wav files, ~19GB)
    │   ├── development
    │   │    └── (3839 wav files, ~7.1GB)
    │   ├── evaluation
    │   │    └── (1045 wav files, ~2.0GB)
    │   ├── test
    │   |    └── (1043 wav files, ~2.0GB)
    │   ├── test_retrieval_audio
    │   |    └── (1000 wav files, ~2.0GB)
    │   └── validation
    │        └── (1045 wav files, ~2.0GB)
    └── clotho_csv_files
        ├── clotho_captions_development.csv
        ├── clotho_captions_evaluation.csv
        ├── clotho_captions_validation.csv
        ├── clotho_metadata_development.csv
        ├── clotho_metadata_evaluation.csv
        ├── clotho_metadata_test.csv
        ├── clotho_metadata_validation.csv
        ├── retrieval_audio_metadata.csv
        └── retrieval_captions.csv
CARD: ClassVar[ClothoCard] = <aac_datasets.datasets.functional.clotho.ClothoCard object>
INVALID_SOUND_ID: ClassVar[str] = 'Not found'
INVALID_START_END_SAMPLES: ClassVar[str] = ''
property download: bool
property root: str
property sr: int
property subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]
property version: typing_extensions.Literal[v1, v2, v2.1]
class ClothoDCASEAACAnalysisItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio: Tensor
dataset: str
duration: float
fname: str
index: int
sr: int
subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]
class ClothoDCASEAACTestItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio: Tensor
dataset: str
duration: float
fname: str
index: int
license: str
manufacturer: str
sr: int
start_end_samples: str
subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]
class ClothoDCASET2AAudioItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio: Tensor
dataset: str
duration: float
fname: str
index: int
keywords: List[str]
license: str
manufacturer: str
sound_id: str
sr: int
start_end_samples: str
subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]
class ClothoDCASET2ACaptionsItem[source]

Bases: TypedDict

Class representing a single Clotho item.

captions: List[str]
dataset: str
index: int
subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]
class ClothoDevValEvalItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio: Tensor
captions: List[str]
dataset: str
duration: float
fname: str
index: int
keywords: List[str]
license: str
manufacturer: str
sound_id: str
sr: int
start_end_samples: str
subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]
class ClothoItem[source]

Bases: TypedDict

Class representing a single Clotho item.

audio: typing_extensions.NotRequired[Tensor]
captions: typing_extensions.NotRequired[List[str]]
dataset: str
duration: typing_extensions.NotRequired[float]
fname: typing_extensions.NotRequired[str]
index: int
keywords: typing_extensions.NotRequired[List[str]]
license: typing_extensions.NotRequired[str]
manufacturer: typing_extensions.NotRequired[str]
sound_id: typing_extensions.NotRequired[str]
sr: typing_extensions.NotRequired[int]
start_end_samples: typing_extensions.NotRequired[str]
subset: typing_extensions.Literal[dev, val, eval, dcase_aac_test, dcase_aac_analysis, dcase_t2a_audio, dcase_t2a_captions]