Datasets:
file
string
| audio
audio
| label
class label
31 classes
| is_unknown
bool
| speaker_id
string
| utterance_id
int8
0
11
|
---|---|---|---|---|---|
"bed/4a294341_nohash_0.wav" | 20
(bed) | true | "4a294341" | 0 |
|
"bed/43f57297_nohash_0.wav" | 20
(bed) | true | "43f57297" | 0 |
|
"bed/f9af823e_nohash_1.wav" | 20
(bed) | true | "f9af823e" | 1 |
|
"bed/9ff2d2f4_nohash_0.wav" | 20
(bed) | true | "9ff2d2f4" | 0 |
|
"bed/651d108f_nohash_0.wav" | 20
(bed) | true | "651d108f" | 0 |
|
"bed/810c99be_nohash_0.wav" | 20
(bed) | true | "810c99be" | 0 |
|
"bed/6a1908f8_nohash_0.wav" | 20
(bed) | true | "6a1908f8" | 0 |
|
"bed/179a61b7_nohash_0.wav" | 20
(bed) | true | "179a61b7" | 0 |
|
"bed/edd8bfe3_nohash_0.wav" | 20
(bed) | true | "edd8bfe3" | 0 |
|
"bed/215699ff_nohash_1.wav" | 20
(bed) | true | "215699ff" | 1 |
|
"bed/14872d06_nohash_0.wav" | 20
(bed) | true | "14872d06" | 0 |
|
"bed/d85270c1_nohash_1.wav" | 20
(bed) | true | "d85270c1" | 1 |
|
"bed/18a8f03f_nohash_0.wav" | 20
(bed) | true | "18a8f03f" | 0 |
|
"bed/47565088_nohash_2.wav" | 20
(bed) | true | "47565088" | 2 |
|
"bed/62581901_nohash_1.wav" | 20
(bed) | true | "62581901" | 1 |
|
"bed/11860c84_nohash_0.wav" | 20
(bed) | true | "11860c84" | 0 |
|
"bed/62ef962d_nohash_0.wav" | 20
(bed) | true | "62ef962d" | 0 |
|
"bed/92b2bf59_nohash_2.wav" | 20
(bed) | true | "92b2bf59" | 2 |
|
"bed/cd911ace_nohash_0.wav" | 20
(bed) | true | "cd911ace" | 0 |
|
"bed/5d9bb361_nohash_0.wav" | 20
(bed) | true | "5d9bb361" | 0 |
|
"bed/f638a812_nohash_1.wav" | 20
(bed) | true | "f638a812" | 1 |
|
"bed/f21893dc_nohash_1.wav" | 20
(bed) | true | "f21893dc" | 1 |
|
"bed/42e3f068_nohash_0.wav" | 20
(bed) | true | "42e3f068" | 0 |
|
"bed/d94eb94f_nohash_0.wav" | 20
(bed) | true | "d94eb94f" | 0 |
|
"bed/716757ce_nohash_0.wav" | 20
(bed) | true | "716757ce" | 0 |
|
"bed/2d92f18b_nohash_1.wav" | 20
(bed) | true | "2d92f18b" | 1 |
|
"bed/b12bef84_nohash_0.wav" | 20
(bed) | true | "b12bef84" | 0 |
|
"bed/46a153d8_nohash_0.wav" | 20
(bed) | true | "46a153d8" | 0 |
|
"bed/02746d24_nohash_0.wav" | 20
(bed) | true | "02746d24" | 0 |
|
"bed/b3bb4dd6_nohash_1.wav" | 20
(bed) | true | "b3bb4dd6" | 1 |
|
"bed/b00c4c53_nohash_0.wav" | 20
(bed) | true | "b00c4c53" | 0 |
|
"bed/1365dd89_nohash_0.wav" | 20
(bed) | true | "1365dd89" | 0 |
|
"bed/902258bb_nohash_1.wav" | 20
(bed) | true | "902258bb" | 1 |
|
"bed/03c96658_nohash_0.wav" | 20
(bed) | true | "03c96658" | 0 |
|
"bed/0b77ee66_nohash_0.wav" | 20
(bed) | true | "0b77ee66" | 0 |
|
"bed/4407ba92_nohash_1.wav" | 20
(bed) | true | "4407ba92" | 1 |
|
"bed/7f17667c_nohash_1.wav" | 20
(bed) | true | "7f17667c" | 1 |
|
"bed/637c702a_nohash_0.wav" | 20
(bed) | true | "637c702a" | 0 |
|
"bed/3f339c33_nohash_0.wav" | 20
(bed) | true | "3f339c33" | 0 |
|
"bed/da76aa58_nohash_0.wav" | 20
(bed) | true | "da76aa58" | 0 |
|
"bed/035de8fe_nohash_0.wav" | 20
(bed) | true | "035de8fe" | 0 |
|
"bed/15c371c7_nohash_0.wav" | 20
(bed) | true | "15c371c7" | 0 |
|
"bed/0e5193e6_nohash_0.wav" | 20
(bed) | true | "0e5193e6" | 0 |
|
"bed/e5d2e09d_nohash_0.wav" | 20
(bed) | true | "e5d2e09d" | 0 |
|
"bed/0f7dc557_nohash_0.wav" | 20
(bed) | true | "0f7dc557" | 0 |
|
"bed/d8a5ace5_nohash_1.wav" | 20
(bed) | true | "d8a5ace5" | 1 |
|
"bed/9d050657_nohash_0.wav" | 20
(bed) | true | "9d050657" | 0 |
|
"bed/30f31e42_nohash_0.wav" | 20
(bed) | true | "30f31e42" | 0 |
|
"bed/df6bd83f_nohash_0.wav" | 20
(bed) | true | "df6bd83f" | 0 |
|
"bed/db7c95b0_nohash_1.wav" | 20
(bed) | true | "db7c95b0" | 1 |
|
"bed/44b5720d_nohash_1.wav" | 20
(bed) | true | "44b5720d" | 1 |
|
"bed/a2b16113_nohash_2.wav" | 20
(bed) | true | "a2b16113" | 2 |
|
"bed/493392c6_nohash_0.wav" | 20
(bed) | true | "493392c6" | 0 |
|
"bed/7ff4fc72_nohash_0.wav" | 20
(bed) | true | "7ff4fc72" | 0 |
|
"bed/ab71c9a7_nohash_0.wav" | 20
(bed) | true | "ab71c9a7" | 0 |
|
"bed/919d3c0e_nohash_0.wav" | 20
(bed) | true | "919d3c0e" | 0 |
|
"bed/f0edc767_nohash_0.wav" | 20
(bed) | true | "f0edc767" | 0 |
|
"bed/f804cbb3_nohash_0.wav" | 20
(bed) | true | "f804cbb3" | 0 |
|
"bed/30065f33_nohash_0.wav" | 20
(bed) | true | "30065f33" | 0 |
|
"bed/4c13fe25_nohash_1.wav" | 20
(bed) | true | "4c13fe25" | 1 |
|
"bed/9ce7a419_nohash_0.wav" | 20
(bed) | true | "9ce7a419" | 0 |
|
"bed/479e64cc_nohash_0.wav" | 20
(bed) | true | "479e64cc" | 0 |
|
"bed/ffbb695d_nohash_0.wav" | 20
(bed) | true | "ffbb695d" | 0 |
|
"bed/7c75a504_nohash_0.wav" | 20
(bed) | true | "7c75a504" | 0 |
|
"bed/51995cea_nohash_0.wav" | 20
(bed) | true | "51995cea" | 0 |
|
"bed/5705a0f9_nohash_1.wav" | 20
(bed) | true | "5705a0f9" | 1 |
|
"bed/784e281a_nohash_0.wav" | 20
(bed) | true | "784e281a" | 0 |
|
"bed/708a9569_nohash_0.wav" | 20
(bed) | true | "708a9569" | 0 |
|
"bed/1c3f50ad_nohash_0.wav" | 20
(bed) | true | "1c3f50ad" | 0 |
|
"bed/0ff728b5_nohash_0.wav" | 20
(bed) | true | "0ff728b5" | 0 |
|
"bed/8a0457c9_nohash_1.wav" | 20
(bed) | true | "8a0457c9" | 1 |
|
"bed/86648261_nohash_0.wav" | 20
(bed) | true | "86648261" | 0 |
|
"bed/d2f4f431_nohash_0.wav" | 20
(bed) | true | "d2f4f431" | 0 |
|
"bed/34e8c726_nohash_0.wav" | 20
(bed) | true | "34e8c726" | 0 |
|
"bed/32efce64_nohash_0.wav" | 20
(bed) | true | "32efce64" | 0 |
|
"bed/9d8ac38b_nohash_0.wav" | 20
(bed) | true | "9d8ac38b" | 0 |
|
"bed/4b25f620_nohash_0.wav" | 20
(bed) | true | "4b25f620" | 0 |
|
"bed/6366f61a_nohash_0.wav" | 20
(bed) | true | "6366f61a" | 0 |
|
"bed/9b027ecf_nohash_1.wav" | 20
(bed) | true | "9b027ecf" | 1 |
|
"bed/c8db14a8_nohash_0.wav" | 20
(bed) | true | "c8db14a8" | 0 |
|
"bed/94e6864f_nohash_0.wav" | 20
(bed) | true | "94e6864f" | 0 |
|
"bed/9ab86dd0_nohash_0.wav" | 20
(bed) | true | "9ab86dd0" | 0 |
|
"bed/46114b4e_nohash_0.wav" | 20
(bed) | true | "46114b4e" | 0 |
|
"bed/2bdbe5f7_nohash_0.wav" | 20
(bed) | true | "2bdbe5f7" | 0 |
|
"bed/f9bdf10e_nohash_1.wav" | 20
(bed) | true | "f9bdf10e" | 1 |
|
"bed/8ea22de7_nohash_0.wav" | 20
(bed) | true | "8ea22de7" | 0 |
|
"bed/85b877b5_nohash_0.wav" | 20
(bed) | true | "85b877b5" | 0 |
|
"bed/9efe5140_nohash_0.wav" | 20
(bed) | true | "9efe5140" | 0 |
|
"bed/8830e17f_nohash_0.wav" | 20
(bed) | true | "8830e17f" | 0 |
|
"bed/d78858d9_nohash_0.wav" | 20
(bed) | true | "d78858d9" | 0 |
|
"bed/ad89eb1e_nohash_0.wav" | 20
(bed) | true | "ad89eb1e" | 0 |
|
"bed/39c13eed_nohash_0.wav" | 20
(bed) | true | "39c13eed" | 0 |
|
"bed/1625acd8_nohash_0.wav" | 20
(bed) | true | "1625acd8" | 0 |
|
"bed/62ef962d_nohash_1.wav" | 20
(bed) | true | "62ef962d" | 1 |
|
"bed/0a7c2a8d_nohash_0.wav" | 20
(bed) | true | "0a7c2a8d" | 0 |
|
"bed/a8cb6dda_nohash_0.wav" | 20
(bed) | true | "a8cb6dda" | 0 |
|
"bed/da76aa58_nohash_1.wav" | 20
(bed) | true | "da76aa58" | 1 |
|
"bed/28612180_nohash_0.wav" | 20
(bed) | true | "28612180" | 0 |
|
"bed/6094340e_nohash_0.wav" | 20
(bed) | true | "6094340e" | 0 |
|
"bed/305776dd_nohash_0.wav" | 20
(bed) | true | "305776dd" | 0 |
Dataset Card for SpeechCommands
Dataset Summary
This is a set of one-second .wav audio files, each containing a single spoken English word or background noise. These words are from a small set of commands, and are spoken by a variety of different speakers. This data set is designed to help train simple machine learning models. It is covered in more detail at https://arxiv.org/abs/1804.03209.
Version 0.01 of the data set (configuration "v0.01"
) was released on August 3rd 2017 and contains
64,727 audio files.
Version 0.02 of the data set (configuration "v0.02"
) was released on April 11th 2018 and
contains 105,829 audio files.
Supported Tasks and Leaderboards
keyword-spotting
: the dataset can be used to train and evaluate keyword spotting systems. The task is to detect preregistered keywords by classifying utterances into a predefined set of words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and inference time are all crucial.
Languages
The language data in SpeechCommands is in English (BCP-47 en
).
Dataset Structure
Data Instances
Example of a core word ("label"
is a word, "is_unknown"
is False
):
{
"file": "no/7846fd85_nohash_0.wav",
"audio": {
"path": "no/7846fd85_nohash_0.wav",
"array": array([ -0.00021362, -0.00027466, -0.00036621, ..., 0.00079346,
0.00091553, 0.00079346]),
"sampling_rate": 16000
},
"label": 1, # "no"
"is_unknown": False,
"speaker_id": "7846fd85",
"utterance_id": 0
}
Example of an auxiliary word ("label"
is a word, "is_unknown"
is True
)
{
"file": "tree/8b775397_nohash_0.wav",
"audio": {
"path": "tree/8b775397_nohash_0.wav",
"array": array([ -0.00854492, -0.01339722, -0.02026367, ..., 0.00274658,
0.00335693, 0.0005188]),
"sampling_rate": 16000
},
"label": 28, # "tree"
"is_unknown": True,
"speaker_id": "1b88bf70",
"utterance_id": 0
}
Example of background noise (_silence_
) class:
{
"file": "_silence_/doing_the_dishes.wav",
"audio": {
"path": "_silence_/doing_the_dishes.wav",
"array": array([ 0. , 0. , 0. , ..., -0.00592041,
-0.00405884, -0.00253296]),
"sampling_rate": 16000
},
"label": 30, # "_silence_"
"is_unknown": False,
"speaker_id": "None",
"utterance_id": 0 # doesn't make sense here
}
Data Fields
file
: relative audio filename inside the original archive.audio
: dictionary containing a relative audio filename, a decoded audio array, and the sampling rate. Note that when accessing the audio column:dataset[0]["audio"]
the audio is automatically decoded and resampled todataset.features["audio"].sampling_rate
. Decoding and resampling of a large number of audios might take a significant amount of time. Thus, it is important to first query the sample index before the"audio"
column, i.e.dataset[0]["audio"]
should always be preferred overdataset["audio"][0]
.label
: either word pronounced in an audio sample or background noise (_silence_
) class. Note that it's an integer value corresponding to the class name.is_unknown
: if a word is auxiliary. Equals toFalse
if a word is a core word or_silence_
,True
if a word is an auxiliary word.speaker_id
: unique id of a speaker. Equals toNone
if label is_silence_
.utterance_id
: incremental id of a word utterance within the same speaker.
Data Splits
The dataset has two versions (= configurations): "v0.01"
and "v0.02"
. "v0.02"
contains more words (see section Source Data for more details).
train | validation | test | |
---|---|---|---|
v0.01 | 51093 | 6799 | 3081 |
v0.02 | 84848 | 9982 | 4890 |
Note that in train and validation sets examples of _silence_
class are longer than 1 second.
You can use the following code to sample 1-second examples from the longer ones:
def sample_noise(example):
# Use this function to extract random 1 sec slices of each _silence_ utterance,
# e.g. inside `torch.utils.data.Dataset.__getitem__()`
from random import randint
if example["label"] == "_silence_":
random_offset = randint(0, len(example["speech"]) - example["sample_rate"] - 1)
example["speech"] = example["speech"][random_offset : random_offset + example["sample_rate"]]
return example
Dataset Creation
Curation Rationale
The primary goal of the dataset is to provide a way to build and test small models that can detect a single word from a set of target words and differentiate it from background noise or unrelated speech with as few false positives as possible.
Source Data
Initial Data Collection and Normalization
The audio files were collected using crowdsourcing, see aiyprojects.withgoogle.com/open_speech_recording for some of the open source audio collection code that was used. The goal was to gather examples of people speaking single-word commands, rather than conversational sentences, so they were prompted for individual words over the course of a five minute session.
In version 0.01 thirty different words were recoded: "Yes", "No", "Up", "Down", "Left", "Right", "On", "Off", "Stop", "Go", "Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Bed", "Bird", "Cat", "Dog", "Happy", "House", "Marvin", "Sheila", "Tree", "Wow".
In version 0.02 more words were added: "Backward", "Forward", "Follow", "Learn", "Visual".
In both versions, ten of them are used as commands by convention: "Yes", "No", "Up", "Down", "Left",
"Right", "On", "Off", "Stop", "Go". Other words are considered to be auxiliary (in current implementation
it is marked by True
value of "is_unknown"
feature). Their function is to teach a model to distinguish core words
from unrecognized ones.
The _silence_
label contains a set of longer audio clips that are either recordings or
a mathematical simulation of noise.
Who are the source language producers?
The audio files were collected using crowdsourcing.
Annotations
Annotation process
Labels are the list of words prepared in advances. Speakers were prompted for individual words over the course of a five minute session.
Who are the annotators?
[More Information Needed]
Personal and Sensitive Information
The dataset consists of people who have donated their voice online. You agree to not attempt to determine the identity of speakers in this dataset.
Considerations for Using the Data
Social Impact of Dataset
[More Information Needed]
Discussion of Biases
[More Information Needed]
Other Known Limitations
[More Information Needed]
Additional Information
Dataset Curators
[More Information Needed]
Licensing Information
Creative Commons BY 4.0 License ((CC-BY-4.0)[https://creativecommons.org/licenses/by/4.0/legalcode]).
Citation Information
@article{speechcommandsv2,
author = { {Warden}, P.},
title = "{Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition}",
journal = {ArXiv e-prints},
archivePrefix = "arXiv",
eprint = {1804.03209},
primaryClass = "cs.CL",
keywords = {Computer Science - Computation and Language, Computer Science - Human-Computer Interaction},
year = 2018,
month = apr,
url = {https://arxiv.org/abs/1804.03209},
}
Contributions
Thanks to @polinaeterna for adding this dataset.
- Downloads last month
- 1,577