athena.data

data

Package Contents

Classes

SpeechRecognitionDatasetBuilder SpeechRecognitionDatasetBuilder
SpeechRecognitionDatasetKaldiIOBuilder SpeechRecognitionDatasetKaldiIOBuilder
SpeechSynthesisDatasetBuilder SpeechSynthesisDatasetBuilder
SpeechDatasetBuilder SpeechDatasetBuilder
SpeechDatasetKaldiIOBuilder SpeechDatasetKaldiIOBuilder
SpeakerRecognitionDatasetBuilder SpeakerRecognitionDatasetBuilder
SpeakerVerificationDatasetBuilder SpeakerVerificationDatasetBuilder
LanguageDatasetBuilder LanguageDatasetBuilder
FeatureNormalizer Feature Normalizer
TextFeaturizer The main text featurizer interface
SentencePieceFeaturizer TODO: docstring
class athena.data.SpeechRecognitionDatasetBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

SpeechRecognitionDatasetBuilder

default_config
num_class

@property

Returns:the max_index of the vocabulary + 1
Return type:int
speaker_list

@property

Returns:the speaker list
Return type:list
audio_featurizer_func

return the audio_featurizer function

sample_type

@property

Returns:sample_type of the dataset:
{
    "input": tf.float32,
    "input_length": tf.int32,
    "output_length": tf.int32,
    "output": tf.int32,
}
Return type:dict
sample_shape

@property

Returns:sample_shape of the dataset:
{
    "input": tf.TensorShape([None, dim, nc]),
    "input_length": tf.TensorShape([]),
    "output_length": tf.TensorShape([]),
    "output": tf.TensorShape([None]),
}
Return type:dict
sample_signature

@property

Returns:sample_signature of the dataset:
{
    "input": tf.TensorSpec(shape=(None, None, dim, nc), dtype=tf.float32),
    "input_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
}
Return type:dict
reload_config(self, config)

reload the config

preprocess_data(self, file_path)

generate a list of tuples (wav_filename, wav_length_ms, transcript, speaker).

load_csv(self, file_path)

load csv file

__getitem__(self, index)

get a sample

Parameters:index (int) – index of the entries
Returns:sample:
{
    "input": feat,
    "input_length": feat_length,
    "output_length": label_length,
    "output": label,
}
Return type:dict
__len__(self)

return the number of data samples

filter_sample_by_unk(self)

filter samples which contain unk

filter_sample_by_input_length(self)

filter samples by input length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcripts, speed, speaker)
Return type:entries
filter_sample_by_output_length(self)

filter samples by output length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcripts, speed, speaker)
Return type:entries
compute_cmvn_if_necessary(self, is_necessary=True)

compute cmvn file

class athena.data.SpeechRecognitionDatasetKaldiIOBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

SpeechRecognitionDatasetKaldiIOBuilder

default_config
num_class

return the max_index of the vocabulary + 1

speaker_list

return the speaker list

audio_featurizer_func

return the audio_featurizer function

sample_type
sample_shape
sample_signature
reload_config(self, config)

reload the config

preprocess_data(self, file_dir, apply_sort_filter=True)

Generate a list of tuples (feat_key, speaker).

load_scps(self, file_dir)

load kaldi-format feats.scp, labels.scp and utt2spk (optional)

__getitem__(self, index)
__len__(self)

return the number of data samples

filter_sample_by_unk(self)

filter samples which contain unk

filter_sample_by_input_length(self)

filter samples by input length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcripts, speed, speaker)
Return type:entries
filter_sample_by_output_length(self)

filter samples by output length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcripts, speed, speaker)
Return type:entries
compute_cmvn_if_necessary(self, is_necessary=True)

compute cmvn file

class athena.data.SpeechSynthesisDatasetBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

SpeechSynthesisDatasetBuilder

default_config
num_class

@property

Returns:the max_index of the vocabulary
Return type:int
speaker_list

return the speaker list

audio_featurizer_func

return the audio_featurizer function

feat_dim

return the number of feature dims

sample_type

@property

Returns:sample_type of the dataset:
{
    "input": tf.int32,
    "input_length": tf.int32,
    "output_length": tf.int32,
    "output": tf.float32,
    "speaker": tf.int32
}
Return type:dict
sample_shape

@property

Returns:sample_shape of the dataset:
{
    "input": tf.TensorShape([None]),
    "input_length": tf.TensorShape([]),
    "output_length": tf.TensorShape([]),
    "output": tf.TensorShape([None, feature_dim]),
    "speaker": tf.TensorShape([])
}
Return type:dict
sample_signature

@property

Returns:sample_signature of the dataset:
{
    "input": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    "input_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output": tf.TensorSpec(shape=(None, None, feature_dim),
                            dtype=tf.float32),
    "speaker": tf.TensorSpec(shape=(None), dtype=tf.int32)
}
Return type:dict
reload_config(self, config)

reload the config

preprocess_data(self, file_path)

generate a list of tuples (wav_filename, wav_length_ms, transcript, speaker).

load_csv(self, file_path)

load csv file

__getitem__(self, index)

get a sample

Parameters:index (int) – index of the entries
Returns:sample:
{
    "input": text,
    "input_length": text_length,
    "output_length": audio_feat_length,
    "output": audio_feat,
    "speaker": self.speakers_dict[speaker]
}
Return type:dict
__len__(self)

return the number of data samples

filter_sample_by_unk(self)

filter samples which contain unk

filter_sample_by_input_length(self)

filter samples by input length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcript, speaker)
Return type:entries
filter_sample_by_output_length(self)

filter samples by output length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcripts, speaker)
Return type:entries
compute_cmvn_if_necessary(self, is_necessary=True)

compute cmvn file

class athena.data.SpeechDatasetBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

SpeechDatasetBuilder

default_config
num_class

@property

Returns:the target dim
Return type:int
speaker_list

return the speaker list

audio_featurizer_func

return the audio_featurizer function

sample_type

@property

Returns:sample_type of the dataset:
{
    "input": tf.float32,
"input_length": tf.int32,
"output": tf.float32,
"output_length": tf.int32,
}
Return type:dict
sample_shape

@property

Returns:sample_shape of the dataset:
{
    "input": tf.TensorShape(
    [None, self.audio_featurizer.dim, self.audio_featurizer.num_channels]
    ),
    "input_length": tf.TensorShape([]),
    "output": tf.TensorShape([None, None]),
    "output_length": tf.TensorShape([]),
}
Return type:dict
sample_signature

@property

Returns:sample_signature of the dataset:
{
    "input": tf.TensorSpec(
        shape=(None, None, None, None), dtype=tf.float32
    ),
    "input_length": tf.TensorSpec(shape=([None]), dtype=tf.int32),
    "output": tf.TensorSpec(shape=(None, None, None), dtype=tf.float32),
    "output_length": tf.TensorSpec(shape=([None]), dtype=tf.int32),
}
Return type:dict
reload_config(self, config)

reload the config

preprocess_data(self, file_path)

generate a list of tuples (wav_filename, wav_length_ms, speaker).

load_csv(self, file_path)

load csv file

__getitem__(self, index)

get a sample

Parameters:index (int) – index of the entries
Returns:sample:
{
    "input": input_data,
    "input_length": input_data.shape[0],
    "output": output_data,
    "output_length": output_data.shape[0],
}
Return type:dict
__len__(self)

return the number of data samples

filter_sample_by_input_length(self)

filter samples by input length

The length of filterd samples will be in [min_length, max_length)

Parameters:
  • = [min_len, max_len] (self.hparams.input_length_range) –
  • min_len – the minimal length(ms)
  • max_len – the maximal length(ms)
Returns:

a filtered list of tuples (wav_filename, wav_len, speaker)

Return type:

entries

compute_cmvn_if_necessary(self, is_necessary=True)

compute cmvn file

class athena.data.SpeechDatasetKaldiIOBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

SpeechDatasetKaldiIOBuilder

default_config
num_class

return the max_index of the vocabulary

speaker_list

return the speaker list

audio_featurizer_func

return the audio_featurizer function

sample_type
sample_shape
sample_signature
reload_config(self, config)

reload the config

preprocess_data(self, file_dir, apply_sort_filter=True)

generate a list of tuples (feat_key, speaker).

load_scps(self, file_dir)

load kaldi-format feats.scp and utt2spk (optional)

__getitem__(self, index)
__len__(self)

return the number of data samples

filter_sample_by_input_length(self)

filter samples by input length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, speaker)
Return type:entries
compute_cmvn_if_necessary(self, is_necessary=True)

compute cmvn file

class athena.data.SpeakerRecognitionDatasetBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

SpeakerRecognitionDatasetBuilder

default_config
num_class

@property

Returns:the number of speakers
Return type:int
sample_type

@property

Returns:sample_type of the dataset:
{
    "input": tf.float32,
    "input_length": tf.int32,
    "output_length": tf.int32,
    "output": tf.int32
}
Return type:dict
sample_shape

@property

Returns:sample_shape of the dataset:
{
    "input": tf.TensorShape([None, dim, nc]),
    "input_length": tf.TensorShape([]),
    "output_length": tf.TensorShape([]),
    "output": tf.TensorShape([None])
}
Return type:dict
sample_signature

@property

Returns:sample_signature of the dataset:
{
    "input": tf.TensorSpec(shape=(None, None, dim, nc), dtype=tf.float32),
    "input_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
}
Return type:dict
reload_config(self, config)

reload the config

preprocess_data(self, data_csv_path)

generate a list of tuples (wav_filename, wav_length_ms, speaker_id, speaker_name).

cut_features(self, feature)

cut acoustic featuers

load_csv(self, data_csv_path)

load csv file

__getitem__(self, index)

get a sample

Parameters:index (int) – index of the entries
Returns:sample:
{
    "input": feat,
    "input_length": feat_length,
    "output_length": 1,
    "output": spkid
}
Return type:dict
__len__(self)

return the number of data samples

filter_sample_by_input_length(self)

filter samples by input length

The length of filterd samples will be in [min_length, max_length)

Returns:a filtered list of tuples (wav_filename, wav_len, transcripts, speed, speaker)
Return type:entries
compute_cmvn_if_necessary(self, is_necessary=True)

compute cmvn file

class athena.data.SpeakerVerificationDatasetBuilder(config=None)

Bases: athena.data.datasets.speaker_recognition.SpeakerRecognitionDatasetBuilder

SpeakerVerificationDatasetBuilder

sample_type

@property

Returns:sample_type of the dataset:
{
    "input_a": tf.float32,
    "input_b": tf.float32,
    "output": tf.int32
}
Return type:dict
sample_shape

@property

Returns:sample_shape of the dataset:
{
    "input_a": tf.TensorShape([None, dim, nc]),
    "input_b": tf.TensorShape([None, dim, nc]),
    "output": tf.TensorShape([None])
}
Return type:dict
sample_signature

@property

Returns:sample_signature of the dataset:
{
    "input_a": tf.TensorSpec(shape=(None, None, dim, nc), dtype=tf.float32),
    "input_b":tf.TensorSpec(shape=(None, None, dim, nc), dtype=tf.float32),
    "output": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
}
Return type:dict
preprocess_data(self, data_csv_path)

generate a list of tuples (wav_filename_a, speaker_a, wav_filename_b, speaker_b, label).

__getitem__(self, index)

get a sample

Parameters:index (int) – index of the entries
Returns:sample:
{
    "input_a": feat_a,
    "input_b": feat_b,
    "output": [label]
}
Return type:dict
class athena.data.LanguageDatasetBuilder(config=None)

Bases: athena.data.datasets.base.BaseDatasetBuilder

LanguageDatasetBuilder

default_config
num_class

@property

Returns:the max_index of the vocabulary
Return type:int
input_vocab_size

@property

Returns:the input vocab size
Return type:int
sample_type

@property

Returns:sample_type of the dataset:
{
    "input": tf.int32,
    "input_length": tf.int32,
    "output": tf.int32,
    "output_length": tf.int32,
}
Return type:dict
sample_shape

@property

Returns:sample_shape of the dataset:
{
    "input": tf.TensorShape([None]),
    "input_length": tf.TensorShape([]),
    "output": tf.TensorShape([None]),
    "output_length": tf.TensorShape([]),
}
Return type:dict
sample_signature

@property

Returns:sample_signature of the dataset:
{
    "input": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    "input_length": tf.TensorSpec(shape=([None]), dtype=tf.int32),
    "output": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    "output_length": tf.TensorSpec(shape=([None]), dtype=tf.int32),
}
Return type:dict
load_csv(self, file_path)

load csv file

__getitem__(self, index)

get a sample

Parameters:index (int) – index of the entries
Returns:sample:
{
    "input": input_labels,
    "input_length": input_length,
    "output": output_labels,
    "output_length": output_length,
}
Return type:dict
__len__(self)

return the number of data samples

class athena.data.FeatureNormalizer(cmvn_file=None)

Feature Normalizer

__call__(self, feat_date, speaker, reverse=False)
apply_cmvn(self, feat_data, speaker, reverse=False)

TODO: docstring

compute_cmvn(self, entries, speakers, featurizer, feature_dim, num_cmvn_workers=1)

Compute cmvn for filtered entries

compute_cmvn_kaldiio(self, entries, speakers, kaldi_io_feats, feature_dim)

Compute cmvn for filtered entries using kaldi-format data

load_cmvn(self)

TODO: docstring

save_cmvn(self)

TODO: docstring

class athena.data.TextFeaturizer(config=None)

The main text featurizer interface

supported_model
default_config
model_type

the model type

unk_index

return the unk index

load_model(self, model_file)

load model

delete_punct(self, tokens)

delete punctuation tokens

__len__(self)
encode(self, texts)

Convert a sentence to a list of ids, with special tokens added.

decode(self, sequences)

Conver a list of ids to a sentence

class athena.data.SentencePieceFeaturizer(spm_file)

TODO: docstring

load_model(self, model_file)

load model

__len__(self)
encode(self, sentence)

Convert a sentence to a list of ids by sentence piece model

decode(self, ids)

Conver a list of ids to a sentence