-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathtinysr.h
168 lines (135 loc) · 4.7 KB
/
tinysr.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// TinySR
// Written by Peter Schmidt-Nielsen ([email protected]) starting in 2014.
// The entire project is placed in the public domain.
#ifndef TinySR_H
#define TinySR_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#define FFT_LENGTH 512
#define FRAME_LENGTH 400
#define SHIFT_INTERVAL 160
typedef int16_t samp_t;
typedef enum {
TINYSR_MODE_ONE_SHOT,
TINYSR_MODE_FREE_RUNNING
} tinysr_mode_t;
// Generic singly linked list based stack.
typedef struct _list_node_t {
void* datum;
struct _list_node_t* prev;
struct _list_node_t* next;
} list_node_t;
// Simply set each field to zero initially to make an empty list.
// Thus, list_t l = {0}; suffices for initializiation.
typedef struct {
int length;
list_node_t* head;
list_node_t* tail;
} list_t;
void list_append_back(list_t* list, void* datum);
void* list_pop_front(list_t* list);
// TinySR context, and associated functions.
typedef struct {
// Public configuration:
// The sample rate you are feeding the recognizer.
// It is safe to change this as frequently as you want.
int input_sample_rate;
tinysr_mode_t utterance_mode;
int do_downmix;
// Private:
int processed_samples;
float resampling_prev_raw_sample;
float resampling_time_delta;
float offset_comp_prev_in;
float offset_comp_prev_out;
float* input_buffer;
int input_buffer_next;
int input_buffer_samps;
float* temp_buffer;
// Feature vector list.
list_t fv_list;
long long next_fv_number;
list_node_t* current_fv;
list_node_t* utterance_start;
float noise_floor_estimate;
float excitement;
float boredom;
int utterance_state;
list_t utterance_list;
list_t recog_entry_list;
char** word_names;
list_t results_list;
} tinysr_ctx_t;
typedef struct {
long long number;
float log_energy;
float cepstrum[13];
float noise_floor;
} feature_vector_t;
typedef struct {
int length;
feature_vector_t* feature_vectors;
} utterance_t;
typedef struct {
// The log-likelihood of data matching this model is:
// log_likelihood_offset - 0.5 * (cepstrum - cepstrum_mean)^T * cepstrum_inverse_covariance * (cepstrum - cepstrum_mean)
// Where cepstrum is an input 13-column vector and cepstrum_inverse_covariance is a 13x13 matrix.
// Also note that covariance matrices are symmetric, so there is no row/column major order issue
// to worry about with the cepstrum_inverse_covariance.
float log_likelihood_offset;
float cepstrum_mean[13];
float cepstrum_inverse_covariance[169];
} gaussian_t;
typedef struct {
int index;
char* name;
float ll_offset, ll_slope;
int model_template_length;
gaussian_t* model_template;
} recog_entry_t;
typedef struct {
int word_index;
float score;
} result_t;
// === Public API ===
// Call to get/free a context.
tinysr_ctx_t* tinysr_allocate_context(void);
void tinysr_free_context(tinysr_ctx_t* ctx);
// Convenience function call, equivalent to tinysr_feed_input(), but then runs
// tinysr_detect_utterances() and tinysr_recognize_utterances(), and then returns
// the number of pending recognition results.
int tinysr_recognize(tinysr_ctx_t* ctx, samp_t* samples, int length);
// Call to pass input samples.
void tinysr_feed_input(tinysr_ctx_t* ctx, samp_t* samples, int length);
// Call to trigger utterance detection.
void tinysr_detect_utterances(tinysr_ctx_t* ctx);
// Call to trigger recognition on detected utterances.
void tinysr_recognize_utterances(tinysr_ctx_t* ctx);
// Call to get one recognition result.
// Returns 1 if a result was gotten, 0 otherwise.
// It's safe to set either or both pointers to NULL.
int tinysr_get_result(tinysr_ctx_t* ctx, int* word_index, float* score);
// Add some recognition entries.
// Call this to add a word to the vocabulary of the given context.
int tinysr_load_model(tinysr_ctx_t* ctx, const char* path);
// Read and write CSV files containing an utterance.
// The write function returns non-zero on error, but doesn't print anything.
int write_feature_vector_csv(const char* path, utterance_t* utterance);
utterance_t* read_feature_vector_csv(const char* path);
// === Private functions ===
// Initiates a feature extraction run on the contents of input_buffer.
// This function is called automatically by tinysr_feed_input whenever the
// buffer is full, so you should never have to call it yourself.
void tinysr_process_frame(tinysr_ctx_t* ctx);
void tinysr_recognize_utterance(tinysr_ctx_t* ctx, utterance_t* utterance);
float gaussian_log_likelihood(gaussian_t* gauss, feature_vector_t* fv);
float compute_dynamic_time_warping(recog_entry_t* match, utterance_t* utterance);
// This function is used internally for the FFT computation.
void tinysr_fft_dit(float* in_real, float* in_imag, int length, int stride, float* out_real, float* out_imag);
void tinysr_abs_fft(float* array, int length);
#ifdef __cplusplus
}
#endif
#endif