00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include <stdio.h>
00038 #include <string.h>
00039 #include <math.h>
00040 #include <stdlib.h>
00041 #include <assert.h>
00042 #ifdef _WIN32_WCE
00043 #include <windows.h>
00044 #else
00045 #include <time.h>
00046 #endif
00047
00048 #ifdef HAVE_CONFIG_H
00049 #include <config.h>
00050 #endif
00051
00052 #include "prim_type.h"
00053 #include "byteorder.h"
00054 #include "fixpoint.h"
00055 #include "fe_internal.h"
00056 #include "genrand.h"
00057 #include "err.h"
00058 #include "cmd_ln.h"
00059 #include "ckd_alloc.h"
00060 #include "fe_warp.h"
00061
00062 static const arg_t fe_args[] = {
00063 waveform_to_cepstral_command_line_macro(),
00064 { NULL, 0, NULL, NULL }
00065 };
00066
00067 int
00068 fe_parse_general_params(cmd_ln_t *config, fe_t * fe)
00069 {
00070 int j;
00071
00072 fe->config = config;
00073 fe->sampling_rate = cmd_ln_float32_r(config, "-samprate");
00074 fe->frame_rate = (int16)cmd_ln_int32_r(config, "-frate");
00075 if (cmd_ln_boolean_r(config, "-dither")) {
00076 fe->dither = 1;
00077 fe->seed = cmd_ln_int32_r(config, "-seed");
00078 }
00079 #ifdef WORDS_BIGENDIAN
00080 fe->swap = strcmp("big", cmd_ln_str_r(config, "-input_endian")) == 0 ? 0 : 1;
00081 #else
00082 fe->swap = strcmp("little", cmd_ln_str_r(config, "-input_endian")) == 0 ? 0 : 1;
00083 #endif
00084 fe->window_length = cmd_ln_float32_r(config, "-wlen");
00085 fe->pre_emphasis_alpha = cmd_ln_float32_r(config, "-alpha");
00086
00087 fe->num_cepstra = (uint8)cmd_ln_int32_r(config, "-ncep");
00088 fe->fft_size = (int16)cmd_ln_int32_r(config, "-nfft");
00089
00090
00091 for (j = fe->fft_size, fe->fft_order = 0; j > 1; j >>= 1, fe->fft_order++) {
00092 if (((j % 2) != 0) || (fe->fft_size <= 0)) {
00093 E_ERROR("fft: number of points must be a power of 2 (is %d)\n",
00094 fe->fft_size);
00095 return -1;
00096 }
00097 }
00098
00099 if (fe->fft_size < (int)(fe->window_length * fe->sampling_rate)) {
00100 E_ERROR("FFT: Number of points must be greater or equal to frame size (%d samples)\n",
00101 (int)(fe->window_length * fe->sampling_rate));
00102 return -1;
00103 }
00104
00105 fe->remove_dc = cmd_ln_boolean_r(config, "-remove_dc");
00106
00107 if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "dct"))
00108 fe->transform = DCT_II;
00109 else if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "legacy"))
00110 fe->transform = LEGACY_DCT;
00111 else if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "htk"))
00112 fe->transform = DCT_HTK;
00113 else {
00114 E_ERROR("Invalid transform type (values are 'dct', 'legacy', 'htk')\n");
00115 return -1;
00116 }
00117
00118 if (cmd_ln_boolean_r(config, "-logspec"))
00119 fe->log_spec = RAW_LOG_SPEC;
00120 if (cmd_ln_boolean_r(config, "-smoothspec"))
00121 fe->log_spec = SMOOTH_LOG_SPEC;
00122
00123 return 0;
00124 }
00125
00126 static int
00127 fe_parse_melfb_params(cmd_ln_t *config, fe_t *fe, melfb_t * mel)
00128 {
00129 mel->sampling_rate = fe->sampling_rate;
00130 mel->fft_size = fe->fft_size;
00131 mel->num_cepstra = fe->num_cepstra;
00132 mel->num_filters = cmd_ln_int32_r(config, "-nfilt");
00133
00134 if (fe->log_spec)
00135 fe->feature_dimension = mel->num_filters;
00136 else
00137 fe->feature_dimension = fe->num_cepstra;
00138
00139 mel->upper_filt_freq = cmd_ln_float32_r(config, "-upperf");
00140 mel->lower_filt_freq = cmd_ln_float32_r(config, "-lowerf");
00141
00142 mel->doublewide = cmd_ln_boolean_r(config, "-doublebw");
00143
00144 mel->warp_type = cmd_ln_str_r(config, "-warp_type");
00145 mel->warp_params = cmd_ln_str_r(config, "-warp_params");
00146 mel->lifter_val = cmd_ln_int32_r(config, "-lifter");
00147
00148 mel->unit_area = cmd_ln_boolean_r(config, "-unit_area");
00149 mel->round_filters = cmd_ln_boolean_r(config, "-round_filters");
00150
00151 if (fe_warp_set(mel, mel->warp_type) != FE_SUCCESS) {
00152 E_ERROR("Failed to initialize the warping function.\n");
00153 return -1;
00154 }
00155 fe_warp_set_parameters(mel, mel->warp_params, mel->sampling_rate);
00156 return 0;
00157 }
00158
00159 void
00160 fe_print_current(fe_t const *fe)
00161 {
00162 E_INFO("Current FE Parameters:\n");
00163 E_INFO("\tSampling Rate: %f\n", fe->sampling_rate);
00164 E_INFO("\tFrame Size: %d\n", fe->frame_size);
00165 E_INFO("\tFrame Shift: %d\n", fe->frame_shift);
00166 E_INFO("\tFFT Size: %d\n", fe->fft_size);
00167 E_INFO("\tLower Frequency: %g\n",
00168 fe->mel_fb->lower_filt_freq);
00169 E_INFO("\tUpper Frequency: %g\n",
00170 fe->mel_fb->upper_filt_freq);
00171 E_INFO("\tNumber of filters: %d\n", fe->mel_fb->num_filters);
00172 E_INFO("\tNumber of Overflow Samps: %d\n", fe->num_overflow_samps);
00173 E_INFO("\tStart Utt Status: %d\n", fe->start_flag);
00174 E_INFO("Will %sremove DC offset at frame level\n",
00175 fe->remove_dc ? "" : "not ");
00176 if (fe->dither) {
00177 E_INFO("Will add dither to audio\n");
00178 E_INFO("Dither seeded with %d\n", fe->seed);
00179 }
00180 else {
00181 E_INFO("Will not add dither to audio\n");
00182 }
00183 if (fe->mel_fb->lifter_val) {
00184 E_INFO("Will apply sine-curve liftering, period %d\n",
00185 fe->mel_fb->lifter_val);
00186 }
00187 E_INFO("Will %snormalize filters to unit area\n",
00188 fe->mel_fb->unit_area ? "" : "not ");
00189 E_INFO("Will %sround filter frequencies to DFT points\n",
00190 fe->mel_fb->round_filters ? "" : "not ");
00191 E_INFO("Will %suse double bandwidth in mel filter\n",
00192 fe->mel_fb->doublewide ? "" : "not ");
00193 }
00194
00195 fe_t *
00196 fe_init_auto()
00197 {
00198 return fe_init_auto_r(cmd_ln_retain(cmd_ln_get()));
00199 }
00200
00201 fe_t *
00202 fe_init_auto_r(cmd_ln_t *config)
00203 {
00204 fe_t *fe;
00205
00206 fe = ckd_calloc(1, sizeof(*fe));
00207 fe->refcount = 1;
00208
00209
00210 if (fe_parse_general_params(config, fe) < 0) {
00211 fe_free(fe);
00212 return NULL;
00213 }
00214
00215
00216
00217
00218
00219 fe->frame_shift = (int32) (fe->sampling_rate / fe->frame_rate + 0.5);
00220 fe->frame_size = (int32) (fe->window_length * fe->sampling_rate + 0.5);
00221 fe->prior = 0;
00222 fe->frame_counter = 0;
00223
00224 if (fe->frame_size > (fe->fft_size)) {
00225 E_WARN
00226 ("Number of FFT points has to be a power of 2 higher than %d\n",
00227 (fe->frame_size));
00228 fe_free(fe);
00229 return (NULL);
00230 }
00231
00232 if (fe->dither)
00233 fe_init_dither(fe->seed);
00234
00235
00236 fe->overflow_samps = ckd_calloc(fe->frame_size, sizeof(int16));
00237 fe->hamming_window = ckd_calloc(fe->frame_size/2, sizeof(window_t));
00238
00239
00240 fe_create_hamming(fe->hamming_window, fe->frame_size);
00241
00242
00243 fe->mel_fb = ckd_calloc(1, sizeof(*fe->mel_fb));
00244
00245
00246 fe_parse_melfb_params(config, fe, fe->mel_fb);
00247 fe_build_melfilters(fe->mel_fb);
00248 fe_compute_melcosine(fe->mel_fb);
00249
00250
00251
00252 fe->spch = ckd_calloc(fe->frame_size, sizeof(*fe->spch));
00253 fe->frame = ckd_calloc(fe->fft_size, sizeof(*fe->frame));
00254 fe->spec = ckd_calloc(fe->fft_size, sizeof(*fe->spec));
00255 fe->mfspec = ckd_calloc(fe->mel_fb->num_filters, sizeof(*fe->mfspec));
00256
00257
00258 fe->ccc = ckd_calloc(fe->fft_size / 4, sizeof(*fe->ccc));
00259 fe->sss = ckd_calloc(fe->fft_size / 4, sizeof(*fe->sss));
00260 fe_create_twiddle(fe);
00261
00262 if (cmd_ln_boolean_r(config, "-verbose")) {
00263 fe_print_current(fe);
00264 }
00265
00266
00267
00268 fe_start_utt(fe);
00269 return fe;
00270 }
00271
00272 arg_t const *
00273 fe_get_args(void)
00274 {
00275 return fe_args;
00276 }
00277
00278 cmd_ln_t *
00279 fe_get_config(fe_t *fe)
00280 {
00281 return fe->config;
00282 }
00283
00284 void
00285 fe_init_dither(int32 seed)
00286 {
00287 if (seed < 0) {
00288 E_INFO("You are using the internal mechanism to generate the seed.\n");
00289 #ifdef _WIN32_WCE
00290 s3_rand_seed(GetTickCount());
00291 #else
00292 s3_rand_seed((long) time(0));
00293 #endif
00294 }
00295 else {
00296 E_INFO("You are using %d as the seed.\n", seed);
00297 s3_rand_seed(seed);
00298 }
00299 }
00300
00301 int32
00302 fe_start_utt(fe_t * fe)
00303 {
00304 fe->num_overflow_samps = 0;
00305 memset(fe->overflow_samps, 0, fe->frame_size * sizeof(int16));
00306 fe->start_flag = 1;
00307 fe->prior = 0;
00308 return 0;
00309 }
00310
00311 int
00312 fe_get_output_size(fe_t *fe)
00313 {
00314 return (int)fe->feature_dimension;
00315 }
00316
00317 void
00318 fe_get_input_size(fe_t *fe, int *out_frame_shift,
00319 int *out_frame_size)
00320 {
00321 if (out_frame_shift)
00322 *out_frame_shift = fe->frame_shift;
00323 if (out_frame_size)
00324 *out_frame_size = fe->frame_size;
00325 }
00326
00327 int32
00328 fe_process_frame(fe_t * fe, int16 const *spch, int32 nsamps, mfcc_t * fr_cep)
00329 {
00330 fe_read_frame(fe, spch, nsamps);
00331 return fe_write_frame(fe, fr_cep);
00332 }
00333
00334 int
00335 fe_process_frames(fe_t *fe,
00336 int16 const **inout_spch,
00337 size_t *inout_nsamps,
00338 mfcc_t **buf_cep,
00339 int32 *inout_nframes)
00340 {
00341 int32 frame_count;
00342 int outidx, i, n, n_overflow, orig_n_overflow;
00343 int16 const *orig_spch;
00344
00345
00346
00347 if (buf_cep == NULL) {
00348 if (*inout_nsamps + fe->num_overflow_samps < (size_t)fe->frame_size)
00349 *inout_nframes = 0;
00350 else
00351 *inout_nframes = 1
00352 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
00353 / fe->frame_shift);
00354 return *inout_nframes;
00355 }
00356
00357
00358 if (*inout_nsamps + fe->num_overflow_samps < (size_t)fe->frame_size) {
00359 if (*inout_nsamps > 0) {
00360
00361 memcpy(fe->overflow_samps + fe->num_overflow_samps,
00362 *inout_spch, *inout_nsamps * (sizeof(int16)));
00363 fe->num_overflow_samps += *inout_nsamps;
00364
00365 *inout_spch += *inout_nsamps;
00366 *inout_nsamps = 0;
00367 }
00368
00369 *inout_nframes = 0;
00370 return 0;
00371 }
00372
00373
00374 if (*inout_nframes < 1) {
00375 *inout_nframes = 0;
00376 return 0;
00377 }
00378
00379
00380 orig_spch = *inout_spch;
00381 orig_n_overflow = fe->num_overflow_samps;
00382
00383 frame_count = 1
00384 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
00385 / fe->frame_shift);
00386
00387 if (frame_count > *inout_nframes)
00388 frame_count = *inout_nframes;
00389
00390 outidx = 0;
00391
00392
00393 if (fe->num_overflow_samps) {
00394 int offset = fe->frame_size - fe->num_overflow_samps;
00395
00396
00397 memcpy(fe->overflow_samps + fe->num_overflow_samps,
00398 *inout_spch, offset * sizeof(**inout_spch));
00399 fe_read_frame(fe, fe->overflow_samps, fe->frame_size);
00400 assert(outidx < frame_count);
00401 if ((n = fe_write_frame(fe, buf_cep[outidx])) < 0)
00402 return -1;
00403 outidx += n;
00404
00405 *inout_spch += offset;
00406 *inout_nsamps -= offset;
00407 fe->num_overflow_samps -= fe->frame_shift;
00408 }
00409 else {
00410 fe_read_frame(fe, *inout_spch, fe->frame_size);
00411 assert(outidx < frame_count);
00412 if ((n = fe_write_frame(fe, buf_cep[outidx])) < 0)
00413 return -1;
00414 outidx += n;
00415
00416 *inout_spch += fe->frame_size;
00417 *inout_nsamps -= fe->frame_size;
00418 }
00419
00420
00421 for (i = 1; i < frame_count; ++i) {
00422 assert(*inout_nsamps >= (size_t)fe->frame_shift);
00423
00424 fe_shift_frame(fe, *inout_spch, fe->frame_shift);
00425 assert(outidx < frame_count);
00426 if ((n = fe_write_frame(fe, buf_cep[outidx])) < 0)
00427 return -1;
00428 outidx += n;
00429
00430 *inout_spch += fe->frame_shift;
00431 *inout_nsamps -= fe->frame_shift;
00432
00433 if (fe->num_overflow_samps > 0)
00434 fe->num_overflow_samps -= fe->frame_shift;
00435 }
00436
00437
00438 if (fe->num_overflow_samps <= 0) {
00439
00440 n_overflow = *inout_nsamps;
00441 if (n_overflow > fe->frame_shift)
00442 n_overflow = fe->frame_shift;
00443 fe->num_overflow_samps = fe->frame_size - fe->frame_shift;
00444
00445 if (fe->num_overflow_samps > *inout_spch - orig_spch)
00446 fe->num_overflow_samps = *inout_spch - orig_spch;
00447 fe->num_overflow_samps += n_overflow;
00448 if (fe->num_overflow_samps > 0) {
00449 memcpy(fe->overflow_samps,
00450 *inout_spch - (fe->frame_size - fe->frame_shift),
00451 fe->num_overflow_samps * sizeof(**inout_spch));
00452
00453 *inout_spch += n_overflow;
00454 *inout_nsamps -= n_overflow;
00455 }
00456 }
00457 else {
00458
00459
00460 memmove(fe->overflow_samps,
00461 fe->overflow_samps + orig_n_overflow - fe->num_overflow_samps,
00462 fe->num_overflow_samps * sizeof(*fe->overflow_samps));
00463
00464 n_overflow = *inout_spch - orig_spch + *inout_nsamps;
00465 if (n_overflow > fe->frame_size - fe->num_overflow_samps)
00466 n_overflow = fe->frame_size - fe->num_overflow_samps;
00467 memcpy(fe->overflow_samps + fe->num_overflow_samps,
00468 orig_spch, n_overflow * sizeof(*orig_spch));
00469 fe->num_overflow_samps += n_overflow;
00470
00471 if (n_overflow > *inout_spch - orig_spch) {
00472 n_overflow -= (*inout_spch - orig_spch);
00473 *inout_spch += n_overflow;
00474 *inout_nsamps -= n_overflow;
00475 }
00476 }
00477
00478
00479 *inout_nframes = outidx;
00480 return 0;
00481 }
00482
00483 int
00484 fe_process_utt(fe_t * fe, int16 const * spch, size_t nsamps,
00485 mfcc_t *** cep_block, int32 * nframes)
00486 {
00487 mfcc_t **cep;
00488 int rv;
00489
00490
00491 fe_process_frames(fe, NULL, &nsamps, NULL, nframes);
00492
00493 if (*nframes)
00494 cep = (mfcc_t **)ckd_calloc_2d(*nframes, fe->feature_dimension, sizeof(**cep));
00495 else
00496 cep = (mfcc_t **)ckd_calloc_2d(1, fe->feature_dimension, sizeof(**cep));
00497
00498 rv = fe_process_frames(fe, &spch, &nsamps, cep, nframes);
00499 *cep_block = cep;
00500
00501 return rv;
00502 }
00503
00504
00505 int32
00506 fe_end_utt(fe_t * fe, mfcc_t * cepvector, int32 * nframes)
00507 {
00508
00509 if (fe->num_overflow_samps > 0) {
00510 fe_read_frame(fe, fe->overflow_samps, fe->num_overflow_samps);
00511 *nframes = fe_write_frame(fe, cepvector);
00512 }
00513 else {
00514 *nframes = 0;
00515 }
00516
00517
00518 fe->num_overflow_samps = 0;
00519 fe->start_flag = 0;
00520
00521 return 0;
00522 }
00523
00524 fe_t *
00525 fe_retain(fe_t *fe)
00526 {
00527 ++fe->refcount;
00528 return fe;
00529 }
00530
00531 int
00532 fe_free(fe_t * fe)
00533 {
00534 if (fe == NULL)
00535 return 0;
00536 if (--fe->refcount > 0)
00537 return fe->refcount;
00538
00539
00540 if (fe->mel_fb) {
00541 if (fe->mel_fb->mel_cosine)
00542 fe_free_2d((void *) fe->mel_fb->mel_cosine);
00543 ckd_free(fe->mel_fb->lifter);
00544 ckd_free(fe->mel_fb->spec_start);
00545 ckd_free(fe->mel_fb->filt_start);
00546 ckd_free(fe->mel_fb->filt_width);
00547 ckd_free(fe->mel_fb->filt_coeffs);
00548 ckd_free(fe->mel_fb);
00549 }
00550 ckd_free(fe->spch);
00551 ckd_free(fe->frame);
00552 ckd_free(fe->ccc);
00553 ckd_free(fe->sss);
00554 ckd_free(fe->spec);
00555 ckd_free(fe->mfspec);
00556 ckd_free(fe->overflow_samps);
00557 ckd_free(fe->hamming_window);
00558 cmd_ln_free_r(fe->config);
00559 ckd_free(fe);
00560
00561 return 0;
00562 }
00563
00567 int32
00568 fe_mfcc_to_float(fe_t * fe,
00569 mfcc_t ** input, float32 ** output, int32 nframes)
00570 {
00571 int32 i;
00572
00573 #ifndef FIXED_POINT
00574 if ((void *) input == (void *) output)
00575 return nframes * fe->feature_dimension;
00576 #endif
00577 for (i = 0; i < nframes * fe->feature_dimension; ++i)
00578 output[0][i] = MFCC2FLOAT(input[0][i]);
00579
00580 return i;
00581 }
00582
00586 int32
00587 fe_float_to_mfcc(fe_t * fe,
00588 float32 ** input, mfcc_t ** output, int32 nframes)
00589 {
00590 int32 i;
00591
00592 #ifndef FIXED_POINT
00593 if ((void *) input == (void *) output)
00594 return nframes * fe->feature_dimension;
00595 #endif
00596 for (i = 0; i < nframes * fe->feature_dimension; ++i)
00597 output[0][i] = FLOAT2MFCC(input[0][i]);
00598
00599 return i;
00600 }
00601
00602 int32
00603 fe_logspec_to_mfcc(fe_t * fe, const mfcc_t * fr_spec, mfcc_t * fr_cep)
00604 {
00605 #ifdef FIXED_POINT
00606 fe_spec2cep(fe, fr_spec, fr_cep);
00607 #else
00608 powspec_t *powspec;
00609 int32 i;
00610
00611 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
00612 for (i = 0; i < fe->mel_fb->num_filters; ++i)
00613 powspec[i] = (powspec_t) fr_spec[i];
00614 fe_spec2cep(fe, powspec, fr_cep);
00615 ckd_free(powspec);
00616 #endif
00617 return 0;
00618 }
00619
00620 int32
00621 fe_logspec_dct2(fe_t * fe, const mfcc_t * fr_spec, mfcc_t * fr_cep)
00622 {
00623 #ifdef FIXED_POINT
00624 fe_dct2(fe, fr_spec, fr_cep, 0);
00625 #else
00626 powspec_t *powspec;
00627 int32 i;
00628
00629 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
00630 for (i = 0; i < fe->mel_fb->num_filters; ++i)
00631 powspec[i] = (powspec_t) fr_spec[i];
00632 fe_dct2(fe, powspec, fr_cep, 0);
00633 ckd_free(powspec);
00634 #endif
00635 return 0;
00636 }
00637
00638 int32
00639 fe_mfcc_dct3(fe_t * fe, const mfcc_t * fr_cep, mfcc_t * fr_spec)
00640 {
00641 #ifdef FIXED_POINT
00642 fe_dct3(fe, fr_cep, fr_spec);
00643 #else
00644 powspec_t *powspec;
00645 int32 i;
00646
00647 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
00648 fe_dct3(fe, fr_cep, powspec);
00649 for (i = 0; i < fe->mel_fb->num_filters; ++i)
00650 fr_spec[i] = (mfcc_t) powspec[i];
00651 ckd_free(powspec);
00652 #endif
00653 return 0;
00654 }