00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154 #include <stdio.h>
00155 #include <stdlib.h>
00156 #include <string.h>
00157 #include <assert.h>
00158 #include <math.h>
00159
00160 #ifdef HAVE_CONFIG_H
00161 #include <config.h>
00162 #endif
00163
00164 #ifdef _MSC_VER
00165 #pragma warning (disable: 4305)
00166 #endif
00167
00168 #include "prim_type.h"
00169 #include "ad.h"
00170 #include "cont_ad.h"
00171 #include "err.h"
00172
00173
00174 #ifndef _ABS
00175 #define _ABS(x) ((x) >= 0 ? (x) : -(x))
00176 #endif
00177
00178
00179
00180
00181 #define CONT_AD_ADFRMSIZE 256
00182
00183 #define CONT_AD_POWHISTSIZE 98
00184
00185
00186 #define CONT_AD_CALIB_FRAMES (CONT_AD_POWHISTSIZE * 2)
00187
00188 #define CONT_AD_THRESH_UPDATE 100
00189
00190
00191 #define CONT_AD_ADAPT_RATE 0.2
00192
00193 #define CONT_AD_SPS 16000
00194
00195 #define CONT_AD_DEFAULT_NOISE 30
00196 #define CONT_AD_DELTA_SIL 10
00197 #define CONT_AD_DELTA_SPEECH 17
00198 #define CONT_AD_MIN_NOISE 2
00199 #define CONT_AD_MAX_NOISE 70
00200
00201 #define CONT_AD_HIST_INERTIA 3
00202
00203 #define CONT_AD_WINSIZE 21
00204
00205
00206 #define CONT_AD_SPEECH_ONSET 9
00207
00208
00209
00210
00211
00212
00213 #define CONT_AD_SIL_ONSET 18
00214
00215
00216
00217
00218
00219
00220 #define CONT_AD_LEADER 5
00221
00222
00223
00224 #define CONT_AD_TRAILER 10
00225
00226
00227
00228
00229
00230 void
00231 cont_ad_powhist_dump(FILE * fp, cont_ad_t * r)
00232 {
00233 int32 i, j;
00234
00235 fprintf(fp, "PowHist:\n");
00236 for (i = 0, j = 0; i < CONT_AD_POWHISTSIZE; i++) {
00237 if (r->pow_hist[i] > 0) {
00238 fprintf(fp, "\t%3d %6d\n", i, r->pow_hist[i]);
00239 j = i;
00240 }
00241 }
00242
00243 fprintf(fp, "PH[%7.2f]:",
00244 (double) (r->tot_frm * r->spf) / (double) (r->sps));
00245 for (i = 0; i <= j; i++)
00246 fprintf(fp, " %2d", r->pow_hist[i]);
00247 fprintf(fp, "\n");
00248
00249 fflush(fp);
00250 }
00251
00252
00253
00254
00255
00256
00257 int32
00258 cont_ad_frame_pow(int16 * buf, int32 * prev, int32 spf)
00259 {
00260 double sumsq, v;
00261 int32 i;
00262 int32 p;
00263
00264 sumsq = 0.0;
00265 p = *prev;
00266 for (i = 0; i < spf; i++) {
00267
00268 v = (double) (buf[i] - p);
00269 sumsq += v * v;
00270 p = buf[i];
00271 }
00272 *prev = p;
00273
00274 if (sumsq < spf)
00275 sumsq = spf;
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298 i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5);
00299 if (i < 0)
00300 i = 0;
00301 assert(i < 97);
00302
00303 return (i);
00304 }
00305
00306
00307
00308
00309
00310
00311 static void
00312 compute_frame_pow(cont_ad_t * r, int32 frm)
00313 {
00314 int32 i;
00315
00316 i = cont_ad_frame_pow(r->adbuf + (frm * r->spf), &(r->prev_sample),
00317 r->spf);
00318
00319 r->frm_pow[frm] = (char) i;
00320 (r->pow_hist[i])++;
00321 r->thresh_update--;
00322 }
00323
00324
00325
00326
00327
00328
00329
00330 static void
00331 decay_hist(cont_ad_t * r)
00332 {
00333 int32 i;
00334
00335 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
00336 r->pow_hist[i] -= (r->pow_hist[i] >> CONT_AD_HIST_INERTIA);
00337 }
00338
00339
00340
00341
00342
00343 static int32
00344 find_thresh(cont_ad_t * r)
00345 {
00346 int32 i, j, max, th;
00347 int32 old_noise_level, old_thresh_sil, old_thresh_speech;
00348
00349 if (!r->auto_thresh)
00350 return 0;
00351
00352
00353
00354
00355
00356
00357 for (i = r->min_noise;
00358 (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++);
00359 if (i > r->max_noise)
00360 return -1;
00361
00362
00363
00364
00365
00366 max = 0;
00367 for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i + 20); j++) {
00368 if (max < r->pow_hist[j]) {
00369 max = r->pow_hist[j];
00370 th = j;
00371 }
00372 }
00373
00374
00375 old_noise_level = r->noise_level;
00376 old_thresh_sil = r->thresh_sil;
00377 old_thresh_speech = r->thresh_speech;
00378
00379 r->noise_level =
00380 (int32) (r->noise_level +
00381 r->adapt_rate * (th - r->noise_level) + 0.5);
00382
00383
00384 r->thresh_sil = r->noise_level + r->delta_sil;
00385 r->thresh_speech = r->noise_level + r->delta_speech;
00386
00387 if (r->logfp) {
00388 fprintf(r->logfp,
00389 "%7.2fs %8df: NoisePeak: %d, Noiselevel: %d -> %d, Th-Sil: %d -> %d, Th-Sp: %d -> %d\n",
00390 (double) (r->tot_frm * r->spf) / (double) (r->sps),
00391 r->tot_frm, th, old_noise_level, r->noise_level,
00392 old_thresh_sil, r->thresh_sil, old_thresh_speech,
00393 r->thresh_speech);
00394
00395 cont_ad_powhist_dump(r->logfp, r);
00396
00397 fflush(r->logfp);
00398 }
00399
00400
00401
00402
00403
00404
00405 return 0;
00406 }
00407
00408
00409
00410
00411
00412 static void
00413 sil2speech_transition(cont_ad_t *r, int frm)
00414 {
00415 spseg_t *seg;
00416
00417
00418 seg = malloc(sizeof(*seg));
00419
00420 seg->startfrm = r->win_startfrm - r->leader;
00421 if (seg->startfrm < 0)
00422 seg->startfrm += CONT_AD_ADFRMSIZE;
00423 seg->nfrm = r->leader + r->winsize;
00424 seg->next = NULL;
00425
00426 if (!r->spseg_head)
00427 r->spseg_head = seg;
00428 else
00429 r->spseg_tail->next = seg;
00430 r->spseg_tail = seg;
00431
00432 r->tail_state = CONT_AD_STATE_SPEECH;
00433
00434 if (r->logfp) {
00435 int32 n;
00436
00437
00438 n = frm - seg->startfrm;
00439 if (n < 0)
00440 n += CONT_AD_ADFRMSIZE;
00441 n = r->tot_frm - n - 1;
00442
00443 fprintf(r->logfp,
00444 "%7.2fs %8d[%3d]f: Sil -> Sp detect; seg start: %7.2fs %8d\n",
00445 (double) (r->tot_frm *
00446 r->spf) /
00447 (double) (r->sps),
00448 r->tot_frm, frm,
00449 (double) (n * r->spf) / (double) (r->sps), n);
00450 }
00451
00452
00453 r->win_validfrm = 1;
00454 r->win_startfrm = frm;
00455
00456
00457 r->n_other = (r->frm_pow[frm] <= r->thresh_sil) ? 1 : 0;
00458 }
00459
00460
00461
00462
00463 static void
00464 speech2sil_transition(cont_ad_t *r, int frm)
00465 {
00466 int f;
00467
00468
00469 r->spseg_tail->nfrm += r->trailer;
00470
00471 r->tail_state = CONT_AD_STATE_SIL;
00472
00473 if (r->logfp) {
00474 int32 n;
00475
00476
00477 n = r->spseg_tail->startfrm + r->spseg_tail->nfrm - 1;
00478 if (n >= CONT_AD_ADFRMSIZE)
00479 n -= CONT_AD_ADFRMSIZE;
00480 n = frm - n;
00481 if (n < 0)
00482 n += CONT_AD_ADFRMSIZE;
00483 n = r->tot_frm - n;
00484
00485 fprintf(r->logfp,
00486 "%7.2fs %8d[%3d]f: Sp -> Sil detect; seg end: %7.2fs %8d\n",
00487 (double) (r->tot_frm * r->spf) /
00488 (double) (r->sps), r->tot_frm, frm,
00489 (double) (n * r->spf) / (double) (r->sps), n);
00490 }
00491
00492
00493 r->win_validfrm -= (r->trailer + r->leader - 1);
00494 r->win_startfrm += (r->trailer + r->leader - 1);
00495 if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
00496 r->win_startfrm -= CONT_AD_ADFRMSIZE;
00497
00498
00499 r->n_other = 0;
00500 for (f = r->win_startfrm;;) {
00501 if (r->frm_pow[f] >= r->thresh_speech)
00502 r->n_other++;
00503
00504 if (f == frm)
00505 break;
00506
00507 f++;
00508 if (f >= CONT_AD_ADFRMSIZE)
00509 f = 0;
00510 }
00511 }
00512
00513
00514
00515
00516
00517
00518
00519
00520 static void
00521 boundary_detect(cont_ad_t * r, int32 frm)
00522 {
00523 assert(r->n_other >= 0);
00524
00525 r->win_validfrm++;
00526 if (r->tail_state == CONT_AD_STATE_SIL) {
00527 if (r->frm_pow[frm] >= r->thresh_speech)
00528 r->n_other++;
00529 }
00530 else {
00531 if (r->frm_pow[frm] <= r->thresh_sil)
00532 r->n_other++;
00533 }
00534
00535 if (r->logfp) {
00536 fprintf(r->logfp,
00537 "%7.2fs %8d[%3d]f: P: %2d, N: %2d, T+: %2d, T-: %2d, #O: %2d, %s\n",
00538 (double) (r->tot_frm * r->spf) / (double) (r->sps),
00539 r->tot_frm, frm, r->frm_pow[frm], r->noise_level,
00540 r->thresh_speech, r->thresh_sil, r->n_other,
00541 (r->tail_state == CONT_AD_STATE_SIL) ? "--" : "Sp");
00542 }
00543
00544 if (r->win_validfrm < r->winsize)
00545 return;
00546 assert(r->win_validfrm == r->winsize);
00547
00548 if (r->tail_state == CONT_AD_STATE_SIL) {
00549 if (r->n_frm >= r->winsize + r->leader
00550 && r->n_other >= r->speech_onset) {
00551 sil2speech_transition(r, frm);
00552 }
00553 }
00554 else {
00555 if (r->n_other >= r->sil_onset) {
00556 speech2sil_transition(r, frm);
00557 }
00558 else {
00559
00560 r->spseg_tail->nfrm++;
00561 }
00562 }
00563
00564
00565
00566
00567
00568
00569 if (r->tail_state == CONT_AD_STATE_SIL) {
00570 if (r->frm_pow[r->win_startfrm] >= r->thresh_speech) {
00571 if (r->n_other > 0)
00572 r->n_other--;
00573 }
00574 }
00575 else {
00576 if (r->frm_pow[r->win_startfrm] <= r->thresh_sil) {
00577 if (r->n_other > 0)
00578 r->n_other--;
00579 }
00580 }
00581 r->win_validfrm--;
00582 r->win_startfrm++;
00583 if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
00584 r->win_startfrm = 0;
00585
00586 if (r->logfp)
00587 fflush(r->logfp);
00588 }
00589
00590
00591 static int32
00592 max_siglvl(cont_ad_t * r, int32 startfrm, int32 nfrm)
00593 {
00594 int32 siglvl, i, f;
00595
00596 siglvl = 0;
00597 if (nfrm > 0) {
00598 for (i = 0, f = startfrm; i < nfrm; i++, f++) {
00599 if (f >= CONT_AD_ADFRMSIZE)
00600 f -= CONT_AD_ADFRMSIZE;
00601 if (r->frm_pow[f] > siglvl)
00602 siglvl = r->frm_pow[f];
00603 }
00604 }
00605 return siglvl;
00606 }
00607
00608
00609 #if 0
00610
00611
00612
00613
00614 void
00615 get_audio_data(cont_ad_t * r, int16 * buf, int32 max)
00616 {
00617 }
00618 #endif
00619
00620
00621 static void
00622 cont_ad_read_log(cont_ad_t * r, int32 retval)
00623 {
00624 spseg_t *seg;
00625
00626 fprintf(r->logfp, "return from cont_ad_read() -> %d:\n", retval);
00627 fprintf(r->logfp, "\tstate: %d\n", r->state);
00628 fprintf(r->logfp, "\tread_ts: %d (%.2fs)\n",
00629 r->read_ts, (float32) r->read_ts / (float32) r->sps);
00630 fprintf(r->logfp, "\tseglen: %d (%.2fs)\n",
00631 r->seglen, (float32) r->seglen / (float32) r->sps);
00632 fprintf(r->logfp, "\tsiglvl: %d\n", r->siglvl);
00633 fprintf(r->logfp, "\theadfrm: %d\n", r->headfrm);
00634 fprintf(r->logfp, "\tn_frm: %d\n", r->n_frm);
00635 fprintf(r->logfp, "\tn_sample: %d\n", r->n_sample);
00636 fprintf(r->logfp, "\twin_startfrm: %d\n", r->win_startfrm);
00637 fprintf(r->logfp, "\twin_validfrm: %d\n", r->win_validfrm);
00638 fprintf(r->logfp, "\tnoise_level: %d\n", r->noise_level);
00639 fprintf(r->logfp, "\tthresh_sil: %d\n", r->thresh_sil);
00640 fprintf(r->logfp, "\tthresh_speech: %d\n", r->thresh_speech);
00641 fprintf(r->logfp, "\tn_other: %d\n", r->n_other);
00642 fprintf(r->logfp, "\ttail_state: %d\n", r->tail_state);
00643 fprintf(r->logfp, "\ttot_frm: %d\n", r->tot_frm);
00644
00645 fprintf(r->logfp, "\tspseg:");
00646 for (seg = r->spseg_head; seg; seg = seg->next)
00647 fprintf(r->logfp, " %d[%d]", seg->startfrm, seg->nfrm);
00648 fprintf(r->logfp, "\n");
00649
00650 fflush(r->logfp);
00651 }
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662 static int32
00663 buf_copy(cont_ad_t * r, int32 sf, int32 nf, int16 * buf)
00664 {
00665 int32 f, l;
00666
00667 assert((sf >= 0) && (sf < CONT_AD_ADFRMSIZE));
00668 assert(nf >= 0);
00669
00670 if (sf + nf > CONT_AD_ADFRMSIZE) {
00671
00672 f = CONT_AD_ADFRMSIZE - sf;
00673 l = (f * r->spf);
00674 memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
00675
00676 if (r->logfp) {
00677 fprintf(r->logfp,
00678 "return %d speech frames [%d..%d]; %d samples\n",
00679 f, sf, sf + f - 1, l);
00680 }
00681
00682 buf += l;
00683 sf = 0;
00684 nf -= f;
00685 }
00686
00687 if (nf > 0) {
00688 l = (nf * r->spf);
00689 memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
00690
00691 if (r->logfp) {
00692 fprintf(r->logfp,
00693 "return %d speech frames [%d..%d]; %d samples\n",
00694 nf, sf, sf + nf - 1, l);
00695 }
00696 }
00697
00698 if ((sf + nf) >= CONT_AD_ADFRMSIZE) {
00699 assert((sf + nf) == CONT_AD_ADFRMSIZE);
00700 return 0;
00701 }
00702 else
00703 return (sf + nf);
00704 }
00705
00706 int32
00707 cont_ad_buffer_space(cont_ad_t *r)
00708 {
00709 return r->adbufsize - r->n_sample;
00710 }
00711
00712
00713
00714
00715 static int32
00716 cont_ad_read_internal(cont_ad_t *r, int16 *buf, int32 max)
00717 {
00718 int32 head, tail, len, l;
00719
00720
00721
00722
00723
00724
00725 head = r->headfrm * r->spf;
00726 tail = head + r->n_sample;
00727 len = r->n_sample - (r->n_frm * r->spf);
00728 assert((len >= 0) && (len < r->spf));
00729
00730 if ((tail < r->adbufsize) && (!r->eof)) {
00731 if (r->adfunc) {
00732 if ((l =
00733 (*(r->adfunc)) (r->ad, r->adbuf + tail,
00734 r->adbufsize - tail)) < 0) {
00735 r->eof = 1;
00736 l = 0;
00737 }
00738 }
00739 else {
00740 l = r->adbufsize - tail;
00741 if (l > max) {
00742 l = max;
00743 max = 0;
00744 }
00745 else {
00746 max -= l;
00747 }
00748 memcpy(r->adbuf + tail, buf, l * sizeof(int16));
00749 buf += l;
00750 }
00751 if ((l > 0) && r->rawfp) {
00752 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
00753 fflush(r->rawfp);
00754 }
00755
00756 tail += l;
00757 len += l;
00758 r->n_sample += l;
00759 }
00760 if ((tail >= r->adbufsize) && (!r->eof)) {
00761 tail -= r->adbufsize;
00762 if (tail < head) {
00763 if (r->adfunc) {
00764 if ((l =
00765 (*(r->adfunc)) (r->ad,
00766 r->adbuf + tail, head - tail)) < 0) {
00767 r->eof = 1;
00768 l = 0;
00769 }
00770 }
00771 else {
00772 l = head - tail;
00773 if (l > max)
00774 l = max;
00775 memcpy(r->adbuf + tail, buf, l * sizeof(int16));
00776 }
00777 if ((l > 0) && r->rawfp) {
00778 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
00779 fflush(r->rawfp);
00780 }
00781
00782 tail += l;
00783 len += l;
00784 r->n_sample += l;
00785 }
00786 }
00787
00788 return len;
00789 }
00790
00791
00792
00793
00794 int32
00795 cont_ad_classify(cont_ad_t *r, int32 len)
00796 {
00797 int32 tailfrm;
00798
00799 tailfrm = (r->headfrm + r->n_frm);
00800 if (tailfrm >= CONT_AD_ADFRMSIZE)
00801 tailfrm -= CONT_AD_ADFRMSIZE;
00802
00803 for (; len >= r->spf; len -= r->spf) {
00804 compute_frame_pow(r, tailfrm);
00805 r->n_frm++;
00806 r->tot_frm++;
00807
00808
00809
00810
00811
00812 boundary_detect(r, tailfrm);
00813
00814 if (++tailfrm >= CONT_AD_ADFRMSIZE)
00815 tailfrm = 0;
00816
00817
00818 if (r->thresh_update <= 0) {
00819 int32 i, f;
00820 find_thresh(r);
00821 decay_hist(r);
00822 r->thresh_update = CONT_AD_THRESH_UPDATE;
00823
00824 #if 1
00825
00826
00827
00828
00829 r->n_other = 0;
00830 if (r->tail_state == CONT_AD_STATE_SIL) {
00831 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
00832 if (r->frm_pow[f] >= r->thresh_speech)
00833 r->n_other++;
00834
00835 f++;
00836 if (f >= CONT_AD_ADFRMSIZE)
00837 f = 0;
00838 }
00839 }
00840 else {
00841 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
00842 if (r->frm_pow[f] <= r->thresh_sil)
00843 r->n_other++;
00844
00845 f++;
00846 if (f >= CONT_AD_ADFRMSIZE)
00847 f = 0;
00848 }
00849 }
00850 #endif
00851 }
00852 }
00853
00854 return r->tail_state;
00855 }
00856
00857
00858
00859
00860
00861
00862 int32
00863 cont_ad_read(cont_ad_t * r, int16 * buf, int32 max)
00864 {
00865 int32 flen, len, retval, newstate;
00866 spseg_t *seg;
00867
00868 if ((r == NULL) || (buf == NULL))
00869 return -1;
00870
00871 if (max < r->spf) {
00872 E_ERROR
00873 ("cont_ad_read requires buffer of at least %d samples\n",
00874 r->spf);
00875 return -1;
00876 }
00877
00878 if (r->logfp) {
00879 fprintf(r->logfp, "cont_ad_read(,, %d)\n", max);
00880 fflush(r->logfp);
00881 }
00882
00883
00884 len = cont_ad_read_internal(r, buf, max);
00885
00886
00887 cont_ad_classify(r, len);
00888
00889
00890
00891
00892 if (r->eof) {
00893 if (r->tail_state == CONT_AD_STATE_SPEECH) {
00894
00895
00896
00897
00898 assert(r->spseg_tail != NULL);
00899
00900
00901 assert((r->win_validfrm >= 0)
00902 && (r->win_validfrm < r->winsize));
00903 r->spseg_tail->nfrm += r->win_validfrm;
00904
00905 r->tail_state = CONT_AD_STATE_SIL;
00906 }
00907
00908 r->win_startfrm += r->win_validfrm;
00909 if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
00910 r->win_startfrm -= CONT_AD_ADFRMSIZE;
00911 r->win_validfrm = 0;
00912 r->n_other = 0;
00913 }
00914
00915
00916
00917
00918
00919
00920
00921 seg = r->spseg_head;
00922
00923 if ((seg == NULL) || (r->headfrm != seg->startfrm)) {
00924
00925
00926
00927
00928 if (seg == NULL) {
00929 assert(r->tail_state == CONT_AD_STATE_SIL);
00930
00931 flen =
00932 (r->eof) ? r->n_frm : r->n_frm - (r->winsize +
00933 r->leader - 1);
00934 if (flen < 0)
00935 flen = 0;
00936 }
00937 else {
00938 flen = seg->startfrm - r->headfrm;
00939 if (flen < 0)
00940 flen += CONT_AD_ADFRMSIZE;
00941 }
00942
00943 if (r->rawmode) {
00944
00945 int32 f = max / r->spf;
00946 if (flen > f)
00947 flen = f;
00948 }
00949
00950 newstate = CONT_AD_STATE_SIL;
00951 }
00952 else {
00953 flen = max / r->spf;
00954 if (flen > seg->nfrm)
00955 flen = seg->nfrm;
00956
00957 newstate = CONT_AD_STATE_SPEECH;
00958 }
00959
00960 len = flen * r->spf;
00961
00962 r->siglvl = max_siglvl(r, r->headfrm, flen);
00963
00964 if ((newstate == CONT_AD_STATE_SIL) && (!r->rawmode)) {
00965
00966 r->headfrm += flen;
00967 if (r->headfrm >= CONT_AD_ADFRMSIZE)
00968 r->headfrm -= CONT_AD_ADFRMSIZE;
00969
00970 retval = 0;
00971 }
00972 else {
00973
00974 r->headfrm = buf_copy(r, r->headfrm, flen, buf);
00975
00976 retval = len;
00977 }
00978
00979 r->n_frm -= flen;
00980 r->n_sample -= len;
00981 assert((r->n_frm >= 0) && (r->n_sample >= 0));
00982 assert(r->win_validfrm <= r->n_frm);
00983
00984 if (r->state == newstate)
00985 r->seglen += len;
00986 else
00987 r->seglen = len;
00988 r->state = newstate;
00989
00990 if (newstate == CONT_AD_STATE_SPEECH) {
00991 seg->startfrm = r->headfrm;
00992 assert(seg->startfrm >= 0);
00993 seg->nfrm -= flen;
00994
00995
00996 if ((seg->nfrm == 0)
00997 && (seg->next || (r->tail_state == CONT_AD_STATE_SIL))) {
00998 r->spseg_head = seg->next;
00999 if (seg->next == NULL)
01000 r->spseg_tail = NULL;
01001 free(seg);
01002 }
01003 }
01004
01005
01006 r->read_ts = (r->tot_frm - r->n_frm) * r->spf;
01007
01008 if (retval == 0)
01009 retval = (r->eof && (r->spseg_head == NULL)) ? -1 : 0;
01010
01011 if (r->logfp)
01012 cont_ad_read_log(r, retval);
01013
01014 return retval;
01015 }
01016
01017
01018
01019
01020
01021 int32
01022 cont_ad_calib(cont_ad_t * r)
01023 {
01024 int32 i, s, k, len, tailfrm;
01025
01026 if (r == NULL)
01027 return -1;
01028
01029
01030 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
01031 r->pow_hist[i] = 0;
01032 tailfrm = r->headfrm + r->n_frm;
01033 if (tailfrm >= CONT_AD_ADFRMSIZE)
01034 tailfrm -= CONT_AD_ADFRMSIZE;
01035 s = (tailfrm * r->spf);
01036
01037 for (r->n_calib_frame = 0;
01038 r->n_calib_frame < CONT_AD_CALIB_FRAMES;
01039 ++r->n_calib_frame) {
01040 len = r->spf;
01041 while (len > 0) {
01042
01043 if ((k = (*(r->adfunc)) (r->ad, r->adbuf + s, len)) < 0)
01044 return -1;
01045 len -= k;
01046 s += k;
01047 }
01048 s -= r->spf;
01049
01050 compute_frame_pow(r, tailfrm);
01051 }
01052
01053 r->thresh_update = CONT_AD_THRESH_UPDATE;
01054 return find_thresh(r);
01055 }
01056
01057 int32
01058 cont_ad_calib_size(cont_ad_t *r)
01059 {
01060 return r->spf * CONT_AD_CALIB_FRAMES;
01061 }
01062
01063 int32
01064 cont_ad_calib_loop(cont_ad_t * r, int16 * buf, int32 max)
01065 {
01066 int32 i, s, len, tailfrm;
01067
01068 if (r->n_calib_frame == CONT_AD_CALIB_FRAMES) {
01069
01070
01071 r->n_calib_frame = 0;
01072
01073 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
01074 r->pow_hist[i] = 0;
01075 }
01076
01077 tailfrm = r->headfrm + r->n_frm;
01078 if (tailfrm >= CONT_AD_ADFRMSIZE)
01079 tailfrm -= CONT_AD_ADFRMSIZE;
01080 s = (tailfrm * r->spf);
01081
01082 len = r->spf;
01083 for (; r->n_calib_frame < CONT_AD_CALIB_FRAMES;
01084 ++r->n_calib_frame) {
01085 if (max < len)
01086 return 1;
01087 memcpy(r->adbuf + s, buf, len * sizeof(int16));
01088 max -= len;
01089 buf += len;
01090 compute_frame_pow(r, tailfrm);
01091 }
01092
01093 r->thresh_update = CONT_AD_THRESH_UPDATE;
01094 return find_thresh(r);
01095 }
01096
01097
01098
01099 int32
01100 cont_ad_set_thresh(cont_ad_t * r, int32 sil, int32 speech)
01101 {
01102 if (r == NULL)
01103 return -1;
01104
01105 if ((sil < 0) || (speech < 0)) {
01106 fprintf(stderr,
01107 "cont_ad_set_thresh: invalid threshold arguments: %d, %d\n",
01108 sil, speech);
01109 return -1;
01110 }
01111 r->delta_sil = (3 * sil) / 2;
01112 r->delta_speech = (3 * speech) / 2;
01113
01114 return 0;
01115 }
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125 int32
01126 cont_ad_set_params(cont_ad_t * r, int32 delta_sil,
01127 int32 delta_speech, int32 min_noise,
01128 int32 max_noise, int32 winsize,
01129 int32 speech_onset, int32 sil_onset, int32 leader,
01130 int32 trailer, float32 adapt_rate)
01131 {
01132 if ((delta_sil < 0) || (delta_speech < 0) || (min_noise < 0)
01133 || (max_noise < 0)) {
01134 E_ERROR("threshold arguments: "
01135 "%d, %d, %d, %d must all be >=0\n", delta_sil,
01136 delta_speech, min_noise, max_noise);
01137 return -1;
01138 }
01139
01140 if ((speech_onset > winsize) || (speech_onset <= 0)
01141 || (winsize <= 0)) {
01142 E_ERROR
01143 ("speech_onset, %d, must be <= winsize, %d, and both >0\n",
01144 speech_onset, winsize);
01145 return -1;
01146 }
01147
01148 if ((sil_onset > winsize) || (sil_onset <= 0) || (winsize <= 0)) {
01149 E_ERROR
01150 ("sil_onset, %d, must be <= winsize, %d, and both >0\n",
01151 sil_onset, winsize);
01152 return -1;
01153 }
01154
01155 if (((leader + trailer) > winsize) || (leader <= 0)
01156 || (trailer <= 0)) {
01157 E_ERROR
01158 ("leader, %d, plus trailer, %d, must be <= winsize, %d, and both >0\n",
01159 leader, trailer, winsize);
01160 return -1;
01161 }
01162
01163 if ((adapt_rate < 0.0) || (adapt_rate > 1.0)) {
01164 E_ERROR("adapt_rate, %e; must be in range 0..1\n", adapt_rate);
01165 return -1;
01166 }
01167
01168 if (r == NULL)
01169 return -1;
01170
01171 r->delta_sil = delta_sil;
01172 r->delta_speech = delta_speech;
01173 r->min_noise = min_noise;
01174 r->max_noise = max_noise;
01175
01176 r->winsize = winsize;
01177 r->speech_onset = speech_onset;
01178 r->sil_onset = sil_onset;
01179 r->leader = leader;
01180 r->trailer = trailer;
01181
01182 r->adapt_rate = adapt_rate;
01183
01184 if (r->win_validfrm >= r->winsize)
01185 r->win_validfrm = r->winsize - 1;
01186
01187 return 0;
01188 }
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198 int32
01199 cont_ad_get_params(cont_ad_t * r, int32 * delta_sil,
01200 int32 * delta_speech, int32 * min_noise,
01201 int32 * max_noise, int32 * winsize,
01202 int32 * speech_onset, int32 * sil_onset,
01203 int32 * leader, int32 * trailer, float32 * adapt_rate)
01204 {
01205 if (!delta_sil || !delta_speech || !min_noise || !max_noise
01206 || !winsize || !speech_onset || !sil_onset || !leader
01207 || !trailer || !adapt_rate) {
01208 fprintf(stderr, "cont_ad_get_params: some param slots are NULL\n");
01209 return (-1);
01210 }
01211
01212 if (r == NULL)
01213 return -1;
01214
01215 *delta_sil = r->delta_sil;
01216 *delta_speech = r->delta_speech;
01217 *min_noise = r->min_noise;
01218 *max_noise = r->max_noise;
01219
01220 *winsize = r->winsize;
01221 *speech_onset = r->speech_onset;
01222 *sil_onset = r->sil_onset;
01223 *leader = r->leader;
01224 *trailer = r->trailer;
01225
01226 *adapt_rate = r->adapt_rate;
01227
01228 return 0;
01229 }
01230
01231
01232
01233
01234
01235 int32
01236 cont_ad_reset(cont_ad_t * r)
01237 {
01238 spseg_t *seg;
01239
01240 if (r == NULL)
01241 return -1;
01242
01243 while (r->spseg_head) {
01244 seg = r->spseg_head;
01245 r->spseg_head = seg->next;
01246 free(seg);
01247 }
01248 r->spseg_tail = NULL;
01249
01250 r->headfrm = 0;
01251 r->n_frm = 0;
01252 r->n_sample = 0;
01253 r->win_startfrm = 0;
01254 r->win_validfrm = 0;
01255 r->n_other = 0;
01256
01257 r->tail_state = CONT_AD_STATE_SIL;
01258
01259 return 0;
01260 }
01261
01262
01263 int32
01264 cont_ad_close(cont_ad_t * cont)
01265 {
01266 if (cont == NULL)
01267 return -1;
01268
01269 cont_ad_reset(cont);
01270
01271 free(cont->adbuf);
01272 free(cont->pow_hist);
01273 free(cont->frm_pow);
01274 free(cont);
01275
01276 return 0;
01277 }
01278
01279
01280 int32
01281 cont_ad_detach(cont_ad_t * c)
01282 {
01283 if (c == NULL)
01284 return -1;
01285
01286 c->ad = NULL;
01287 c->adfunc = NULL;
01288 return 0;
01289 }
01290
01291
01292 int32
01293 cont_ad_attach(cont_ad_t * c, ad_rec_t * a,
01294 int32(*func) (ad_rec_t *, int16 *, int32))
01295 {
01296 if (c == NULL)
01297 return -1;
01298
01299 c->ad = a;
01300 c->adfunc = func;
01301 c->eof = 0;
01302
01303 return 0;
01304 }
01305
01306
01307 int32
01308 cont_set_thresh(cont_ad_t * r, int32 silence, int32 speech)
01309 {
01310 int32 i, f;
01311
01312 r->thresh_speech = speech;
01313 r->thresh_sil = silence;
01314
01315
01316 r->n_other = 0;
01317 if (r->tail_state == CONT_AD_STATE_SIL) {
01318 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
01319 if (r->frm_pow[f] >= r->thresh_speech)
01320 r->n_other++;
01321
01322 f++;
01323 if (f >= CONT_AD_ADFRMSIZE)
01324 f = 0;
01325 }
01326 }
01327 else if (r->tail_state == CONT_AD_STATE_SPEECH) {
01328 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
01329 if (r->frm_pow[f] <= r->thresh_sil)
01330 r->n_other++;
01331
01332 f++;
01333 if (f >= CONT_AD_ADFRMSIZE)
01334 f = 0;
01335 }
01336 }
01337
01338 return 0;
01339 }
01340
01341
01342
01343
01344
01345 int32
01346 cont_ad_set_rawfp(cont_ad_t * r, FILE * fp)
01347 {
01348 if (r == NULL)
01349 return -1;
01350
01351 r->rawfp = fp;
01352 return 0;
01353 }
01354
01355
01356
01357
01358
01359 int32
01360 cont_ad_set_logfp(cont_ad_t * r, FILE * fp)
01361 {
01362 if (r == NULL)
01363 return -1;
01364
01365 r->logfp = fp;
01366 return 0;
01367 }
01368
01369
01370
01371
01372
01373 cont_ad_t *
01374 cont_ad_init(ad_rec_t * a, int32(*func) (ad_rec_t *, int16 *, int32))
01375 {
01376 cont_ad_t *r;
01377
01378 if ((r = malloc(sizeof(*r))) == NULL) {
01379 E_ERROR_SYSTEM("allocation of cont_ad_t failed");
01380 return NULL;
01381 }
01382
01383 r->ad = a;
01384 r->adfunc = func;
01385 r->eof = 0;
01386 r->rawmode = 0;
01387
01388 if (a != NULL)
01389 r->sps = a->sps;
01390 else
01391 r->sps = CONT_AD_SPS;
01392
01393
01394 r->spf = (r->sps * 256) / CONT_AD_SPS;
01395 r->adbufsize = CONT_AD_ADFRMSIZE * r->spf;
01396
01397 if ((r->adbuf = malloc(r->adbufsize * sizeof(*r->adbuf))) == NULL) {
01398 E_ERROR_SYSTEM("allocation of audio buffer failed");
01399 free(r);
01400 return NULL;
01401 }
01402 if ((r->pow_hist =
01403 calloc(CONT_AD_POWHISTSIZE, sizeof(*r->pow_hist))) == NULL) {
01404 E_ERROR_SYSTEM("allocation of power history buffer failed");
01405 free(r->adbuf);
01406 free(r);
01407 return NULL;
01408 }
01409 if ((r->frm_pow =
01410 calloc(CONT_AD_ADFRMSIZE, sizeof(*r->frm_pow))) == NULL) {
01411 E_ERROR_SYSTEM("allocation of frame power buffer failed");
01412 free(r->pow_hist);
01413 free(r->adbuf);
01414 free(r);
01415 return NULL;
01416 }
01417
01418 r->state = CONT_AD_STATE_SIL;
01419 r->read_ts = 0;
01420 r->seglen = 0;
01421 r->siglvl = 0;
01422 r->prev_sample = 0;
01423 r->tot_frm = 0;
01424 r->noise_level = CONT_AD_DEFAULT_NOISE;
01425
01426 r->auto_thresh = 1;
01427 r->delta_sil = CONT_AD_DELTA_SIL;
01428 r->delta_speech = CONT_AD_DELTA_SPEECH;
01429 r->min_noise = CONT_AD_MIN_NOISE;
01430 r->max_noise = CONT_AD_MAX_NOISE;
01431 r->winsize = CONT_AD_WINSIZE;
01432 r->speech_onset = CONT_AD_SPEECH_ONSET;
01433 r->sil_onset = CONT_AD_SIL_ONSET;
01434 r->leader = CONT_AD_LEADER;
01435 r->trailer = CONT_AD_TRAILER;
01436
01437 r->thresh_sil = r->noise_level + r->delta_sil;
01438 r->thresh_speech = r->noise_level + r->delta_speech;
01439 r->thresh_update = CONT_AD_THRESH_UPDATE;
01440 r->adapt_rate = CONT_AD_ADAPT_RATE;
01441
01442 r->tail_state = CONT_AD_STATE_SIL;
01443
01444 r->spseg_head = NULL;
01445 r->spseg_tail = NULL;
01446
01447 r->rawfp = NULL;
01448 r->logfp = NULL;
01449
01450 r->n_calib_frame = 0;
01451
01452 cont_ad_reset(r);
01453
01454 return r;
01455 }
01456
01457
01458 cont_ad_t *
01459 cont_ad_init_rawmode(ad_rec_t * a,
01460 int32(*func) (ad_rec_t *, int16 *, int32))
01461 {
01462 cont_ad_t *r;
01463
01464 r = cont_ad_init(a, func);
01465 r->rawmode = 1;
01466
01467 return r;
01468 }