• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/sphinx_adtools/sphinx_pitch.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 
00038 #include <stdio.h>
00039 #include <string.h>
00040 
00041 #include <cmd_ln.h>
00042 #include <yin.h>
00043 #include <ckd_alloc.h>
00044 #include <byteorder.h>
00045 #include <strfuncs.h>
00046 #include <err.h>
00047 #include <pio.h>
00048 
00049 #ifndef WORDS_BIGENDIAN
00050 #define WORDS_BIGENDIAN 0
00051 #endif
00052 
00053 static arg_t defn[] = {
00054   { "-i",
00055     ARG_STRING,
00056     NULL,
00057     "Single audio input file" },
00058 
00059   { "-o",
00060     ARG_STRING,
00061     NULL,
00062     "Single text output file (standard output will be used if not given)" },
00063   
00064   { "-c",
00065     ARG_STRING,
00066     NULL,
00067     "Control file for batch processing" },
00068   
00069   { "-nskip",
00070     ARG_INT32,
00071     "0",
00072     "If a control file was specified, the number of utterances to skip at the head of the file" },
00073   
00074   { "-runlen",
00075     ARG_INT32,
00076     "-1",
00077     "If a control file was specified, the number of utterances to process (see -nskip too)" },
00078   
00079   { "-di",
00080     ARG_STRING,
00081     NULL,
00082     "Input directory, input file names are relative to this, if defined" },
00083   
00084   { "-ei",
00085     ARG_STRING,
00086     NULL,
00087     "Input extension to be applied to all input files" },
00088   
00089   { "-do",
00090     ARG_STRING,
00091     NULL,
00092     "Output directory, output files are relative to this" },
00093   
00094   { "-eo",
00095     ARG_STRING,
00096     NULL,
00097     "Output extension to be applied to all output files" },
00098   
00099   { "-nist",
00100     ARG_BOOLEAN,
00101     "no",
00102     "Defines input format as NIST sphere" },
00103   
00104   { "-raw",
00105     ARG_BOOLEAN,
00106     "no",
00107     "Defines input format as raw binary data" },
00108   
00109   { "-mswav",
00110     ARG_BOOLEAN,
00111     "no",
00112     "Defines input format as Microsoft Wav (RIFF)" },
00113 
00114   { "-samprate",
00115     ARG_INT32,
00116     "0",
00117     "Sampling rate of audio data (will be determined automatically if 0)" },
00118 
00119   { "-input_endian",
00120     ARG_STRING,
00121     NULL,
00122     "Endianness of audio data (will be determined automatically if not given)" },
00123 
00124   { "-fshift",
00125     ARG_FLOAT32,
00126     "0.01",
00127     "Frame shift: number of seconds between each analysis frame." },
00128 
00129   { "-flen",
00130     ARG_FLOAT32,
00131     "0.025",
00132     "Number of seconds in each analysis frame (needs to be greater than twice the longest period you wish to detect - to detect down to 80Hz you need a frame length of 2.0/80 = 0.025)." },
00133 
00134   { "-smooth_window",
00135     ARG_INT32,
00136     "2",
00137     "Number of frames on either side of the current frame to use for smoothing." },
00138 
00139   { "-voice_thresh",
00140     ARG_FLOAT32,
00141     "0.1",
00142     "Threshold of normalized difference under which to search for the fundamental period." },
00143 
00144   { "-search_range",
00145     ARG_FLOAT32,
00146     "0.2",
00147     "Fraction of the best local estimate to use as a search range for smoothing." },
00148 
00149   { NULL, 0, NULL, NULL }
00150 };
00151 
00152 static int extract_pitch(const char *in, const char *out);
00153 static int run_control_file(const char *ctl);
00154 
00155 int
00156 main(int argc, char *argv[])
00157 {
00158     cmd_ln_parse(defn, argc, argv, TRUE);
00159 
00160     /* Run a control file if requested. */
00161     if (cmd_ln_str("-c")) {
00162         if (run_control_file(cmd_ln_str("-c")) < 0)
00163             return 1;
00164     }
00165     else {
00166         if (extract_pitch(cmd_ln_str("-i"), cmd_ln_str("-o")) < 0)
00167             return 1;
00168     }
00169 
00170     cmd_ln_free();
00171     return 0;
00172 }
00173 
00174 static int
00175 guess_file_type(char const *file, FILE *infh)
00176 {
00177     char header[4];
00178 
00179     fseek(infh, 0, SEEK_SET);
00180     if (fread(header, 1, 4, infh) != 4) {
00181         E_ERROR_SYSTEM("Failed to read 4 byte header");
00182         return -1;
00183     }
00184     if (0 == memcmp(header, "RIFF", 4)) {
00185         E_INFO("%s appears to be a WAV file\n", file);
00186         cmd_ln_set_boolean("-mswav", TRUE);
00187         cmd_ln_set_boolean("-nist", FALSE);
00188         cmd_ln_set_boolean("-raw", FALSE);
00189     }
00190     else if (0 == memcmp(header, "NIST", 4)) {
00191         E_INFO("%s appears to be a NIST SPHERE file\n", file);
00192         cmd_ln_set_boolean("-mswav", FALSE);
00193         cmd_ln_set_boolean("-nist", TRUE);
00194         cmd_ln_set_boolean("-raw", FALSE);
00195     }
00196     else {
00197         E_INFO("%s appears to be raw data\n", file);
00198         cmd_ln_set_boolean("-mswav", FALSE);
00199         cmd_ln_set_boolean("-nist", FALSE);
00200         cmd_ln_set_boolean("-raw", TRUE);
00201     }
00202     fseek(infh, 0, SEEK_SET);
00203     return 0;
00204 }
00205 
00206 #define TRY_FREAD(ptr, size, nmemb, stream)                             \
00207     if (fread(ptr, size, nmemb, stream) != (nmemb)) {                   \
00208         E_ERROR_SYSTEM("Failed to read %d bytes", size * nmemb);       \
00209         goto error_out;                                                 \
00210     }
00211 
00212 static int
00213 read_riff_header(FILE *infh)
00214 {
00215     char id[4];
00216     int32 intval, header_len;
00217     int16 shortval;
00218 
00219     /* RIFF files are little-endian by definition. */
00220     cmd_ln_set_str("-input_endian", "little");
00221 
00222     /* Read in all the header chunks and etcetera. */
00223     TRY_FREAD(id, 1, 4, infh);
00224     /* Total file length (we don't care) */
00225     TRY_FREAD(&intval, 4, 1, infh);
00226     /* 'WAVE' */
00227     TRY_FREAD(id, 1, 4, infh);
00228     if (0 != memcmp(id, "WAVE", 4)) {
00229         E_ERROR("This is not a WAVE file\n");
00230         goto error_out;
00231     }
00232     /* 'fmt ' */
00233     TRY_FREAD(id, 1, 4, infh);
00234     if (0 != memcmp(id, "fmt ", 4)) {
00235         E_ERROR("Format chunk missing\n");
00236         goto error_out;
00237     }
00238     /* Length of 'fmt ' chunk */
00239     TRY_FREAD(&intval, 4, 1, infh);
00240     if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
00241     header_len = intval;
00242 
00243     /* Data format. */
00244     TRY_FREAD(&shortval, 2, 1, infh);
00245     if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
00246     if (shortval != 1) { /* PCM */
00247         E_ERROR("WAVE file is not in PCM format\n");
00248         goto error_out;
00249     }
00250 
00251     /* Number of channels. */
00252     TRY_FREAD(&shortval, 2, 1, infh);
00253     if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
00254     if (shortval != 1) { /* PCM */
00255         E_ERROR("WAVE file is not single channel\n");
00256         goto error_out;
00257     }
00258 
00259     /* Sampling rate (finally!) */
00260     TRY_FREAD(&intval, 4, 1, infh);
00261     if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
00262     if (cmd_ln_int32("-samprate") == 0)
00263         cmd_ln_set_int32("-samprate", intval);
00264     else if (cmd_ln_int32("-samprate") != intval) {
00265         E_WARN("WAVE file sampling rate %d != -samprate %d\n",
00266                intval, cmd_ln_int32("-samprate"));
00267     }
00268 
00269     /* Average bytes per second (we don't care) */
00270     TRY_FREAD(&intval, 4, 1, infh);
00271 
00272     /* Block alignment (we don't care) */
00273     TRY_FREAD(&shortval, 2, 1, infh);
00274 
00275     /* Bits per sample (must be 16) */
00276     TRY_FREAD(&shortval, 2, 1, infh);
00277     if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
00278     if (shortval != 16) {
00279         E_ERROR("WAVE file is not 16-bit\n");
00280         goto error_out;
00281     }
00282 
00283     /* Any extra parameters. */
00284     if (header_len > 16)
00285         fseek(infh, header_len - 16, SEEK_CUR);
00286 
00287     /* Now skip to the 'data' chunk. */
00288     while (1) {
00289         TRY_FREAD(id, 1, 4, infh);
00290         if (0 == memcmp(id, "data", 4)) {
00291             /* Total number of bytes of data (we don't care). */
00292             TRY_FREAD(&intval, 4, 1, infh);
00293             break;
00294         }
00295         else {
00296             /* Some other stuff... */
00297             /* Number of bytes of ... whatever */
00298             TRY_FREAD(&intval, 4, 1, infh);
00299             if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
00300             fseek(infh, intval, SEEK_CUR);
00301         }
00302     }
00303 
00304     /* We are ready to rumble. */
00305     return 0;
00306 error_out:
00307     return -1;
00308 }
00309 
00310 static int
00311 read_nist_header(FILE *infh)
00312 {
00313     char hdr[1024];
00314     char *line, *c;
00315 
00316     TRY_FREAD(hdr, 1, 1024, infh);
00317     hdr[1023] = '\0';
00318 
00319     /* Roughly parse it to find the sampling rate and byte order
00320      * (don't bother with other stuff) */
00321     if ((line = strstr(hdr, "sample_rate")) == NULL) {
00322         E_ERROR("No sampling rate in NIST header!\n");
00323         goto error_out;
00324     }
00325     c = strchr(line, '\n');
00326     if (c) *c = '\0';
00327     c = strrchr(line, ' ');
00328     if (c == NULL) {
00329         E_ERROR("Could not find sampling rate!\n");
00330         goto error_out;
00331     }
00332     ++c;
00333     if (cmd_ln_int32("-samprate") == 0)
00334         cmd_ln_set_int32("-samprate", atoi(c));
00335     else if (cmd_ln_int32("-samprate") != atoi(c)) {
00336         E_WARN("NIST file sampling rate %d != -samprate %d\n",
00337                atoi(c), cmd_ln_int32("-samprate"));
00338     }
00339 
00340     if (line + strlen(line) < hdr + 1023)
00341         line[strlen(line)] = ' ';
00342     if ((line = strstr(hdr, "sample_byte_format")) == NULL) {
00343         E_ERROR("No sample byte format in NIST header!\n");
00344         goto error_out;
00345     }
00346     c = strchr(line, '\n');
00347     if (c) *c = '\0';
00348     c = strrchr(line, ' ');
00349     if (c == NULL) {
00350         E_ERROR("Could not find sample byte order!\n");
00351         goto error_out;
00352     }
00353     ++c;
00354     if (0 == memcmp(c, "01", 2)) {
00355         cmd_ln_set_str("-input_endian", "little");
00356     }
00357     else if (0 == memcmp(c, "10", 2)) {
00358         cmd_ln_set_str("-input_endian", "big");
00359     }
00360     else {
00361         E_ERROR("Unknown byte order %s\n", c);
00362         goto error_out;
00363     }
00364 
00365     /* We are ready to rumble. */
00366     return 0;
00367 error_out:
00368     return -1;
00369 }
00370 
00371 static int
00372 extract_pitch(const char *in, const char *out)
00373 {
00374     FILE *infh = NULL, *outfh = NULL;
00375     size_t flen, fshift, nsamps;
00376     int16 *buf = NULL;
00377     yin_t *yin = NULL;
00378     uint16 period, bestdiff;
00379     int32 sps;
00380 
00381     if (out) {
00382         if ((outfh = fopen(out, "w")) == NULL) {
00383             E_ERROR_SYSTEM("Failed to open %s for writing", outfh);
00384             goto error_out;
00385         }
00386     }
00387     else {
00388         outfh = stdout;
00389     }
00390     if ((infh = fopen(in, "rb")) == NULL) {
00391         E_ERROR_SYSTEM("Failed to open %s for reading", infh);
00392         goto error_out;
00393     }
00394 
00395     /* If we weren't told what the file type is, weakly try to
00396      * determine it (actually it's pretty obvious) */
00397     if (!(cmd_ln_boolean("-raw")
00398           || cmd_ln_boolean("-mswav")
00399           || cmd_ln_boolean("-nist"))) {
00400         if (guess_file_type(in, infh) < 0)
00401             goto error_out;
00402     }
00403     
00404     /* Grab the sampling rate and byte order from the header and also
00405      * make sure this is 16-bit linear PCM. */
00406     if (cmd_ln_boolean("-mswav")) {
00407         if (read_riff_header(infh) < 0)
00408             goto error_out;
00409     }
00410     else if (cmd_ln_boolean("-nist")) {
00411         if (read_nist_header(infh) < 0)
00412             goto error_out;
00413     }
00414     else if (cmd_ln_boolean("-raw")) {
00415         /* Just use some defaults for sampling rate and endian. */
00416         if (cmd_ln_str("-input_endian") == NULL) {
00417             if (WORDS_BIGENDIAN)
00418                 cmd_ln_set_str("-input_endian", "big");
00419             else
00420                 cmd_ln_set_str("-input_endian", "little");
00421         }
00422         if (cmd_ln_int32("-samprate") == 0)
00423             cmd_ln_set_int32("-samprate", 16000);
00424     }
00425 
00426     /* Now read frames and write pitch estimates. */
00427     sps = cmd_ln_int32("-samprate");
00428     flen = (size_t)(0.5 + sps * cmd_ln_float32("-flen"));
00429     fshift = (size_t)(0.5 + sps * cmd_ln_float32("-fshift"));
00430     yin = yin_init(flen, cmd_ln_float32("-voice_thresh"),
00431                    cmd_ln_float32("-search_range"),
00432                    cmd_ln_int32("-smooth_window"));
00433     if (yin == NULL) {
00434         E_ERROR("Failed to initialize YIN\n");
00435         goto error_out;
00436     }
00437     buf = ckd_calloc(flen, sizeof(*buf));
00438     /* Read the first full frame of data. */
00439     if (fread(buf, sizeof(*buf), flen, infh) != flen) {
00440         /* Fail silently, which is probably okay. */
00441     }
00442     yin_start(yin);
00443     nsamps = 0;
00444     while (!feof(infh)) {
00445         /* Process a frame of data. */
00446         yin_write(yin, buf);
00447         if (yin_read(yin, &period, &bestdiff)) {
00448             fprintf(outfh, "%.3f %.2f %.2f\n",
00449                     /* Time point. */
00450                     (double)nsamps/sps,
00451                     /* "Probability" of voicing. */
00452                     bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
00453                     /* Pitch (possibly bogus) */
00454                     period == 0 ? sps : (double)sps / period);
00455             nsamps += fshift;
00456         }
00457         /* Shift it back and get the next frame's overlap. */
00458         memmove(buf, buf + fshift, (flen - fshift) * sizeof(*buf));
00459         if (fread(buf + flen - fshift, sizeof(*buf), fshift, infh) != fshift) {
00460             /* Fail silently (FIXME: really?) */
00461         }
00462     }
00463     yin_end(yin);
00464     /* Process trailing frames of data. */
00465     while (yin_read(yin, &period, &bestdiff)) {
00466             fprintf(outfh, "%.3f %.2f %.2f\n",
00467                     /* Time point. */
00468                     (double)nsamps/sps,
00469                     /* "Probability" of voicing. */
00470                     bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
00471                     /* Pitch (possibly bogus) */
00472                     period == 0 ? sps : (double)sps / period);
00473     }
00474 
00475     if (yin)
00476         yin_free(yin);
00477     ckd_free(buf);
00478     fclose(infh);
00479     if (outfh != stdout)
00480         fclose(outfh);
00481     return 0;
00482 
00483 error_out:
00484     yin_free(yin);
00485     ckd_free(buf);
00486     if (infh) fclose(infh);
00487     if (outfh && outfh != stdout) fclose(outfh);
00488     return -1;
00489 }
00490 
00491 static int
00492 run_control_file(const char *ctl)
00493 {
00494     FILE *ctlfh;
00495     char *line;
00496     char *di, *dout, *ei, *eio;
00497     size_t len;
00498     int rv, guess_type, guess_sps, guess_endian;
00499     int32 skip, runlen;
00500 
00501     skip = cmd_ln_int32("-nskip");
00502     runlen = cmd_ln_int32("-runlen");
00503 
00504     /* Whether to guess file types */
00505     guess_type = !(cmd_ln_boolean("-raw")
00506                    || cmd_ln_boolean("-mswav")
00507                    || cmd_ln_boolean("-nist"));
00508     /* Whether to guess sampling rate */
00509     guess_sps = (cmd_ln_int32("-samprate") == 0);
00510     /* Whether to guess endian */
00511     guess_endian = (cmd_ln_str("-input_endian") == NULL);
00512 
00513     if ((ctlfh = fopen(ctl, "r")) == NULL) {
00514         E_ERROR_SYSTEM("Failed to open control file %s", ctl);
00515         return -1;
00516     }
00517     if (cmd_ln_str("-di"))
00518         di = string_join(cmd_ln_str("-di"), "/", NULL);
00519     else
00520         di = ckd_salloc("");
00521     if (cmd_ln_str("-do"))
00522         dout = string_join(cmd_ln_str("-do"), "/", NULL);
00523     else
00524         dout = ckd_salloc("");
00525     if (cmd_ln_str("-ei"))
00526         ei = string_join(".", cmd_ln_str("-ei"), NULL);
00527     else
00528         ei = ckd_salloc("");
00529     if (cmd_ln_str("-eo"))
00530         eio = string_join(".", cmd_ln_str("-eo"), NULL);
00531     else
00532         eio = ckd_salloc("");
00533     rv = 0;
00534     while ((line = fread_line(ctlfh, &len)) != NULL) {
00535         char *infile, *outfile;
00536 
00537         if (skip-- > 0) {
00538             ckd_free(line);
00539             continue;
00540         }
00541         if (runlen == 0) {
00542             ckd_free(line);
00543             break;
00544         }
00545         --runlen;
00546 
00547         if (line[len-1] == '\n')
00548             line[len-1] = '\0';
00549 
00550         infile = string_join(di, line, ei, NULL);
00551         outfile = string_join(dout, line, eio, NULL);
00552 
00553         /* Reset various guessed information */
00554         if (guess_type) {
00555             cmd_ln_set_boolean("-nist", FALSE);
00556             cmd_ln_set_boolean("-mswav", FALSE);
00557             cmd_ln_set_boolean("-raw", FALSE);
00558         }
00559         if (guess_sps)
00560             cmd_ln_set_int32("-samprate", 0);
00561         if (guess_endian)
00562             cmd_ln_set_str("-input_endian", NULL);
00563 
00564         rv = extract_pitch(infile, outfile);
00565 
00566         ckd_free(infile);
00567         ckd_free(outfile);
00568         ckd_free(line);
00569 
00570         if (rv != 0)
00571             break;
00572     }
00573     ckd_free(di);
00574     ckd_free(dout);
00575     ckd_free(ei);
00576     ckd_free(eio);
00577     fclose(ctlfh);
00578     return rv;
00579 }

Generated on Tue Aug 17 2010 for SphinxBase by  doxygen 1.7.1