00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <string.h>
00039
00040 #include "pio.h"
00041 #include "strfuncs.h"
00042 #include "dict.h"
00043
00044
00045 #define DELIM " \t\n"
00046 #define DEFAULT_NUM_PHONE (MAX_S3CIPID+1)
00047
00048 #if WIN32
00049 #define snprintf sprintf_s
00050 #endif
00051
00052 extern const char *const cmu6_lts_phone_table[];
00053
00054 static s3cipid_t
00055 dict_ciphone_id(dict_t * d, const char *str)
00056 {
00057 if (d->nocase)
00058 return bin_mdef_ciphone_id_nocase(d->mdef, str);
00059 else
00060 return bin_mdef_ciphone_id(d->mdef, str);
00061 }
00062
00063
00064 const char *
00065 dict_ciphone_str(dict_t * d, s3wid_t wid, int32 pos)
00066 {
00067 assert(d != NULL);
00068 assert((wid >= 0) && (wid < d->n_word));
00069 assert((pos >= 0) && (pos < d->word[wid].pronlen));
00070
00071 return bin_mdef_ciphone_str(d->mdef, d->word[wid].ciphone[pos]);
00072 }
00073
00074
00075 s3wid_t
00076 dict_add_word(dict_t * d, char const *word, s3cipid_t const * p, int32 np)
00077 {
00078 int32 len;
00079 dictword_t *wordp;
00080 s3wid_t newwid;
00081 char *wword;
00082
00083 if (d->n_word >= d->max_words) {
00084 E_INFO("Reallocating to %d KiB for word entries\n",
00085 (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t) / 1024);
00086 d->word =
00087 (dictword_t *) ckd_realloc(d->word,
00088 (d->max_words +
00089 S3DICT_INC_SZ) * sizeof(dictword_t));
00090 d->max_words = d->max_words + S3DICT_INC_SZ;
00091 return BAD_S3WID;
00092 }
00093
00094 wordp = d->word + d->n_word;
00095 wordp->word = (char *) ckd_salloc(word);
00096
00097
00098 if (hash_table_enter_int32(d->ht, wordp->word, d->n_word) != d->n_word) {
00099 ckd_free(wordp->word);
00100 wordp->word = NULL;
00101 return BAD_S3WID;
00102 }
00103
00104
00105 if (p && (np > 0)) {
00106 wordp->ciphone = (s3cipid_t *) ckd_malloc(np * sizeof(s3cipid_t));
00107 memcpy(wordp->ciphone, p, np * sizeof(s3cipid_t));
00108 wordp->pronlen = np;
00109 }
00110 else {
00111 wordp->ciphone = NULL;
00112 wordp->pronlen = 0;
00113 }
00114 wordp->alt = BAD_S3WID;
00115 wordp->basewid = d->n_word;
00116
00117
00118 wword = ckd_salloc(word);
00119 if ((len = dict_word2basestr(wword)) > 0) {
00120 int32 w;
00121
00122
00123 if (hash_table_lookup_int32(d->ht, wword, &w) < 0) {
00124 E_ERROR("Missing base word for: %s\n", word);
00125 ckd_free(wword);
00126 ckd_free(wordp->word);
00127 wordp->word = NULL;
00128 return BAD_S3WID;
00129 }
00130
00131
00132 wordp->basewid = w;
00133 wordp->alt = d->word[w].alt;
00134 d->word[w].alt = d->n_word;
00135 }
00136 ckd_free(wword);
00137
00138 newwid = d->n_word++;
00139
00140 return newwid;
00141 }
00142
00143
00144 static int32
00145 dict_read(FILE * fp, dict_t * d)
00146 {
00147 lineiter_t *li;
00148 char **wptr;
00149 s3cipid_t *p;
00150 int32 lineno, nwd;
00151 s3wid_t w;
00152 int32 i, maxwd;
00153 size_t stralloc, phnalloc;
00154
00155 maxwd = 512;
00156 p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p));
00157 wptr = (char **) ckd_calloc(maxwd, sizeof(char *));
00158
00159 lineno = 0;
00160 stralloc = phnalloc = 0;
00161 for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
00162 lineno++;
00163 if (0 == strncmp(li->buf, "##", 2)
00164 || 0 == strncmp(li->buf, ";;", 2))
00165 continue;
00166
00167 if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) {
00168
00169 nwd = str2words(li->buf, NULL, 0);
00170 assert(nwd > maxwd);
00171 maxwd = nwd;
00172 p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p));
00173 wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr));
00174 }
00175
00176 if (nwd == 0)
00177 continue;
00178
00179 if (nwd == 1) {
00180 E_ERROR("Line %d: No pronunciation for word %s; ignored\n",
00181 lineno, wptr[0]);
00182 continue;
00183 }
00184
00185
00186
00187 for (i = 1; i < nwd; i++) {
00188 p[i - 1] = dict_ciphone_id(d, wptr[i]);
00189 if (NOT_S3CIPID(p[i - 1])) {
00190 E_ERROR("Line %d: Bad ciphone: %s; word %s ignored\n",
00191 lineno, wptr[i], wptr[0]);
00192 break;
00193 }
00194 }
00195
00196 if (i == nwd) {
00197 w = dict_add_word(d, wptr[0], p, nwd - 1);
00198 if (NOT_S3WID(w))
00199 E_ERROR
00200 ("Line %d: dict_add_word (%s) failed (duplicate?); ignored\n",
00201 lineno, wptr[0]);
00202 stralloc += strlen(d->word[w].word);
00203 phnalloc += d->word[w].pronlen * sizeof(s3cipid_t);
00204 }
00205 }
00206 E_INFO("Allocated %d KiB for strings, %d KiB for phones\n",
00207 (int)stralloc / 1024, (int)phnalloc / 1024);
00208 ckd_free(p);
00209 ckd_free(wptr);
00210
00211 return 0;
00212 }
00213
00214 int
00215 dict_write(dict_t *dict, char const *filename, char const *format)
00216 {
00217 FILE *fh;
00218 int i;
00219
00220 if ((fh = fopen(filename, "w")) == NULL) {
00221 E_ERROR_SYSTEM("Failed to open %s", filename);
00222 return -1;
00223 }
00224 for (i = 0; i < dict->n_word; ++i) {
00225 char *phones;
00226 int j, phlen;
00227 if (!dict_real_word(dict, i))
00228 continue;
00229 for (phlen = j = 0; j < dict_pronlen(dict, i); ++j)
00230 phlen += strlen(dict_ciphone_str(dict, i, j)) + 1;
00231 phones = ckd_calloc(1, phlen);
00232 for (j = 0; j < dict_pronlen(dict, i); ++j) {
00233 strcat(phones, dict_ciphone_str(dict, i, j));
00234 if (j != dict_pronlen(dict, i) - 1)
00235 strcat(phones, " ");
00236 }
00237 fprintf(fh, "%-30s %s\n", dict_wordstr(dict, i), phones);
00238 ckd_free(phones);
00239 }
00240 fclose(fh);
00241 return 0;
00242 }
00243
00244
00245 dict_t *
00246 dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
00247 {
00248 FILE *fp, *fp2;
00249 int32 n;
00250 lineiter_t *li;
00251 dict_t *d;
00252 s3cipid_t sil;
00253 char const *dictfile = cmd_ln_str_r(config, "-dict");
00254 char const *fillerfile = cmd_ln_str_r(config, "-fdict");
00255
00256
00257
00258
00259
00260
00261 if ((fp = fopen(dictfile, "r")) == NULL)
00262 E_FATAL_SYSTEM("fopen(%s,r) failed\n", dictfile);
00263 n = 0;
00264 for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
00265 if (li->buf[0] != '#')
00266 n++;
00267 }
00268 rewind(fp);
00269
00270 fp2 = NULL;
00271 if (fillerfile) {
00272 if ((fp2 = fopen(fillerfile, "r")) == NULL)
00273 E_FATAL_SYSTEM("fopen(%s,r) failed\n", fillerfile);
00274
00275 for (li = lineiter_start(fp2); li; li = lineiter_next(li)) {
00276 if (li->buf[0] != '#')
00277 n++;
00278 }
00279 rewind(fp2);
00280 }
00281
00282
00283
00284
00285
00286 d = (dict_t *) ckd_calloc(1, sizeof(dict_t));
00287 d->refcnt = 1;
00288 d->max_words =
00289 (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
00290 if (n >= MAX_S3WID)
00291 E_FATAL("#Words in dictionaries (%d) exceeds limit (%d)\n", n,
00292 MAX_S3WID);
00293
00294 E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n",
00295 d->max_words, sizeof(dictword_t),
00296 d->max_words * sizeof(dictword_t) / 1024);
00297 d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t));
00298 d->n_word = 0;
00299 d->mdef = bin_mdef_retain(mdef);
00300
00301
00302 d->nocase = cmd_ln_boolean_r(config, "-dictcase");
00303 d->ht = hash_table_new(d->max_words, d->nocase);
00304
00305
00306 E_INFO("Reading main dictionary: %s\n", dictfile);
00307 dict_read(fp, d);
00308 fclose(fp);
00309 E_INFO("%d words read\n", d->n_word);
00310
00311
00312 d->filler_start = d->n_word;
00313 if (fillerfile) {
00314 E_INFO("Reading filler dictionary: %s\n", fillerfile);
00315 dict_read(fp2, d);
00316 fclose(fp2);
00317 E_INFO("%d words read\n", d->n_word - d->filler_start);
00318 }
00319 sil = bin_mdef_silphone(mdef);
00320 if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) {
00321 dict_add_word(d, S3_START_WORD, &sil, 1);
00322 }
00323 if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) {
00324 dict_add_word(d, S3_FINISH_WORD, &sil, 1);
00325 }
00326 if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) {
00327 dict_add_word(d, S3_SILENCE_WORD, &sil, 1);
00328 }
00329
00330 d->filler_end = d->n_word - 1;
00331
00332
00333 d->startwid = dict_wordid(d, S3_START_WORD);
00334 d->finishwid = dict_wordid(d, S3_FINISH_WORD);
00335 d->silwid = dict_wordid(d, S3_SILENCE_WORD);
00336
00337 if ((d->filler_start > d->filler_end)
00338 || (!dict_filler_word(d, d->silwid)))
00339 E_FATAL("%s must occur (only) in filler dictionary\n",
00340 S3_SILENCE_WORD);
00341
00342
00343
00344 return d;
00345 }
00346
00347
00348 s3wid_t
00349 dict_wordid(dict_t * d, const char *word)
00350 {
00351 int32 w;
00352
00353 assert(d);
00354 assert(word);
00355
00356 if (hash_table_lookup_int32(d->ht, word, &w) < 0)
00357 return (BAD_S3WID);
00358 return w;
00359 }
00360
00361
00362 int
00363 dict_filler_word(dict_t * d, s3wid_t w)
00364 {
00365 assert(d);
00366 assert((w >= 0) && (w < d->n_word));
00367
00368 w = dict_basewid(d, w);
00369 if ((w == d->startwid) || (w == d->finishwid))
00370 return 0;
00371 if ((w >= d->filler_start) && (w <= d->filler_end))
00372 return 1;
00373 return 0;
00374 }
00375
00376 int
00377 dict_real_word(dict_t * d, s3wid_t w)
00378 {
00379 assert(d);
00380 assert((w >= 0) && (w < d->n_word));
00381
00382 w = dict_basewid(d, w);
00383 if ((w == d->startwid) || (w == d->finishwid))
00384 return 0;
00385 if ((w >= d->filler_start) && (w <= d->filler_end))
00386 return 0;
00387 return 1;
00388 }
00389
00390
00391 int32
00392 dict_word2basestr(char *word)
00393 {
00394 int32 i, len;
00395
00396 len = strlen(word);
00397 if (word[len - 1] == ')') {
00398 for (i = len - 2; (i > 0) && (word[i] != '('); --i);
00399
00400 if (i > 0) {
00401
00402 word[i] = '\0';
00403 return i;
00404 }
00405 }
00406
00407 return -1;
00408 }
00409
00410 dict_t *
00411 dict_retain(dict_t *d)
00412 {
00413 ++d->refcnt;
00414 return d;
00415 }
00416
00417 int
00418 dict_free(dict_t * d)
00419 {
00420 int i;
00421 dictword_t *word;
00422
00423 if (d == NULL)
00424 return 0;
00425 if (--d->refcnt > 0)
00426 return d->refcnt;
00427
00428
00429 for (i = 0; i < d->n_word; i++) {
00430 word = (dictword_t *) & (d->word[i]);
00431 if (word->word)
00432 ckd_free((void *) word->word);
00433 if (word->ciphone)
00434 ckd_free((void *) word->ciphone);
00435 }
00436
00437 if (d->word)
00438 ckd_free((void *) d->word);
00439 if (d->ht)
00440 hash_table_free(d->ht);
00441 bin_mdef_free(d->mdef);
00442 ckd_free((void *) d);
00443
00444 return 0;
00445 }
00446
00447 void
00448 dict_report(dict_t * d)
00449 {
00450 E_INFO_NOFN("Initialization of dict_t, report:\n");
00451 E_INFO_NOFN("Max word: %d\n", d->max_words);
00452 E_INFO_NOFN("No of word: %d\n", d->n_word);
00453 E_INFO_NOFN("\n");
00454 }