/*! function to read results from the ASR*/ static switch_status_t pocketsphinx_asr_get_results(switch_asr_handle_t *ah, char **xmlstr, switch_asr_flag_t *flags) { pocketsphinx_t *ps = (pocketsphinx_t *) ah->private_info; switch_status_t status = SWITCH_STATUS_SUCCESS; int32_t conf; if (switch_test_flag(ps, PSFLAG_BARGE)) { switch_clear_flag_locked(ps, PSFLAG_BARGE); status = SWITCH_STATUS_BREAK; } if (switch_test_flag(ps, PSFLAG_HAS_TEXT)) { switch_mutex_lock(ps->flag_mutex); switch_clear_flag(ps, PSFLAG_HAS_TEXT); conf = ps_get_prob(ps->ps, &ps->uttid); ps->confidence = (conf + 20000) / 200; if (ps->confidence < 0) { ps->confidence = 0; } switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Recognized: %s, Confidence: %d\n", ps->hyp, ps->confidence); switch_mutex_unlock(ps->flag_mutex); *xmlstr = switch_mprintf("<?xml version=\"1.0\"?>\n" "<result grammar=\"%s\">\n" " <interpretation grammar=\"%s\" confidence=\"%d\">\n" " <input mode=\"speech\">%s</input>\n" " </interpretation>\n" "</result>\n", ps->grammar, ps->grammar, ps->confidence, ps->hyp); if (switch_test_flag(ps, SWITCH_ASR_FLAG_AUTO_RESUME)) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Auto Resuming\n"); switch_set_flag(ps, PSFLAG_READY); ps_start_utt(ps->ps, NULL); } status = SWITCH_STATUS_SUCCESS; } return status; }
int ps_decoder_test(cmd_ln_t *config, char const *sname, char const *expected) { ps_decoder_t *ps; mfcc_t **cepbuf; FILE *rawfh; int16 *buf; int16 const *bptr; size_t nread; size_t nsamps; int32 nfr, i, score, prob; char const *hyp; char const *uttid; double n_speech, n_cpu, n_wall; ps_seg_t *seg; TEST_ASSERT(ps = ps_init(config)); /* Test it first with pocketsphinx_decode_raw() */ TEST_ASSERT(rawfh = fopen(DATADIR "/goforward.raw", "rb")); ps_decode_raw(ps, rawfh, "goforward", -1); hyp = ps_get_hyp(ps, &score, &uttid); prob = ps_get_prob(ps, &uttid); printf("%s (%s): %s (%d, %d)\n", sname, uttid, hyp, score, prob); TEST_EQUAL(0, strcmp(hyp, expected)); TEST_ASSERT(prob <= 0); ps_get_utt_time(ps, &n_speech, &n_cpu, &n_wall); printf("%.2f seconds speech, %.2f seconds CPU, %.2f seconds wall\n", n_speech, n_cpu, n_wall); printf("%.2f xRT (CPU), %.2f xRT (elapsed)\n", n_cpu / n_speech, n_wall / n_speech); /* Test it with ps_process_raw() */ clearerr(rawfh); fseek(rawfh, 0, SEEK_END); nsamps = ftell(rawfh) / sizeof(*buf); fseek(rawfh, 0, SEEK_SET); TEST_EQUAL(0, ps_start_utt(ps, NULL)); nsamps = 2048; buf = ckd_calloc(nsamps, sizeof(*buf)); while (!feof(rawfh)) { nread = fread(buf, sizeof(*buf), nsamps, rawfh); ps_process_raw(ps, buf, nread, FALSE, FALSE); } TEST_EQUAL(0, ps_end_utt(ps)); hyp = ps_get_hyp(ps, &score, &uttid); prob = ps_get_prob(ps, &uttid); printf("%s (%s): %s (%d, %d)\n", sname, uttid, hyp, score, prob); TEST_EQUAL(0, strcmp(uttid, "000000000")); TEST_EQUAL(0, strcmp(hyp, expected)); ps_get_utt_time(ps, &n_speech, &n_cpu, &n_wall); printf("%.2f seconds speech, %.2f seconds CPU, %.2f seconds wall\n", n_speech, n_cpu, n_wall); printf("%.2f xRT (CPU), %.2f xRT (elapsed)\n", n_cpu / n_speech, n_wall / n_speech); /* Now read the whole file and produce an MFCC buffer. */ clearerr(rawfh); fseek(rawfh, 0, SEEK_END); nsamps = ftell(rawfh) / sizeof(*buf); fseek(rawfh, 0, SEEK_SET); bptr = buf = ckd_realloc(buf, nsamps * sizeof(*buf)); TEST_EQUAL(nsamps, fread(buf, sizeof(*buf), nsamps, rawfh)); fe_process_frames(ps->acmod->fe, &bptr, &nsamps, NULL, &nfr); cepbuf = ckd_calloc_2d(nfr + 1, fe_get_output_size(ps->acmod->fe), sizeof(**cepbuf)); fe_start_utt(ps->acmod->fe); fe_process_frames(ps->acmod->fe, &bptr, &nsamps, cepbuf, &nfr); fe_end_utt(ps->acmod->fe, cepbuf[nfr], &i); /* Decode it with process_cep() */ TEST_EQUAL(0, ps_start_utt(ps, NULL)); for (i = 0; i < nfr; ++i) { ps_process_cep(ps, cepbuf + i, 1, FALSE, FALSE); } TEST_EQUAL(0, ps_end_utt(ps)); hyp = ps_get_hyp(ps, &score, &uttid); prob = ps_get_prob(ps, &uttid); printf("%s (%s): %s (%d, %d)\n", sname, uttid, hyp, score, prob); TEST_EQUAL(0, strcmp(uttid, "000000001")); TEST_EQUAL(0, strcmp(hyp, expected)); TEST_ASSERT(prob <= 0); for (seg = ps_seg_iter(ps, &score); seg; seg = ps_seg_next(seg)) { char const *word; int sf, ef; int32 post, lscr, ascr, lback; word = ps_seg_word(seg); ps_seg_frames(seg, &sf, &ef); post = ps_seg_prob(seg, &ascr, &lscr, &lback); printf("%s (%d:%d) P(w|o) = %f ascr = %d lscr = %d lback = %d\n", word, sf, ef, logmath_exp(ps_get_logmath(ps), post), ascr, lscr, lback); TEST_ASSERT(post <= 2); // Due to numerical errors with float it sometimes could go out of 0 } ps_get_utt_time(ps, &n_speech, &n_cpu, &n_wall); printf("%.2f seconds speech, %.2f seconds CPU, %.2f seconds wall\n", n_speech, n_cpu, n_wall); printf("%.2f xRT (CPU), %.2f xRT (elapsed)\n", n_cpu / n_speech, n_wall / n_speech); ps_get_all_time(ps, &n_speech, &n_cpu, &n_wall); printf("TOTAL: %.2f seconds speech, %.2f seconds CPU, %.2f seconds wall\n", n_speech, n_cpu, n_wall); printf("TOTAL: %.2f xRT (CPU), %.2f xRT (elapsed)\n", n_cpu / n_speech, n_wall / n_speech); fclose(rawfh); ps_free(ps); cmd_ln_free_r(config); ckd_free_2d(cepbuf); ckd_free(buf); return 0; }
/*! function to feed audio to the ASR */ static switch_status_t pocketsphinx_asr_feed(switch_asr_handle_t *ah, void *data, unsigned int len, switch_asr_flag_t *flags) { pocketsphinx_t *ps = (pocketsphinx_t *) ah->private_info; int rv = 0; if (switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) return SWITCH_STATUS_BREAK; if (!switch_test_flag(ps, PSFLAG_NOMATCH) && !switch_test_flag(ps, PSFLAG_NOINPUT) && !switch_test_flag(ps, PSFLAG_HAS_TEXT) && switch_test_flag(ps, PSFLAG_READY)) { if (stop_detect(ps, (int16_t *) data, len / 2)) { char const *hyp; switch_mutex_lock(ps->flag_mutex); if ((hyp = ps_get_hyp(ps->ps, &ps->score, &ps->uttid))) { if (!zstr(hyp)) { ps_end_utt(ps->ps); switch_clear_flag(ps, PSFLAG_READY); if ((hyp = ps_get_hyp(ps->ps, &ps->score, &ps->uttid))) { if (zstr(hyp)) { if (!switch_test_flag(ps, PSFLAG_SPEECH_TIMEOUT)) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Lost the text, never mind....\n"); ps_start_utt(ps->ps, NULL); switch_set_flag(ps, PSFLAG_READY); } } else { /* get match and confidence */ int32_t conf; conf = ps_get_prob(ps->ps, &ps->uttid); ps->confidence = (conf + 20000) / 200; if (ps->confidence < 0) { ps->confidence = 0; } if (ps->confidence_threshold <= 0 || ps->confidence >= ps->confidence_threshold) { ps->hyp = switch_core_strdup(ah->memory_pool, hyp); switch_set_flag(ps, PSFLAG_HAS_TEXT); } else { /* have match, but below confidence threshold */ switch_set_flag(ps, PSFLAG_NOMATCH); } } } } } if (switch_test_flag(ps, PSFLAG_SPEECH_TIMEOUT) && !switch_test_flag(ps, PSFLAG_HAS_TEXT)) { /* heard something, but doesn't match anything */ switch_clear_flag(ps, PSFLAG_READY); switch_set_flag(ps, PSFLAG_NOMATCH); } switch_mutex_unlock(ps->flag_mutex); } /* only feed ps_process_raw when we are listening */ if (ps->listening) { switch_mutex_lock(ps->flag_mutex); rv = ps_process_raw(ps->ps, (int16 *) data, len / 2, FALSE, FALSE); switch_mutex_unlock(ps->flag_mutex); } if (rv < 0) { return SWITCH_STATUS_FALSE; } } else if (switch_test_flag(ps, PSFLAG_NOINPUT_TIMEOUT)) { /* never heard anything */ switch_clear_flag_locked(ps, PSFLAG_READY); } return SWITCH_STATUS_SUCCESS; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; acmod_t *acmod; fsg_search_t *fsgs; jsgf_t *jsgf; jsgf_rule_t *rule; fsg_model_t *fsg; ps_seg_t *seg; ps_lattice_t *dag; FILE *rawfh; char const *hyp, *uttid; int32 score, prob; clock_t c; int i; TEST_ASSERT(config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-dict", MODELDIR "/lm/en/turtle.dic", "-input_endian", "little", "-samprate", "16000", NULL)); TEST_ASSERT(ps = ps_init(config)); jsgf = jsgf_parse_file(DATADIR "/goforward.gram", NULL); TEST_ASSERT(jsgf); rule = jsgf_get_rule(jsgf, "<goforward.move2>"); TEST_ASSERT(rule); fsg = jsgf_build_fsg(jsgf, rule, ps->lmath, 7.5); TEST_ASSERT(fsg); fsg_model_write(fsg, stdout); ps_set_fsg(ps, "<goforward.move2>", fsg); ps_set_search(ps, "<goforward.move2>"); acmod = ps->acmod; fsgs = (fsg_search_t *) fsg_search_init(fsg, config, acmod, ps->dict, ps->d2p); setbuf(stdout, NULL); c = clock(); for (i = 0; i < 5; ++i) { int16 buf[2048]; size_t nread; int16 const *bptr; int nfr; int is_final; TEST_ASSERT(rawfh = fopen(DATADIR "/goforward.raw", "rb")); TEST_EQUAL(0, acmod_start_utt(acmod)); fsg_search_start(ps_search_base(fsgs)); is_final = FALSE; while (!feof(rawfh)) { nread = fread(buf, sizeof(*buf), 2048, rawfh); bptr = buf; while ((nfr = acmod_process_raw(acmod, &bptr, &nread, FALSE)) > 0) { while (acmod->n_feat_frame > 0) { fsg_search_step(ps_search_base(fsgs), acmod->output_frame); acmod_advance(acmod); } } hyp = fsg_search_hyp(ps_search_base(fsgs), &score, &is_final); printf("FSG: %s (%d) frame %d final %s\n", hyp, score, acmod->output_frame, is_final ? "FINAL" : ""); TEST_EQUAL (is_final, (acmod->output_frame > 170)); } fsg_search_finish(ps_search_base(fsgs)); hyp = fsg_search_hyp(ps_search_base(fsgs), &score, NULL); printf("FSG: %s (%d)\n", hyp, score); TEST_ASSERT(acmod_end_utt(acmod) >= 0); fclose(rawfh); } TEST_EQUAL(0, strcmp("go forward ten meters", fsg_search_hyp(ps_search_base(fsgs), &score, NULL))); ps->search = (ps_search_t *)fsgs; for (seg = ps_seg_iter(ps, &score); seg; seg = ps_seg_next(seg)) { char const *word; int sf, ef; word = ps_seg_word(seg); ps_seg_frames(seg, &sf, &ef); printf("%s %d %d\n", word, sf, ef); } c = clock() - c; printf("5 * fsg search in %.2f sec\n", (double)c / CLOCKS_PER_SEC); dag = ps_get_lattice(ps); ps_lattice_write(dag, "test_jsgf.lat"); jsgf_grammar_free(jsgf); fsg_search_free(ps_search_base(fsgs)); ps_free(ps); cmd_ln_free_r(config); TEST_ASSERT(config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-dict", MODELDIR "/lm/en/turtle.dic", "-jsgf", DATADIR "/goforward.gram", "-input_endian", "little", "-samprate", "16000", NULL)); TEST_ASSERT(ps = ps_init(config)); TEST_ASSERT(rawfh = fopen(DATADIR "/goforward.raw", "rb")); ps_decode_raw(ps, rawfh, "goforward", -1); hyp = ps_get_hyp(ps, &score, &uttid); prob = ps_get_prob(ps, &uttid); printf("%s: %s (%d, %d)\n", uttid, hyp, score, prob); TEST_EQUAL(0, strcmp("go forward ten meters", hyp)); ps_free(ps); fclose(rawfh); cmd_ln_free_r(config); TEST_ASSERT(config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-dict", MODELDIR "/lm/en/turtle.dic", "-jsgf", DATADIR "/goforward.gram", "-toprule", "goforward.move2", "-input_endian", "little", "-samprate", "16000", NULL)); TEST_ASSERT(ps = ps_init(config)); TEST_ASSERT(rawfh = fopen(DATADIR "/goforward.raw", "rb")); ps_decode_raw(ps, rawfh, "goforward", -1); hyp = ps_get_hyp(ps, &score, &uttid); prob = ps_get_prob(ps, &uttid); printf("%s: %s (%d, %d)\n", uttid, hyp, score, prob); TEST_EQUAL(0, strcmp("go forward ten meters", hyp)); ps_free(ps); cmd_ln_free_r(config); fclose(rawfh); TEST_ASSERT(config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-dict", MODELDIR "/lm/en/turtle.dic", "-jsgf", DATADIR "/defective.gram", NULL)); TEST_ASSERT(NULL == ps_init(config)); cmd_ln_free_r(config); return 0; }
static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[4096]; int32 k, ts, rem; char const *hyp; char const *uttid; cont_ad_t *cont; char word[256]; int32 score; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int)cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed top open audio device\n"); /* Initialize continuous listening module */ if ((cont = cont_ad_init(ad, ad_read)) == NULL) E_FATAL("Failed to initialize voice activity detection\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (cont_ad_calib(cont) < 0) E_FATAL("Failed to calibrate voice activity detection\n"); for (;;) { /* Indicate listening for next utterance */ printf("READY....\n"); fflush(stdout); fflush(stderr); /* Wait data for next utterance */ while ((k = cont_ad_read(cont, adbuf, 4096)) == 0) sleep_msec(100); if (k < 0) E_FATAL("Failed to read audio\n"); /* * Non-zero amount of data received; start recognition of new utterance. * NULL argument to uttproc_begin_utt => automatic generation of utterance-id. */ if (ps_start_utt(ps, NULL) < 0) E_FATAL("Failed to start utterance\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); printf("Listening...\n"); fflush(stdout); /* Note timestamp for this first block of data */ ts = cont->read_ts; /* Decode utterance until end (marked by a "long" silence, >1sec) */ for (;;) { /* Read non-silence audio data, if any, from continuous listening module */ if ((k = cont_ad_read(cont, adbuf, 4096)) < 0) E_FATAL("Failed to read audio\n"); if (k == 0) { /* * No speech data available; check current timestamp with most recent * speech to see if more than 1 sec elapsed. If so, end of utterance. */ if ((cont->read_ts - ts) > DEFAULT_SAMPLES_PER_SEC) break; } else { /* New speech data received; note current timestamp */ ts = cont->read_ts; } /* * Decode whatever data was read above. */ rem = ps_process_raw(ps, adbuf, k, FALSE, FALSE); /* If no work to be done, sleep a bit */ if ((rem == 0) && (k == 0)) sleep_msec(20); } /* * Utterance ended; flush any accumulated, unprocessed A/D data and stop * listening until current utterance completely decoded */ ad_stop_rec(ad); while (ad_read(ad, adbuf, 4096) >= 0); cont_ad_reset(cont); printf("Stopped listening, please wait...\n"); fflush(stdout); /* Finish decoding, obtain and print result */ ps_end_utt(ps); hyp = ps_get_hyp(ps, &score, &uttid); if ( hyp == NULL ) return; printf("Recognized: %s (%d %s) with prob %d\n", hyp, score, uttid, ps_get_prob(ps, NULL)); //printf("%s: %s\n", uttid, hyp); fflush(stdout); /* Exit if the first word spoken was GOODBYE */ if (hyp) { sscanf(hyp, "%s", word); if (strcmp(word, "goodbye") == 0) break; } /* Resume A/D recording for next utterance */ if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); } cont_ad_close(cont); ad_close(ad); }