int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; //int i; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-lm", MODELDIR "/lm/en/turtle.DMP", "-dict", MODELDIR "/lm/en/turtle.dic", NULL); if (config == NULL) return 1; ps = ps_init(config); if (ps == NULL) return 1; fh = fopen("goforward.raw", "rb"); if (fh == NULL) { perror("Failed to open goforward.raw"); return 1; } rv = ps_decode_raw(ps, fh, "goforward", -1); if (rv < 0) return 1; hyp = ps_get_hyp(ps, &score, &uttid); if (hyp == NULL) return 1; printf("Recognized: %s\n", hyp); fseek(fh, 0, SEEK_SET); rv = ps_start_utt(ps, "goforward"); if (rv < 0) return 1; while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } rv = ps_end_utt(ps); if (rv < 0) return 1; hyp = ps_get_hyp(ps, &score, &uttid); if (hyp == NULL) return 1; printf("Recognized: %s\n", hyp); fclose(fh); ps_free(ps); return 0; }
void EventHandlerBasic::event(ps_decoder_t* decoder) { char const* message = ps_get_hyp( decoder, NULL ); if( mCb != nullptr && message != NULL && strlen( message ) > 0 ) mCb( std::string( message ) ); }
std::string PocketSphinxServer::decode(const PocketSphinxIce::sample& signal, const Ice::Current& c) { std::cout << "Decode\n"; int rv; const char* hyp; const char* uttid; int32 score; rv = ps_start_utt(ps); if (rv < 0) throw PocketSphinxIce::Error("Error in ps_start_utt"); ps_process_raw(ps, signal.data(), signal.size(), FALSE, FALSE); rv = ps_end_utt(ps); if (rv < 0) throw PocketSphinxIce::Error("Error in ps_end_utt"); hyp = ps_get_hyp(ps, &score); if (!hyp) throw PocketSphinxIce::Error("ps_get_hyp returned NULL"); std::cout << "return:" << hyp << '\n'; return hyp; }
int main(int argc, char *argv[]) { cmd_ln_t *config; ps_decoder_t *ps; FILE *rawfh; char const *hyp; char const *uttid; int32 score; TEST_ASSERT(config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", DATADIR "/an4_ci_cont", "-lm", MODELDIR "/lm/en/turtle.DMP", "-dict", MODELDIR "/lm/en/turtle.dic", "-mllr", DATADIR "/mllr_matrices", "-samprate", "16000", NULL)); TEST_ASSERT(ps = ps_init(config)); TEST_ASSERT(rawfh = fopen(DATADIR "/goforward.raw", "rb")); ps_decode_raw(ps, rawfh, "goforward", -1); fclose(rawfh); hyp = ps_get_hyp(ps, &score, &uttid); printf("FWDFLAT (%s): %s (%d)\n", uttid, hyp, score); ps_free(ps); cmd_ln_free_r(config); return 0; }
int processRaw(const char *rawFile) { const char *hyp, *uttid; int16 buf[512]; int rv; int32 score; // Open the wav file passed from argument printf("file: %s\n", rawFile); fh = fopen(rawFile, "rb"); if (fh == NULL) { fprintf(stderr, "Unable to open input file %s\n", rawFile); return -1; } // Start utterance rv = ps_start_utt(ps); // Process buffer, 512 samples at a time while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } // Recieve the recognized string rv = ps_end_utt(ps); hyp = ps_get_hyp(ps, &score); printf("Recognized: |%s|\n", hyp); fflush(stdout); // Close file fclose(fh); return 0; }
std::tuple<std::string, double> TwitchStreamChunk::process(std::string body){ int pos = _uri.find_last_of('/'); std::string fileName = _uri.substr(pos + 1); std::ofstream file(fileName); file << body; file.flush(); file.close(); std::stringstream cmd; std::string audioFile = boost::filesystem::unique_path().native(); audioFile.append(".wav"); cmd << "ffmpeg -i " << fileName << " -vn -ac 1 " << audioFile << " > /dev/null 2>&1"; system(cmd.str().c_str()); FILE *aFile = fopen(audioFile.c_str(), "r"); ps_decode_raw(getDecoder(), aFile, -1); fclose(aFile); auto logarithm = ps_get_logmath(getDecoder()); int confidence = 1; const char * result = ps_get_hyp(getDecoder(), &confidence); double tmp = logmath_exp(logarithm, confidence); std::remove(fileName.c_str()); std::remove(audioFile.c_str()); return std::make_tuple(result == nullptr ? "" : std::string(result), tmp); }
int ofApp::engineClose() { char const *uttid; int rv; int32 score; rv = ps_end_utt(ps); if (rv < 0) { return 1; } hyp = ps_get_hyp(ps, &score); if (hyp == NULL) { return 1; } printf("NEWLINE_________\n\n\nRecognized: %s\n", hyp); sentence = hyp; process_result(); }
int ps_end_utt(ps_decoder_t *ps) { int rv, i; acmod_end_utt(ps->acmod); /* Search any remaining frames. */ if ((rv = ps_search_forward(ps)) < 0) { ptmr_stop(&ps->perf); return rv; } /* Finish phone loop search. */ if (ps->phone_loop) { if ((rv = ps_search_finish(ps->phone_loop)) < 0) { ptmr_stop(&ps->perf); return rv; } } /* Search any frames remaining in the lookahead window. */ for (i = ps->acmod->output_frame - ps->pl_window; i < ps->acmod->output_frame; ++i) ps_search_step(ps->search, i); /* Finish main search. */ if ((rv = ps_search_finish(ps->search)) < 0) { ptmr_stop(&ps->perf); return rv; } ptmr_stop(&ps->perf); /* Log a backtrace if requested. */ if (cmd_ln_boolean_r(ps->config, "-backtrace")) { char const *uttid, *hyp; ps_seg_t *seg; int32 score; hyp = ps_get_hyp(ps, &score, &uttid); if (hyp != NULL) { E_INFO("%s: %s (%d)\n", uttid, hyp, score); E_INFO_NOFN("%-20s %-5s %-5s %-5s %-10s %-10s %-3s\n", "word", "start", "end", "pprob", "ascr", "lscr", "lback"); for (seg = ps_seg_iter(ps, &score); seg; seg = ps_seg_next(seg)) { char const *word; int sf, ef; int32 post, lscr, ascr, lback; word = ps_seg_word(seg); ps_seg_frames(seg, &sf, &ef); post = ps_seg_prob(seg, &ascr, &lscr, &lback); E_INFO_NOFN("%-20s %-5d %-5d %-1.3f %-10d %-10d %-3d\n", word, sf, ef, logmath_exp(ps_get_logmath(ps), post), ascr, lscr, lback); } } } return rv; }
/*! function to feed audio to the ASR */ static switch_status_t pocketsphinx_asr_feed(switch_asr_handle_t *ah, void *data, unsigned int len, switch_asr_flag_t *flags) { pocketsphinx_t *ps = (pocketsphinx_t *) ah->private_info; int rv = 0; if (switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) return SWITCH_STATUS_BREAK; if (!switch_test_flag(ps, PSFLAG_HAS_TEXT) && switch_test_flag(ps, PSFLAG_READY)) { if (stop_detect(ps, (int16_t *) data, len / 2)) { char const *hyp; switch_mutex_lock(ps->flag_mutex); if ((hyp = ps_get_hyp(ps->ps, &ps->score, &ps->uttid))) { if (!zstr(hyp)) { ps_end_utt(ps->ps); switch_clear_flag(ps, PSFLAG_READY); if ((hyp = ps_get_hyp(ps->ps, &ps->score, &ps->uttid))) { if (zstr(hyp)) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Lost the text, never mind....\n"); ps_start_utt(ps->ps, NULL); switch_set_flag(ps, PSFLAG_READY); } else { ps->hyp = switch_core_strdup(ah->memory_pool, hyp); switch_set_flag(ps, PSFLAG_HAS_TEXT); } } } } switch_mutex_unlock(ps->flag_mutex); } /* only feed ps_process_raw when we are listening */ if (ps->listening) { switch_mutex_lock(ps->flag_mutex); rv = ps_process_raw(ps->ps, (int16 *) data, len / 2, FALSE, FALSE); switch_mutex_unlock(ps->flag_mutex); } if (rv < 0) { return SWITCH_STATUS_FALSE; } } return SWITCH_STATUS_SUCCESS; }
ReturnType Recognizer::process(const std::vector<int16_t>& buffer) { if ((decoder == NULL) || (!is_recording)) return BAD_STATE; if (buffer.size() == 0) return RUNTIME_ERROR; ps_process_raw(decoder, (short int *) &buffer[0], buffer.size(), 0, 0); const char* h = ps_get_hyp(decoder, &score, &sentence_id); current_hyp = (h == NULL) ? "" : h; return SUCCESS; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-lm", MODELDIR "/en-us/en-us.lm.bin", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", NULL); if (config == NULL) { fprintf(stderr, "Failed to create config object, see log for details\n"); return -1; } // Initialize pocketsphinx ps = ps_init(config); if (ps == NULL) { fprintf(stderr, "Failed to create recognizer, see log for details\n"); return -1; } // Open the wav file passed from argument printf("file: %s\n", argv[1]); fh = fopen(argv[1], "rb"); if (fh == NULL) { fprintf(stderr, "Unable to open input file %s\n", argv[1]); return -1; } // Start utterance rv = ps_start_utt(ps); // Process buffer, 512 samples at a time while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } // Recieve the recognized string rv = ps_end_utt(ps); hyp = ps_get_hyp(ps, &score); printf("Recognized: |%s|\n", hyp); // free memory fclose(fh); ps_free(ps); cmd_ln_free_r(config); return 0; }
SWIGINTERN Hypothesis *ps_decoder_s_getHyp(struct ps_decoder_s *self){ char const *hyp, *uttid; int32 best_score; hyp = ps_get_hyp(self, &best_score, &uttid); if (hyp == NULL) return NULL; else return new_Hypothesis(hyp, uttid, best_score); }
/** * You must sucessfully call spInitListener * once before using this function. * * Reads the next block of audio from the microphone * up to SPLBUFSIZE number of samples * (defined in splistener.h). * If an utterance was completed in that block, * the transcription is stored in the string words. * * When calling this function in a realtime loop, delay * by some amount of time between calls keeping in mind * your recording's sample rate and maximum samples read * per call (ex. sleep the thread for 100 milliseconds) * so that some audio can be recorded for the next call. * * @return true if a speech session was completed and * transcribed this block, otherwise false. */ static bool spDecode() { static bool uttered = false; // lock pocketsphinx resources to make sure they // don't get freed by main thread while in use std::lock_guard<std::mutex> ps_lock(ps_mtx); if(!mic || !ps) return false; int samples_read = ad_read(mic, buf, SPLBUFSIZE); if (samples_read <= 0) { spError("failed to read audio :("); return false; } ps_process_raw(ps, buf, samples_read, FALSE, FALSE); bool talking = ps_get_in_speech(ps); // Just started talking if (talking && !uttered) { uttered = true; return false; } // Stopped talking, so transcribe what was said // and begin the next utterance if (!talking && uttered) { ps_end_utt(ps); const char *trans = ps_get_hyp(ps, NULL); if (ps_start_utt(ps) < 0) { spError("failed to start utterance :("); } uttered = false; int l = strlen(trans); if (trans && l > 0) { std::lock_guard<std::mutex> lock(words_mtx); if (words && l + 1 > words_buf_size) { delete words; words = NULL; } if (!words) { words = new char[l + 1]; words_buf_size = l + 1; } std::copy(trans, trans + l, words); words[l] = '\0'; return true; } } return false; }
ReturnType Recognizer::stop() { if ((decoder == NULL) || (!is_recording)) return BAD_STATE; if (ps_end_utt(decoder) < 0) { return RUNTIME_ERROR; } const char* h = ps_get_hyp(decoder, &score, &sentence_id); current_hyp = (h == NULL) ? "" : h; is_recording = false; return SUCCESS; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; const char *filename = "goforward.raw"; const char *word = "goforward"; char const *hyp, *uttid; int rv; int32 score; /* setup the sphinx config */ config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-lm", MODELDIR "/lm/en/turtle.DMP", "-dict", MODELDIR "/lm/en/turtle.dic", NULL); if(config == NULL) { EXIT_ERROR; } /* initialize the config */ ps = ps_init(config); if(ps == NULL) { EXIT_ERROR; } /* open the audio file (stream?) */ fh = fopen(filename, "rb"); if(fh == NULL) { perror(filename); exit(1); } /* decode the file */ rv = ps_decode_raw(ps, fh, word, -1); if(rv < 0) { EXIT_ERROR; } /* get hypothesis */ hyp = ps_get_hyp(ps, &score, &uttid); if(hyp == NULL) { EXIT_ERROR; } printf("Recognized: %s; score: %d; uttid: %s\n", hyp, score, uttid); /* clean up */ fclose(fh); ps_free(ps); return 0; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-keyphrase", "marieta", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", "-kws_threshold", "1e-30", NULL); if (config == NULL) { fprintf(stderr, "Failed to create config object, see log for details\n"); return -1; } ps = ps_init(config); if (ps == NULL) { fprintf(stderr, "Failed to create recognizer, see log for details\n"); return -1; } fh = fopen("data/marieta.raw", "rb"); if (fh == NULL) { fprintf(stderr, "Unable to open input file goforward.raw\n"); return -1; } rv = ps_start_utt(ps); while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } rv = ps_end_utt(ps); hyp = ps_get_hyp(ps, &score); printf("Recognized: %s\n", hyp); fclose(fh); ps_free(ps); cmd_ln_free_r(config); return 0; }
static void gst_sphinx_sink_process_chunk (GstSphinxSink *sphinxsink) { int32 k; int16 adbuf[REQUIRED_FRAME_SAMPLES]; k = cont_ad_read (sphinxsink->cont, adbuf, REQUIRED_FRAME_SAMPLES); if (k == 0 && sphinxsink->last_ts == 0) { return; } else if (k == 0 && sphinxsink->cont->read_ts - sphinxsink->last_ts > DEFAULT_SAMPLES_PER_SEC) { int32 score; const char *hyp; char *stripped_hyp; int i, j; ps_end_utt (sphinxsink->decoder); if ((hyp = ps_get_hyp (sphinxsink->decoder, &score, NULL)) == NULL) { gst_sphinx_sink_send_message (sphinxsink, "message", ""); g_message("Not Recognized"); } else { stripped_hyp = g_malloc (strlen (hyp) + 1); for (i=0, j=0; hyp[i] != 0; i++) { if (hyp[i] != '(' && hyp[i] != ')' && (hyp[i] < '0' || hyp[i] > '9')) { stripped_hyp[j++] = hyp[i]; } } stripped_hyp [j] = 0; gst_sphinx_sink_send_message (sphinxsink, "message", stripped_hyp); g_message("Recognized: %s", stripped_hyp); } sphinxsink->last_ts = 0; sphinxsink->ad.listening = 0; } else if (k != 0) { if (sphinxsink->ad.listening == 0) { ps_start_utt (sphinxsink->decoder, NULL); sphinxsink->ad.listening = 1; gst_sphinx_sink_send_message (sphinxsink, "listening", NULL); } ps_process_raw (sphinxsink->decoder, adbuf, k, 0, 0); sphinxsink->last_ts = sphinxsink->cont->read_ts; } }
/* * Main utterance processing loop: * for (;;) { * start utterance and wait for speech to process * decoding till end-of-utterance silence will be detected * print utterance result; * } */ static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int) cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; E_INFO("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) { printf("%s\n", hyp); fflush(stdout); } if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); } sleep_msec(100); } ad_close(ad); }
/* * Continuous recognition from a file */ int recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; char str[1000]=""; uint8 utt_started, in_speech; fname = "C:/Users/Reza/Documents/GitHub/speech_agent/presentation_samples/italy1_reza.wav"; rawfd = fopen(fname, "rb"); if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL){ strncpy_s( str, hyp, strlen(hyp)); printf("%s\n", hyp); listenCallback(str); } ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); fclose(rawfd); return 0; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; ps_nbest_t *nbest; cmd_ln_t *config; FILE *rawfh; char const *hyp; int32 score, n; TEST_ASSERT(config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-lm", MODELDIR "/en-us/en-us.lm.bin", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", "-fwdtree", "yes", "-fwdflat", "yes", "-bestpath", "yes", "-input_endian", "little", "-samprate", "16000", NULL)); TEST_ASSERT(ps = ps_init(config)); TEST_ASSERT(rawfh = fopen(DATADIR "/goforward.raw", "rb")); ps_decode_raw(ps, rawfh, -1); fclose(rawfh); hyp = ps_get_hyp(ps, &score); printf("BESTPATH: %s (%d)\n", hyp, score); for (n = 1, nbest = ps_nbest(ps); nbest && n < 10; nbest = ps_nbest_next(nbest), n++) { ps_seg_t *seg; hyp = ps_nbest_hyp(nbest, &score); printf("NBEST %d: %s (%d)\n", n, hyp, score); for (seg = ps_nbest_seg(nbest); seg; seg = ps_seg_next(seg)) { char const *word; int sf, ef; word = ps_seg_word(seg); ps_seg_frames(seg, &sf, &ef); printf("%s %d %d\n", word, sf, ef); } } if (nbest) ps_nbest_free(nbest); ps_free(ps); cmd_ln_free_r(config); return 0; }
static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(AUDIO_DEVICE_NAME, (int) SAMPLE_RATE )) == NULL) { E_FATAL("Failed to open audio device\n"); } if (ad_start_rec(ad) < 0) { E_FATAL("Failed to start recording\n"); } if (ps_start_utt(ps) < 0) { E_FATAL("Failed to start utterance\n"); } utt_started = FALSE; printf("READY....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; printf("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) printf("%s\n", hyp); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; printf("READY....\n"); } sleep_msec(100); } ad_close(ad); }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-lm", MODELDIR "/en-us/en-us.lm.dmp", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", NULL); if (config == NULL) return 1; ps = ps_init(config); if (ps == NULL) return 1; fh = fopen("goforward.raw", "rb"); if (fh == NULL) return -1; rv = ps_start_utt(ps); if (rv < 0) return 1; while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } rv = ps_end_utt(ps); if (rv < 0) return 1; hyp = ps_get_hyp(ps, &score); if (hyp == NULL) return 1; printf("Recognized: %s\n", hyp); fclose(fh); ps_free(ps); cmd_ln_free_r(config); return 0; }
int printResult() { char const *hyp, *uttid; int32 score; hyp = ps_get_hyp(ps, &score, &uttid); if (hyp == NULL) { printf("Error getting result.\n"); return 0; } printf("Result => %s, uttid: %s, score: %d\n", hyp, uttid, score); if (strcmp(hyp, "left") == 0) return 0; return 1; }
uint32 FSpeechRecognitionWorker::Run() { char const *hyp; // attempt to open the default recording device if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int)cmd_ln_float32_r(config, "-samprate"))) == NULL) { ClientMessage(FString(TEXT("Failed to open audio device"))); return 1; } if (ad_start_rec(ad) < 0) { ClientMessage(FString(TEXT("Failed to start recording"))); return 2; } if (ps_start_utt(ps) < 0) { ClientMessage(FString(TEXT("Failed to start utterance"))); return 3; } while (StopTaskCounter.GetValue() == 0) { if ((k = ad_read(ad, adbuf, 1024)) < 0) ClientMessage(FString(TEXT("Failed to read audio"))); ps_process_raw(ps, adbuf, k, 0, 0); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = 1; } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) Manager->WordSpoken_method(FString(hyp)); if (ps_start_utt(ps) < 0) ClientMessage(FString(TEXT("Failed to start"))); utt_started = 0; } } ad_close(ad); return 0; }
static GstFlowReturn gst_pocketsphinx_chain(GstPad * pad, GstBuffer * buffer) { GstPocketSphinx *ps; ps = GST_POCKETSPHINX(GST_OBJECT_PARENT(pad)); /* Start an utterance for the first buffer we get (i.e. we assume * that the VADER is "leaky") */ if (!ps->listening) { ps->listening = TRUE; ps_start_utt(ps->ps, NULL); } ps_process_raw(ps->ps, (short *)GST_BUFFER_DATA(buffer), GST_BUFFER_SIZE(buffer) / sizeof(short), FALSE, FALSE); /* Get a partial result every now and then, see if it is different. */ if (ps->last_result_time == 0 /* Check every 100 milliseconds. */ || (GST_BUFFER_TIMESTAMP(buffer) - ps->last_result_time) > 100*10*1000) { int32 score; char const *hyp; char const *uttid; hyp = ps_get_hyp(ps->ps, &score, &uttid); ps->last_result_time = GST_BUFFER_TIMESTAMP(buffer); if (hyp && strlen(hyp) > 0) { if (ps->last_result == NULL || 0 != strcmp(ps->last_result, hyp)) { g_free(ps->last_result); ps->last_result = g_strdup(hyp); /* Emit a signal for applications. */ g_signal_emit(ps, gst_pocketsphinx_signals[SIGNAL_PARTIAL_RESULT], 0, hyp, uttid); } } } gst_buffer_unref(buffer); return GST_FLOW_OK; }
int processCommands() { int32 samples; int16 audioBuf[BUFFER_SIZE]; char const *uttid; char const *hyp; while(run) { printf("Waiting for utterance...\n"); samples = waitForNextUtterance(); if(samples < 0) return -1; if(ps_start_utt(psDecoder, NULL) < 0) { fprintf(stderr, "Failed to start next utterance\n"); return -1; } ps_process_raw(psDecoder, audioBuf, samples, FALSE, FALSE); printf("Recording...\n"); fflush(stdout); record(); ad_stop_rec(audioDevice); while(ad_read(audioDevice, audioBuf, BUFFER_SIZE) >= 0); cont_ad_reset(continousAudoDevice); ps_end_utt(psDecoder); hyp = ps_get_hyp(psDecoder, NULL, &uttid); printf("Heard: %s\n", hyp); if (ad_start_rec(audioDevice) < 0) { fprintf(stderr, "Failed to start audio device.\n"); return -1; } } return 0; }
static int filter_frame(AVFilterLink *inlink, AVFrame *in) { AVFilterContext *ctx = inlink->dst; AVDictionary **metadata = &in->metadata; ASRContext *s = ctx->priv; int have_speech; const char *speech; ps_process_raw(s->ps, (const int16_t *)in->data[0], in->nb_samples, 0, 0); have_speech = ps_get_in_speech(s->ps); if (have_speech && !s->utt_started) s->utt_started = 1; if (!have_speech && s->utt_started) { ps_end_utt(s->ps); speech = ps_get_hyp(s->ps, NULL); if (speech != NULL) av_dict_set(metadata, "lavfi.asr.text", speech, 0); ps_start_utt(s->ps); s->utt_started = 0; } return ff_filter_frame(ctx->outputs[0], in); }
static void fsg_processor(context_t *ctx, srs_srec_utterance_t *utt, srs_srec_candidate_t *cands, srs_srec_candidate_t **sorted) { filter_buf_t *filtbuf; decoder_set_t *decset; decoder_t *dec; logmath_t *lmath; const char *uttid; int32_t score; double prob; srs_srec_candidate_t *cand; srs_srec_token_t *tkn; ps_lattice_t *dag; ps_latlink_t *lnk; ps_latnode_t *nod; const char *token; int32_t frlen; int32_t start, end; int16 fef, lef; if (!ctx || !(filtbuf = ctx->filtbuf) || !(decset = ctx->decset) || !(dec = decset->curdec)) return; frlen = filtbuf->frlen; lmath = ps_get_logmath(dec->ps); ps_get_hyp(dec->ps, &score, &uttid); prob = logmath_exp(lmath, score); cand = cands; cand->score = 1.0; cand->ntoken = 0; tkn = NULL; if ((dag = ps_get_lattice(dec->ps))) { if ((lnk = ps_lattice_traverse_edges(dag, NULL, NULL))) { ps_latlink_nodes(lnk, &nod); if (nod && (token = ps_latnode_word(dag, nod)) && *token != '<') { tkn = cand->tokens + cand->ntoken++; tkn->token = tknbase(token); tkn->start = ps_latnode_times(nod, &fef, &lef) * frlen; tkn->end = ((fef + lef) / 2) * frlen; } goto handle_destination_node; while ((lnk = ps_lattice_traverse_next(dag, NULL))) { handle_destination_node: nod = ps_latlink_nodes(lnk, NULL); if (nod && (token = ps_latnode_word(dag,nod)) && *token != '<') { start = ps_latnode_times(nod, &fef, &lef) * frlen; end = fef * frlen; if (tkn && start < (int32_t)tkn->end) break; /* just take one candidate */ if (!tkn || !tkneq(token, tkn->token)) { tkn = cand->tokens + cand->ntoken++; tkn->token = tknbase(token); tkn->start = start; tkn->end = end + frlen; } } } } } sorted[0] = cands; sorted[1] = NULL; utt->id = uttid; utt->score = prob < 0.00001 ? 0.00001 : prob; //utt->length = dag ? ps_lattice_n_frames(dag) * frlen : 0; utt->length = filtbuf->len; utt->ncand = 1; utt->cands = sorted; }
static void acoustic_processor(context_t *ctx, srs_srec_utterance_t *utt, srs_srec_candidate_t *cands, srs_srec_candidate_t **sorted) { filter_buf_t *filtbuf; decoder_set_t *decset; decoder_t *dec; logmath_t *lmath; const char *uttid; const char *hyp; int32 score; double prob; ps_nbest_t *nb; ps_seg_t *seg; int32_t frlen; int32 start, end; size_t ncand; srs_srec_candidate_t *cand; srs_srec_token_t *tkn; int32_t length; if (!ctx || !(filtbuf = ctx->filtbuf) || !(decset = ctx->decset) || !(dec = decset->curdec)) return; frlen = filtbuf->frlen; lmath = ps_get_logmath(dec->ps); uttid = "<unknown>"; hyp = ps_get_hyp(dec->ps, &score, &uttid); prob = logmath_exp(lmath, score); length = 0; if (prob < 0.00000001) prob = 0.00000001; for (nb = ps_nbest(dec->ps, 0,-1, NULL,NULL), ncand = 0; nb != NULL; nb = ps_nbest_next(nb)) { if (ncand >= CANDIDATE_MAX-1) { break; ps_nbest_free(nb); } if ((seg = ps_nbest_seg(nb, &score))) { while (seg && strcmp(ps_seg_word(seg), "<s>")) seg = ps_seg_next(seg); if (!seg) continue; ps_seg_frames(seg, &start, &end); cand = cands + ncand; cand->score = logmath_exp(lmath, score) / prob; cand->ntoken = 0; length = 0; while ((seg = ps_seg_next(seg))) { if ((hyp = ps_seg_word(seg))) { if (!strcmp(hyp, "</s>") || cand->ntoken >= CANDIDATE_TOKEN_MAX) { ncand++; //memset(cand+1, 0, sizeof(srs_srec_candidate_t)); ps_seg_frames(seg, &start, &end); ps_seg_free(seg); //printf("hyp=</s> ncand=%d\n", ncand); length = (end + 1) * frlen; break; } else if (!strcmp(hyp, "<sil>")) { ps_seg_frames(seg, &start, &end); //printf("hyp=<sil> skip it\n"); } else { tkn = cand->tokens + cand->ntoken++; tkn->token = tknbase(hyp); ps_seg_frames(seg, &start, &end); tkn->start = start * frlen; tkn->end = (end + 1) * frlen; //printf("hyp=%s (%d, %d) tkn count %d\n", // tkn->token, tkn->start,tkn->end, cand->ntoken); } } } /* while seg */ if (!seg && cand->ntoken > 0) { ncand++; cand->score *= 0.9; /* some penalty */ //memset(cand+1, 0, sizeof(srs_srec_candidate_t)); } if (!length) { tkn = cand->tokens + (cand->ntoken - 1); length = tkn->end; } } } /* for nb */ memset(cand+1, 0, sizeof(srs_srec_candidate_t)); utt->id = uttid; utt->score = prob; //utt->length = length; utt->length = filtbuf->len; utt->ncand = candidate_sort(cands, sorted); utt->cands = sorted; }
/* * Continuous recognition from a file */ static void recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; uint8 utt_started, in_speech; int32 print_times = cmd_ln_boolean_r(config, "-time"); fname = cmd_ln_str_r(config, "-infile"); if ((rawfd = fopen(fname, "rb")) == NULL) { E_FATAL_SYSTEM("Failed to open file '%s' for reading", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); if (!check_wav_header(waveheader, (int)cmd_ln_float32_r(config, "-samprate"))) E_FATAL("Failed to process file '%s' due to format mismatch.\n", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".mp3") == 0) { E_FATAL("Can not decode mp3 files, convert input file to WAV 16kHz 16-bit mono before decoding.\n"); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) printf("%s\n", hyp); if (print_times) print_word_times(); fflush(stdout); ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); if (utt_started) { hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) { printf("%s\n", hyp); if (print_times) { print_word_times(); } } } fclose(rawfd); }