int decode() { int ret; int16 buf[BUFFER_SIZE]; printf("Listening for input...\n"); if (ad_start_rec(ad) < 0) { printf("Error starting recording.\n"); return 0; } //check if not silent while ((ret = cont_ad_read(c_ad, buf, BUFFER_SIZE)) == 0) usleep(1000); if (ps_start_utt(ps, NULL) < 0) { printf("Failed to start utterance.\n"); return 0; } ret = ps_process_raw(ps, buf, BUFFER_SIZE, 0, 0); if (ret < 0) { printf("Error decoding.\n"); return 0; } do { ret = cont_ad_read(c_ad, buf, BUFFER_SIZE); if (ret < 0) { printf("Failed to record audio.\n"); return 0; } else if(ret > 0) { // Valid speech data read. ret = ps_process_raw(ps, buf, 4096, 0, 0); if (ret < 0) { printf("Error decoding.\n"); return 0; } } else { //no data usleep(1000); } } while(getRun()); ad_stop_rec(ad); while (ad_read(ad, buf, BUFFER_SIZE) >= 0); cont_ad_reset(c_ad); ps_end_utt(ps); return 1; }
std::string PocketSphinxServer::decode(const PocketSphinxIce::sample& signal, const Ice::Current& c) { std::cout << "Decode\n"; int rv; const char* hyp; const char* uttid; int32 score; rv = ps_start_utt(ps); if (rv < 0) throw PocketSphinxIce::Error("Error in ps_start_utt"); ps_process_raw(ps, signal.data(), signal.size(), FALSE, FALSE); rv = ps_end_utt(ps); if (rv < 0) throw PocketSphinxIce::Error("Error in ps_end_utt"); hyp = ps_get_hyp(ps, &score); if (!hyp) throw PocketSphinxIce::Error("ps_get_hyp returned NULL"); std::cout << "return:" << hyp << '\n'; return hyp; }
int processRaw(const char *rawFile) { const char *hyp, *uttid; int16 buf[512]; int rv; int32 score; // Open the wav file passed from argument printf("file: %s\n", rawFile); fh = fopen(rawFile, "rb"); if (fh == NULL) { fprintf(stderr, "Unable to open input file %s\n", rawFile); return -1; } // Start utterance rv = ps_start_utt(ps); // Process buffer, 512 samples at a time while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } // Recieve the recognized string rv = ps_end_utt(ps); hyp = ps_get_hyp(ps, &score); printf("Recognized: |%s|\n", hyp); fflush(stdout); // Close file fclose(fh); return 0; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; //int i; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k", "-lm", MODELDIR "/lm/en/turtle.DMP", "-dict", MODELDIR "/lm/en/turtle.dic", NULL); if (config == NULL) return 1; ps = ps_init(config); if (ps == NULL) return 1; fh = fopen("goforward.raw", "rb"); if (fh == NULL) { perror("Failed to open goforward.raw"); return 1; } rv = ps_decode_raw(ps, fh, "goforward", -1); if (rv < 0) return 1; hyp = ps_get_hyp(ps, &score, &uttid); if (hyp == NULL) return 1; printf("Recognized: %s\n", hyp); fseek(fh, 0, SEEK_SET); rv = ps_start_utt(ps, "goforward"); if (rv < 0) return 1; while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } rv = ps_end_utt(ps); if (rv < 0) return 1; hyp = ps_get_hyp(ps, &score, &uttid); if (hyp == NULL) return 1; printf("Recognized: %s\n", hyp); fclose(fh); ps_free(ps); return 0; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-lm", MODELDIR "/en-us/en-us.lm.bin", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", NULL); if (config == NULL) { fprintf(stderr, "Failed to create config object, see log for details\n"); return -1; } // Initialize pocketsphinx ps = ps_init(config); if (ps == NULL) { fprintf(stderr, "Failed to create recognizer, see log for details\n"); return -1; } // Open the wav file passed from argument printf("file: %s\n", argv[1]); fh = fopen(argv[1], "rb"); if (fh == NULL) { fprintf(stderr, "Unable to open input file %s\n", argv[1]); return -1; } // Start utterance rv = ps_start_utt(ps); // Process buffer, 512 samples at a time while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } // Recieve the recognized string rv = ps_end_utt(ps); hyp = ps_get_hyp(ps, &score); printf("Recognized: |%s|\n", hyp); // free memory fclose(fh); ps_free(ps); cmd_ln_free_r(config); return 0; }
ReturnType Recognizer::process(const std::vector<int16_t>& buffer) { if ((decoder == NULL) || (!is_recording)) return BAD_STATE; if (buffer.size() == 0) return RUNTIME_ERROR; ps_process_raw(decoder, (short int *) &buffer[0], buffer.size(), 0, 0); const char* h = ps_get_hyp(decoder, &score, &sentence_id); current_hyp = (h == NULL) ? "" : h; return SUCCESS; }
/** * You must sucessfully call spInitListener * once before using this function. * * Reads the next block of audio from the microphone * up to SPLBUFSIZE number of samples * (defined in splistener.h). * If an utterance was completed in that block, * the transcription is stored in the string words. * * When calling this function in a realtime loop, delay * by some amount of time between calls keeping in mind * your recording's sample rate and maximum samples read * per call (ex. sleep the thread for 100 milliseconds) * so that some audio can be recorded for the next call. * * @return true if a speech session was completed and * transcribed this block, otherwise false. */ static bool spDecode() { static bool uttered = false; // lock pocketsphinx resources to make sure they // don't get freed by main thread while in use std::lock_guard<std::mutex> ps_lock(ps_mtx); if(!mic || !ps) return false; int samples_read = ad_read(mic, buf, SPLBUFSIZE); if (samples_read <= 0) { spError("failed to read audio :("); return false; } ps_process_raw(ps, buf, samples_read, FALSE, FALSE); bool talking = ps_get_in_speech(ps); // Just started talking if (talking && !uttered) { uttered = true; return false; } // Stopped talking, so transcribe what was said // and begin the next utterance if (!talking && uttered) { ps_end_utt(ps); const char *trans = ps_get_hyp(ps, NULL); if (ps_start_utt(ps) < 0) { spError("failed to start utterance :("); } uttered = false; int l = strlen(trans); if (trans && l > 0) { std::lock_guard<std::mutex> lock(words_mtx); if (words && l + 1 > words_buf_size) { delete words; words = NULL; } if (!words) { words = new char[l + 1]; words_buf_size = l + 1; } std::copy(trans, trans + l, words); words[l] = '\0'; return true; } } return false; }
long ps_decode_raw(ps_decoder_t *ps, FILE *rawfh, long maxsamps) { int16 *data; long total, pos, endpos; ps_start_stream(ps); ps_start_utt(ps); /* If this file is seekable or maxsamps is specified, then decode * the whole thing at once. */ if (maxsamps != -1) { data = ckd_calloc(maxsamps, sizeof(*data)); total = fread(data, sizeof(*data), maxsamps, rawfh); ps_process_raw(ps, data, total, FALSE, TRUE); ckd_free(data); } else if ((pos = ftell(rawfh)) >= 0) { fseek(rawfh, 0, SEEK_END); endpos = ftell(rawfh); fseek(rawfh, pos, SEEK_SET); maxsamps = endpos - pos; data = ckd_calloc(maxsamps, sizeof(*data)); total = fread(data, sizeof(*data), maxsamps, rawfh); ps_process_raw(ps, data, total, FALSE, TRUE); ckd_free(data); } else { /* Otherwise decode it in a stream. */ total = 0; while (!feof(rawfh)) { int16 data[256]; size_t nread; nread = fread(data, sizeof(*data), sizeof(data)/sizeof(*data), rawfh); ps_process_raw(ps, data, nread, FALSE, FALSE); total += nread; } } ps_end_utt(ps); return total; }
void GqAndroidSphinx::received_buf_from_recorder(short *record_buf, unsigned long buf_size_in_byte) { m_fout.write((char *) record_buf, buf_size_in_byte); pthread_mutex_lock(&m_pt_mutex); LOGD("received_buf_from_recorder before"); ps_process_raw(m_pdecoder, record_buf, buf_size_in_byte / sizeof(int16), TRUE, FALSE); LOGD("received_buf_from_recorder end"); pthread_mutex_unlock(&m_pt_mutex); }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-keyphrase", "marieta", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", "-kws_threshold", "1e-30", NULL); if (config == NULL) { fprintf(stderr, "Failed to create config object, see log for details\n"); return -1; } ps = ps_init(config); if (ps == NULL) { fprintf(stderr, "Failed to create recognizer, see log for details\n"); return -1; } fh = fopen("data/marieta.raw", "rb"); if (fh == NULL) { fprintf(stderr, "Unable to open input file goforward.raw\n"); return -1; } rv = ps_start_utt(ps); while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } rv = ps_end_utt(ps); hyp = ps_get_hyp(ps, &score); printf("Recognized: %s\n", hyp); fclose(fh); ps_free(ps); cmd_ln_free_r(config); return 0; }
void Recognizer::run() { // Create audio converter: auto converter = ci::audio::dsp::Converter::create( mMonitorNode->getSampleRate(), 16000, mMonitorNode->getNumChannels(), 1, mMonitorNode->getFramesPerBlock() ); // Create buffer for converted audio: ci::audio::Buffer destBuffer( converter->getDestMaxFramesPerBlock(), converter->getDestNumChannels() ); bool utt_started, in_speech; if( ps_start_utt( mDecoder ) < 0 ) throw std::runtime_error( "Could not start utterance" ); utt_started = false; while( ! mStop ) { // Convert buffer: std::pair<size_t,size_t> convertResult = converter->convert( &( mMonitorNode->getBuffer() ), &destBuffer ); // Convert buffer data: int16_t* data = new int16_t[ convertResult.second ]; convertFloatToInt16( destBuffer.getData(), data, convertResult.second ); // Process buffer: ps_process_raw( mDecoder, data, convertResult.second, false, false ); // Cleanup buffer data: delete[] data; in_speech = static_cast<bool>( ps_get_in_speech( mDecoder ) ); if( in_speech && ! utt_started ) { utt_started = true; } if( ! in_speech && utt_started ) { // Start new utterance on speech to silence transition: ps_end_utt( mDecoder ); // Pass to handler: if( mHandler ) mHandler->event( mDecoder ); // Prepare for next utterance: if( ps_start_utt( mDecoder ) < 0 ) throw std::runtime_error( "Could not start utterance" ); utt_started = false; } std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); } }
static void gst_sphinx_sink_process_chunk (GstSphinxSink *sphinxsink) { int32 k; int16 adbuf[REQUIRED_FRAME_SAMPLES]; k = cont_ad_read (sphinxsink->cont, adbuf, REQUIRED_FRAME_SAMPLES); if (k == 0 && sphinxsink->last_ts == 0) { return; } else if (k == 0 && sphinxsink->cont->read_ts - sphinxsink->last_ts > DEFAULT_SAMPLES_PER_SEC) { int32 score; const char *hyp; char *stripped_hyp; int i, j; ps_end_utt (sphinxsink->decoder); if ((hyp = ps_get_hyp (sphinxsink->decoder, &score, NULL)) == NULL) { gst_sphinx_sink_send_message (sphinxsink, "message", ""); g_message("Not Recognized"); } else { stripped_hyp = g_malloc (strlen (hyp) + 1); for (i=0, j=0; hyp[i] != 0; i++) { if (hyp[i] != '(' && hyp[i] != ')' && (hyp[i] < '0' || hyp[i] > '9')) { stripped_hyp[j++] = hyp[i]; } } stripped_hyp [j] = 0; gst_sphinx_sink_send_message (sphinxsink, "message", stripped_hyp); g_message("Recognized: %s", stripped_hyp); } sphinxsink->last_ts = 0; sphinxsink->ad.listening = 0; } else if (k != 0) { if (sphinxsink->ad.listening == 0) { ps_start_utt (sphinxsink->decoder, NULL); sphinxsink->ad.listening = 1; gst_sphinx_sink_send_message (sphinxsink, "listening", NULL); } ps_process_raw (sphinxsink->decoder, adbuf, k, 0, 0); sphinxsink->last_ts = sphinxsink->cont->read_ts; } }
/* * Continuous recognition from mic */ int recognize_from_mic() { ad_rec_t *ad; int16 adbuf[2048]; const char *fname; const char* seg; int32 k; char str[1000]=""; uint8 utt_started, in_speech; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),16000)) == NULL) perror("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) perror("Failed to start recording\n"); ps_start_utt(ps); utt_started = FALSE; ps_seg_t *psegt; while (!finished) { if ((k = ad_read(ad, adbuf, 2048)) < 0) perror("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); psegt = ps_seg_iter(ps, NULL); while (psegt!=NULL){ seg = ps_seg_word(psegt); strncpy_s( str, seg, strlen(seg)); listenCallback(str); printf("%s\n", seg); int prob = ps_seg_prob(psegt,NULL,NULL,NULL); printf("%d\n", prob); psegt = ps_seg_next(psegt); } ps_start_utt(ps); utt_started = FALSE; } Sleep(100); } ps_end_utt(ps); fclose(rawfd); return 0; }
/* * Main utterance processing loop: * for (;;) { * start utterance and wait for speech to process * decoding till end-of-utterance silence will be detected * print utterance result; * } */ static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int) cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; E_INFO("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) { printf("%s\n", hyp); fflush(stdout); } if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); } sleep_msec(100); } ad_close(ad); }
static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(AUDIO_DEVICE_NAME, (int) SAMPLE_RATE )) == NULL) { E_FATAL("Failed to open audio device\n"); } if (ad_start_rec(ad) < 0) { E_FATAL("Failed to start recording\n"); } if (ps_start_utt(ps) < 0) { E_FATAL("Failed to start utterance\n"); } utt_started = FALSE; printf("READY....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; printf("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) printf("%s\n", hyp); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; printf("READY....\n"); } sleep_msec(100); } ad_close(ad); }
/* * Continuous recognition from a file */ int recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; char str[1000]=""; uint8 utt_started, in_speech; fname = "C:/Users/Reza/Documents/GitHub/speech_agent/presentation_samples/italy1_reza.wav"; rawfd = fopen(fname, "rb"); if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL){ strncpy_s( str, hyp, strlen(hyp)); printf("%s\n", hyp); listenCallback(str); } ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); fclose(rawfd); return 0; }
int main(int argc, char *argv[]) { ps_decoder_t *ps; cmd_ln_t *config; FILE *fh; char const *hyp, *uttid; int16 buf[512]; int rv; int32 score; config = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-lm", MODELDIR "/en-us/en-us.lm.dmp", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", NULL); if (config == NULL) return 1; ps = ps_init(config); if (ps == NULL) return 1; fh = fopen("goforward.raw", "rb"); if (fh == NULL) return -1; rv = ps_start_utt(ps); if (rv < 0) return 1; while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } rv = ps_end_utt(ps); if (rv < 0) return 1; hyp = ps_get_hyp(ps, &score); if (hyp == NULL) return 1; printf("Recognized: %s\n", hyp); fclose(fh); ps_free(ps); cmd_ln_free_r(config); return 0; }
/*! function to feed audio to the ASR */ static switch_status_t pocketsphinx_asr_feed(switch_asr_handle_t *ah, void *data, unsigned int len, switch_asr_flag_t *flags) { pocketsphinx_t *ps = (pocketsphinx_t *) ah->private_info; int rv = 0; if (switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) return SWITCH_STATUS_BREAK; if (!switch_test_flag(ps, PSFLAG_HAS_TEXT) && switch_test_flag(ps, PSFLAG_READY)) { if (stop_detect(ps, (int16_t *) data, len / 2)) { char const *hyp; switch_mutex_lock(ps->flag_mutex); if ((hyp = ps_get_hyp(ps->ps, &ps->score, &ps->uttid))) { if (!zstr(hyp)) { ps_end_utt(ps->ps); switch_clear_flag(ps, PSFLAG_READY); if ((hyp = ps_get_hyp(ps->ps, &ps->score, &ps->uttid))) { if (zstr(hyp)) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Lost the text, never mind....\n"); ps_start_utt(ps->ps, NULL); switch_set_flag(ps, PSFLAG_READY); } else { ps->hyp = switch_core_strdup(ah->memory_pool, hyp); switch_set_flag(ps, PSFLAG_HAS_TEXT); } } } } switch_mutex_unlock(ps->flag_mutex); } /* only feed ps_process_raw when we are listening */ if (ps->listening) { switch_mutex_lock(ps->flag_mutex); rv = ps_process_raw(ps->ps, (int16 *) data, len / 2, FALSE, FALSE); switch_mutex_unlock(ps->flag_mutex); } if (rv < 0) { return SWITCH_STATUS_FALSE; } } return SWITCH_STATUS_SUCCESS; }
uint32 FSpeechRecognitionWorker::Run() { char const *hyp; // attempt to open the default recording device if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int)cmd_ln_float32_r(config, "-samprate"))) == NULL) { ClientMessage(FString(TEXT("Failed to open audio device"))); return 1; } if (ad_start_rec(ad) < 0) { ClientMessage(FString(TEXT("Failed to start recording"))); return 2; } if (ps_start_utt(ps) < 0) { ClientMessage(FString(TEXT("Failed to start utterance"))); return 3; } while (StopTaskCounter.GetValue() == 0) { if ((k = ad_read(ad, adbuf, 1024)) < 0) ClientMessage(FString(TEXT("Failed to read audio"))); ps_process_raw(ps, adbuf, k, 0, 0); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = 1; } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) Manager->WordSpoken_method(FString(hyp)); if (ps_start_utt(ps) < 0) ClientMessage(FString(TEXT("Failed to start"))); utt_started = 0; } } ad_close(ad); return 0; }
static GstFlowReturn gst_pocketsphinx_chain(GstPad * pad, GstBuffer * buffer) { GstPocketSphinx *ps; ps = GST_POCKETSPHINX(GST_OBJECT_PARENT(pad)); /* Start an utterance for the first buffer we get (i.e. we assume * that the VADER is "leaky") */ if (!ps->listening) { ps->listening = TRUE; ps_start_utt(ps->ps, NULL); } ps_process_raw(ps->ps, (short *)GST_BUFFER_DATA(buffer), GST_BUFFER_SIZE(buffer) / sizeof(short), FALSE, FALSE); /* Get a partial result every now and then, see if it is different. */ if (ps->last_result_time == 0 /* Check every 100 milliseconds. */ || (GST_BUFFER_TIMESTAMP(buffer) - ps->last_result_time) > 100*10*1000) { int32 score; char const *hyp; char const *uttid; hyp = ps_get_hyp(ps->ps, &score, &uttid); ps->last_result_time = GST_BUFFER_TIMESTAMP(buffer); if (hyp && strlen(hyp) > 0) { if (ps->last_result == NULL || 0 != strcmp(ps->last_result, hyp)) { g_free(ps->last_result); ps->last_result = g_strdup(hyp); /* Emit a signal for applications. */ g_signal_emit(ps, gst_pocketsphinx_signals[SIGNAL_PARTIAL_RESULT], 0, hyp, uttid); } } } gst_buffer_unref(buffer); return GST_FLOW_OK; }
int processCommands() { int32 samples; int16 audioBuf[BUFFER_SIZE]; char const *uttid; char const *hyp; while(run) { printf("Waiting for utterance...\n"); samples = waitForNextUtterance(); if(samples < 0) return -1; if(ps_start_utt(psDecoder, NULL) < 0) { fprintf(stderr, "Failed to start next utterance\n"); return -1; } ps_process_raw(psDecoder, audioBuf, samples, FALSE, FALSE); printf("Recording...\n"); fflush(stdout); record(); ad_stop_rec(audioDevice); while(ad_read(audioDevice, audioBuf, BUFFER_SIZE) >= 0); cont_ad_reset(continousAudoDevice); ps_end_utt(psDecoder); hyp = ps_get_hyp(psDecoder, NULL, &uttid); printf("Heard: %s\n", hyp); if (ad_start_rec(audioDevice) < 0) { fprintf(stderr, "Failed to start audio device.\n"); return -1; } } return 0; }
int ofApp::engineOpen(string filename) { FILE *fh; char const *uttid; int16 buf[512]; int rv; int32 score; fh = fopen((directoryString + filename).c_str(), "rb"); if (fh == NULL) { return -1; } rv = ps_start_utt(ps); if (rv < 0) return 1; while (!feof(fh)) { size_t nsamp; nsamp = fread(buf, 2, 512, fh); rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); } engineClose(); }
void record() { int32 samples, timeStamp, rem; int16 audioBuf[BUFFER_SIZE]; timeStamp = continousAudoDevice->read_ts; while(run) { samples = cont_ad_read(continousAudoDevice, audioBuf, BUFFER_SIZE); if (samples == 0) { if ((continousAudoDevice->read_ts - timeStamp) > DEFAULT_SAMPLES_PER_SEC) break; } else { timeStamp = continousAudoDevice->read_ts; } rem = ps_process_raw(psDecoder, audioBuf, samples, FALSE, FALSE); if ((rem == 0) && (samples == 0)) usleep(20000); } }
void filter_buffer_utter(context_t *ctx, bool full_utterance) { decoder_set_t *decset; decoder_t *dec; filter_buf_t *filtbuf; int sts, cnt, size; if (!ctx || !(decset = ctx->decset) || !(dec = decset->curdec) || !(filtbuf = ctx->filtbuf)) return; mrp_debug("utterance length %d samples", filtbuf->len); if (filtbuf->len > 0) { if (filtbuf->fdrec >= 0) { size = filtbuf->len * sizeof(int16); for (;;) { cnt = write(filtbuf->fdrec, filtbuf->buf, size); if (cnt != size) { if (cnt < 0 && errno == EINTR) continue; mrp_log_error("failed to record samples (fd %d): %s", filtbuf->fdrec, strerror(errno)); } break; } } sts = ps_process_raw(dec->ps, filtbuf->buf, filtbuf->len, FALSE, full_utterance); if (sts < 0) mrp_log_error("Failed to process %d samples", filtbuf->len); } }
static int filter_frame(AVFilterLink *inlink, AVFrame *in) { AVFilterContext *ctx = inlink->dst; AVDictionary **metadata = &in->metadata; ASRContext *s = ctx->priv; int have_speech; const char *speech; ps_process_raw(s->ps, (const int16_t *)in->data[0], in->nb_samples, 0, 0); have_speech = ps_get_in_speech(s->ps); if (have_speech && !s->utt_started) s->utt_started = 1; if (!have_speech && s->utt_started) { ps_end_utt(s->ps); speech = ps_get_hyp(s->ps, NULL); if (speech != NULL) av_dict_set(metadata, "lavfi.asr.text", speech, 0); ps_start_utt(s->ps); s->utt_started = 0; } return ff_filter_frame(ctx->outputs[0], in); }
/* * Continuous recognition from a file */ static void recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; uint8 utt_started, in_speech; int32 print_times = cmd_ln_boolean_r(config, "-time"); fname = cmd_ln_str_r(config, "-infile"); if ((rawfd = fopen(fname, "rb")) == NULL) { E_FATAL_SYSTEM("Failed to open file '%s' for reading", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); if (!check_wav_header(waveheader, (int)cmd_ln_float32_r(config, "-samprate"))) E_FATAL("Failed to process file '%s' due to format mismatch.\n", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".mp3") == 0) { E_FATAL("Can not decode mp3 files, convert input file to WAV 16kHz 16-bit mono before decoding.\n"); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) printf("%s\n", hyp); if (print_times) print_word_times(); fflush(stdout); ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); if (utt_started) { hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) { printf("%s\n", hyp); if (print_times) { print_word_times(); } } } fclose(rawfd); }
/* * Main utterance processing loop: * for (;;) { * wait for start of next utterance; * decode utterance until silence of at least 1 sec observed; * print utterance result; * } */ static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[4096]; int32 k, ts, rem; char const *hyp; char const *uttid; cont_ad_t *cont; char word[256]; char c1[256], c2[256]; int tracking = 0; int halted = 0; int LEFT = 0; int RIGHT = 1; int MOVE_CENT = 100; //1 meter int numwords; setlinebuf(stdout); if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int)cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); /* Initialize continuous listening module */ if ((cont = cont_ad_init(ad, ad_read)) == NULL) E_FATAL("Failed to initialize voice activity detection\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (cont_ad_calib(cont) < 0) E_FATAL("Failed to calibrate voice activity detection\n"); printf("LEDON BLUE\n"); for (;;) { /* Indicate listening for next utterance */ fprintf(stderr, "READY....\n"); fflush(stderr); /* Wait data for next utterance */ while ((k = cont_ad_read(cont, adbuf, 4096)) == 0) sleep_msec(100); if (k < 0) E_FATAL("Failed to read audio\n"); /* * Non-zero amount of data received; start recognition of new utterance. * NULL argument to uttproc_begin_utt => automatic generation of utterance-id. */ if (ps_start_utt(ps, NULL) < 0) E_FATAL("Failed to start utterance\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); fprintf(stderr, "Listening...\n"); /* Note timestamp for this first block of data */ ts = cont->read_ts; /* Decode utterance until end (marked by a "long" silence, >1sec) */ for (;;) { /* Read non-silence audio data, if any, from continuous listening module */ if ((k = cont_ad_read(cont, adbuf, 4096)) < 0) E_FATAL("Failed to read audio\n"); if (k == 0) { /* * No speech data available; check current timestamp with most recent * speech to see if more than 1 sec elapsed. If so, end of utterance. */ if ((cont->read_ts - ts) > DEFAULT_SAMPLES_PER_SEC) break; } else { /* New speech data received; note current timestamp */ ts = cont->read_ts; } /* * Decode whatever data was read above. */ rem = ps_process_raw(ps, adbuf, k, FALSE, FALSE); /* If no work to be done, sleep a bit */ if ((rem == 0) && (k == 0)) sleep_msec(20); } /* * Utterance ended; flush any accumulated, unprocessed A/D data and stop * listening until current utterance completely decoded */ ad_stop_rec(ad); while (ad_read(ad, adbuf, 4096) >= 0); cont_ad_reset(cont); fprintf(stderr, "Stopped listening, please wait...\n"); fflush(stdout); /* Finish decoding, obtain and print result */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL, &uttid); fprintf(stderr, "%s: %s\n", uttid, hyp); /* Exit if the first word spoken was GOODBYE */ if (hyp) { numwords = sscanf(hyp, "%s %s %s", word, c1, c2); if(strcmp(word, "GUGGUG") == 0) { if(strcmp(c1, "HALT") == 0) { printf("LEDOFF BLUE\n"); halted = 1; } else if(strcmp(c1, "RESUME") == 0) { printf("LEDON BLUE\n"); halted = 0; } if(strcmp(c1, "BEGIN") == 0 || strcmp(c1, "START") == 0) { if(strcmp(c2, "TRACKING") == 0 && !tracking) { printf("START TRACKING\n"); tracking = 1; halted = 0; } } else if(strcmp(c1, "STOP") == 0) { if(strcmp(c2, "TRACKING") == 0 && tracking) { printf("STOP TRACKING\n"); tracking = 0; } } if(!tracking && !halted && numwords == 3) { if(strcmp(c1, "TURN") == 0) { if(strcmp(c2, "AROUND") == 0) { printf("TURN %d 180\n", LEFT); } else if(strcmp(c2, "LEFT") == 0) { printf("TURN %d 90\n", LEFT); } else if(strcmp(c2, "RIGHT") == 0) { printf("TURN %d 90\n", RIGHT); } } else if(strcmp(c1, "MOVE") == 0) { if(strcmp(c2, "FORWARD") == 0) { printf("MOVE 0 %d\n", MOVE_CENT); } else if(strcmp(c2, "BACKWARD") == 0) { printf("MOVE 1 %d\n", MOVE_CENT); } } } } } /* Resume A/D recording for next utterance */ if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); } cont_ad_close(cont); ad_close(ad); }
/* * Continuous recognition from a file */ static void recognize_from_file() { cont_ad_t *cont; ad_rec_t file_ad = {0}; int16 adbuf[4096]; const char* hyp; const char* uttid; int32 k, ts, start; char waveheader[44]; if ((rawfd = fopen(cmd_ln_str_r(config, "-infile"), "rb")) == NULL) { E_FATAL_SYSTEM("Failed to open file '%s' for reading", cmd_ln_str_r(config, "-infile")); } fread(waveheader, 1, 44, rawfd); file_ad.sps = (int32)cmd_ln_float32_r(config, "-samprate"); file_ad.bps = sizeof(int16); if ((cont = cont_ad_init(&file_ad, ad_file_read)) == NULL) { E_FATAL("Failed to initialize voice activity detection"); } if (cont_ad_calib(cont) < 0) E_FATAL("Failed to calibrate voice activity detection\n"); rewind (rawfd); for (;;) { while ((k = cont_ad_read(cont, adbuf, 4096)) == 0); if (k < 0) { break; } if (ps_start_utt(ps, NULL) < 0) E_FATAL("ps_start_utt() failed\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); ts = cont->read_ts; start = ((ts - k) * 100.0) / file_ad.sps; for (;;) { if ((k = cont_ad_read(cont, adbuf, 4096)) < 0) break; if (k == 0) { /* * No speech data available; check current timestamp with most recent * speech to see if more than 1 sec elapsed. If so, end of utterance. */ if ((cont->read_ts - ts) > DEFAULT_SAMPLES_PER_SEC) break; } else { /* New speech data received; note current timestamp */ ts = cont->read_ts; } ps_process_raw(ps, adbuf, k, FALSE, FALSE); } ps_end_utt(ps); if (cmd_ln_boolean_r(config, "-time")) { print_word_times(start); } else { hyp = ps_get_hyp(ps, NULL, &uttid); fprintf(stderr, "%s: %s\n", uttid, hyp); } fflush(stdout); } cont_ad_close(cont); fclose(rawfd); }
void listen::recognize_from_microphone(){ ad_rec_t *ad; int16 adbuf[4096]; int32 k, ts, rem; char buffer[128]; char const *hyp; char const *uttid; cont_ad_t *cont; state = SLEEPING; FILE* pipe = popen(c.getValue("[General]", "Hcidump").c_str(), "r"); std::string bufferStr; std::size_t found; if((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int)cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); /* Initialize continuous listening module */ if((cont = cont_ad_init(ad, ad_read)) == NULL) E_FATAL("Failed to initialize voice activity detection\n"); if(ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if(cont_ad_calib(cont) < 0) E_FATAL("Failed to calibrate voice activity detection\n"); while(!feof(pipe) || state == SLEEPING){ fgets(buffer, 128, pipe); bufferStr = buffer; found = bufferStr.find(c.getValue("[General]", "KeyPress")); if(found!=std::string::npos){ i.pauseIfPlaying(); s.speakThis(c.getValue("[General]", "WakeUpPhrase")); state = ACTIVE; while(state != SLEEPING){ /* Indicate listening for next utterance */ printf("READY....\n"); fflush(stdout); fflush(stderr); /* Wait data for next utterance */ while ((k = cont_ad_read(cont, adbuf, 4096)) == 0){ sleep_msec(100); } if (k < 0) E_FATAL("Failed to read audio\n"); /* * Non-zero amount of data received; start recognition of new utterance. * NULL argument to uttproc_begin_utt => automatic generation of utterance-id. */ if (ps_start_utt(ps, NULL) < 0) E_FATAL("Failed to start utterance\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); printf("Listening...\n"); fflush(stdout); /* Note timestamp for this first block of data */ ts = cont->read_ts; /* Decode utterance until end (marked by a "long" silence, >1sec) */ for(;;){ //while(sleep(2)){ /* Read non-silence audio data, if any, from continuous listening module */ if ((k = cont_ad_read(cont, adbuf, 4096)) < 0) E_FATAL("Failed to read audio\n"); if (k == 0){ /* * No speech data available; check current timestamp with most recent * speech to see if more than 1 sec elapsed. If so, end of utterance. */ if ((cont->read_ts - ts) > DEFAULT_SAMPLES_PER_SEC) break; } else { /* New speech data received; note current timestamp */ ts = cont->read_ts; } /* * Decode whatever data was read above. */ rem = ps_process_raw(ps, adbuf, k, FALSE, FALSE); /* If no work to be done, sleep a bit */ if ((rem == 0) && (k == 0)) sleep_msec(20); } /* * Utterance ended; flush any accumulated, unprocessed A/D data and stop * listening until current utterance completely decoded */ ad_stop_rec(ad); while (ad_read(ad, adbuf, 4096) >= 0); cont_ad_reset(cont); printf("Stopped listening, please wait...\n"); fflush(stdout); /* Finish decoding, obtain and print result */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL, &uttid); fflush(stdout); /* Resume A/D recording for next utterance */ if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if(hyp != NULL){ if(hyp == c.getValue("[General]", "Sleep")){ state = SLEEPING; s.speakThis(c.getValue("[General]", "SleepPhrase")); }else{ if(state != SLEEPING){ i.parse(hyp); } //Hack for play/pause/select std::string hypStr = hyp; if(hypStr == "PLAY ITEM" || hypStr == "PAUSE ITEM" || hypStr == "SELECT ITEM"){ state = SLEEPING; } } } } } } pclose(pipe); cont_ad_close(cont); ad_close(ad); }
SWIGINTERN int Decoder_processRaw__SWIG_1(Decoder *self,short const shorts[],size_t nshorts,bool no_search,bool full_utt){ return ps_process_raw(self, shorts, nshorts, no_search, full_utt); }