/** * You must sucessfully call spInitListener * once before using this function. * * Reads the next block of audio from the microphone * up to SPLBUFSIZE number of samples * (defined in splistener.h). * If an utterance was completed in that block, * the transcription is stored in the string words. * * When calling this function in a realtime loop, delay * by some amount of time between calls keeping in mind * your recording's sample rate and maximum samples read * per call (ex. sleep the thread for 100 milliseconds) * so that some audio can be recorded for the next call. * * @return true if a speech session was completed and * transcribed this block, otherwise false. */ static bool spDecode() { static bool uttered = false; // lock pocketsphinx resources to make sure they // don't get freed by main thread while in use std::lock_guard<std::mutex> ps_lock(ps_mtx); if(!mic || !ps) return false; int samples_read = ad_read(mic, buf, SPLBUFSIZE); if (samples_read <= 0) { spError("failed to read audio :("); return false; } ps_process_raw(ps, buf, samples_read, FALSE, FALSE); bool talking = ps_get_in_speech(ps); // Just started talking if (talking && !uttered) { uttered = true; return false; } // Stopped talking, so transcribe what was said // and begin the next utterance if (!talking && uttered) { ps_end_utt(ps); const char *trans = ps_get_hyp(ps, NULL); if (ps_start_utt(ps) < 0) { spError("failed to start utterance :("); } uttered = false; int l = strlen(trans); if (trans && l > 0) { std::lock_guard<std::mutex> lock(words_mtx); if (words && l + 1 > words_buf_size) { delete words; words = NULL; } if (!words) { words = new char[l + 1]; words_buf_size = l + 1; } std::copy(trans, trans + l, words); words[l] = '\0'; return true; } } return false; }
void Recognizer::run() { // Create audio converter: auto converter = ci::audio::dsp::Converter::create( mMonitorNode->getSampleRate(), 16000, mMonitorNode->getNumChannels(), 1, mMonitorNode->getFramesPerBlock() ); // Create buffer for converted audio: ci::audio::Buffer destBuffer( converter->getDestMaxFramesPerBlock(), converter->getDestNumChannels() ); bool utt_started, in_speech; if( ps_start_utt( mDecoder ) < 0 ) throw std::runtime_error( "Could not start utterance" ); utt_started = false; while( ! mStop ) { // Convert buffer: std::pair<size_t,size_t> convertResult = converter->convert( &( mMonitorNode->getBuffer() ), &destBuffer ); // Convert buffer data: int16_t* data = new int16_t[ convertResult.second ]; convertFloatToInt16( destBuffer.getData(), data, convertResult.second ); // Process buffer: ps_process_raw( mDecoder, data, convertResult.second, false, false ); // Cleanup buffer data: delete[] data; in_speech = static_cast<bool>( ps_get_in_speech( mDecoder ) ); if( in_speech && ! utt_started ) { utt_started = true; } if( ! in_speech && utt_started ) { // Start new utterance on speech to silence transition: ps_end_utt( mDecoder ); // Pass to handler: if( mHandler ) mHandler->event( mDecoder ); // Prepare for next utterance: if( ps_start_utt( mDecoder ) < 0 ) throw std::runtime_error( "Could not start utterance" ); utt_started = false; } std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); } }
/* * Continuous recognition from mic */ int recognize_from_mic() { ad_rec_t *ad; int16 adbuf[2048]; const char *fname; const char* seg; int32 k; char str[1000]=""; uint8 utt_started, in_speech; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),16000)) == NULL) perror("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) perror("Failed to start recording\n"); ps_start_utt(ps); utt_started = FALSE; ps_seg_t *psegt; while (!finished) { if ((k = ad_read(ad, adbuf, 2048)) < 0) perror("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); psegt = ps_seg_iter(ps, NULL); while (psegt!=NULL){ seg = ps_seg_word(psegt); strncpy_s( str, seg, strlen(seg)); listenCallback(str); printf("%s\n", seg); int prob = ps_seg_prob(psegt,NULL,NULL,NULL); printf("%d\n", prob); psegt = ps_seg_next(psegt); } ps_start_utt(ps); utt_started = FALSE; } Sleep(100); } ps_end_utt(ps); fclose(rawfd); return 0; }
/* * Main utterance processing loop: * for (;;) { * start utterance and wait for speech to process * decoding till end-of-utterance silence will be detected * print utterance result; * } */ static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int) cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; E_INFO("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) { printf("%s\n", hyp); fflush(stdout); } if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); } sleep_msec(100); } ad_close(ad); }
/* * Continuous recognition from a file */ int recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; char str[1000]=""; uint8 utt_started, in_speech; fname = "C:/Users/Reza/Documents/GitHub/speech_agent/presentation_samples/italy1_reza.wav"; rawfd = fopen(fname, "rb"); if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL){ strncpy_s( str, hyp, strlen(hyp)); printf("%s\n", hyp); listenCallback(str); } ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); fclose(rawfd); return 0; }
static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(AUDIO_DEVICE_NAME, (int) SAMPLE_RATE )) == NULL) { E_FATAL("Failed to open audio device\n"); } if (ad_start_rec(ad) < 0) { E_FATAL("Failed to start recording\n"); } if (ps_start_utt(ps) < 0) { E_FATAL("Failed to start utterance\n"); } utt_started = FALSE; printf("READY....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; printf("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) printf("%s\n", hyp); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; printf("READY....\n"); } sleep_msec(100); } ad_close(ad); }
uint32 FSpeechRecognitionWorker::Run() { char const *hyp; // attempt to open the default recording device if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int)cmd_ln_float32_r(config, "-samprate"))) == NULL) { ClientMessage(FString(TEXT("Failed to open audio device"))); return 1; } if (ad_start_rec(ad) < 0) { ClientMessage(FString(TEXT("Failed to start recording"))); return 2; } if (ps_start_utt(ps) < 0) { ClientMessage(FString(TEXT("Failed to start utterance"))); return 3; } while (StopTaskCounter.GetValue() == 0) { if ((k = ad_read(ad, adbuf, 1024)) < 0) ClientMessage(FString(TEXT("Failed to read audio"))); ps_process_raw(ps, adbuf, k, 0, 0); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = 1; } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) Manager->WordSpoken_method(FString(hyp)); if (ps_start_utt(ps) < 0) ClientMessage(FString(TEXT("Failed to start"))); utt_started = 0; } } ad_close(ad); return 0; }
static int filter_frame(AVFilterLink *inlink, AVFrame *in) { AVFilterContext *ctx = inlink->dst; AVDictionary **metadata = &in->metadata; ASRContext *s = ctx->priv; int have_speech; const char *speech; ps_process_raw(s->ps, (const int16_t *)in->data[0], in->nb_samples, 0, 0); have_speech = ps_get_in_speech(s->ps); if (have_speech && !s->utt_started) s->utt_started = 1; if (!have_speech && s->utt_started) { ps_end_utt(s->ps); speech = ps_get_hyp(s->ps, NULL); if (speech != NULL) av_dict_set(metadata, "lavfi.asr.text", speech, 0); ps_start_utt(s->ps); s->utt_started = 0; } return ff_filter_frame(ctx->outputs[0], in); }
/* * Continuous recognition from a file */ static void recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; uint8 utt_started, in_speech; int32 print_times = cmd_ln_boolean_r(config, "-time"); fname = cmd_ln_str_r(config, "-infile"); if ((rawfd = fopen(fname, "rb")) == NULL) { E_FATAL_SYSTEM("Failed to open file '%s' for reading", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); if (!check_wav_header(waveheader, (int)cmd_ln_float32_r(config, "-samprate"))) E_FATAL("Failed to process file '%s' due to format mismatch.\n", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".mp3") == 0) { E_FATAL("Can not decode mp3 files, convert input file to WAV 16kHz 16-bit mono before decoding.\n"); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) printf("%s\n", hyp); if (print_times) print_word_times(); fflush(stdout); ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); if (utt_started) { hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) { printf("%s\n", hyp); if (print_times) { print_word_times(); } } } fclose(rawfd); }
int main(int argc, char** argv) { /* init ps */ fprintf(stderr, "INIT PS\n"); cmd_ln_t* psconfig = cmd_ln_init(NULL, ps_args(), TRUE, "-hmm", MODELDIR "/en-us/en-us", "-lm", MODELDIR "/en-us/en-us.lm.bin", "-dict", MODELDIR "/en-us/cmudict-en-us.dict", NULL); if (psconfig == NULL) { fprintf(stderr, "Cannot create config for PS\n"); return -1; } ps_decoder_t* ps = ps_init(psconfig); if (ps == NULL) { fprintf(stderr, "Cannot create PS decoder\n"); return -1; } fprintf(stderr, "INIT AL\n"); /* init openal and create microphone */ ALCdevice* aldevice = alcCaptureOpenDevice(NULL,AL_MIC_FREQ,AL_FORMAT_MONO16,AL_MIC_FREQ/2); if (aldevice == NULL) { fprintf(stderr, "Cannot open AL device"); return -1; } alcCaptureStart(aldevice); /* conna capture some wods */ ALCint samplesIn = 0; short buff[AL_MIC_FREQ*2]; int ignoreCounter = 20; uint8 utt_started; uint8 in_speech; utt_started = FALSE; ps_start_utt(ps); const char* hyp; for (;;) { /* poll some data */ alcGetIntegerv(aldevice,ALC_CAPTURE_SAMPLES,1,&samplesIn); if(samplesIn>AL_MIC_CAP) { alcCaptureSamples(aldevice,buff,AL_MIC_CAP); /* actual voice processing */ ps_process_raw(ps, buff, AL_MIC_CAP, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; fprintf(stderr, "Hearing something i guess...\n"); } if (!in_speech && utt_started) { fprintf(stderr, "Processing it now...\n"); ps_end_utt(ps); hyp = ps_get_hyp(ps,NULL); if (hyp != NULL) { fprintf(stderr, "Here we go!\n\n"); fprintf(stdout,"%s\n",hyp); fflush(stdout); fprintf(stderr,"\n\nTo the next round!\n"); } ps_start_utt(ps); utt_started = FALSE; } } } return 0; }
void Node::run() { // declare variables const int BUFFER_SIZE = 2048; ad_rec_t *reader; size_t read_samples; int16 read_buffer[BUFFER_SIZE]; cmd_ln_t *read_decoder_config; ps_decoder_t *read_decoder; // open audio record reader = ad_open_dev(NULL, 16000); if (reader == NULL) { throw NodeException("Failed to open read audio device"); } // start audio record if (ad_start_rec(reader) < 0) { throw NodeException("Failed to start read audio device"); } // instance audio decoder read_decoder_config = cmd_ln_init( NULL, ps_args(), TRUE, "-logfn", "/dev/null", // turn off console log "-hmm", "/usr/local/share/pocketsphinx/model/en-us/en-us", "-lm", "/usr/local/share/pocketsphinx/model/en-us/en-us.lm.dmp", "-dict", "/usr/local/share/pocketsphinx/model/en-us/cmudict-en-us.dict", NULL ); read_decoder = ps_init(read_decoder_config); if (read_decoder == NULL) { throw NodeException("Failed to initialize audio decoder"); } // start utterance if (ps_start_utt(read_decoder) < 0) { throw NodeException("Failed to start reader utterance"); } bool utt_started = false; bool in_speech = false; char const *hypothesis; while (true) { try { // read from input device read_samples = ad_read( reader, read_buffer, BUFFER_SIZE ); if (read_samples < 0) { throw NodeException("Failed to read audio"); } // add data buffered from device to read decoder ps_process_raw( read_decoder, read_buffer, read_samples, false, false ); // check device in speech in_speech = ps_get_in_speech(read_decoder); if (in_speech && !utt_started) { utt_started = true; } // check device in speech then add data to decoder if (!in_speech && utt_started) { ps_end_utt(read_decoder); hypothesis = ps_get_hyp(read_decoder, NULL); if (hypothesis != NULL) { cout<<"ENV: "<<hypothesis<<endl; Command core_command = this->spec_.first("node-core"); int port = atoi(core_command.option("--port").c_str()); SocketWriter socket(core_command.option("--host"), port); string response = socket.write_to(hypothesis); cout << "IN: " << response << endl; } if (ps_start_utt(read_decoder) < 0) { throw NodeException("Failed to start reader utterance"); } utt_started = false; } sleep_milli_seconds(100); } catch (NodeException e) { cout<<"ERROR: "<<e.what()<<endl; } } }
/* * Main utterance processing loop: * for (;;) { * start utterance and wait for speech to process * decoding till end-of-utterance silence will be detected * print utterance result; * } */ static void recognize_from_microphone() { //---------------------definitions ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; ///////////////////////////// if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int) cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; printf("READY1....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); // decode whatever data was read //1.try to limit the processing time here //2.Or modify the algorithm //3.Or change the parameters of the implemented functions ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { //machine processing acoustics (listening) utt_started = TRUE; printf("Listening...\n"); } if (!in_speech && utt_started) { //--------------------------------------------understanding without listening /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) { rawfd = fopen("char-array.txt", "a"); //~ fprintf(rawfd, "data\n"); //~ fclose(rawfd); if (hyp) { strcpy(word0, ""); strcpy(word1, ""); sscanf(hyp, "%s", word0); printf("%s\n", word0); } //~ printf("%s\n", hyp); // this appears on the screan fprintf(rawfd, "%s\n",hyp); // write to txt file fclose(rawfd); printf("xxxxx"); } if (ps_start_utt(ps) < 0) //open the file and start decoding E_FATAL("Failed to start utterance\n"); utt_started = FALSE; printf("READY2....\n"); const char *text = "write this to the file"; // fprintf(f,"some text :%s\n",text); // fclose(f); } sleep_msec(1); } ad_close(ad); }