/**
 * You must sucessfully call spInitListener
 * once before using this function.
 *
 * Reads the next block of audio from the microphone
 * up to SPLBUFSIZE number of samples
 * (defined in splistener.h).
 * If an utterance was completed in that block,
 * the transcription is stored in the string words.
 *
 * When calling this function in a realtime loop, delay
 * by some amount of time between calls keeping in mind
 * your recording's sample rate and maximum samples read
 * per call (ex. sleep the thread for 100 milliseconds)
 * so that some audio can be recorded for the next call.
 *
 * @return true if a speech session was completed and
 *         transcribed this block, otherwise false.
 */
static bool spDecode() {
    static bool uttered = false;

    // lock pocketsphinx resources to make sure they 
    // don't get freed by main thread while in use
    std::lock_guard<std::mutex> ps_lock(ps_mtx);
    if(!mic || !ps)
        return false;

    int samples_read = ad_read(mic, buf, SPLBUFSIZE);
    if (samples_read <= 0) {
        spError("failed to read audio :(");
        return false;
    }

    ps_process_raw(ps, buf, samples_read, FALSE, FALSE);
    bool talking = ps_get_in_speech(ps);

    // Just started talking
    if (talking && !uttered) {
        uttered = true;
        return false;
    }

    // Stopped talking, so transcribe what was said
    // and begin the next utterance
    if (!talking && uttered) {
        ps_end_utt(ps);
        const char *trans = ps_get_hyp(ps, NULL);

        if (ps_start_utt(ps) < 0) {
            spError("failed to start utterance :(");
        }
        uttered = false;

        int l = strlen(trans);
        if (trans && l > 0) {
            std::lock_guard<std::mutex> lock(words_mtx);
            if (words && l + 1 > words_buf_size) {
                delete words;
                words = NULL;
            }
            if (!words) {
                words = new char[l + 1];
                words_buf_size = l + 1;
            }

            std::copy(trans, trans + l, words);
            words[l] = '\0';
            
            return true;
        }
    }

    return false;
}
Exemplo n.º 2
0
void Recognizer::run()
{
    // Create audio converter:
    auto converter = ci::audio::dsp::Converter::create( mMonitorNode->getSampleRate(), 16000, mMonitorNode->getNumChannels(), 1, mMonitorNode->getFramesPerBlock() );
    // Create buffer for converted audio:
    ci::audio::Buffer destBuffer( converter->getDestMaxFramesPerBlock(), converter->getDestNumChannels() );

    bool utt_started, in_speech;

    if( ps_start_utt( mDecoder ) < 0 )
        throw std::runtime_error( "Could not start utterance" );

    utt_started = false;

    while( ! mStop ) {
        // Convert buffer:
        std::pair<size_t,size_t> convertResult = converter->convert( &( mMonitorNode->getBuffer() ), &destBuffer );

        // Convert buffer data:
        int16_t* data = new int16_t[ convertResult.second ];
        convertFloatToInt16( destBuffer.getData(), data, convertResult.second );

        // Process buffer:
        ps_process_raw( mDecoder, data, convertResult.second, false, false );

        // Cleanup buffer data:
        delete[] data;

        in_speech = static_cast<bool>( ps_get_in_speech( mDecoder ) );

        if( in_speech && ! utt_started ) {
            utt_started = true;
        }

        if( ! in_speech && utt_started ) {
            // Start new utterance on speech to silence transition:
            ps_end_utt( mDecoder );

            // Pass to handler:
            if( mHandler )
                mHandler->event( mDecoder );

            // Prepare for next utterance:
            if( ps_start_utt( mDecoder ) < 0 )
                throw std::runtime_error( "Could not start utterance" );

            utt_started = false;
        }

        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
    }
}
Exemplo n.º 3
0
/*
 * Continuous recognition from mic
 */
int
recognize_from_mic()
{
	ad_rec_t *ad;
    int16 adbuf[2048];
    const char *fname;
	const char* seg;
    int32 k;
	char str[1000]="";
    uint8 utt_started, in_speech;
	
    if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),16000)) == NULL)
		perror("Failed to open audio device\n");
	if (ad_start_rec(ad) < 0)
		perror("Failed to start recording\n");
    
    ps_start_utt(ps);
    utt_started = FALSE;
	ps_seg_t *psegt;
    while (!finished) {
		if ((k = ad_read(ad, adbuf, 2048)) < 0)
			perror("Failed to read audio\n");
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
        } 
        if (!in_speech && utt_started) {
            ps_end_utt(ps);
			psegt = ps_seg_iter(ps, NULL);
			while (psegt!=NULL){
				seg = ps_seg_word(psegt);
				strncpy_s( str, seg, strlen(seg));
				listenCallback(str);
				printf("%s\n", seg);
				int prob = ps_seg_prob(psegt,NULL,NULL,NULL);
				printf("%d\n", prob);
				psegt = ps_seg_next(psegt);
			}
            ps_start_utt(ps);
            utt_started = FALSE;
        }
		Sleep(100);
    }
	
    ps_end_utt(ps);
    fclose(rawfd);
	return 0;
}
Exemplo n.º 4
0
/*
 * Main utterance processing loop:
 *     for (;;) {
 *        start utterance and wait for speech to process
 *        decoding till end-of-utterance silence will be detected
 *        print utterance result;
 *     }
 */
static void
recognize_from_microphone()
{
    ad_rec_t *ad;
    int16 adbuf[2048];
    uint8 utt_started, in_speech;
    int32 k;
    char const *hyp;

    if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
                          (int) cmd_ln_float32_r(config,
                                                 "-samprate"))) == NULL)
        E_FATAL("Failed to open audio device\n");
    if (ad_start_rec(ad) < 0)
        E_FATAL("Failed to start recording\n");

    if (ps_start_utt(ps) < 0)
        E_FATAL("Failed to start utterance\n");
    utt_started = FALSE;
    E_INFO("Ready....\n");

    for (;;) {
        if ((k = ad_read(ad, adbuf, 2048)) < 0)
            E_FATAL("Failed to read audio\n");
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
            E_INFO("Listening...\n");
        }
        if (!in_speech && utt_started) {
            /* speech -> silence transition, time to start new utterance  */
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL );
            if (hyp != NULL) {
                printf("%s\n", hyp);
                fflush(stdout);
            }

            if (ps_start_utt(ps) < 0)
                E_FATAL("Failed to start utterance\n");
            utt_started = FALSE;
            E_INFO("Ready....\n");
        }
        sleep_msec(100);
    }
    ad_close(ad);
}
Exemplo n.º 5
0
/*
 * Continuous recognition from a file
 */
int
recognize_from_file()
{
    int16 adbuf[2048];
    const char *fname;
    const char *hyp;
    int32 k;
	char str[1000]="";
    uint8 utt_started, in_speech;
	
    fname = "C:/Users/Reza/Documents/GitHub/speech_agent/presentation_samples/italy1_reza.wav"; 
    rawfd = fopen(fname, "rb");
    if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) {
        char waveheader[44];
		fread(waveheader, 1, 44, rawfd);
    }
    
    ps_start_utt(ps);
    utt_started = FALSE;
	
    while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) {
		 

        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
        } 
        if (!in_speech && utt_started) {
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL);

            if (hyp != NULL){
				strncpy_s( str, hyp, strlen(hyp));
				printf("%s\n", hyp);
				listenCallback(str);
			}
            ps_start_utt(ps);
            utt_started = FALSE;
        }
    }
	
    ps_end_utt(ps);
    fclose(rawfd);
	return 0;
	
}
Exemplo n.º 6
0
static void
recognize_from_microphone()
{
    ad_rec_t *ad;
    int16 adbuf[2048];
    uint8 utt_started, in_speech;
    int32 k;
    char const *hyp;

    if ((ad = ad_open_dev(AUDIO_DEVICE_NAME,
                          (int) SAMPLE_RATE )) == NULL) {
        E_FATAL("Failed to open audio device\n");
	}
    if (ad_start_rec(ad) < 0) {
        E_FATAL("Failed to start recording\n");
    }
    if (ps_start_utt(ps) < 0) {
        E_FATAL("Failed to start utterance\n");
    }
    utt_started = FALSE;
    printf("READY....\n");

    for (;;) {
        if ((k = ad_read(ad, adbuf, 2048)) < 0)
            E_FATAL("Failed to read audio\n");
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
            printf("Listening...\n");
        }
        if (!in_speech && utt_started) {
            /* speech -> silence transition, time to start new utterance  */
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL );
            if (hyp != NULL)
                printf("%s\n", hyp);

            if (ps_start_utt(ps) < 0)
                E_FATAL("Failed to start utterance\n");
            utt_started = FALSE;
            printf("READY....\n");
        }
        sleep_msec(100);
    }
    ad_close(ad);
}
uint32 FSpeechRecognitionWorker::Run() {

	char const *hyp;
	// attempt to open the default recording device
	if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
		(int)cmd_ln_float32_r(config,
		"-samprate"))) == NULL) {
		ClientMessage(FString(TEXT("Failed to open audio device")));
		return 1;
	}
	if (ad_start_rec(ad) < 0) {
		ClientMessage(FString(TEXT("Failed to start recording")));
		return 2;
	}
	if (ps_start_utt(ps) < 0) {
		ClientMessage(FString(TEXT("Failed to start utterance")));
		return 3;
	}

	while (StopTaskCounter.GetValue() == 0) {
		if ((k = ad_read(ad, adbuf, 1024)) < 0)
			ClientMessage(FString(TEXT("Failed to read audio")));
		ps_process_raw(ps, adbuf, k, 0, 0);
		in_speech = ps_get_in_speech(ps);
		if (in_speech && !utt_started) {
			utt_started = 1;
		}
		if (!in_speech && utt_started) {
			/* speech -> silence transition, time to start new utterance  */
			ps_end_utt(ps);
			hyp = ps_get_hyp(ps, NULL);
			if (hyp != NULL)
				Manager->WordSpoken_method(FString(hyp));

			if (ps_start_utt(ps) < 0)
				ClientMessage(FString(TEXT("Failed to start")));
			utt_started = 0;
		}
	}

	ad_close(ad);
	return 0;
}
Exemplo n.º 8
0
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
    AVFilterContext *ctx = inlink->dst;
    AVDictionary **metadata = &in->metadata;
    ASRContext *s = ctx->priv;
    int have_speech;
    const char *speech;

    ps_process_raw(s->ps, (const int16_t *)in->data[0], in->nb_samples, 0, 0);
    have_speech = ps_get_in_speech(s->ps);
    if (have_speech && !s->utt_started)
        s->utt_started = 1;
    if (!have_speech && s->utt_started) {
        ps_end_utt(s->ps);
        speech = ps_get_hyp(s->ps, NULL);
        if (speech != NULL)
            av_dict_set(metadata, "lavfi.asr.text", speech, 0);
        ps_start_utt(s->ps);
        s->utt_started = 0;
    }

    return ff_filter_frame(ctx->outputs[0], in);
}
Exemplo n.º 9
0
/*
 * Continuous recognition from a file
 */
static void
recognize_from_file()
{
    int16 adbuf[2048];
    const char *fname;
    const char *hyp;
    int32 k;
    uint8 utt_started, in_speech;
    int32 print_times = cmd_ln_boolean_r(config, "-time");

    fname = cmd_ln_str_r(config, "-infile");
    if ((rawfd = fopen(fname, "rb")) == NULL) {
        E_FATAL_SYSTEM("Failed to open file '%s' for reading",
                       fname);
    }
    
    if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) {
        char waveheader[44];
	fread(waveheader, 1, 44, rawfd);
	if (!check_wav_header(waveheader, (int)cmd_ln_float32_r(config, "-samprate")))
    	    E_FATAL("Failed to process file '%s' due to format mismatch.\n", fname);
    }

    if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".mp3") == 0) {
	E_FATAL("Can not decode mp3 files, convert input file to WAV 16kHz 16-bit mono before decoding.\n");
    }
    
    ps_start_utt(ps);
    utt_started = FALSE;

    while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) {
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
        } 
        if (!in_speech && utt_started) {
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL);
            if (hyp != NULL)
        	printf("%s\n", hyp);
            if (print_times)
        	print_word_times();
            fflush(stdout);

            ps_start_utt(ps);
            utt_started = FALSE;
        }
    }
    ps_end_utt(ps);
    if (utt_started) {
        hyp = ps_get_hyp(ps, NULL);
        if (hyp != NULL) {
    	    printf("%s\n", hyp);
    	    if (print_times) {
    		print_word_times();
	    }
	}
    }
    
    fclose(rawfd);
}
Exemplo n.º 10
0
int main(int argc, char** argv)
{
    /* init ps */
    fprintf(stderr, "INIT PS\n");
    cmd_ln_t* psconfig = cmd_ln_init(NULL, ps_args(), TRUE,
        "-hmm", MODELDIR "/en-us/en-us",
        "-lm", MODELDIR "/en-us/en-us.lm.bin",
        "-dict", MODELDIR "/en-us/cmudict-en-us.dict",
        NULL);
    if (psconfig == NULL) {
        fprintf(stderr, "Cannot create config for PS\n");
        return -1;
    }
    
    ps_decoder_t* ps = ps_init(psconfig);
    if (ps == NULL) {
        fprintf(stderr, "Cannot create PS decoder\n");
        return -1;
    }

    fprintf(stderr, "INIT AL\n");
    /* init openal and create microphone */
    ALCdevice* aldevice = alcCaptureOpenDevice(NULL,AL_MIC_FREQ,AL_FORMAT_MONO16,AL_MIC_FREQ/2);
    if (aldevice == NULL) {
        fprintf(stderr, "Cannot open AL device");
        return -1;
    }
    alcCaptureStart(aldevice);

    /* conna capture some wods */
    ALCint samplesIn = 0;
    short buff[AL_MIC_FREQ*2];
    int ignoreCounter = 20;
    uint8 utt_started;
    uint8 in_speech;
    utt_started = FALSE;
    ps_start_utt(ps);
    const char* hyp;
    for (;;) {
        /* poll some data */
        alcGetIntegerv(aldevice,ALC_CAPTURE_SAMPLES,1,&samplesIn);
        if(samplesIn>AL_MIC_CAP) {
            alcCaptureSamples(aldevice,buff,AL_MIC_CAP);
            /* actual voice processing */
            ps_process_raw(ps, buff, AL_MIC_CAP, FALSE, FALSE);
            in_speech = ps_get_in_speech(ps);
            if (in_speech && !utt_started) {
                utt_started = TRUE;
                fprintf(stderr, "Hearing something i guess...\n");
            }
            if (!in_speech && utt_started) {
                fprintf(stderr, "Processing it now...\n");
                ps_end_utt(ps);
                hyp = ps_get_hyp(ps,NULL);
                if (hyp != NULL) {
                    fprintf(stderr, "Here we go!\n\n");
                    fprintf(stdout,"%s\n",hyp);
                    fflush(stdout);
                    fprintf(stderr,"\n\nTo the next round!\n");
                }
                ps_start_utt(ps);
                utt_started = FALSE;
            }

        }

    }
    return 0;
}
Exemplo n.º 11
0
    void Node::run() {
        // declare variables
        const int BUFFER_SIZE = 2048;
        ad_rec_t *reader;
        size_t read_samples;
        int16 read_buffer[BUFFER_SIZE];
        cmd_ln_t *read_decoder_config;
        ps_decoder_t *read_decoder;

        // open audio record
        reader = ad_open_dev(NULL, 16000);
        if (reader == NULL) {
            throw NodeException("Failed to open read audio device");
        }

        // start audio record
        if (ad_start_rec(reader) < 0) {
            throw NodeException("Failed to start read audio device");
        }

        // instance audio decoder
        read_decoder_config = cmd_ln_init(
                NULL,
                ps_args(),
                TRUE,
                "-logfn", "/dev/null", // turn off console log
                "-hmm", "/usr/local/share/pocketsphinx/model/en-us/en-us",
                "-lm", "/usr/local/share/pocketsphinx/model/en-us/en-us.lm.dmp",
                "-dict", "/usr/local/share/pocketsphinx/model/en-us/cmudict-en-us.dict",
                NULL
        );
        read_decoder = ps_init(read_decoder_config);
        if (read_decoder == NULL) {
            throw NodeException("Failed to initialize audio decoder");
        }

        // start utterance
        if (ps_start_utt(read_decoder) < 0) {
            throw NodeException("Failed to start reader utterance");
        }
        bool utt_started = false;
        bool in_speech = false;
        char const *hypothesis;

        while (true) {
            try {
                // read from input device
                read_samples = ad_read(
                        reader,
                        read_buffer,
                        BUFFER_SIZE
                );
                if (read_samples < 0) {
                    throw NodeException("Failed to read audio");
                }

                // add data buffered from device to read decoder
                ps_process_raw(
                        read_decoder,
                        read_buffer,
                        read_samples,
                        false,
                        false
                );

                // check device in speech
                in_speech = ps_get_in_speech(read_decoder);
                if (in_speech && !utt_started) {
                    utt_started = true;
                }

                // check device in speech then add data to decoder
                if (!in_speech && utt_started) {
                    ps_end_utt(read_decoder);
                    hypothesis = ps_get_hyp(read_decoder, NULL);
                    if (hypothesis != NULL) {
                        cout<<"ENV: "<<hypothesis<<endl;
                        Command core_command = this->spec_.first("node-core");
                        int port = atoi(core_command.option("--port").c_str());
                        SocketWriter socket(core_command.option("--host"), port);
                        string response = socket.write_to(hypothesis);
                        cout << "IN: " << response << endl;
                    }
                    if (ps_start_utt(read_decoder) < 0) {
                        throw NodeException("Failed to start reader utterance");
                    }
                    utt_started = false;
                }
                sleep_milli_seconds(100);
            } catch (NodeException e) {
                cout<<"ERROR: "<<e.what()<<endl;
            }

        }
    }
Exemplo n.º 12
0
/*
 * Main utterance processing loop:
 *     for (;;) {
 *        start utterance and wait for speech to process
 *        decoding till end-of-utterance silence will be detected
 *        print utterance result;
 *     }
 */
static void
recognize_from_microphone()
{
   //---------------------definitions	
    ad_rec_t *ad;
    int16 adbuf[2048];
    uint8 utt_started, in_speech;
    int32 k;
    char const *hyp;
   /////////////////////////////
    if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
                          (int) cmd_ln_float32_r(config,
                                                 "-samprate"))) == NULL)
        E_FATAL("Failed to open audio device\n");
    if (ad_start_rec(ad) < 0)
        E_FATAL("Failed to start recording\n");

    if (ps_start_utt(ps) < 0)
        E_FATAL("Failed to start utterance\n");
    utt_started = FALSE;
    printf("READY1....\n");

    for (;;) {
        if ((k = ad_read(ad, adbuf, 2048)) < 0)
            E_FATAL("Failed to read audio\n");
            
            
         // decode whatever data was read 
         //1.try to limit the processing time here
         //2.Or modify the algorithm 
         //3.Or change the parameters of the implemented functions
        ps_process_raw(ps, adbuf, k, FALSE, FALSE); 
        
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {  //machine processing acoustics (listening)
            utt_started = TRUE;
            printf("Listening...\n");
        }
        if (!in_speech && utt_started) {  //--------------------------------------------understanding without listening
            /* speech -> silence transition, time to start new utterance  */
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL );
            if (hyp != NULL) {
				rawfd = fopen("char-array.txt", "a");
			//~ fprintf(rawfd, "data\n");
			//~ fclose(rawfd);          
			
			
			
			if (hyp) {
			strcpy(word0, ""); strcpy(word1, "");
			sscanf(hyp, "%s", word0);
			printf("%s\n", word0);
		}
			
			
			
			
                //~ printf("%s\n", hyp);   // this appears on the screan
            
            
            
            
            
            
                fprintf(rawfd, "%s\n",hyp);  // write to txt file
			fclose(rawfd); 
				printf("xxxxx");
			}
            if (ps_start_utt(ps) < 0)  //open the file and start decoding
                E_FATAL("Failed to start utterance\n");
            utt_started = FALSE;
            printf("READY2....\n");
            const char *text = "write this to the file";
           // fprintf(f,"some text :%s\n",text);
           // fclose(f);
        }
        sleep_msec(1);
    }
    ad_close(ad);
}