void AUD_SuperposeReader::read(int& length, bool& eos, sample_t* buffer) { AUD_Specs specs = m_reader1->getSpecs(); AUD_Specs s2 = m_reader2->getSpecs(); if(!AUD_COMPARE_SPECS(specs, s2)) AUD_THROW(AUD_ERROR_SPECS, specs_error); int samplesize = AUD_SAMPLE_SIZE(specs); m_buffer.assureSize(length * samplesize); int len1 = length; m_reader1->read(len1, eos, buffer); if(len1 < length) memset(buffer + len1 * specs.channels, 0, (length - len1) * samplesize); int len2 = length; bool eos2; sample_t* buf = m_buffer.getBuffer(); m_reader2->read(len2, eos2, buf); for(int i = 0; i < len2 * specs.channels; i++) buffer[i] += buf[i]; length = AUD_MAX(len1, len2); eos &= eos2; }
void AUD_SuperposeReader::read(int & length, sample_t* & buffer) { AUD_Specs specs = m_reader1->getSpecs(); int samplesize = AUD_SAMPLE_SIZE(specs); if(m_buffer.getSize() < length * samplesize) m_buffer.resize(length * samplesize); buffer = m_buffer.getBuffer(); int len1 = length; sample_t* buf; m_reader1->read(len1, buf); memcpy(buffer, buf, len1 * samplesize); if(len1 < length) memset(buffer + len1 * specs.channels, 0, (length - len1) * samplesize); int len2 = length; m_reader2->read(len2, buf); for(int i = 0; i < len2 * specs.channels; i++) buffer[i] += buf[i]; length = AUD_MAX(len1, len2); }
/* compute probability for Mixture of Gaussians */ AUD_Double gmm_prob( void *hGmmHandle, AUD_Matrix *pFeatMatrix, AUD_Int32s rowIndex, AUD_Vector *pComponentProb ) { AUD_ASSERT( hGmmHandle && pFeatMatrix ); AUD_ASSERT( pFeatMatrix->dataType == AUD_DATATYPE_INT32S ); AUD_ASSERT( rowIndex < pFeatMatrix->rows ); AUD_ASSERT( !pComponentProb || ( pComponentProb && pComponentProb->dataType == AUD_DATATYPE_DOUBLE ) ); GmmModel *pState = (GmmModel*)hGmmHandle; AUD_Int32s i; AUD_ASSERT( pFeatMatrix->cols == pState->width ); AUD_Double *llr = (AUD_Double*)calloc( pState->numMix * sizeof(AUD_Double), 1 ); AUD_ASSERT( llr ); AUD_Double maxLLR = 0.; for ( i = 0; i < pState->numMix; i++ ) { loggauss( pFeatMatrix->pInt32s + rowIndex * pFeatMatrix->cols, pState->means.pInt32s + i * pState->step, pState->cvars.pInt64s + i * pState->step, pState->width, &llr[i], (pState->dets.pDouble)[i] ); if ( pComponentProb != NULL ) { pComponentProb->pDouble[i] = exp( llr[i] ); } if ( i == 0 ) { maxLLR = llr[i]; } else { maxLLR = AUD_MAX( maxLLR, llr[i] ); } } AUD_Double expDiffSum = 0., totalLLR, prob; for ( i = 0; i < pState->numMix; i++ ) { expDiffSum += exp( llr[i] - maxLLR ); } totalLLR = log( expDiffSum ) + maxLLR; prob = exp( totalLLR ); // AUDLOG( "totalLLR[%f], prob[%f]\n", totalLLR, prob ); free( llr ); llr = NULL; return prob; }
void AUD_FFMPEGWriter::write(unsigned int length, sample_t* buffer) { unsigned int samplesize = AUD_SAMPLE_SIZE(m_specs); if(m_input_size) { sample_t* inbuf = m_input_buffer.getBuffer(); while(length) { unsigned int len = AUD_MIN(m_input_size - m_input_samples, length); memcpy(inbuf + m_input_samples * m_specs.channels, buffer, len * samplesize); buffer += len * m_specs.channels; m_input_samples += len; m_position += len; length -= len; if(m_input_samples == m_input_size) { encode(inbuf); m_input_samples = 0; } } } else // PCM data, can write directly! { int samplesize = AUD_SAMPLE_SIZE(m_specs); if(m_output_buffer.getSize() != length * m_specs.channels * m_codecCtx->bits_per_coded_sample / 8) m_output_buffer.resize(length * m_specs.channels * m_codecCtx->bits_per_coded_sample / 8); m_input_buffer.assureSize(length * AUD_MAX(AUD_DEVICE_SAMPLE_SIZE(m_specs), samplesize)); sample_t* buf = m_input_buffer.getBuffer(); m_convert(reinterpret_cast<data_t*>(buf), reinterpret_cast<data_t*>(buffer), length * m_specs.channels); encode(buf); m_position += length; } }
/* compute LLR for Mixture of Gaussians */ AUD_Double gmm_llr( void *hGmmHandle, AUD_Matrix *pFeatMatrix, AUD_Int32s rowIndex, AUD_Vector *pComponentLLR ) { AUD_ASSERT( hGmmHandle && pFeatMatrix ); AUD_ASSERT( pFeatMatrix->dataType == AUD_DATATYPE_INT32S ); AUD_ASSERT( rowIndex < pFeatMatrix->rows ); AUD_ASSERT( !pComponentLLR || ( pComponentLLR && pComponentLLR->dataType == AUD_DATATYPE_DOUBLE ) ); GmmModel *pState = (GmmModel*)hGmmHandle; AUD_Int32s i; AUD_ASSERT( pFeatMatrix->cols == pState->width ); AUD_Double *llr = (AUD_Double*)calloc( pState->numMix * sizeof(AUD_Double), 1 ); AUD_ASSERT( llr ); AUD_Double maxLLR = 0., totalLLR; if ( pState->dets.len <= 0 ) { pState->dets.len = pState->numMix; pState->dets.dataType = AUD_DATATYPE_DOUBLE; AUD_Int32s ret = createVector( &(pState->dets) ); AUD_ASSERT( ret == 0 ); calcdet( &(pState->cvars), &(pState->weights), &(pState->dets) ); } for ( i = 0; i < pState->numMix; i++ ) { loggauss( pFeatMatrix->pInt32s + rowIndex * pFeatMatrix->cols, pState->means.pInt32s + i * pState->step, pState->cvars.pInt64s + i * pState->step, pFeatMatrix->cols, &llr[i], (pState->dets.pDouble)[i] ); if ( pComponentLLR != NULL ) { pComponentLLR->pDouble[i] = llr[i]; } if ( i == 0 ) { maxLLR = llr[i]; } else { maxLLR = AUD_MAX( maxLLR, llr[i] ); } } AUD_Double expDiffSum = 0.; for ( i = 0; i < pState->numMix; i++ ) { expDiffSum += exp( llr[i] - maxLLR ); } totalLLR = log( expDiffSum ) + maxLLR; free( llr ); llr = NULL; if ( isnan( totalLLR ) || totalLLR < LOG_ZERO ) { totalLLR = LOG_ZERO; } return totalLLR; }
void AUD_SoftwareDevice::AUD_SoftwareHandle::update() { int flags = 0; AUD_Vector3 SL; if(m_relative) SL = -m_location; else SL = m_device->m_location - m_location; float distance = SL * SL; if(distance > 0) distance = sqrt(distance); else flags |= AUD_RENDER_DOPPLER | AUD_RENDER_DISTANCE; if(m_pitch->getSpecs().channels != AUD_CHANNELS_MONO) { m_volume = m_user_volume; m_pitch->setPitch(m_user_pitch); return; } flags = ~(flags | m_flags | m_device->m_flags); // Doppler and Pitch if(flags & AUD_RENDER_DOPPLER) { float vls; if(m_relative) vls = 0; else vls = SL * m_device->m_velocity / distance; float vss = SL * m_velocity / distance; float max = m_device->m_speed_of_sound / m_device->m_doppler_factor; if(vss >= max) { m_pitch->setPitch(AUD_PITCH_MAX); } else { if(vls > max) vls = max; m_pitch->setPitch((m_device->m_speed_of_sound - m_device->m_doppler_factor * vls) / (m_device->m_speed_of_sound - m_device->m_doppler_factor * vss) * m_user_pitch); } } else m_pitch->setPitch(m_user_pitch); if(flags & AUD_RENDER_VOLUME) { // Distance if(flags & AUD_RENDER_DISTANCE) { if(m_device->m_distance_model == AUD_DISTANCE_MODEL_INVERSE_CLAMPED || m_device->m_distance_model == AUD_DISTANCE_MODEL_LINEAR_CLAMPED || m_device->m_distance_model == AUD_DISTANCE_MODEL_EXPONENT_CLAMPED) { distance = AUD_MAX(AUD_MIN(m_distance_max, distance), m_distance_reference); } switch(m_device->m_distance_model) { case AUD_DISTANCE_MODEL_INVERSE: case AUD_DISTANCE_MODEL_INVERSE_CLAMPED: m_volume = m_distance_reference / (m_distance_reference + m_attenuation * (distance - m_distance_reference)); break; case AUD_DISTANCE_MODEL_LINEAR: case AUD_DISTANCE_MODEL_LINEAR_CLAMPED: { float temp = m_distance_max - m_distance_reference; if(temp == 0) { if(distance > m_distance_reference) m_volume = 0.0f; else m_volume = 1.0f; } else m_volume = 1.0f - m_attenuation * (distance - m_distance_reference) / (m_distance_max - m_distance_reference); break; } case AUD_DISTANCE_MODEL_EXPONENT: case AUD_DISTANCE_MODEL_EXPONENT_CLAMPED: if(m_distance_reference == 0) m_volume = 0; else m_volume = pow(distance / m_distance_reference, -m_attenuation); break; default: m_volume = 1.0f; } } else m_volume = 1.0f; // Cone if(flags & AUD_RENDER_CONE) { AUD_Vector3 SZ = m_orientation.getLookAt(); float phi = acos(float(SZ * SL / (SZ.length() * SL.length()))); float t = (phi - m_cone_angle_inner)/(m_cone_angle_outer - m_cone_angle_inner); if(t > 0) { if(t > 1) m_volume *= m_cone_volume_outer; else m_volume *= 1 + t * (m_cone_volume_outer - 1); } } if(m_volume > m_volume_max) m_volume = m_volume_max; else if(m_volume < m_volume_min) m_volume = m_volume_min; // Volume m_volume *= m_user_volume; } // 3D Cue AUD_Quaternion orientation; if(!m_relative) orientation = m_device->m_orientation; AUD_Vector3 Z = orientation.getLookAt(); AUD_Vector3 N = orientation.getUp(); AUD_Vector3 A = N * ((SL * N) / (N * N)) - SL; float Asquare = A * A; if(Asquare > 0) { float phi = acos(float(Z * A / (Z.length() * sqrt(Asquare)))); if(N.cross(Z) * A > 0) phi = -phi; m_mapper->setMonoAngle(phi); } else m_mapper->setMonoAngle(m_relative ? m_user_pan * M_PI / 2.0 : 0); }
void AUD_SequencerReader::read(int& length, bool& eos, sample_t* buffer) { AUD_MutexLock lock(*m_sequence); if(m_sequence->m_status != m_status) { m_device.changeSpecs(m_sequence->m_specs); m_device.setSpeedOfSound(m_sequence->m_speed_of_sound); m_device.setDistanceModel(m_sequence->m_distance_model); m_device.setDopplerFactor(m_sequence->m_doppler_factor); m_status = m_sequence->m_status; } if(m_sequence->m_entry_status != m_entry_status) { std::list<boost::shared_ptr<AUD_SequencerHandle> > handles; AUD_HandleIterator hit = m_handles.begin(); AUD_EntryIterator eit = m_sequence->m_entries.begin(); int result; boost::shared_ptr<AUD_SequencerHandle> handle; while(hit != m_handles.end() && eit != m_sequence->m_entries.end()) { handle = *hit; boost::shared_ptr<AUD_SequencerEntry> entry = *eit; result = handle->compare(entry); if(result < 0) { try { handle = boost::shared_ptr<AUD_SequencerHandle>(new AUD_SequencerHandle(entry, m_device)); handles.push_back(handle); } catch(AUD_Exception&) { } eit++; } else if(result == 0) { handles.push_back(handle); hit++; eit++; } else { handle->stop(); hit++; } } while(hit != m_handles.end()) { (*hit)->stop(); hit++; } while(eit != m_sequence->m_entries.end()) { try { handle = boost::shared_ptr<AUD_SequencerHandle>(new AUD_SequencerHandle(*eit, m_device)); handles.push_back(handle); } catch(AUD_Exception&) { } eit++; } m_handles = handles; m_entry_status = m_sequence->m_entry_status; } AUD_Specs specs = m_sequence->m_specs; int pos = 0; float time = float(m_position) / float(specs.rate); float volume, frame; int len, cfra; AUD_Vector3 v, v2; AUD_Quaternion q; while(pos < length) { frame = time * m_sequence->m_fps; cfra = int(floor(frame)); len = int(ceil((cfra + 1) / m_sequence->m_fps * specs.rate)) - m_position; len = AUD_MIN(length - pos, len); len = AUD_MAX(len, 1); for(AUD_HandleIterator it = m_handles.begin(); it != m_handles.end(); it++) { (*it)->update(time, frame, m_sequence->m_fps); } m_sequence->m_volume.read(frame, &volume); if(m_sequence->m_muted) volume = 0.0f; m_device.setVolume(volume); m_sequence->m_orientation.read(frame, q.get()); m_device.setListenerOrientation(q); m_sequence->m_location.read(frame, v.get()); m_device.setListenerLocation(v); m_sequence->m_location.read(frame + 1, v2.get()); v2 -= v; m_device.setListenerVelocity(v2 * m_sequence->m_fps); m_device.read(reinterpret_cast<data_t*>(buffer + specs.channels * pos), len); pos += len; time += float(len) / float(specs.rate); } m_position += length; eos = false; }
int AUD_SuperposeReader::getPosition() const { int pos1 = m_reader1->getPosition(); int pos2 = m_reader2->getPosition(); return AUD_MAX(pos1, pos2); }
AUD_Int32s wov_adapt_gmm_si() { AUD_Error error = AUD_ERROR_NONE; AUD_Int32s ret = 0; AUD_Int8s wavPath[256] = { 0, }; AUD_Int32s data; setbuf( stdout, NULL ); setbuf( stdin, NULL ); AUDLOG( "pls give adapt wav stream's folder path:\n" ); wavPath[0] = '\0'; data = scanf( "%s", wavPath ); AUDLOG( "adapt wav stream's folder path is: %s\n", wavPath ); // step 1: read UBM model from file void *hUbm = NULL; FILE *fpUbm = fopen( WOV_UBM_GMMMODEL_FILE, "rb" ); if ( fpUbm == NULL ) { AUDLOG( "cannot open ubm model file: [%s]\n", WOV_UBM_GMMMODEL_FILE ); return AUD_ERROR_IOFAILED; } error = gmm_import( &hUbm, fpUbm ); AUD_ASSERT( error == AUD_ERROR_NONE ); fclose( fpUbm ); fpUbm = NULL; // AUDLOG( "ubm GMM as:\n" ); // gmm_show( hUbm ); AUD_Int32s i = 0, j = 0; entry *pEntry = NULL; dir *pDir = NULL; AUD_Int32s totalWinNum = 0; pDir = openDir( (const char*)wavPath ); if ( pDir == NULL ) { AUDLOG( "cannot open folder: %s\n", wavPath ); return -1; } while ( ( pEntry = scanDir( pDir ) ) ) { AUD_Int8s keywordFile[256] = { 0, }; AUD_Summary fileSummary; AUD_Int32s sampleNum = 0; snprintf( (char*)keywordFile, 256, "%s/%s", wavPath, pEntry->name ); // AUDLOG( "%s\n", keywordFile ); ret = parseWavFromFile( keywordFile, &fileSummary ); if ( ret < 0 ) { continue; } AUD_ASSERT( fileSummary.channelNum == CHANNEL_NUM && fileSummary.bytesPerSample == BYTES_PER_SAMPLE && fileSummary.sampleRate == SAMPLE_RATE ); // request memeory for template sampleNum = fileSummary.dataChunkBytes / fileSummary.bytesPerSample; for ( j = 0; j * FRAME_STRIDE + FRAME_LEN <= sampleNum; j++ ) { ; } j = j - MFCC_DELAY; totalWinNum += j; } closeDir( pDir ); pDir = NULL; AUD_Matrix featureMatrix; featureMatrix.rows = totalWinNum; featureMatrix.cols = MFCC_FEATDIM; featureMatrix.dataType = AUD_DATATYPE_INT32S; ret = createMatrix( &featureMatrix ); AUD_ASSERT( ret == 0 ); AUD_Int32s currentRow = 0; pDir = openDir( (const char*)wavPath ); while ( ( pEntry = scanDir( pDir ) ) ) { AUD_Int8s keywordFile[256] = { 0, }; AUD_Summary fileSummary; AUD_Int32s sampleNum = 0; void *hMfccHandle = NULL; snprintf( (char*)keywordFile, 256, "%s/%s", wavPath, pEntry->name ); // AUDLOG( "%s\n", keywordFile ); ret = parseWavFromFile( keywordFile, &fileSummary ); if ( ret < 0 ) { continue; } AUD_ASSERT( fileSummary.channelNum == CHANNEL_NUM && fileSummary.bytesPerSample == BYTES_PER_SAMPLE && fileSummary.sampleRate == SAMPLE_RATE ); AUD_Int32s bufLen = fileSummary.dataChunkBytes; AUD_Int16s *pBuf = (AUD_Int16s*)calloc( bufLen, 1 ); AUD_ASSERT( pBuf ); sampleNum = readWavFromFile( (AUD_Int8s*)keywordFile, pBuf, bufLen ); AUD_ASSERT( sampleNum > 0 ); // pre-processing // pre-emphasis sig_preemphasis( pBuf, pBuf, sampleNum ); // calc framing number for ( j = 0; j * FRAME_STRIDE + FRAME_LEN <= sampleNum; j++ ) { ; } // XXX: select salient frames AUD_Feature feature; feature.featureMatrix.rows = j - MFCC_DELAY; feature.featureMatrix.cols = MFCC_FEATDIM; feature.featureMatrix.dataType = AUD_DATATYPE_INT32S; feature.featureMatrix.pInt32s = featureMatrix.pInt32s + currentRow * feature.featureMatrix.cols; feature.featureNorm.len = j - MFCC_DELAY; feature.featureNorm.dataType = AUD_DATATYPE_INT64S; ret = createVector( &(feature.featureNorm) ); AUD_ASSERT( ret == 0 ); error = mfcc16s32s_init( &hMfccHandle, FRAME_LEN, WINDOW_TYPE, MFCC_ORDER, FRAME_STRIDE, SAMPLE_RATE, COMPRESS_TYPE ); AUD_ASSERT( error == AUD_ERROR_NONE ); error = mfcc16s32s_calc( hMfccHandle, pBuf, sampleNum, &feature ); AUD_ASSERT( error == AUD_ERROR_NONE ); error = mfcc16s32s_deinit( &hMfccHandle ); AUD_ASSERT( error == AUD_ERROR_NONE ); free( pBuf ); pBuf = NULL; bufLen = 0; ret = destroyVector( &(feature.featureNorm) ); AUD_ASSERT( ret == 0 ); currentRow += feature.featureMatrix.rows; } closeDir( pDir ); pDir = NULL; AUD_Matrix llrMatrix; llrMatrix.rows = totalWinNum; llrMatrix.cols = gmm_getmixnum( hUbm ); llrMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &llrMatrix ); AUD_ASSERT( ret == 0 ); AUD_Double llr = 0.; for ( j = 0; j < featureMatrix.rows; j++ ) { AUD_Vector componentLLR; componentLLR.len = llrMatrix.cols; componentLLR.dataType = AUD_DATATYPE_DOUBLE; componentLLR.pDouble = llrMatrix.pDouble + j * llrMatrix.cols; llr = gmm_llr( hUbm, &(featureMatrix), j, &componentLLR ); } AUD_Vector sumLlr; sumLlr.len = llrMatrix.cols; sumLlr.dataType = AUD_DATATYPE_DOUBLE; ret = createVector( &sumLlr ); AUD_ASSERT( ret == 0 ); AUD_Double *pSumLlr = sumLlr.pDouble; for ( j = 0; j < llrMatrix.cols; j++ ) { pSumLlr[j] = llrMatrix.pDouble[j]; } for ( i = 1; i < llrMatrix.rows; i++ ) { for ( j = 0; j < llrMatrix.cols; j++ ) { pSumLlr[j] = logadd( pSumLlr[j], *(llrMatrix.pDouble + i * llrMatrix.cols + j) ); } } #if 0 AUD_Vector bestIndex; bestIndex.len = TOP_N; bestIndex.dataType = AUD_DATATYPE_INT32S; ret = createVector( &bestIndex ); AUD_ASSERT( ret == 0 ); // get top TOP_N component ret = sortVector( &sumLlr, &bestIndex ); AUD_ASSERT( ret == 0 ); #else llr = pSumLlr[0]; for ( j = 1; j < sumLlr.len; j++ ) { llr = logadd( llr, pSumLlr[j] ); } // AUDLOG( "llr: %.f\n", llr ); AUD_Vector sortIndex; sortIndex.len = sumLlr.len; sortIndex.dataType = AUD_DATATYPE_INT32S; ret = createVector( &sortIndex ); AUD_ASSERT( ret == 0 ); ret = sortVector( &sumLlr, &sortIndex ); AUD_ASSERT( ret == 0 ); int num = 0; double val = 0.; for ( i = 0; i < sortIndex.len; i++ ) { // ln( 0.001 ) ~= -7. val = pSumLlr[sortIndex.pInt32s[i]] - llr + 7.; // AUDLOG( "%f, \n", val ); if ( val < 0 ) { break; } num++; } // AUDLOG( "\n" ); AUD_ASSERT( num > 0 ); AUDLOG( "computed component num: %d\n", num ); num = AUD_MAX( num, TOP_N ); AUDLOG( "normalized component num: %d\n", num ); AUD_Vector bestIndex; bestIndex.len = num; bestIndex.dataType = AUD_DATATYPE_INT32S; bestIndex.pInt32s = sortIndex.pInt32s; #endif int slash = '/'; char *ptr = strrchr( (char*)wavPath, slash ); ptr++; // select imposter GMM void *hImposterGmm = NULL; AUD_Int8s imposterGmmName[256] = { 0, }; snprintf( (char*)imposterGmmName, 256, "%s-imposter", ptr ); error = gmm_select( &hImposterGmm, hUbm, &bestIndex, 0 | GMM_INVERTSELECT_MASK, imposterGmmName ); AUD_ASSERT( error == AUD_ERROR_NONE ); gmm_show( hImposterGmm ); // export gmm char imposterFile[256] = { 0 }; snprintf( imposterFile, 256, "%s/%s-imposter.gmm", WOV_IMPOSTER_GMMMODEL_DIR, ptr ); AUDLOG( "Export imposter GMM Model to: %s\n", imposterFile ); FILE *fpImposterGmm = fopen( imposterFile, "wb" ); AUD_ASSERT( fpImposterGmm ); error = gmm_export( hImposterGmm, fpImposterGmm ); AUD_ASSERT( error == AUD_ERROR_NONE ); AUDLOG( "Export imposter GMM Model File Done\n" ); fclose( fpImposterGmm ); fpImposterGmm = NULL; error = gmm_free( &hImposterGmm ); AUD_ASSERT( error == AUD_ERROR_NONE ); // select keyword GMM void *hAdaptedGmm = NULL; AUD_Int8s adaptedGmmName[256] = { 0, }; snprintf( (char*)adaptedGmmName, 256, "%s", ptr ); // AUDLOG( "%s\n", adaptedGmmName ); error = gmm_select( &hAdaptedGmm, hUbm, &bestIndex, 0, adaptedGmmName ); AUD_ASSERT( error == AUD_ERROR_NONE ); ret = destroyVector( &sumLlr ); AUD_ASSERT( ret == 0 ); ret = destroyMatrix( &llrMatrix ); AUD_ASSERT( ret == 0 ); #if 0 ret = destroyVector( &bestIndex ); AUD_ASSERT( ret == 0 ); #else ret = destroyVector( &sortIndex ); AUD_ASSERT( ret == 0 ); #endif #if 1 // adapt GMM error = gmm_adapt( hAdaptedGmm, &featureMatrix ); AUD_ASSERT( error == AUD_ERROR_NONE ); #endif gmm_show( hAdaptedGmm ); // export gmm char modelFile[256] = { 0 }; snprintf( modelFile, 256, "%s/%s.gmm", WOV_KEYWORD_GMMMODEL_DIR, ptr ); AUDLOG( "Export GMM Model to: %s\n", modelFile ); FILE *fpGmm = fopen( modelFile, "wb" ); AUD_ASSERT( fpGmm ); error = gmm_export( hAdaptedGmm, fpGmm ); AUD_ASSERT( error == AUD_ERROR_NONE ); AUDLOG( "Export GMM Model File Done\n" ); fclose( fpGmm ); fpGmm = NULL; ret = destroyMatrix( &featureMatrix ); AUD_ASSERT( ret == 0 ); error = gmm_free( &hAdaptedGmm ); AUD_ASSERT( error == AUD_ERROR_NONE ); error = gmm_free( &hUbm ); AUD_ASSERT( error == AUD_ERROR_NONE ); AUDLOG( "keyword model adapt2 done\n" ); return 0; }
AUD_FFMPEGWriter::AUD_FFMPEGWriter(std::string filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate) : m_position(0), m_specs(specs), m_input_samples(0) { static const char* formats[] = { NULL, "ac3", "flac", "matroska", "mp2", "mp3", "ogg", "wav" }; m_formatCtx = avformat_alloc_context(); if (!m_formatCtx) AUD_THROW(AUD_ERROR_FFMPEG, context_error); strcpy(m_formatCtx->filename, filename.c_str()); m_outputFmt = m_formatCtx->oformat = av_guess_format(formats[format], filename.c_str(), NULL); if (!m_outputFmt) { avformat_free_context(m_formatCtx); AUD_THROW(AUD_ERROR_FFMPEG, context_error); } switch(codec) { case AUD_CODEC_AAC: m_outputFmt->audio_codec = AV_CODEC_ID_AAC; break; case AUD_CODEC_AC3: m_outputFmt->audio_codec = AV_CODEC_ID_AC3; break; case AUD_CODEC_FLAC: m_outputFmt->audio_codec = AV_CODEC_ID_FLAC; break; case AUD_CODEC_MP2: m_outputFmt->audio_codec = AV_CODEC_ID_MP2; break; case AUD_CODEC_MP3: m_outputFmt->audio_codec = AV_CODEC_ID_MP3; break; case AUD_CODEC_PCM: switch(specs.format) { case AUD_FORMAT_U8: m_outputFmt->audio_codec = AV_CODEC_ID_PCM_U8; break; case AUD_FORMAT_S16: m_outputFmt->audio_codec = AV_CODEC_ID_PCM_S16LE; break; case AUD_FORMAT_S24: m_outputFmt->audio_codec = AV_CODEC_ID_PCM_S24LE; break; case AUD_FORMAT_S32: m_outputFmt->audio_codec = AV_CODEC_ID_PCM_S32LE; break; case AUD_FORMAT_FLOAT32: m_outputFmt->audio_codec = AV_CODEC_ID_PCM_F32LE; break; case AUD_FORMAT_FLOAT64: m_outputFmt->audio_codec = AV_CODEC_ID_PCM_F64LE; break; default: m_outputFmt->audio_codec = AV_CODEC_ID_NONE; break; } break; case AUD_CODEC_VORBIS: m_outputFmt->audio_codec = AV_CODEC_ID_VORBIS; break; default: m_outputFmt->audio_codec = AV_CODEC_ID_NONE; break; } try { if(m_outputFmt->audio_codec == AV_CODEC_ID_NONE) AUD_THROW(AUD_ERROR_SPECS, codec_error); m_stream = avformat_new_stream(m_formatCtx, NULL); if(!m_stream) AUD_THROW(AUD_ERROR_FFMPEG, stream_error); m_codecCtx = m_stream->codec; m_codecCtx->codec_id = m_outputFmt->audio_codec; m_codecCtx->codec_type = AVMEDIA_TYPE_AUDIO; m_codecCtx->bit_rate = bitrate; m_codecCtx->sample_rate = int(m_specs.rate); m_codecCtx->channels = m_specs.channels; m_codecCtx->time_base.num = 1; m_codecCtx->time_base.den = m_codecCtx->sample_rate; switch(m_specs.format) { case AUD_FORMAT_U8: m_convert = AUD_convert_float_u8; m_codecCtx->sample_fmt = AV_SAMPLE_FMT_U8; break; case AUD_FORMAT_S16: m_convert = AUD_convert_float_s16; m_codecCtx->sample_fmt = AV_SAMPLE_FMT_S16; break; case AUD_FORMAT_S32: m_convert = AUD_convert_float_s32; m_codecCtx->sample_fmt = AV_SAMPLE_FMT_S32; break; case AUD_FORMAT_FLOAT32: m_convert = AUD_convert_copy<float>; m_codecCtx->sample_fmt = AV_SAMPLE_FMT_FLT; break; case AUD_FORMAT_FLOAT64: m_convert = AUD_convert_float_double; m_codecCtx->sample_fmt = AV_SAMPLE_FMT_DBL; break; default: AUD_THROW(AUD_ERROR_FFMPEG, format_error); } try { if(m_formatCtx->oformat->flags & AVFMT_GLOBALHEADER) m_codecCtx->flags |= CODEC_FLAG_GLOBAL_HEADER; AVCodec* codec = avcodec_find_encoder(m_codecCtx->codec_id); if(!codec) AUD_THROW(AUD_ERROR_FFMPEG, codec_error); if(codec->sample_fmts) { // Check if the prefered sample format for this codec is supported. const enum AVSampleFormat *p = codec->sample_fmts; for(; *p != -1; p++) { if(*p == m_stream->codec->sample_fmt) break; } if(*p == -1) { // Sample format incompatible with codec. Defaulting to a format known to work. m_stream->codec->sample_fmt = codec->sample_fmts[0]; } } if(avcodec_open2(m_codecCtx, codec, NULL)) AUD_THROW(AUD_ERROR_FFMPEG, codec_error); m_output_buffer.resize(FF_MIN_BUFFER_SIZE); int samplesize = AUD_MAX(AUD_SAMPLE_SIZE(m_specs), AUD_DEVICE_SAMPLE_SIZE(m_specs)); if(m_codecCtx->frame_size <= 1) { m_input_size = FF_MIN_BUFFER_SIZE * 8 / m_codecCtx->bits_per_coded_sample / m_codecCtx->channels; m_input_buffer.resize(m_input_size * samplesize); } else { m_input_buffer.resize(m_codecCtx->frame_size * samplesize); m_input_size = m_codecCtx->frame_size; } #ifdef FFMPEG_HAVE_ENCODE_AUDIO2 m_frame = av_frame_alloc(); if (!m_frame) AUD_THROW(AUD_ERROR_FFMPEG, codec_error); avcodec_get_frame_defaults(m_frame); m_frame->linesize[0] = m_input_size * samplesize; m_frame->format = m_codecCtx->sample_fmt; m_frame->nb_samples = m_input_size; # ifdef FFMPEG_HAVE_AVFRAME_SAMPLE_RATE m_frame->sample_rate = m_codecCtx->sample_rate; # endif # ifdef FFMPEG_HAVE_FRAME_CHANNEL_LAYOUT m_frame->channel_layout = m_codecCtx->channel_layout; # endif m_sample_size = av_get_bytes_per_sample(m_codecCtx->sample_fmt); m_frame_pts = 0; m_deinterleave = av_sample_fmt_is_planar(m_codecCtx->sample_fmt); if(m_deinterleave) m_deinterleave_buffer.resize(m_input_size * m_codecCtx->channels * m_sample_size); #endif try { if(avio_open(&m_formatCtx->pb, filename.c_str(), AVIO_FLAG_WRITE)) AUD_THROW(AUD_ERROR_FILE, file_error); avformat_write_header(m_formatCtx, NULL); } catch(AUD_Exception&) { avcodec_close(m_codecCtx); av_freep(&m_formatCtx->streams[0]->codec); throw; } } catch(AUD_Exception&) { av_freep(&m_formatCtx->streams[0]); throw; } } catch(AUD_Exception&) { av_free(m_formatCtx); throw; } }
AUD_Int32s denoise_aud( AUD_Int16s *pInBuf, AUD_Int16s *pOutBuf, AUD_Int32s inLen ) { Fft_16s *hFft = NULL; Ifft_16s *hIfft = NULL; AUD_Window16s *hWin = NULL; AUD_Int32s frameSize = 512; AUD_Int32s frameStride = 256; AUD_Int32s frameOverlap = 256; AUD_Int32s nFFT = frameSize; AUD_Int32s nSpecLen = nFFT / 2 + 1; AUD_Int32s nNoiseFrame = 6; // (AUD_Int32s)( ( 0.25 * SAMPLE_RATE - frameSize ) / frameStride + 1 ); AUD_Int32s i, j, k, m, n, ret; AUD_Int32s cleanLen = 0; // pre-emphasis // sig_preemphasis( pInBuf, pInBuf, inLen ); // init hamming module win16s_init( &hWin, AUD_WIN_HAMM, frameSize, 14 ); AUD_ASSERT( hWin ); // init fft handle fft_init( &hFft, nFFT, 15 ); AUD_ASSERT( hFft ); // init ifft handle ifft_init( &hIfft, nFFT, 15 ); AUD_ASSERT( hIfft ); AUD_Int16s *pFrame = (AUD_Int16s*)calloc( frameSize * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pFrame ); // FFT AUD_Int32s *pFFTMag = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTMag ); AUD_Int32s *pFFTRe = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTRe ); AUD_Int32s *pFFTIm = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTIm ); AUD_Int32s *pFFTCleanRe = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanRe ); AUD_Int32s *pFFTCleanIm = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanIm ); // noise spectrum AUD_Double *pNoiseEn = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pNoiseEn ); AUD_Double *pNoiseB = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pNoiseB ); AUD_Double *pXPrev = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pXPrev ); AUD_Double *pAb = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pAb ); AUD_Double *pH = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pH ); AUD_Double *pGammak = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pGammak ); AUD_Double *pKsi = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pKsi ); AUD_Double *pLogSigmak = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pLogSigmak ); AUD_Double *pAlpha = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pAlpha ); AUD_Int32s *pLinToBark = (AUD_Int32s*)calloc( nSpecLen * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pLinToBark ); AUD_Int16s *pxOld = (AUD_Int16s*)calloc( frameOverlap * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pxOld ); AUD_Int16s *pxClean = (AUD_Int16s*)calloc( nFFT * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pxClean ); /* AUD_Int32s critBandEnds[22] = { 0, 100, 200, 300, 400, 510, 630, 770, 920, 1080, 1270, 1480, 1720, 2000, 2320, 2700, 3150, 3700, 4400, 5300, 6400, 7700 }; */ AUD_Int32s critFFTEnds[CRITICAL_BAND_NUM + 1] = { 0, 4, 7, 10, 13, 17, 21, 25, 30, 35, 41, 48, 56, 64, 75, 87, 101, 119, 141, 170, 205, 247, 257 }; // generate linear->bark transform mapping k = 0; for ( i = 0; i < CRITICAL_BAND_NUM; i++ ) { while ( k >= critFFTEnds[i] && k < critFFTEnds[i + 1] ) { pLinToBark[k] = i; k++; } } AUD_Double absThr[CRITICAL_BAND_NUM] = { 38, 31, 22, 18.5, 15.5, 13, 11, 9.5, 8.75, 7.25, 4.75, 2.75, 1.5, 0.5, 0, 0, 0, 0, 2, 7, 12, 15.5 }; AUD_Double dbOffset[CRITICAL_BAND_NUM]; AUD_Double sumn[CRITICAL_BAND_NUM]; AUD_Double spread[CRITICAL_BAND_NUM]; for ( i = 0; i < CRITICAL_BAND_NUM; i++ ) { absThr[i] = pow( 10., absThr[i] / 10. ) / nFFT / ( 65535. * 65535. ); dbOffset[i] = 10. + i; sumn[i] = 0.474 + i; spread[i] = pow( 10., ( 15.81 + 7.5 * sumn[i] - 17.5 * sqrt( 1. + sumn[i] * sumn[i] ) ) / 10. ); } AUD_Double dcGain[CRITICAL_BAND_NUM]; for ( i = 0; i < CRITICAL_BAND_NUM; i++ ) { dcGain[i] = 0.; for ( j = 0; j < CRITICAL_BAND_NUM; j++ ) { dcGain[i] += spread[MABS( i - j )]; } } AUD_Matrix exPatMatrix; exPatMatrix.rows = CRITICAL_BAND_NUM; exPatMatrix.cols = nSpecLen; exPatMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &exPatMatrix ); AUD_ASSERT( ret == 0 ); // excitation pattern AUD_Int32s index = 0; for ( i = 0; i < exPatMatrix.rows; i++ ) { AUD_Double *pExpatRow = exPatMatrix.pDouble + i * exPatMatrix.cols; for ( j = 0; j < exPatMatrix.cols; j++ ) { index = MABS( i - pLinToBark[j] ); pExpatRow[j] = spread[index]; } } AUD_Int32s frameNum = (inLen - frameSize) / frameStride + 1; AUD_ASSERT( frameNum > nNoiseFrame ); // compute noise mean for ( i = 0; i < nNoiseFrame; i++ ) { win16s_calc( hWin, pInBuf + i * frameSize, pFrame ); fft_mag( hFft, pFrame, frameSize, pFFTMag ); for ( j = 0; j < nSpecLen; j++ ) { pNoiseEn[j] += pFFTMag[j] / 32768. * pFFTMag[j] / 32768.; } } for ( j = 0; j < nSpecLen; j++ ) { pNoiseEn[j] /= nNoiseFrame; } // get cirtical band mean filtered noise power AUD_Int32s k1 = 0, k2 = 0; for ( i = 0; i < CRITICAL_BAND_NUM; i++ ) { k1 = k2; AUD_Double segSum = 0.; while ( k2 >= critFFTEnds[i] && k2 < critFFTEnds[i + 1] ) { segSum += pNoiseEn[k2]; k2++; } segSum /= ( k2 - k1 ); for ( m = k1; m < k2; m++ ) { pNoiseB[m] = segSum; } } #if 0 AUDLOG( "noise band spectrum:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pNoiseB[j] ); } AUDLOG( "\n" ); #endif AUD_Matrix frameMatrix; frameMatrix.rows = nSpecLen; frameMatrix.cols = 1; frameMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &frameMatrix ); AUD_ASSERT( ret == 0 ); AUD_Double *pFrameEn = frameMatrix.pDouble; AUD_Matrix xMatrix; xMatrix.rows = nSpecLen; xMatrix.cols = 1; xMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &xMatrix ); AUD_ASSERT( ret == 0 ); AUD_Double *pX = xMatrix.pDouble; AUD_Matrix cMatrix; cMatrix.rows = CRITICAL_BAND_NUM; cMatrix.cols = 1; cMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &cMatrix ); AUD_ASSERT( ret == 0 ); AUD_Double *pC = cMatrix.pDouble; AUD_Matrix tMatrix; tMatrix.rows = 1; tMatrix.cols = CRITICAL_BAND_NUM; tMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &tMatrix ); AUD_ASSERT( ret == 0 ); AUD_Double *pT = tMatrix.pDouble; AUD_Matrix tkMatrix; tkMatrix.rows = 1; tkMatrix.cols = nSpecLen; tkMatrix.dataType = AUD_DATATYPE_DOUBLE; ret = createMatrix( &tkMatrix ); AUD_ASSERT( ret == 0 ); AUD_Double *pTk = tkMatrix.pDouble; AUD_Double dB0[CRITICAL_BAND_NUM]; AUD_Double epsilon = pow( 2, -52 ); #define ESTIMATE_MASKTHRESH( sigMatrix, tkMatrix )\ do {\ AUD_Double *pSig = sigMatrix.pDouble; \ for ( m = 0; m < exPatMatrix.rows; m++ ) \ { \ AUD_Double suma = 0.; \ AUD_Double *pExpatRow = exPatMatrix.pDouble + m * exPatMatrix.cols; \ for ( n = 0; n < exPatMatrix.cols; n++ ) \ { \ suma += pExpatRow[n] * pSig[n]; \ } \ pC[m] = suma; \ } \ AUD_Double product = 1.; \ AUD_Double sum = 0.; \ for ( m = 0; m < sigMatrix.rows; m++ ) \ { \ product *= pSig[m]; \ sum += pSig[m]; \ } \ AUD_Double power = 1. / sigMatrix.rows;\ AUD_Double sfmDB = 10. * log10( pow( product, power ) / sum / sigMatrix.rows + epsilon ); \ AUD_Double alpha = AUD_MIN( 1., sfmDB / (-60.) ); \ for ( m = 0; m < tMatrix.cols; m++ ) \ { \ dB0[m] = dbOffset[m] * alpha + 5.5; \ pT[m] = pC[m] / pow( 10., dB0[m] / 10. ) / dcGain[m]; \ pT[m] = AUD_MAX( pT[m], absThr[m] ); \ } \ for ( m = 0; m < tkMatrix.cols; m++ ) \ { \ pTk[m] = pT[pLinToBark[m]]; \ } \ } while ( 0 ) AUD_Double aa = 0.98; AUD_Double mu = 0.98; AUD_Double eta = 0.15; AUD_Double vadDecision; k = 0; // start processing for ( i = 0; i < frameNum; i++ ) { win16s_calc( hWin, pInBuf + i * frameStride, pFrame ); fft_calc( hFft, pFrame, frameSize, pFFTRe, pFFTIm ); // compute SNR vadDecision = 0.; for ( j = 0; j < nSpecLen; j++ ) { pFrameEn[j] = pFFTRe[j] / 32768. * pFFTRe[j] / 32768. + pFFTIm[j] / 32768. * pFFTIm[j] / 32768.; pGammak[j] = AUD_MIN( pFrameEn[j] / pNoiseEn[j], 40. ); if ( i > 0 ) { pKsi[j] = aa * pXPrev[j] / pNoiseEn[j] + ( 1 - aa ) * AUD_MAX( pGammak[j] - 1., 0. ); } else { pKsi[j] = aa + ( 1. - aa ) * AUD_MAX( pGammak[j] - 1., 0. ); } pLogSigmak[j] = pGammak[j] * pKsi[j] / ( 1. + pKsi[j] ) - log( 1. + pKsi[j] ); vadDecision += ( j > 0 ? 2 : 1 ) * pLogSigmak[j]; } vadDecision /= nFFT; #if 0 AUDLOG( "X prev:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pXPrev[j] ); } AUDLOG( "\n" ); #endif #if 0 AUDLOG( "gamma k:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pGammak[j] ); } AUDLOG( "\n" ); #endif #if 0 AUDLOG( "ksi:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pKsi[j] ); } AUDLOG( "\n" ); #endif #if 0 AUDLOG( "log sigma k:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pLogSigmak[j] ); } AUDLOG( "\n" ); #endif // AUDLOG( "vadDecision: %.2f\n", vadDecision ); // re-estimate noise if ( vadDecision < eta ) { for ( j = 0; j < nSpecLen; j++ ) { pNoiseEn[j] = mu * pNoiseEn[j] + ( 1. - mu ) * pFrameEn[j]; } // re-estimate crital band based noise AUD_Int32s k1 = 0, k2 = 0; for ( int band = 0; band < CRITICAL_BAND_NUM; band++ ) { k1 = k2; AUD_Double segSum = 0.; while ( k2 >= critFFTEnds[band] && k2 < critFFTEnds[band + 1] ) { segSum += pNoiseEn[k2]; k2++; } segSum /= ( k2 - k1 ); for ( m = k1; m < k2; m++ ) { pNoiseB[m] = segSum; } } #if 0 AUDLOG( "noise band spectrum:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pNoiseB[j] ); } AUDLOG( "\n" ); #endif } for ( j = 0; j < nSpecLen; j++ ) { pX[j] = AUD_MAX( pFrameEn[j] - pNoiseEn[j], 0.001 ); pXPrev[j] = pFrameEn[j]; } ESTIMATE_MASKTHRESH( xMatrix, tkMatrix ); for ( int iter = 0; iter < 2; iter++ ) { for ( j = 0; j < nSpecLen; j++ ) { pAb[j] = pNoiseB[j] + pNoiseB[j] * pNoiseB[j] / pTk[j]; pFrameEn[j] = pFrameEn[j] * pFrameEn[j] / ( pFrameEn[j] + pAb[j] ); ESTIMATE_MASKTHRESH( frameMatrix, tkMatrix ); #if 0 showMatrix( &tMatrix ); #endif } } #if 0 AUDLOG( "tk:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pTk[j] ); } AUDLOG( "\n" ); #endif pAlpha[0] = ( pNoiseB[0] + pTk[0] ) * ( pNoiseB[0] / pTk[0] ); pH[0] = pFrameEn[0] / ( pFrameEn[0] + pAlpha[0] ); pXPrev[0] *= pH[0] * pH[0]; pFFTCleanRe[0] = 0; pFFTCleanIm[0] = 0; for ( j = 1; j < nSpecLen; j++ ) { pAlpha[j] = ( pNoiseB[j] + pTk[j] ) * ( pNoiseB[j] / pTk[j] ); pH[j] = pFrameEn[j] / ( pFrameEn[j] + pAlpha[j] ); pFFTCleanRe[j] = pFFTCleanRe[nFFT - j] = (AUD_Int32s)round( pH[j] * pFFTRe[j] ); pFFTCleanIm[j] = (AUD_Int32s)round( pH[j] * pFFTIm[j] ); pFFTCleanIm[nFFT - j] = -pFFTCleanIm[j]; pXPrev[j] *= pH[j] * pH[j]; } #if 0 AUDLOG( "denoise transfer function:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pH[j] ); } AUDLOG( "\n" ); #endif #if 0 AUDLOG( "clean FFT with phase:\n" ); for ( j = 0; j < nFFT; j++ ) { AUDLOG( "%d + j%d, ", pFFTCleanRe[j], pFFTCleanIm[j] ); } AUDLOG( "\n" ); #endif ifft_real( hIfft, pFFTCleanRe, pFFTCleanIm, nFFT, pxClean ); #if 0 AUDLOG( "clean speech:\n" ); for ( j = 0; j < nFFT; j++ ) { AUDLOG( "%d, ", pxClean[j] ); } AUDLOG( "\n" ); #endif for ( j = 0; j < frameStride; j++ ) { if ( j < frameOverlap ) { pOutBuf[k + j] = pxOld[j] + pxClean[j]; pxOld[j] = pxClean[frameStride + j]; } else { pOutBuf[k + j] = pxClean[j]; } } k += frameStride; cleanLen += frameStride; } // de-emphasis // sig_deemphasis( pOutBuf, pOutBuf, cleanLen ); win16s_free( &hWin ); fft_free( &hFft ); ifft_free( &hIfft ); free( pFrame ); free( pNoiseEn ); free( pNoiseB ); free( pFFTMag ); free( pFFTRe ); free( pFFTIm ); free( pXPrev ); free( pAb ); free( pH ); free( pFFTCleanRe ); free( pFFTCleanIm ); free( pxOld ); free( pxClean ); free( pGammak ); free( pKsi ); free( pLogSigmak ); free( pAlpha ); free( pLinToBark ); ret = createMatrix( &xMatrix ); AUD_ASSERT( ret == 0 ); ret = destroyMatrix( &exPatMatrix ); AUD_ASSERT( ret == 0 ); ret = destroyMatrix( &frameMatrix ); AUD_ASSERT( ret == 0 ); ret = destroyMatrix( &cMatrix ); AUD_ASSERT( ret == 0 ); ret = destroyMatrix( &tMatrix ); AUD_ASSERT( ret == 0 ); ret = destroyMatrix( &tkMatrix ); AUD_ASSERT( ret == 0 ); return cleanLen; }
AUD_Int32s denoise_mmse( AUD_Int16s *pInBuf, AUD_Int16s *pOutBuf, AUD_Int32s inLen ) { Fft_16s *hFft = NULL; Ifft_16s *hIfft = NULL; AUD_Window16s *hWin = NULL; AUD_Int32s frameSize = 512; AUD_Int32s frameStride = 256; AUD_Int32s frameOverlap = 256; AUD_Int32s nFFT = frameSize; AUD_Int32s nSpecLen = nFFT / 2 + 1; AUD_Int32s nNoiseFrame = (AUD_Int32s)( ( 0.25 * SAMPLE_RATE - frameSize ) / frameStride + 1 );; AUD_Int32s i, j, k; AUD_Int32s cleanLen = 0; // pre-emphasis // sig_preemphasis( pInBuf, pInBuf, inLen ); // init hamming module win16s_init( &hWin, AUD_WIN_HAMM, frameSize, 14 ); AUD_ASSERT( hWin ); // init fft handle fft_init( &hFft, nFFT, 15 ); AUD_ASSERT( hFft ); // init ifft handle ifft_init( &hIfft, nFFT, 15 ); AUD_ASSERT( hIfft ); AUD_Int16s *pFrame = (AUD_Int16s*)calloc( frameSize * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pFrame ); // FFT AUD_Int32s *pFFTMag = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTMag ); AUD_Int32s *pFFTRe = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTRe ); AUD_Int32s *pFFTIm = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTIm ); AUD_Int32s *pFFTCleanRe = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanRe ); AUD_Int32s *pFFTCleanIm = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanIm ); AUD_Double *pNoiseMean = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pNoiseMean ); AUD_Int32s *pFFTCleanMag = (AUD_Int32s*)calloc( nSpecLen * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanMag ); AUD_Int16s *pxOld = (AUD_Int16s*)calloc( frameSize * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pxOld ); AUD_Int16s *pxClean = (AUD_Int16s*)calloc( frameSize * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pxClean ); for ( i = 0; (i < nNoiseFrame) && ( (i * frameStride + frameSize) < inLen ); i++ ) { win16s_calc( hWin, pInBuf + i * frameSize, pFrame ); fft_mag( hFft, pFrame, frameSize, pFFTMag ); for ( j = 0; j < nSpecLen; j++ ) { pNoiseMean[j] += (AUD_Double)pFFTMag[j]; } } // compute noise mean for ( j = 0; j < nSpecLen; j++ ) { pNoiseMean[j] /= nNoiseFrame; } AUD_Double thres = 3.; AUD_Double alpha = 2.; AUD_Double flr = 0.002; AUD_Double G = 0.9; AUD_Double snr, beta; AUD_Double tmp; AUD_Double sigEnergy, noiseEnergy; k = 0; // start processing for ( i = 0; (i * frameStride + frameSize) < inLen; i++ ) { win16s_calc( hWin, pInBuf + i * frameStride, pFrame ); fft_calc( hFft, pFrame, frameSize, pFFTRe, pFFTIm ); // compute SNR sigEnergy = 0.; noiseEnergy = 0.; for ( j = 0; j < nSpecLen; j++ ) { tmp = pFFTRe[j] / 32768. * pFFTRe[j] / 32768. + pFFTIm[j] / 32768. * pFFTIm[j] / 32768.; sigEnergy += tmp; noiseEnergy += pNoiseMean[j] / 32768. * pNoiseMean[j] / 32768.; tmp = sqrt( tmp ) * 32768.; if ( tmp > MAX_INT32S ) { AUD_ECHO pFFTMag[j] = MAX_INT32S; } else { pFFTMag[j] = (AUD_Int32s)round( tmp ); } } snr = 10. * log10( sigEnergy / noiseEnergy ); beta = berouti( snr ); // AUDLOG( "signal energy: %.2f, noise energy: %.2f, snr: %.2f, beta: %.2f\n", sigEnergy, noiseEnergy, snr, beta ); #if 0 AUDLOG( "noisy FFT:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%d, ", pFFTMag[j] ); } AUDLOG( "\n" ); #endif tmp = 0.; for ( j = 0; j < nSpecLen; j++ ) { tmp = AUD_MAX( pow( pFFTMag[j], alpha ) - beta * pow( pNoiseMean[j], alpha ), flr * pow( pNoiseMean[j], alpha ) ); pFFTCleanMag[j] = (AUD_Int32s)round( pow( tmp, 1. / alpha ) ); } // re-estimate noise if ( snr < thres ) { AUD_Double tmpNoise; for ( j = 0; j < nSpecLen; j++ ) { tmpNoise = G * pow( pNoiseMean[j], alpha ) + ( 1 - G ) * pow( pFFTMag[j], alpha ); pNoiseMean[j] = pow( tmpNoise, 1. / alpha ); } } #if 0 AUDLOG( "clean FFT:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%d, ", pFFTCleanMag[j] ); } AUDLOG( "\n" ); #endif pFFTCleanRe[0] = pFFTCleanMag[0]; pFFTCleanIm[0] = 0; AUD_Double costheta, sintheta; for ( j = 1; j < nSpecLen; j++ ) { if ( pFFTMag[j] != 0 ) { costheta = (AUD_Double)pFFTRe[j] / (AUD_Double)pFFTMag[j]; sintheta = (AUD_Double)pFFTIm[j] / (AUD_Double)pFFTMag[j]; pFFTCleanRe[nFFT - j] = pFFTCleanRe[j] = (AUD_Int32s)round( costheta * pFFTCleanMag[j] ); pFFTCleanIm[j] = (AUD_Int32s)round( sintheta * pFFTCleanMag[j] ); pFFTCleanIm[nFFT - j] = -pFFTCleanIm[j]; } else { pFFTCleanRe[nFFT - j] = pFFTCleanRe[j] = pFFTCleanMag[j]; pFFTCleanIm[nFFT - j] = pFFTCleanIm[j] = 0; } } #if 0 AUDLOG( "clean FFT with phase:\n" ); for ( j = 0; j < nFFT; j++ ) { AUDLOG( "%d + j%d, ", pFFTCleanRe[j], pFFTCleanIm[j] ); } AUDLOG( "\n" ); #endif ifft_real( hIfft, pFFTCleanRe, pFFTCleanIm, nFFT, pxClean ); #if 0 AUDLOG( "clean speech:\n" ); for ( j = 0; j < nFFT; j++ ) { AUDLOG( "%d, ", pxClean[j] ); } AUDLOG( "\n" ); #endif for ( j = 0; j < frameStride; j++ ) { pOutBuf[k + j] = pxOld[j] + pxClean[j]; pxOld[j] = pxClean[frameOverlap + j]; } k += frameStride; cleanLen += frameStride; } // de-emphasis // sig_deemphasis( pOutBuf, pOutBuf, cleanLen ); win16s_free( &hWin ); fft_free( &hFft ); ifft_free( &hIfft ); free( pFrame ); free( pNoiseMean ); free( pFFTMag ); free( pFFTRe ); free( pFFTIm ); free( pFFTCleanMag ); free( pFFTCleanRe ); free( pFFTCleanIm ); free( pxOld ); free( pxClean ); return cleanLen; }
AUD_Int32s denoise_wiener( AUD_Int16s *pInBuf, AUD_Int16s *pOutBuf, AUD_Int32s inLen ) { Fft_16s *hFft = NULL; Ifft_16s *hIfft = NULL; AUD_Window16s *hWin = NULL; _VAD_Nest vadState; AUD_Int32s frameSize = 512; AUD_Int32s frameStride = 256; AUD_Int32s nFFT = frameSize; AUD_Int32s nSpecLen = nFFT / 2 + 1; AUD_Int32s nNoiseFrame = (AUD_Int32s)( ( 0.25 * SAMPLE_RATE - frameSize ) / frameStride + 1 ); AUD_Int32s i, j, k; AUD_Int32s cleanLen = 0; vadState.meanEn = 280; vadState.nbSpeechFrame = 0; vadState.hangOver = 0; AUD_Int16s *pFrame = (AUD_Int16s*)calloc( frameSize * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pFrame ); AUD_Double *pNoiseMean = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pNoiseMean ); AUD_Double *pLamdaD = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pLamdaD ); AUD_Double *pXi = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pXi ); AUD_Double *pGamma = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pGamma ); AUD_Double *pG = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pG ); AUD_Double *pGammaNew = (AUD_Double*)calloc( nSpecLen * sizeof(AUD_Double), 1 ); AUD_ASSERT( pGammaNew ); for ( j = 0; j < nSpecLen; j++ ) { pG[j] = 1.; pGamma[j] = 1.; } // FFT AUD_Int32s *pFFTMag = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTMag ); AUD_Int32s *pFFTRe = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTRe ); AUD_Int32s *pFFTIm = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTIm ); AUD_Double *pFFTCleanMag = (AUD_Double*)calloc( nFFT * sizeof(AUD_Double), 1 ); AUD_ASSERT( pFFTCleanMag ); AUD_Int32s *pFFTCleanRe = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanRe ); AUD_Int32s *pFFTCleanIm = (AUD_Int32s*)calloc( nFFT * sizeof(AUD_Int32s), 1 ); AUD_ASSERT( pFFTCleanIm ); AUD_Int16s *pxClean = (AUD_Int16s*)calloc( nFFT * sizeof(AUD_Int16s), 1 ); AUD_ASSERT( pxClean ); memset( pOutBuf, 0, inLen * sizeof(AUD_Int16s) ); // init hamming module win16s_init( &hWin, AUD_WIN_HAMM, frameSize, 14 ); AUD_ASSERT( hWin ); // init fft handle fft_init( &hFft, nFFT, 15 ); AUD_ASSERT( hFft ); // init ifft handle ifft_init( &hIfft, nFFT, 15 ); AUD_ASSERT( hIfft ); // noise manipulation for ( i = 0; (i < nNoiseFrame) && ( (i * frameStride + frameSize) < inLen ); i++ ) { win16s_calc( hWin, pInBuf + i * frameStride, pFrame ); fft_mag( hFft, pFrame, frameSize, pFFTMag ); for ( j = 0; j < nSpecLen; j++ ) { pNoiseMean[j] += (AUD_Double)pFFTMag[j]; pLamdaD[j] += (AUD_Double)pFFTMag[j] * (AUD_Double)pFFTMag[j]; } } // compute noise mean for ( j = 0; j < nSpecLen; j++ ) { pNoiseMean[j] /= nNoiseFrame; pLamdaD[j] /= nNoiseFrame; } AUD_Int32s vadFlag = 0; AUD_Double noiseLen = 9.; AUD_Double alpha = 0.99; k = 0; for ( i = 0; (i * frameStride + frameSize) < inLen; i++ ) { win16s_calc( hWin, pInBuf + i * frameStride, pFrame ); fft_calc( hFft, pFrame, frameSize, pFFTRe, pFFTIm ); for ( j = 0; j < nSpecLen; j++ ) { pFFTMag[j] = (AUD_Int32s)round( sqrt( (AUD_Double)pFFTRe[j] * pFFTRe[j] + (AUD_Double)pFFTIm[j] * pFFTIm[j] ) ); } #if 0 AUDLOG( "noisy FFT:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%d, ", pFFTMag[j] ); } AUDLOG( "\n" ); #endif vadFlag = vad_nest( &vadState, pFrame, frameSize ); if ( vadFlag == 0 ) { for ( j = 0; j < nSpecLen; j++ ) { pNoiseMean[j] = ( noiseLen * pNoiseMean[j] + (AUD_Double)pFFTMag[j] ) / ( noiseLen + 1. ); pLamdaD[j] = ( noiseLen * pLamdaD[j] + (AUD_Double)pFFTMag[j] * pFFTMag[j] ) / ( noiseLen + 1. ); } } for ( j = 0; j < nSpecLen; j++ ) { pGammaNew[j] = (AUD_Double)pFFTMag[j] * pFFTMag[j] / pLamdaD[j]; pXi[j] = alpha * pG[j] * pG[j] * pGamma[j] + ( 1. - alpha ) * AUD_MAX( pGammaNew[j] - 1., 0. ); pGamma[j] = pGammaNew[j]; pG[j] = pXi[j] / ( pXi[j] + 1. ); pFFTCleanMag[j] = pG[j] * pFFTMag[j]; } #if 0 AUDLOG( "clean FFT:\n" ); for ( j = 0; j < nSpecLen; j++ ) { AUDLOG( "%.2f, ", pFFTCleanMag[j] ); } AUDLOG( "\n" ); #endif // combine to real/im part of IFFT pFFTCleanRe[0] = pFFTCleanMag[0]; pFFTCleanIm[0] = 0; AUD_Double costheta, sintheta; for ( j = 1; j < nSpecLen; j++ ) { if ( pFFTMag[j] != 0 ) { costheta = (AUD_Double)pFFTRe[j] / (AUD_Double)pFFTMag[j]; sintheta = (AUD_Double)pFFTIm[j] / (AUD_Double)pFFTMag[j]; pFFTCleanRe[nFFT - j] = pFFTCleanRe[j] = (AUD_Int32s)round( costheta * pFFTCleanMag[j] ); pFFTCleanIm[j] = (AUD_Int32s)round( sintheta * pFFTCleanMag[j] ); pFFTCleanIm[nFFT - j] = -pFFTCleanIm[j]; } else { pFFTCleanRe[nFFT - j] = pFFTCleanRe[j] = pFFTCleanMag[j]; pFFTCleanIm[nFFT - j] = pFFTCleanIm[j] = 0; } } ifft_real( hIfft, pFFTCleanRe, pFFTCleanIm, nFFT, pxClean ); #if 0 AUDLOG( "clean FFT with phase:\n" ); for ( j = 0; j < nFFT; j++ ) { AUDLOG( "%d + j%d, ", pFFTCleanRe[j], pFFTCleanIm[j] ); } AUDLOG( "\n" ); #endif for ( j = 0; j < frameSize; j++ ) { pOutBuf[k + j] = pOutBuf[k + j] + pxClean[j]; } k += frameStride; cleanLen += frameStride; } win16s_free( &hWin ); fft_free( &hFft ); ifft_free( &hIfft ); free( pFrame ); free( pNoiseMean ); free( pLamdaD ); free( pXi ); free( pGamma ); free( pG ); free( pGammaNew ); free( pFFTMag ); free( pFFTRe ); free( pFFTIm ); free( pFFTCleanMag ); free( pFFTCleanRe ); free( pFFTCleanIm ); free( pxClean ); return cleanLen; }