bool MatrixDouble::subtract(const MatrixDouble &a,const MatrixDouble &b){
    
    const unsigned int M = a.getNumRows();
    const unsigned int N = a.getNumCols();
    
    if( M != b.getNumRows() ){
        errorLog << "subtract(const MatrixDouble &a,const MatrixDouble &b) - Failed to add matrix! The rows do not match!";
        errorLog << " a rows: " << M << " b rows: " << b.getNumRows() << endl;
        return false;
    }
    
    if( N != b.getNumCols() ){
        errorLog << "subtract(const MatrixDouble &a,const MatrixDouble &b) - Failed to add matrix! The columns do not match!";
        errorLog << " a cols: " << N << " b cols: " << b.getNumCols() << endl;
        return false;
    }
    
    resize( M, N );
    
    UINT i,j;
    
    //Using direct pointers really helps speed up the computation time
    double **pa = a.getDataPointer();
    double **pb = b.getDataPointer();
    
    for(i=0; i<M; i++){
        for(j=0; j<N; j++){
            dataPtr[i*cols+j] = pa[i][j] - pb[i][j];
        }
    }
    
    return true;
}
bool MatrixDouble::multiple(const MatrixDouble &a,const MatrixDouble &b,const bool aTranspose){
    
    const unsigned int M = !aTranspose ? a.getNumRows() : a.getNumCols();
    const unsigned int N = !aTranspose ? a.getNumCols() : a.getNumRows();
    const unsigned int K = b.getNumRows();
    const unsigned int L = b.getNumCols();
    
    if( N != K ) {
        errorLog << "multiple(const MatrixDouble &a,const MatrixDouble &b,const bool aTranspose) - The number of rows in a (" << K << ") does not match the number of columns in matrix b (" << N << ")" << std::endl;
        return false;
    }
    
    if( !resize( M, L ) ){
        errorLog << "multiple(const MatrixDouble &b,const MatrixDouble &c,const bool bTranspose) - Failed to resize matrix!" << endl;
        return false;
    }
    
    unsigned int i, j, k = 0;
    
    //Using direct pointers really helps speed up the computation time
    double **pa = a.getDataPointer();
    double **pb = b.getDataPointer();
    
    if( aTranspose ){
        
        for(j=0; j<L; j++){
            for(i=0; i<M; i++){
                dataPtr[i*cols+j] = 0;
                for(k=0; k<K; k++){
                    dataPtr[i*cols+j] += pa[k][i] * pb[k][j];
                }
            }
        }
        
    }else{
        
        for(j=0; j<L; j++){
            for(i=0; i<M; i++){
                dataPtr[i*cols+j] = 0;
                for(k=0; k<K; k++){
                    dataPtr[i*cols+j] += pa[i][k] * pb[k][j];
                }
            }
        }
        
    }
    
    return true;
}
bool MatrixDouble::subtract(const MatrixDouble &b){
    
    if( b.getNumRows() != rows ){
        errorLog << "subtract(const MatrixDouble &b) - Failed to add matrix! The rows do not match!" << endl;
        errorLog << " rows: " << rows << " b rows: " << b.getNumRows() << endl;
        return false;
    }
    
    if( b.getNumCols() != cols ){
        errorLog << "subtract(const MatrixDouble &b) - Failed to add matrix! The rows do not match!" << endl;
        errorLog << "  cols: " << cols << " b cols: " << b.getNumCols() << endl;
        return false;
    }
    
    unsigned int i,j;
    
    //Using direct pointers really helps speed up the computation time
    double **pb = b.getDataPointer();
    
    for(i=0; i<rows; i++){
        for(j=0; j<cols; j++){
            dataPtr[i*cols+j] -= pb[i][j];
        }
    }
    
    return true;
}
MatrixDouble MatrixDouble::multiple(const MatrixDouble &b) const{
    
    const unsigned int M = rows;
    const unsigned int N = cols;
    const unsigned int K = b.getNumRows();
    const unsigned int L = b.getNumCols();
    
    if( N != K ) {
        errorLog << "multiple(MatrixDouble b) - The number of rows in b (" << K << ") does not match the number of columns in this matrix (" << N << ")" << std::endl;
        return MatrixDouble();
    }
    
    MatrixDouble c(M,L);
    double **pb = b.getDataPointer();
    double **pc = c.getDataPointer();
    
    unsigned int i,j,k = 0;
    for(i=0; i<M; i++){
        for(j=0; j<L; j++){
            pc[i][j] = 0;
            for(k=0; k<K; k++){
                pc[i][j] += dataPtr[i*cols+k] * pb[k][j];
            }
        }
    }
    
    return c;
}
bool BernoulliRBM::train_(MatrixDouble &data){
    
    const UINT numTrainingSamples = data.getNumRows();
    numInputDimensions = data.getNumCols();
    numOutputDimensions = numHiddenUnits;
    numVisibleUnits = numInputDimensions;
    
    trainingLog << "NumInputDimensions: " << numInputDimensions << endl;
    trainingLog << "NumOutputDimensions: " << numOutputDimensions << endl;
    
    if( randomizeWeightsForTraining ){
    
        //Init the weights matrix
        weightsMatrix.resize(numHiddenUnits, numVisibleUnits);
        
        double a = 1.0 / numVisibleUnits;
        for(UINT i=0; i<numHiddenUnits; i++) {
            for(UINT j=0; j<numVisibleUnits; j++) {
                weightsMatrix[i][j] = rand.getRandomNumberUniform(-a, a);
            }
        }

        //Init the bias units
        visibleLayerBias.resize( numVisibleUnits );
        hiddenLayerBias.resize( numHiddenUnits );
        std::fill(visibleLayerBias.begin(),visibleLayerBias.end(),0);
        std::fill(hiddenLayerBias.begin(),hiddenLayerBias.end(),0);
        
    }else{
        if( weightsMatrix.getNumRows() != numHiddenUnits ){
            errorLog << "train_(MatrixDouble &data) - Weights matrix row size does not match the number of hidden units!" << endl;
            return false;
        }
        if( weightsMatrix.getNumCols() != numVisibleUnits ){
            errorLog << "train_(MatrixDouble &data) - Weights matrix row size does not match the number of visible units!" << endl;
            return false;
        }
        if( visibleLayerBias.size() != numVisibleUnits ){
            errorLog << "train_(MatrixDouble &data) - Visible layer bias size does not match the number of visible units!" << endl;
            return false;
        }
        if( hiddenLayerBias.size() != numHiddenUnits ){
            errorLog << "train_(MatrixDouble &data) - Hidden layer bias size does not match the number of hidden units!" << endl;
            return false;
        }
    }
    
    //Flag the model has been trained encase the user wants to save the model during a training iteration using an observer
    trained = true;
    
    //Make sure the data is scaled between [0 1]
    ranges = data.getRanges();
    if( useScaling ){
        for(UINT i=0; i<numTrainingSamples; i++){
            for(UINT j=0; j<numInputDimensions; j++){
                data[i][j] = scale(data[i][j], ranges[j].minValue, ranges[j].maxValue, 0, 1);
            }
        }
    }
    
    const UINT numBatches = (UINT)ceil( numTrainingSamples/batchSize );
    
    //Setup the batch indexs
    vector< BatchIndexs > batchIndexs( numBatches );
    UINT startIndex = 0;
    for(UINT i=0; i<numBatches; i++){
        batchIndexs[i].startIndex = startIndex;
        batchIndexs[i].endIndex = startIndex + batchSize;
        
        //Make sure the last batch end index is not larger than the number of training examples
        if( batchIndexs[i].endIndex >= numTrainingSamples ){
            batchIndexs[i].endIndex = numTrainingSamples;
        }
        
        //Get the batch size
        batchIndexs[i].batchSize = batchIndexs[i].endIndex - batchIndexs[i].startIndex;
        
        //Set the start index for the next batch
        startIndex = batchIndexs[i].endIndex;
    }
    
    Timer timer;
    UINT i,j,n,epoch,noChangeCounter = 0;
    double startTime = 0;
    double alpha = learningRate;
    double error = 0;
    double err = 0;
    double delta = 0;
    double lastError = 0;
    vector< UINT > indexList(numTrainingSamples);
    TrainingResult trainingResult;
    MatrixDouble wT( numVisibleUnits, numHiddenUnits );       //Stores a transposed copy of the weights vector
    MatrixDouble vW( numHiddenUnits, numVisibleUnits );       //Stores the weight velocity updates
    MatrixDouble tmpW( numHiddenUnits, numVisibleUnits );     //Stores the weight values that will be used to update the main weights matrix at each batch update
    MatrixDouble v1( batchSize, numVisibleUnits );            //Stores the real batch data during a batch update
    MatrixDouble v2( batchSize, numVisibleUnits );            //Stores the sampled batch data during a batch update
    MatrixDouble h1( batchSize, numHiddenUnits );             //Stores the hidden states given v1 and the current weightsMatrix
    MatrixDouble h2( batchSize, numHiddenUnits );             //Stores the sampled hidden states given v2 and the current weightsMatrix
    MatrixDouble c1( numHiddenUnits, numVisibleUnits );       //Stores h1' * v1
    MatrixDouble c2( numHiddenUnits, numVisibleUnits );       //Stores h2' * v2
    MatrixDouble vDiff( batchSize, numVisibleUnits );         //Stores the difference between v1-v2
    MatrixDouble hDiff( batchSize, numVisibleUnits );         //Stores the difference between h1-h2
    MatrixDouble cDiff( numHiddenUnits, numVisibleUnits );    //Stores the difference between c1-c2
    VectorDouble vDiffSum( numVisibleUnits );                 //Stores the column sum of vDiff
    VectorDouble hDiffSum( numHiddenUnits );                  //Stores the column sum of hDiff
    VectorDouble visibleLayerBiasVelocity( numVisibleUnits ); //Stores the velocity update of the visibleLayerBias
    VectorDouble hiddenLayerBiasVelocity( numHiddenUnits );   //Stores the velocity update of the hiddenLayerBias
    
    //Set all the velocity weights to zero
    vW.setAllValues( 0 );
    std::fill(visibleLayerBiasVelocity.begin(),visibleLayerBiasVelocity.end(),0);
    std::fill(hiddenLayerBiasVelocity.begin(),hiddenLayerBiasVelocity.end(),0);
    
    //Randomize the order that the training samples will be used in
    for(UINT i=0; i<numTrainingSamples; i++) indexList[i] = i;
    if( randomiseTrainingOrder ){
        std::random_shuffle(indexList.begin(), indexList.end());
    }
    
    //Start the main training loop
    timer.start();
    for(epoch=0; epoch<maxNumEpochs; epoch++) {
        startTime = timer.getMilliSeconds();
        error = 0;
        
        //Randomize the batch order
        std::random_shuffle(batchIndexs.begin(),batchIndexs.end());
        
        //Run each of the batch updates
        for(UINT k=0; k<numBatches; k+=batchStepSize){
            
            //Resize the data matrices, the matrices will only be resized if the rows cols are different
            v1.resize( batchIndexs[k].batchSize, numVisibleUnits );
            h1.resize( batchIndexs[k].batchSize, numHiddenUnits );
            v2.resize( batchIndexs[k].batchSize, numVisibleUnits );
            h2.resize( batchIndexs[k].batchSize, numHiddenUnits );
            
            //Setup the data pointers, using data pointers saves a few ms on large matrix updates
            double **w_p = weightsMatrix.getDataPointer();
            double **wT_p = wT.getDataPointer();
            double **vW_p = vW.getDataPointer();
            double **data_p = data.getDataPointer();
            double **v1_p = v1.getDataPointer();
            double **v2_p = v2.getDataPointer();
            double **h1_p = h1.getDataPointer();
            double **h2_p = h2.getDataPointer();
            double *vlb_p = &visibleLayerBias[0];
            double *hlb_p = &hiddenLayerBias[0];
            
            //Get the batch data
            UINT index = 0;
            for(i=batchIndexs[k].startIndex; i<batchIndexs[k].endIndex; i++){
                for(j=0; j<numVisibleUnits; j++){
                    v1_p[index][j] = data_p[ indexList[i] ][j];
                }
                index++;
            }
            
            //Copy a transposed version of the weights matrix, this is used to compute h1 and h2
            for(i=0; i<numHiddenUnits; i++)
                for(j=0; j<numVisibleUnits; j++)
                    wT_p[j][i] = w_p[i][j];
            
            //Compute h1
            h1.multiple(v1, wT);
            for(n=0; n<batchIndexs[k].batchSize; n++){
                for(i=0; i<numHiddenUnits; i++){
                    h1_p[n][i] = sigmoidRandom( h1_p[n][i] + hlb_p[i] );
                }
            }
            
            //Compute v2
            v2.multiple(h1, weightsMatrix);
            for(n=0; n<batchIndexs[k].batchSize; n++){
                for(i=0; i<numVisibleUnits; i++){
                    v2_p[n][i] = sigmoidRandom( v2_p[n][i] + vlb_p[i] );
                }
            }
            
            //Compute h2
            h2.multiple(v2,wT);
            for(n=0; n<batchIndexs[k].batchSize; n++){
                for(i=0; i<numHiddenUnits; i++){
                    h2_p[n][i] = sigmoid( h2_p[n][i] + hlb_p[i] );
                }
            }
            
            //Compute c1, c2 and the difference between v1-v2
            c1.multiple(h1,v1,true);
            c2.multiple(h2,v2,true);
            vDiff.subtract(v1, v2);
            
            //Compute the sum of vdiff
            for(j=0; j<numVisibleUnits; j++){
                vDiffSum[j] = 0;
                for(i=0; i<batchIndexs[k].batchSize; i++){
                    vDiffSum[j] += vDiff[i][j];
                }
            }
            
            //Compute the difference between h1 and h2
            hDiff.subtract(h1, h2);
            for(j=0; j<numHiddenUnits; j++){
                hDiffSum[j] = 0;
                for(i=0; i<batchIndexs[k].batchSize; i++){
                    hDiffSum[j] += hDiff[i][j];
                }
            }
            
            //Compute the difference between c1 and c2
            cDiff.subtract(c1,c2);
            
            //Update the weight velocities
            for(i=0; i<numHiddenUnits; i++){
                for(j=0; j<numVisibleUnits; j++){
                    vW_p[i][j] = ((momentum * vW_p[i][j]) + (alpha * cDiff[i][j])) / batchIndexs[k].batchSize;
                }
            }
            for(i=0; i<numVisibleUnits; i++){
                visibleLayerBiasVelocity[i] = ((momentum * visibleLayerBiasVelocity[i]) + (alpha * vDiffSum[i])) / batchIndexs[k].batchSize;
            }
            for(i=0; i<numHiddenUnits; i++){
                hiddenLayerBiasVelocity[i] = ((momentum * hiddenLayerBiasVelocity[i]) + (alpha * hDiffSum[i])) / batchIndexs[k].batchSize;
            }
            
            //Update the weights
            weightsMatrix.add( vW );
            
            //Update the bias for the visible layer
            for(i=0; i<numVisibleUnits; i++){
                visibleLayerBias[i] += visibleLayerBiasVelocity[i];
            }
            
            //Update the bias for the visible layer
            for(i=0; i<numHiddenUnits; i++){
                hiddenLayerBias[i] += hiddenLayerBiasVelocity[i];
            }
            
            //Compute the reconstruction error
            err = 0;
            for(i=0; i<batchIndexs[k].batchSize; i++){
                for(j=0; j<numVisibleUnits; j++){
                    err += SQR( v1[i][j] - v2[i][j] );
                }
            }
            
            error += err / batchIndexs[k].batchSize;
        }
        error /= numBatches;
        delta = lastError - error;
        lastError = error;
        
        trainingLog << "Epoch: " << epoch+1 << "/" << maxNumEpochs;
        trainingLog << " Epoch time: " << (timer.getMilliSeconds()-startTime)/1000.0 << " seconds";
        trainingLog << " Learning rate: " << alpha;
        trainingLog << " Momentum: " << momentum;
        trainingLog << " Average reconstruction error: " << error;
        trainingLog << " Delta: " << delta << endl;
        
        //Update the learning rate
        alpha *= learningRateUpdate;
        
        trainingResult.setClassificationResult(epoch, error, this);
        trainingResults.push_back(trainingResult);
        trainingResultsObserverManager.notifyObservers( trainingResult );
        
        //Check for convergance
        if( fabs(delta) < minChange ){
            if( ++noChangeCounter >= minNumEpochs ){
                trainingLog << "Stopping training. MinChange limit reached!" << endl;
                break;
            }
        }else noChangeCounter = 0;
        
    }
    trainingLog << "Training complete after " << epoch << " epochs. Total training time: " << timer.getMilliSeconds()/1000.0 << " seconds" << endl;
    
    trained = true;
    
    return true;
}