void MLP::modifyWeights(const integer &exampleIndex, const realnumber &learningRate)
{
    modifyDelta(m_input.col(exampleIndex), m_output.col(exampleIndex), 0);
    for (integer j = m_last; j >= 0; --j)
        m_layers[j] +=
                       learningRate
                     * m_Delta[j]
                     * addBias( (j>0) ? run(exampleIndex, j-1) : m_input.col(exampleIndex) ).transpose();
}
void myConvKernel_naive()
{
    float *filterOutput_buf = (float*) _mm_malloc(sizeof(float) * outputSize, 512); 
    assert(filterOutput_buf != NULL);
    
    memset(outputPlanes, 0, outputSize * nOutputPlanes);
    
    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        int nthreads = omp_get_num_threads();
        
        int ioHeight_spos = BLOCK_LOW(tid, nthreads, ioHeight);
        int ioHeight_epos = BLOCK_LOW(tid + 1, nthreads, ioHeight);
        
        int oS_spos = ioHeight_spos * ioWidth;
        int oS_size = (ioHeight_epos - ioHeight_spos) * ioWidth;
      
        for (int opIndex = 0; opIndex < nOutputPlanes; opIndex++)
        {
            float *filterOutput = filterOutput_buf;                    
            float *outputPlane = outputPlanes + opIndex * outputSize; 
            
            for (int ipIndex = 0; ipIndex < nInputPlanes; ipIndex++)
            {
                int wMatIndex = nInputPlanes * opIndex + ipIndex;
                float *inputPlane = inputPlanes + ipIndex * paddedInSize;
                float *weightMatrix = weights + wMatIndex * wSize;
                
                convolve3x3withPad(
                    inputPlane, filterOutput, weightMatrix,
                    ioHeight_spos, ioHeight_epos
                );

                addVec(oS_size, filterOutput + oS_spos, outputPlane + oS_spos);
            }
        }
        
        #pragma omp barrier
        
        #pragma omp for
        for (int opIndex = 0; opIndex < nOutputPlanes; opIndex++)
        {
            int wMatIndex = nInputPlanes * opIndex;
            float *outputPlane = outputPlanes + opIndex * outputSize;    
            addBias(outputSize, (float)(biases[opIndex]), outputPlane); 
            scaleIfLessThanX(outputSize, outputPlane, 0.0, 0.1);  
        }
    }

    _mm_free(filterOutput_buf);
}