int MaskFromMemoryBuffer::updateState(double time, double dt)
    if (imageLayer->getDataLeft() == dataLeft &&
            imageLayer->getDataTop() == dataTop &&
            imageLayer->getDataWidth() == dataRight-dataLeft &&
            imageLayer->getDataHeight() && dataBottom-dataTop) {
        return PV_SUCCESS; // mask only needs to change if the imageLayer changes its active region

    dataLeft = imageLayer->getDataLeft();
    dataRight = dataLeft+imageLayer->getDataWidth();
    dataTop = imageLayer->getDataTop();
    dataBottom = dataTop + imageLayer->getDataHeight();

    PVLayerLoc const * loc = getLayerLoc();
    for(int b = 0; b < loc->nbatch; b++) {
        pvdata_t * ABatch = getActivity() + b * getNumExtended();
        int const num_neurons = getNumNeurons();
        #pragma omp parallel for
        for(int ni = 0; ni < num_neurons; ni++) {
            PVHalo const * halo = &loc->halo;
            int const nx = loc->nx;
            int const ny = loc->ny;
            int const nf = loc->nf;
            int x = kxPos(ni, nx, ny, nf);
            int y = kyPos(ni, nx, ny, nf);
            pvadata_t a = (pvadata_t) (x>=dataLeft && x < dataRight && y >= dataTop && y < dataBottom);
            int nExt = kIndexExtended(ni, nx, ny, nf, halo->lt, halo->rt, halo->dn, halo->up);
            ABatch[nExt] = a;
    return PV_SUCCESS;
int ImageTestLayer::updateStateWrapper(double time, double dt)
   Image::updateStateWrapper(time, dt);
   const PVLayerLoc * loc = getLayerLoc();
   int nx = loc->nx;
   int ny = loc->ny;
   int nf = loc->nf;
   int nbatch = loc->nbatch;
   for(int b = 0; b < nbatch; b++){
      pvdata_t * dataBatch = data + b * getNumExtended();
      for(int nkRes = 0; nkRes < getNumNeurons(); nkRes++){
         //Calculate extended index
         int nkExt = kIndexExtended(nkRes, nx, ny, nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);  
         //checkVal is the value from batch index 0
         pvdata_t checkVal = dataBatch[nkExt] * 255;

         int kxGlobal = kxPos(nkRes, nx, ny, nf) + loc->kx0;
         int kyGlobal = kyPos(nkRes, nx, ny, nf) + loc->ky0; 
         int kf = featureIndex(nkRes, nx, ny, nf);

         pvdata_t expectedVal = kIndex(kxGlobal, kyGlobal, kf, loc->nxGlobal, loc->nyGlobal, nf);
         if(fabs(checkVal - expectedVal) >= 1e-5){
            std::cout << "ImageFileIO test Expected: " << expectedVal << " Actual: " << checkVal << "\n";
   return PV_SUCCESS;
int SegmentifyTest::updateState(double timef, double dt){
   //Do update state first
   Segmentify::updateState(timef, dt);
   const PVLayerLoc * loc = getLayerLoc();
   pvdata_t * A = getActivity();

   for(int bi = 0; bi < loc->nbatch; bi++){
      pvdata_t * batchA = A + bi * getNumExtended();
      for(int yi = 0; yi < loc->ny; yi++){
         for(int xi = 0; xi < loc->nx; xi++){
            for(int fi = 0; fi < loc->nf; fi++){
               int extIdx = (yi + loc->halo.up) * (loc->nx + loc-> + loc->halo.rt) * loc->nf + (xi + loc-> * loc->nf + fi;
               float actualVal = batchA[extIdx];
               float targetVal = getTargetVal(yi+loc->ky0, xi+loc->kx0, fi);
               checkOutputVals(yi+loc->ky0, xi+loc->kx0, fi, targetVal, actualVal);

               //std::cout << "Idx: (" << bi << "," << yi << "," << xi << "," << fi << ") Val: " << actualVal << " Target: " << targetVal << "\n";


   return PV_SUCCESS;
// set activity to global x/y/f position, using position in border/margin as required
int PlasticConnTestLayer::setActivitytoGlobalPos(){
   for (int kLocalExt = 0; kLocalExt < getNumExtended(); kLocalExt++){
      int kxLocalExt = kxPos(kLocalExt, clayer->loc.nx + clayer-> + clayer->loc.halo.rt, clayer->loc.ny + clayer->loc.halo.dn + clayer->loc.halo.up, clayer-> - clayer->;
      int kxGlobalExt = kxLocalExt + clayer->loc.kx0;
      float xScaleLog2 = clayer->xScale;
      float x0 = xOriginGlobal(xScaleLog2);
      float dx = deltaX(xScaleLog2);
      float x_global_pos = (x0 + dx * kxGlobalExt);
      clayer->activity->data[kLocalExt] = x_global_pos;
   return PV_SUCCESS;
int AccumulateLayer::setActivity() {
   const PVLayerLoc * loc = getLayerLoc();
   int nx = loc->nx;
   int ny = loc->ny;
   int nf = loc->nf;
   int num_neurons = nx*ny*nf;
   int status = PV_SUCCESS;
   memset(clayer->activity->data, 0, sizeof(pvdata_t)*getNumExtended());
   if( status == PV_SUCCESS ) status = applyVThresh_ANNLayer(num_neurons, getV(), AMin, VThresh, AShift, VWidth, getCLayer()->activity->data, nx, ny, nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);
   if( status == PV_SUCCESS ) status = applyVMax_ANNLayer(num_neurons, getV(), AMax, getCLayer()->activity->data, nx, ny, nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);
   return status;
int RunningAverageLayer::updateState(double timef, double dt) {
   int status = PV_SUCCESS;
   double deltaT = parent->getDeltaTime();
   //Check if an update is needed
   //Done in cloneVLayer
       int numNeurons = originalLayer->getNumNeurons();
       pvdata_t * A = clayer->activity->data;
       const pvdata_t * originalA = originalLayer->getCLayer()->activity->data;
       const PVLayerLoc * loc = getLayerLoc();
       const PVLayerLoc * locOriginal = originalLayer->getLayerLoc();
       int nbatch = loc->nbatch;
       //Make sure all sizes match
       //assert(locOriginal->nb == loc->nb);
       assert(locOriginal->nx == loc->nx);
       assert(locOriginal->ny == loc->ny);
       assert(locOriginal->nf == loc->nf);

       for(int b = 0; b < nbatch; b++){
          const pvdata_t * originalABatch = originalA + b * originalLayer->getNumExtended();
          pvdata_t * ABatch = A + b * getNumExtended();
          if (numUpdateTimes < numImagesToAverage*deltaT){
#pragma omp parallel for
                for(int k=0; k<numNeurons; k++) {
                   int kExt = kIndexExtended(k, loc->nx, loc->ny, loc->nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);
                   int kExtOriginal = kIndexExtended(k, locOriginal->nx, locOriginal->ny, locOriginal->nf,
                         locOriginal->, loc->halo.rt, loc->halo.dn, loc->halo.up);
                   ABatch[kExt] = ((numUpdateTimes/deltaT-1) * ABatch[kExt] + originalABatch[kExtOriginal]) * deltaT / numUpdateTimes;
#pragma omp parallel for
             for(int k=0; k<numNeurons; k++) {
                int kExt = kIndexExtended(k, loc->nx, loc->ny, loc->nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);
                int kExtOriginal = kIndexExtended(k, locOriginal->nx, locOriginal->ny, locOriginal->nf,
                      locOriginal->, loc->halo.rt, loc->halo.dn, loc->halo.up);
                ABatch[kExt] = ((numImagesToAverage-1) * ABatch[kExt] + originalABatch[kExtOriginal]) / numImagesToAverage;

       //Update lastUpdateTime
       lastUpdateTime = parent->simulationTime();

   return status;
//Makes a layer such that the restricted space is the index, but with spinning order be [x, y, f] as opposed to [f, x, y]
int InputLayer::updateState(double timef, double dt){
   //Grab layer size
   const PVLayerLoc* loc = getLayerLoc();
   int nx = loc->nx;
   int ny = loc->ny;
   int nf = loc->nf;
   int nxGlobal = loc->nxGlobal;
   int nyGlobal = loc->nyGlobal;
   int kx0 = loc->kx0;
   int ky0 = loc->ky0;

   for(int b = 0; b < parent->getNBatch(); b++){
      pvdata_t * A = getActivity() + b * getNumExtended();
      //looping over ext
      for(int iY = 0; iY < ny+loc->halo.up+loc->halo.dn; iY++){
         for(int iX = 0; iX < nx+loc->>halo.rt; iX++){
            //Calculate x and y global extended
            int xGlobalExt = iX + loc->kx0;
            int yGlobalExt = iY + loc->ky0;
            //Calculate x and y in restricted space
            int xGlobalRes = xGlobalExt - loc->;
            int yGlobalRes = yGlobalExt - loc->halo.up;
            //Calculate base value
            //xGlobal and yGlobalRes can be negative
            int baseActivityVal = yGlobalRes * nxGlobal + xGlobalRes;

            for(int iFeature = 0; iFeature < nf; iFeature++){
              int ext_idx = kIndex(iX, iY, iFeature, nx+loc->>halo.rt, ny+loc->halo.dn+loc->halo.up, nf);
              //Feature gives an offset, since it spins slowest
              int activityVal = baseActivityVal + iFeature * nxGlobal * nyGlobal;
              A[ext_idx] = activityVal;

   ////Printing for double checking 
   ////looping over ext
   //for(int iFeature = 0; iFeature < nf; iFeature++){
   //   for(int iY = 0; iY < ny+loc->halo.up+loc->halo.dn; iY++){
   //      for(int iX = 0; iX < nx+loc->>halo.rt; iX++){
   //         int ext_idx = kIndex(iX, iY, iFeature, nx+loc->>halo.rt, ny+loc->halo.dn+loc->halo.up, nf);
   //         printf("%03d ", (int)A[ext_idx]);
   //      }
   //      printf("\n");
   //   }
   //   printf("\n\n");

   return PV_SUCCESS;
int FilenameParsingGroundTruthLayer::updateState(double time, double dt)
   pvdata_t * A = getCLayer()->activity->data;
   const PVLayerLoc * loc = getLayerLoc();
   int num_neurons = getNumNeurons();
   if (num_neurons != numClasses)
      pvError() << "The number of neurons in " << getName() << " is not equal to the number of classes specified in " << parent->getOutputPath() << "/classes.txt\n";

   for(int b = 0; b < loc->nbatch; b++){
      char * currentFilename = NULL;
      int filenameLen = 0;
      //TODO depending on speed of this layer, more efficient way would be to preallocate currentFilename buffer
         currentFilename = strdup(movieLayer->getFilename(b));
         //Get length of currentFilename and broadcast
         int filenameLen = (int) strlen(currentFilename) + 1; //+1 for the null terminator
         //Using local communicator, as each batch MPI will handle it's own run
         MPI_Bcast(&filenameLen, 1, MPI_INT, 0, parent->icCommunicator()->communicator());
         //Braodcast filename to all other local processes
         MPI_Bcast(currentFilename, filenameLen, MPI_CHAR, 0, parent->icCommunicator()->communicator());
         //Receive broadcast about length of filename
         MPI_Bcast(&filenameLen, 1, MPI_INT, 0, parent->icCommunicator()->communicator());
         currentFilename = (char*)calloc(sizeof(char), filenameLen);
         //Receive filename
         MPI_Bcast(currentFilename, filenameLen, MPI_CHAR, 0, parent->icCommunicator()->communicator());

      std::string fil = currentFilename;
      pvdata_t * ABatch = A + b * getNumExtended();
      for(int i = 0; i < num_neurons; i++){
         int nExt = kIndexExtended(i, loc->nx, loc->ny, loc->nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);
         int fi = featureIndex(nExt, loc->nx+loc->halo.rt+loc->, loc->ny+loc->halo.dn+loc->halo.up, loc->nf);
         int match = fil.find(classes[i]);
         if(0 <= match){
            ABatch[nExt] = gtClassTrueValue;
            ABatch[nExt] = gtClassFalseValue;
      //Free buffer, TODO, preallocate buffer to avoid this
   return PV_SUCCESS;
int GatePoolTestLayer::updateState(double timef, double dt) {
    //Do update state of ANN Layer first
    ANNLayer::updateState(timef, dt);

    //Grab layer size
    const PVLayerLoc* loc = getLayerLoc();
    int nx = loc->nx;
    int ny = loc->ny;
    int nxGlobal = loc->nxGlobal;
    int nyGlobal = loc->nyGlobal;
    int nf = loc->nf;
    int kx0 = loc->kx0;
    int ky0 = loc->ky0;

    bool isCorrect = true;
    //Grab the activity layer of current layer
    for(int b = 0; b < loc->nbatch; b++) {
        const pvdata_t * A = getActivity() + b * getNumExtended();
        //We only care about restricted space, but iY and iX are extended
        for(int iY = loc->halo.up; iY < ny + loc->halo.up; iY++) {
            for(int iX = loc->; iX < nx + loc->; iX++) {
                for(int iFeature = 0; iFeature < nf; iFeature++) {
                    int ext_idx = kIndex(iX, iY, iFeature, nx+loc->>halo.rt, ny+loc->halo.dn+loc->halo.up, nf);

                    float actualvalue = A[ext_idx];

                    int xval = (iX + kx0 - loc->;
                    int yval = (iY + ky0 - loc->halo.up)/2;
                    assert(xval >= 0 && xval < loc->nxGlobal);
                    assert(yval >= 0 && yval < loc->nxGlobal);

                    float expectedvalue;
                    expectedvalue = iFeature * 64 + yval * 16 + xval * 2 + 4.5;

                    if(fabs(actualvalue - expectedvalue) >= 1e-4) {
                        pvErrorNoExit() << "Connection " << name << " Mismatch at (" << iX << "," << iY << ") : actual value: " << actualvalue << " Expected value: " << expectedvalue << ".  Discrepancy is a whopping " << actualvalue - expectedvalue << "!  Horrors!" << "\n";
                        isCorrect = false;
    if(!isCorrect) {
        InterColComm * icComm = parent->icCommunicator();
        MPI_Barrier(icComm->communicator()); // If there is an error, make sure that MPI doesn't kill the run before process 0 reports the error.
    return PV_SUCCESS;
int BIDSCloneLayer::allocateDataStructures() {
   int status = CloneVLayer::allocateDataStructures();


   BIDSMovieCloneMap *blayer = dynamic_cast<BIDSMovieCloneMap*> (originalLayer->getParent()->getLayerFromName(jitterSourceName));
   if (blayer==NULL) {
      fprintf(stderr, "BIDSCloneLayer \"%s\": jitterSource \"%s\" must be a BIDSMovieCloneMap.\n", name, jitterSourceName);
   coords = blayer->getCoords();
   numNodes = blayer->getNumNodes();

   for(int i = 0; i < getNumExtended(); i++){
      this->clayer->activity->data[i] = 0;
   return status;
int AccumulateLayer::doUpdateState(double time, double dt, const PVLayerLoc * loc, pvdata_t * A,
      pvdata_t * V, int num_channels, pvdata_t * gSynHead)
   bool needsUpdate = false;
   if (syncedInputLayer != NULL) {
      if (getPhase() > syncedInputLayer->getPhase()) {
         needsUpdate = syncedInputLayer->getLastUpdateTime() >= lastUpdateTime;
      else {
         needsUpdate = syncedInputLayer->getLastUpdateTime() > lastUpdateTime;
   if (needsUpdate) {
      memset(clayer->activity->data, 0, sizeof(pvdata_t)*getNumExtended());
//#ifdef PV_USE_OPENCL
//   if(gpuAccelerateFlag) {
//      updateStateOpenCL(time, dt);
//      //HyPerLayer::updateState(time, dt);
//   }
//   else {
      int nx = loc->nx;
      int ny = loc->ny;
      int nf = loc->nf;
      int num_neurons = nx*ny*nf;
      updateV_AccumulateLayer(num_neurons, V, num_channels, gSynHead, A,
              AMax, AMin, VThresh, AShift, VWidth, nx, ny, nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);
      //Moved to publish
      //if (this->writeSparseActivity){
      //   updateActiveIndices();  // added by GTK to allow for sparse output, can this be made an inline function???
//#ifdef PV_USE_OPENCL
//   }

   return PV_SUCCESS;
int PursuitLayer::updateState(double time, double dt) {
    if (!updateReady) return PV_SUCCESS;
    int nx = getLayerLoc()->nx;
    int ny = getLayerLoc()->ny;
    int nf = getLayerLoc()->nf;
    PVHalo const * halo = &getLayerLoc()->halo;
    pvdata_t * activity = getActivity();
    memset(activity, 0, getNumExtended()*sizeof(*activity));

    int nxy = nx*ny;
    for (int kxy=0; kxy<nxy; kxy++) {
        int kf = foundFeatures[kxy];
        if (kf>=0) {
            int kx = kxPos(kxy,nx,ny,1);
            int ky = kyPos(kxy,nx,ny,1);
            int kex = kIndex(kx+halo->lt, ky+halo->up, kf, nx+halo->lt+halo->rt, ny+halo->dn+halo->up, nf); /* Is this correct? Before splitting x- and y- margin widths, the ny argument was ny*nb, which seems weird. */
            activity[kex] = gSynSparse[kxy];
    //resetGSynBuffers_HyPerLayer(getNumNeurons(), getNumChannels(), GSyn[0]);
    updateReady = false;
    return PV_SUCCESS;
int MoviePvpTestLayer::updateStateWrapper(double time, double dt)
   MoviePvp::updateStateWrapper(time, dt);
   const PVLayerLoc * loc = getLayerLoc();
   int nx = loc->nx;
   int ny = loc->ny;
   int nf = loc->nf;
   int nbatch = loc->nbatch;

   for(int b = 0; b < nbatch; b++){
      pvdata_t * dataBatch = data + b * getNumExtended();
      int frameIdx;
      if(strcmp(getBatchMethod(), "byImage") == 0){
         frameIdx = (time-1) * nbatch + b;
      else if(strcmp(getBatchMethod(), "byMovie") == 0){
         frameIdx = b * 2 + (time-1);
      for(int nkRes = 0; nkRes < getNumNeurons(); nkRes++){
         //Calculate extended index
         int nkExt = kIndexExtended(nkRes, nx, ny, nf, loc->, loc->halo.rt, loc->halo.dn, loc->halo.up);  
         //checkVal is the value from batch index 0
         pvdata_t checkVal = dataBatch[nkExt];

         int kxGlobal = kxPos(nkRes, nx, ny, nf) + loc->kx0;
         int kyGlobal = kyPos(nkRes, nx, ny, nf) + loc->ky0; 
         int kf = featureIndex(nkRes, nx, ny, nf);

         pvdata_t expectedVal = kIndex(kxGlobal, kyGlobal, kf, loc->nxGlobal, loc->nyGlobal, nf) + frameIdx*192;
         if(fabs(checkVal - expectedVal) >= 1e-5){
            std::cout << "ImageFileIO " << name << " test Expected: " << expectedVal << " Actual: " << checkVal << "\n";
   return PV_SUCCESS;
int SegmentLayer::updateState(double timef, double dt) {
   pvdata_t* srcA = originalLayer->getActivity();
   pvdata_t* thisA = getActivity();

   const PVLayerLoc* loc = getLayerLoc();

   //Segment input layer based on segmentMethod
   if(strcmp(segmentMethod, "none") == 0){
      int numBatchExtended = getNumExtendedAllBatches();
      //Copy activity over
      //Since both buffers should be identical size, we can do a memcpy here
      memcpy(thisA, srcA, numBatchExtended * sizeof(pvdata_t));
      //This case should never happen

   assert(loc->nf == 1);

   //Clear centerIdxs
   for(int bi = 0; bi < loc->nbatch; bi++){

   for(int bi = 0; bi < loc->nbatch; bi++){
      pvdata_t* batchA = thisA + bi * getNumExtended();
      //Reset max/min buffers

      //Loop through this buffer to fill labelVec and idxVec
      //Looping through restricted, but indices are extended
      for(int yi = loc->halo.up; yi < loc->ny+loc->halo.up; yi++){
         for(int xi = loc->; xi < loc->nx+loc->; xi++){
            //Convert to local extended linear index
            int niLocalExt = yi * (loc->nx+loc->>halo.rt) + xi;
            //Convert yi and xi to global res index
            int globalResYi = yi - loc->halo.up + loc->ky0;
            int globalResXi = xi - loc-> + loc->kx0;

            //Get label value
            //Note that we're assuming that the activity here are integers,
            //even though the buffer is floats
            int labelVal = round(batchA[niLocalExt]);

            //Calculate max/min x and y for a single batch
            //If labelVal exists in map
               //Here, we're assuming the 4 maps are in sync, so we use the 
               //.at method, as it will throw an exception as opposed to the 
               //[] operator, which will simply add the key into the map
               if(globalResXi >{
                  maxX[labelVal] = globalResXi;
               if(globalResXi <{
                  minX[labelVal] = globalResXi;
               if(globalResYi >{
                  maxY[labelVal] = globalResYi;
               if(globalResYi <{
                  minY[labelVal] = globalResYi;
            //If doesn't exist, add into map with current vals
               maxX[labelVal] = globalResXi;
               minX[labelVal] = globalResXi;
               maxY[labelVal] = globalResYi;
               minY[labelVal] = globalResYi;

      //We need to mpi across processors in case a segment crosses an mpi boundary
      InterColComm * icComm = parent->icCommunicator();
      int numMpi = icComm->commSize();
      int rank = icComm->commRank();

      //Local comm rank
      //Non root processes simply send buffer size and then buffers
      int numLabels = maxX.size();

      if(rank != 0){
         //Load buffers
         //Send number of labels first
         MPI_Send(&numLabels, 1, MPI_INT, 0, rank, icComm->communicator());
         //Send labels, then max/min buffers
         MPI_Send(labelBuf, numLabels, MPI_INT, 0, rank, icComm->communicator());
         MPI_Send(maxXBuf, numLabels, MPI_INT, 0, rank, icComm->communicator());
         MPI_Send(maxYBuf, numLabels, MPI_INT, 0, rank, icComm->communicator());
         MPI_Send(minXBuf, numLabels, MPI_INT, 0, rank, icComm->communicator());
         MPI_Send(minYBuf, numLabels, MPI_INT, 0, rank, icComm->communicator());

         //Receive the full centerIdxBuf from root process
         int numCenterIdx = 0;
         MPI_Bcast(&numCenterIdx, 1, MPI_INT, 0, icComm->communicator());

         MPI_Bcast(allLabelsBuf, numCenterIdx, MPI_INT, 0, icComm->communicator());
         MPI_Bcast(centerIdxBuf, numCenterIdx, MPI_INT, 0, icComm->communicator());

         //Load buffer into centerIdx map
         loadCenterIdxMap(bi, numCenterIdx);
      //Root process stores everything
         //One recv per buffer
         for(int recvRank = 1; recvRank < numMpi; recvRank++){
            int numRecvLabels = 0;
            MPI_Recv(&numRecvLabels, 1, MPI_INT, recvRank, recvRank, icComm->communicator(), NULL);

            MPI_Recv(labelBuf, numRecvLabels, MPI_INT, recvRank, recvRank, icComm->communicator(), NULL);
            MPI_Recv(maxXBuf, numRecvLabels, MPI_INT, recvRank, recvRank, icComm->communicator(), NULL);
            MPI_Recv(maxYBuf, numRecvLabels, MPI_INT, recvRank, recvRank, icComm->communicator(), NULL);
            MPI_Recv(minXBuf, numRecvLabels, MPI_INT, recvRank, recvRank, icComm->communicator(), NULL);
            MPI_Recv(minYBuf, numRecvLabels, MPI_INT, recvRank, recvRank, icComm->communicator(), NULL);

            for(int i = 0; i < numRecvLabels; i++){
               int label = labelBuf[i];
               //Add on to maps
               //If the label already exists, fill with proper max/min
                  if(maxXBuf[i] >{
                     maxX[label] = maxXBuf[i];
                  if(maxYBuf[i] >{
                     maxY[label] = maxYBuf[i];
                  if(minXBuf[i] <{
                     minX[label] = minXBuf[i];
                  if(minYBuf[i] <{
                     minY[label] = minYBuf[i];
                  maxX[label] = maxXBuf[i];
                  maxY[label] = maxYBuf[i];
                  minX[label] = minXBuf[i];
                  minY[label] = minYBuf[i];

         //Maps are now filled with all segments from the image
         //Fill centerIdx based on max/min
         for(std::map<int, int>::iterator it = maxX.begin();
               it != maxX.end(); ++it){
            int label = it->first;
            int centerX = + ( -;
            int centerY = + ( -;
            //Convert centerpoints (in global res idx) to linear idx (in global res space)
            int centerIdxVal = centerY * (loc->nxGlobal) + centerX;
            //Add to centerIdxMap
            centerIdx[bi][label] = centerIdxVal;

         //Fill centerpoint buffer
         int numCenterIdx = centerIdx[bi].size();

         int idx = 0;
         for(std::map<int, int>::iterator it = centerIdx[bi].begin(); 
               it != centerIdx[bi].end(); ++it){
            allLabelsBuf[idx] = it->first;
            centerIdxBuf[idx] = it->second;

         //Broadcast buffers
         MPI_Bcast(&numCenterIdx, 1, MPI_INT, 0, icComm->communicator());
         MPI_Bcast(allLabelsBuf, numCenterIdx, MPI_INT, 0, icComm->communicator());
         MPI_Bcast(centerIdxBuf, numCenterIdx, MPI_INT, 0, icComm->communicator());
   } //End batch loop
   //centerIdx now stores each center coordinate of each segment
   return PV_SUCCESS;
int BackwardsBatchNorm::updateState(double timef, double dt) {
    int status = PV_SUCCESS;

    //We are filling this activity buffer
    pvdata_t * thisA = clayer->activity->data;

    //We need the normalized input vals, orig input vals, and the input gradients
    const pvdata_t * inputGradA = originalLayer->getCLayer()->activity->data;
    const pvdata_t * forwardA = forwardLayer->getCLayer()->activity->data;
    const pvdata_t * origInputA = forwardLayer->getOriginalLayer()->getCLayer()->activity->data;
    assert(inputGradA && forwardA && origInputA);

    //Get locs for all buffers
    const PVLayerLoc * thisLoc = getLayerLoc();
    const PVLayerLoc * inputGradLoc = originalLayer->getLayerLoc();
    const PVLayerLoc * forwardLoc = forwardLayer->getLayerLoc();
    const PVLayerLoc * origInputLoc = forwardLayer->getOriginalLayer()->getLayerLoc();

    int nbatch = thisLoc->nbatch;

    //All nx, ny, and nf should be the same
    int nx = thisLoc->nx;
    int ny = thisLoc->ny;
    int nf = thisLoc->nf;

    //Get buffer margins here
    int xThisMargin = thisLoc-> + thisLoc->halo.rt;
    int yThisMargin = thisLoc->halo.up + thisLoc->halo.dn;
    int xInputGradMargin = inputGradLoc-> + inputGradLoc->halo.rt;
    int yInputGradMargin = inputGradLoc->halo.up + inputGradLoc->halo.dn;
    int xForwardMargin = forwardLoc-> + forwardLoc->halo.rt;
    int yForwardMargin = forwardLoc->halo.up + forwardLoc->halo.dn;
    int xOrigInputMargin = origInputLoc-> + origInputLoc->halo.rt;
    int yOrigInputMargin = origInputLoc->halo.up + origInputLoc->halo.dn;

    //We also need various mean and var buffers from the forward layer
    const float* batchMean = forwardLayer->getBatchMean();
    const float* batchVar = forwardLayer->getBatchVar();
    float*       batchMeanShift = forwardLayer->getBatchMeanShift();
    float*       batchVarShift = forwardLayer->getBatchVarShift();
    float        epsilon = forwardLayer->getEpsilon();

    //Total number of neurons to divide by for each feature
    float normVal = parent->getNBatchGlobal() * thisLoc->nyGlobal * thisLoc->nxGlobal;

    //We're accumulating into delta buffers, so clear

    //Ioffe et. al. Batch Normalization

    //Calculate deltaVar
    //TODO parallize over threads
    for(int iF = 0; iF < nf; iF++) {
        float secondTerm = -.5*(powf(batchVar[iF] + epsilon, -1.5));
        for(int b = 0; b < nbatch; b++) {
            const pvdata_t* batchOrigInputA = origInputA + b * forwardLayer->getOriginalLayer()->getNumExtended();
            const pvdata_t* batchInputGradA = inputGradA + b * originalLayer->getNumExtended();
            for(int iY = 0; iY < ny; iY++) {
                for(int iX = 0; iX < nx; iX++) {
                    int kExtOrigInput = kIndex(iX, iY, iF, nx+xOrigInputMargin, ny+yOrigInputMargin, nf);
                    int kExtInputGrad = kIndex(iX, iY, iF, nx+xInputGradMargin, ny+yInputGradMargin, nf);
                    float deltaNorm = batchInputGradA[kExtInputGrad] * batchVarShift[iF];
                    deltaVar[iF] += deltaNorm * (batchOrigInputA[kExtOrigInput] - batchMean[iF]);
        //Multiply deltaVar by secondTerm
        deltaVar[iF] = deltaVar[iF] * secondTerm;

    //Reduce deltaVar
#ifdef PV_USE_MPI
    MPI_Allreduce(MPI_IN_PLACE, deltaVar, nf, MPI_FLOAT, MPI_SUM, parent->icCommunicator()->globalCommunicator());
#endif // PV_USE_MPI

    //Calculate deltaMean
    //Calculate first term first
    //TODO parallize over threads
    for(int iF = 0; iF < nf; iF++) {
        float multiplier = -1.0/(sqrtf(batchVar[iF]+epsilon));
        for(int b = 0; b < nbatch; b++) {
            const pvdata_t* batchInputGradA = inputGradA + b * originalLayer->getNumExtended();
            for(int iY = 0; iY < ny; iY++) {
                for(int iX = 0; iX < nx; iX++) {
                    int kExtInputGrad = kIndex(iX, iY, iF, nx+xInputGradMargin, ny+yInputGradMargin, nf);
                    float deltaNorm = batchInputGradA[kExtInputGrad] * batchVarShift[iF];
                    deltaMean[iF] += deltaNorm * multiplier;
    //Reduce deltaMean across mpi
#ifdef PV_USE_MPI
    MPI_Allreduce(MPI_IN_PLACE, deltaMean, nf, MPI_FLOAT, MPI_SUM, parent->icCommunicator()->globalCommunicator());
#endif // PV_USE_MPI

    //Calculate second term
    //TODO parallize over threads
    for(int iF = 0; iF < nf; iF++) {
        float tmpMean = 0;
        for(int b = 0; b < nbatch; b++) {
            const pvdata_t* batchOrigInputA = origInputA + b * forwardLayer->getOriginalLayer()->getNumExtended();
            for(int iY = 0; iY < ny; iY++) {
                for(int iX = 0; iX < nx; iX++) {
                    int kExtOrigInput = kIndex(iX, iY, iF, nx+xOrigInputMargin, ny+yOrigInputMargin, nf);
                    tmpMean += -2 * (batchOrigInputA[kExtOrigInput] - batchMean[iF]);
        //Reduce tmpMean
#ifdef PV_USE_MPI
        MPI_Allreduce(MPI_IN_PLACE, &tmpMean, 1, MPI_FLOAT, MPI_SUM, parent->icCommunicator()->globalCommunicator());
#endif // PV_USE_MPI
        tmpMean = tmpMean / normVal;
        //Add second term to first term
        deltaMean[iF] += deltaVar[iF] * tmpMean;

    //No more sums, go with efficient loop
    //TODO Is the efficient loop better for optimization or do we put
    //features on the outer most loop for precalculation of constants over features?
    for(int b = 0; b < nbatch; b++) {
        const pvdata_t* batchOrigInputA = origInputA + b * forwardLayer->getOriginalLayer()->getNumExtended();
        const pvdata_t* batchInputGradA = inputGradA + b * originalLayer->getNumExtended();
        pvdata_t* batchThisA = thisA + b * getNumExtended();
        #pragma omp parallel for collapse(3)
        for(int iY = 0; iY < ny; iY++) {
            for(int iX = 0; iX < nx; iX++) {
                for(int iF = 0; iF < nf; iF++) {
                    int kExtOrigInput = kIndex(iX, iY, iF, nx+xOrigInputMargin, ny+yOrigInputMargin, nf);
                    int kExtInputGrad = kIndex(iX, iY, iF, nx+xInputGradMargin, ny+yInputGradMargin, nf);
                    int kExtThis = kIndex(iX, iY, iF, nx+xThisMargin, ny+yThisMargin, nf);
                    float deltaNorm = batchInputGradA[kExtInputGrad] * batchVarShift[iF];
                    float firstTerm = deltaNorm/sqrtf(batchVar[iF] + epsilon);
                    float secondTerm = deltaVar[iF] * (2*(batchOrigInputA[kExtOrigInput] - batchMean[iF])/normVal);
                    float thirdTerm = deltaMean[iF]/normVal;
                    batchThisA[kExtThis] = firstTerm + secondTerm + thirdTerm;

    //We calculate delta varShift and deltaMeanShift here
    //TODO parallize over threads
    //Since we're summing into delta*shift buffers, we have to sequentialize over features
    for(int iF = 0; iF < nf; iF++) {
        for(int b = 0; b < nbatch; b++) {
            const pvdata_t* batchForwardA = forwardA + b * forwardLayer->getNumExtended();
            const pvdata_t* batchInputGradA = inputGradA + b * originalLayer->getNumExtended();
            for(int iY = 0; iY < ny; iY++) {
                for(int iX = 0; iX < nx; iX++) {
                    int kExtInputGrad = kIndex(iX, iY, iF, nx+xInputGradMargin, ny+yInputGradMargin, nf);
                    int kExtForwardA = kIndex(iX, iY, iF, nx+xForwardMargin, ny + yForwardMargin, nf);
                    deltaVarShift[iF] += batchInputGradA[kExtInputGrad] * batchForwardA[kExtForwardA];
                    deltaMeanShift[iF] += batchInputGradA[kExtInputGrad];

    //Reduce delta*Shift across all mpi
#ifdef PV_USE_MPI
    MPI_Allreduce(MPI_IN_PLACE, deltaVarShift, nf, MPI_FLOAT, MPI_SUM, parent->icCommunicator()->globalCommunicator());
    MPI_Allreduce(MPI_IN_PLACE, deltaMeanShift, nf, MPI_FLOAT, MPI_SUM, parent->icCommunicator()->globalCommunicator());
#endif // PV_USE_MPI

    //TODO implement learning rule for meanShift and varShift

    return status;