/* * ======== delayMicroseconds ======== * Delay for the given number of microseconds. */ void delayMicroseconds(unsigned int us) { if (us <7) { //The overhead in calling and returning from this function takes about 6us } else if (us <=20) { int time; for (time = 5*(us-6); time > 0; time--) { asm(" nop"); } } else if (us < 70) { int time; for (time = 5*us; time > 0; time--) { asm(" nop"); } } else { uint32_t t0, deltaT; Types_FreqHz freq; Timestamp_getFreq(&freq); deltaT = us * (freq.lo/1000000); t0 = Timestamp_get32(); while ((Timestamp_get32()-t0) < deltaT) { ; } } }
/* * ======== doLoad ======== * stall in a loop until timestamp equals initial * value plus count. */ Void doLoad(ULong count) { ULong now, end; /* compute load loop endpoint */ end = Timestamp_get32() + count; /* loop until we reach termination timestamp */ do { now = Timestamp_get32(); } while ((end - now) < count); }
void readPPM(int id, int height, int width, unsigned char *rgbPtr) { const unsigned char* im_data = (id == 0) ? im0_data : im1_data; static int i = 0; static unsigned int time = 0; unsigned int now; if (i == 0) { // 32bits precision is not sufficient here. Needs 64bits instead. now = Timestamp_get32(); unsigned int delta = (now - time) / NB_ITERATION_MEASURED; float fps = 1000000000.0 / (float) delta; System_printf("fps: %f\n", fps); time = Timestamp_get32(); } i = (i + 1) % NB_ITERATION_MEASURED; memcpy((void*)rgbPtr,(void*)im_data, 3*height*width*sizeof(unsigned char)); }
/* * ======== tsk0 ======== */ Void tsk0(UArg arg0, UArg arg1) { Int status; MessageQ_Msg msg; System_printf("tsk0 starting\n"); /* Register this heap with MessageQ */ MessageQ_registerHeap((IHeap_Handle)SharedRegion_getHeap(0), HEAP_ID); /* Open the 'next' remote message queue. Spin until it is ready. */ do { status = MessageQ_open(nextQueueName, &nextQueueId); } while (status < 0); if (selfId == 0) { msg = MessageQ_alloc(HEAP_ID, MSGSIZE); if (msg == NULL) { System_abort("MessageQ_alloc failed\n"); } /* Kick off the loop */ status = MessageQ_put(nextQueueId, msg); if (status < 0) { System_abort("MessageQ_put failed\n"); } } for (numReceived = 0; numReceived < NUMLOOPS; numReceived++) { /* Get a message */ status = MessageQ_get(messageQ, &msg, MessageQ_FOREVER); if (status < 0) { System_abort("MessageQ_get failed\n"); } if (selfId == 0) { rawtimestamps[numReceived] = Timestamp_get32(); if (numReceived == NUMLOOPS - 1) { printStatistics(); break; } } status = MessageQ_put(nextQueueId, msg); if (status < 0) { System_abort("MessageQ_put failed\n"); } } System_exit(0); }
void main(void) { Uint32 timestamp, delta; timestamp = Timestamp_get32(); EDMA3_DRV_Result edmaResult = EDMA3_DRV_SOK; unsigned const short *L1d_p9; unsigned const short *L1d_p10; unsigned short *mainSourceImg; unsigned short *referenceImg1; L1d_p9 = (unsigned const short *) Memory_alloc((IHeap_Handle) l2Heap, (NUM_BYTES1), 0, NULL); L1d_p10 = (unsigned const short *) Memory_alloc((IHeap_Handle) l2Heap, (NUM_BYTES3), 0, NULL); referenceImg1 = (unsigned short *) Memory_alloc((IHeap_Handle) l2Heap, (NUM_BYTES1), 0, NULL); mainSourceImg = (unsigned short *) Memory_alloc((IHeap_Handle) l2Heap, (NUM_BYTES3), 0, NULL); int i; edmaWait4Completion(1); for (i = 0; i < 1024; i++) { mainSourceImg[i] = 1; } edmaInit(); edmaInitiateXfer(L1d_p10, mainSourceImg, NUM_BYTES2, 8, 1, NUM_BYTES2, NUM_BYTES2, 1, 1, 0, 1); edmaWait4Completion(0); edmaInitiateXfer(L1d_p9, referenceImg1, NUM_BYTES2, 2, 1, NUM_BYTES2, NUM_BYTES2, 1, 1, 0, 1); edmaWait4Completion(0); delta = Timestamp_get32()-timestamp; }
/* * ======== tsk0_func ======== * This function is executed only on CORE0. * It sends an event to the next processor then pends on a semaphore. * The semaphore is posted by the callback function. */ Void tsk0_func(UArg arg0, UArg arg1) { Int status; /* Send an event to the next processor */ if (MultiProc_self() == 0) { rawtimestamps[seq++] = Timestamp_get32(); status = Notify_sendEvent(dstProc, INTERRUPT_LINE, EVENTID, NULL, TRUE); if (status < 0) { System_abort("Notify_sendEvent failed\n"); } } Task_exit(); }
/* * ======== cbFxn ======== * This function was registered with Notify. It is called when any event is * sent to this processor. */ Void cbFxn(UInt16 procId, UInt16 lineId, UInt32 eventId, UArg arg, UInt32 payload) { Int status; if (selfId == 0) { rawtimestamps[seq] = Timestamp_get32(); } seq++; status = Notify_sendEvent(dstProc, INTERRUPT_LINE, EVENTID, NULL, TRUE); if (status < 0) { System_abort("Notify_sendEvent failed\n"); } }
Void Utils_IntLatencyCalculate(Utils_IntLatencyMeasure * latencyMeasure, UInt intId) { if (latencyMeasure->start) { UInt32 curTime = Timestamp_get32(); UInt32 tsDelta; if ((latencyMeasure->prevIntTime != 0) && (latencyMeasure->prevIntTime < curTime)) { tsDelta = (curTime - latencyMeasure->prevIntTime) / latencyMeasure->timerFreqPerMicroSec; if (tsDelta > (latencyMeasure->expectedInterruptInterval + latencyMeasure->maxAllowedLatency)) { UInt32 lateIntIdx = latencyMeasure->numLateInts % UTILS_INTLATENCY_LATE_IRP_COUNT; latencyMeasure->lateIntIrp[lateIntIdx] = (UInt32) Task_self(); latencyMeasure->numLateInts++; } } else { if (latencyMeasure->prevIntTime == 0) { Types_FreqHz freq; Bits64 freqInMicrosec; Timestamp_getFreq(&freq); freqInMicrosec = freq.hi; freqInMicrosec <<= 32; freqInMicrosec |= freq.lo; freqInMicrosec /= UTILS_FREQPERMICROSEC_DIV_FACTOR; latencyMeasure->timerFreqPerMicroSec = (UInt32) freqInMicrosec; latencyMeasure->numLateInts = 0; // latencyMeasure->hHwi = Hwi_getHandle(intId); } } latencyMeasure->prevIntTime = curTime; } }
void dumpTime(int id, int* dumpBuffer){ dumpBuffer[id] = Timestamp_get32(); CACHE_wbInvL2(dumpBuffer+id, sizeof(int), CACHE_WAIT); }
/** * Send all prepared IPC messages to all cores and return the calculation result (ssd/jac/hess) */ void send_to_cores(const processing_type_e ProcessingType, const uint32_T number_of_cores, real32_T *SSD, real32_T JD[3], real32_T JD2[9]) { process_message_t * p_msg = 0; uint16_t msgId = 0; int32_T ret_val=0; #ifdef _TRACE_MC_ Types_FreqHz freq; float processing_time=0; Int32 ts1, ts2; #endif int32_t j; int32_t i; #ifdef _TRACE_MC_ logout("[MAIN ] Execute Process (ProcessingType=%u)\n", ProcessingType); //trace Timestamp_getFreq(&freq); #endif #ifdef _DO_ERROR_CHECKS_ if(NULL == h_receive_queue) { logout("No master msg receive queue available.\n", max_core); } if ((number_of_cores <= 0) || (number_of_cores > max_core)) { logout("Invalid number_of_cores: It should be between 1 to %u\n", max_core); ret_val = -1; goto mcip_process_error; } #endif //CACHING NOTE: //The picture data was cache write backed after images have been received. More //data is not to be cache write backed as we pass all other data (also arrays //element by element) to the cores using the message queue. Results are passed //back also using the message interface as we don't receive bulk data results. #ifdef _TRACE_MC_ ts1 = (Int32) Timestamp_get32(); #endif /* Send messages to processing cores, start at the highest core */ for (i = CORE_AMOUNT-1; i >= (int)(CORE_AMOUNT-number_of_cores); i-- ) { p_msg = p_queue_msg[i]; MessageQ_setMsgId(p_msg, ++msgId); MessageQ_setReplyQueue(h_receive_queue, (MessageQ_Msg)p_msg); #ifdef _TRACE_MC_ logout("[MAIN ] Start process on core %u (ProcessingType=%u)\n", p_msg->core_id, ProcessingType, p_msg->info.NewImageDataArrived); //trace #endif /* send the message to the remote processor */ if (MessageQ_put(queue_id[p_msg->core_id], (MessageQ_Msg)p_msg) < 0) { logout("MessageQ_put had a failure error\n"); ret_val = -1; goto mcip_process_error; } } //All cores have invalidated their cache to read new image data. Next time cache invalidation is no more necessary (until new image data arrives). g_NewImageDataArrived = 0; #ifdef _TRACE_MC_ logout("[MAIN ] Reset g_NetImageDataArrived signal to %d.\n", g_NewImageDataArrived); #endif //Clear result buffers (will be summed up, have to start at 0) if(pt_ssd == ProcessingType || pt_ssdJacHess == ProcessingType) { (*SSD)=0; if(pt_ssdJacHess == ProcessingType) { memset(JD, 0, sizeof(real32_T) * 3); memset(JD2, 0, sizeof(real32_T) * 9); } } //ToDo: Once it looked like all other cores finished calculating before core 0 started. Why ? //One could think of having no mcip_core_task at the main core and call the calculation directly instead ... Use _TRACE_MC_ (only) to see this //ToDo: When adding a big sleep command to the processing functions one should see if there's something wrong /* Receive the result */ for (i = (CORE_AMOUNT-number_of_cores); i < CORE_AMOUNT; i++) { if (MessageQ_get(h_receive_queue, (MessageQ_Msg *)&p_msg, MessageQ_FOREVER) < 0) { logout("This should not happen since timeout is forever\n"); ret_val = -1; }/* else if (p_msg->info.flag != 0) { logout("Process image error received from core %d\n", p_msg->core_id); ret_val = -1; }*/ #ifdef _TRACE_MC_ if(pt_ssd == ProcessingType || pt_ssdJacHess == ProcessingType) { logout("[MAIN ] process answer received from core %u (SSD=%f, ProcessingType=%u)\n", p_msg->core_id, (double)p_msg->info.out_SSD, ProcessingType); //trace if(pt_ssdJacHess == ProcessingType) { logout("[MAIN ] JD = [%f %f %f], JD2 = [%f ... %f ... %f]\n", (double)p_msg->info.out_JD[0], (double)p_msg->info.out_JD[1], (double)p_msg->info.out_JD[2], (double)p_msg->info.out_JD2[0], (double)p_msg->info.out_JD2[4], (double)p_msg->info.out_JD2[8]); } } else { logout("[MAIN ] process answer received from core %u (ProcessingType=%u)\n", p_msg->core_id, ProcessingType); //trace } #endif //Sum up the results if(pt_ssd == ProcessingType || pt_ssdJacHess == ProcessingType) { (*SSD) += p_msg->info.out_SSD; if(pt_ssdJacHess == ProcessingType) { for(j=0; j<3; j++) { JD[j] += p_msg->info.out_JD[j]; } for(j=0; j<9; j++) { JD2[j] += p_msg->info.out_JD2[j]; } } } } if (ret_val == -1) { goto mcip_process_error; } #ifdef _TRACE_MC_ ts2 = (Int32) Timestamp_get32(); ts2 = ts2 - ts1; processing_time = ((float)ts2 / (float)freq.lo); if(pt_ssd == ProcessingType || pt_ssdJacHess == ProcessingType) { logout("[MAIN ] SSD calculated in: %f s. Result = %f\n", processing_time, (double)(*SSD)); //trace if(pt_ssdJacHess == ProcessingType) { logout("[MAIN ] JD = [%f %f %f], JD2 = [%f ... %f ... %f]\n", (double)JD[0], (double)JD[1], (double)JD[2], (double)JD2[0], (double)JD2[4], (double)JD2[8]); } } else { logout("[MAIN ] Image shrinked in: %f s.\n", processing_time); //trace } #endif return; mcip_process_error: logout("mcip_process_error !!! \n"); shutdown_message_q(); }
/* * ======== Timer_checkFreq ======== */ Void Timer_checkFreq(Timer_Object *obj) { UInt key; UInt32 timerCountStart, timerCountEnd, tsCountStart, tsCountEnd; UInt32 deltaTs, deltaCnt; Types_FreqHz timerFreq, timestampFreq; UInt freqRatio; UInt32 actualFrequency; Timer_Object tempObj; /* * Make a temporary copy of 'obj' and modify it to be used for the timer * frequency check. Set the period to Timer_MAX_PERIOD to ensure that * the timer does not roll over while performing the check. */ memcpy((void *)&tempObj, (void *)obj, sizeof(Timer_Object)); tempObj.period = Timer_MAX_PERIOD; tempObj.periodType = Timer_PeriodType_COUNTS; tempObj.runMode = Timer_RunMode_ONESHOT; tempObj.startMode = Timer_StartMode_USER; /* Initialize the timer registers */ Timer_deviceConfig(&tempObj, NULL); /* Get the frequencies of the Timer and the Timestamp */ Timer_getFreq(&tempObj, &timerFreq); Timestamp_getFreq(×tampFreq); /* Assume that timer frequency is less than 2^32 Hz */ Assert_isTrue(timestampFreq.hi == 0 && timerFreq.hi == 0, NULL); freqRatio = timestampFreq.lo / timerFreq.lo; key = Hwi_disable(); /* * Warning: halting the core between Timer_start and the point of * code indicated below can cause the frequency check to fail. This is * is because the DMTimer will continue to run while this core is halted, * this causing the ratio between timer counts to change */ Timer_start(&tempObj); /* Record the initial timer & timestamp counts */ timerCountStart = Timer_getCount(&tempObj); tsCountStart = Timestamp_get32(); /* Wait for 'TIMERCOUNTS' timer counts to elapse */ while (Timer_getCount(&tempObj) < timerCountStart + TIMERCOUNTS); timerCountEnd = Timer_getCount(&tempObj); /* Record the timestamp ticks that have elapsed during the above loop */ tsCountEnd = Timestamp_get32(); /* End of code segment where core should not be halted */ Hwi_restore(key); deltaTs = tsCountEnd - tsCountStart; deltaCnt = timerCountEnd - timerCountStart; /* Check the timer frequency. Allow a margin of error. */ if (((deltaTs / deltaCnt) > freqRatio * 2) || ((deltaTs / deltaCnt) < freqRatio / 2)) { actualFrequency = ((UInt64)timestampFreq.lo * (UInt64)deltaCnt) / (UInt64)deltaTs; Error_raise(NULL, Timer_E_freqMismatch, Timer_module->intFreqs[obj->id].lo, actualFrequency); } }
t_real matlab_c_ssdRigid2D(t_pixel* dataR, int mR[2], t_real omegaR[4], t_pixel* dataT, int mT[2], t_real omegaT[4], const t_real w[3], const unsigned int MarginAddon[3], const t_real DSPRange[4], const unsigned int i_from, const unsigned int i_to, t_real grad[3], t_real H[9]){ #ifdef _TRACE_PAPER_FIGURES_ uint32_t time_s = Timestamp_get32(); #endif //Apply margin, set pointer fo first pixe containing data /* Bildbreite incl. Margin */ //const uint32_T width = ((BoundBox[1] - BoundBox[0]) + MarginAddon[0]) + 1U; const unsigned int width = (mT[0] + MarginAddon[0]); const unsigned int uImageStart = width * MarginAddon[1]; dataT = &dataT[uImageStart]; #ifdef _DO_ERROR_CHECKS_ CalcMinMaxIndex(MarginAddon, mT, uImageStart); #endif //precompute some values const t_real s = sin(w[0]); const t_real c = cos(w[0]); const t_real precomputedParamA = (w[1] - omegaT[0] + c*omegaR[0] - s*omegaR[2]) + 0.5f*(c - s - 1.0f); const t_real precomputedParamB = (w[2] - omegaT[2] + s*omegaR[0] + c*omegaR[2]) + 0.5f*(s + c - 1.0f); t_real fval = 0.0f; // Declare gradient entries as scalars due to OpenMP array reduction limitations t_real grad_r0 = 0.0f; t_real grad_r1 = 0.0f; t_real grad_r2 = 0.0f; t_real HR_00 = 0.0f; t_real HR_01 = 0.0f; t_real HR_02 = 0.0f; t_real HR_11 = 0.0f; t_real HR_12 = 0.0f; t_real HR_22 = 0.0f; // Variables are declared inside loop on purpose. No speedup was observed when setting up // variables outside and declaring them as private, yet errors can easily remain // undetected that way. //#pragma omp parallel for reduction(+: fval, grad_r0, grad_r1, grad_r2, HR_00, HR_01, HR_02, HR_11, HR_12, HR_22) if(useOpenMP) for (int j=i_from; j<i_to/*mR[1]*/; j++) { const t_real y_grid = omegaR[2] + (j+0.5f); const int yIndex = j*mR[0]; //#pragma MUST_ITERATE(3) //#pragma MUST_ITERATE(2, ,2) //ToDo: Would this make the code faster ? As we have padding borders (which might have to be sized +1 pixel below then) it might work to round i to the next multiple of 2 ... for (int i=0; i<mR[0]; i++) { t_real dTx, dTy; // Compute corresponding template image coordinates belonging to current pixel // shifts for cell-centered discretization are incorporated into precomputedParams const t_real x_voxel = c*i - s*j + precomputedParamA; //ToDo: What is faster ? To loop with a float value and omit the multiplications here ? Or to loop with integer value and to multiply here ? const t_real y_voxel = s*i + c*j + precomputedParamB; // Subtract current values for R and T (factor -1 is included here) const t_real val = linearInterPoint2D(dataT, x_voxel, y_voxel, dTx, dTy, width #ifdef _TEST_DIRICHLET_AS_IF_THERE_WAS_NO_PADDING_ , mT[0], mT[1] #endif ) - (dataR[yIndex+i]); // Compute current grid position in world coordinates const t_real x_grid = omegaR[0] + (i+0.5f); const t_real dxx = dTx*x_grid; const t_real dxy = dTx*y_grid; const t_real dx = dTx; const t_real dyx = dTy*x_grid; const t_real dyy = dTy*y_grid; const t_real dy = dTy; const t_real term = c*dyx - s*dxx - c*dxy - s*dyy; grad_r0 += val*term; grad_r1 += val*dx; grad_r2 += val*dy; HR_00 += term*term; HR_01 += term*dx; HR_02 += term*dy; HR_11 += dx*dx; HR_12 += dx*dy; HR_22 += dy*dy; fval += val*val; } } // multiply grad with h_bar grad[0] = grad_r0; grad[1] = grad_r1; grad[2] = grad_r2; H[0] = HR_00; H[1] = HR_01; H[2] = HR_02; H[1*3+1] = HR_11; H[1*3+2] = HR_12; H[2*3+2] = HR_22; // fill lower left part of matrix for (int i=1; i<3; i++){ for (int j=0; j<i; j++){ H[i*3+j] = H[j*3+i]; } } t_real fRetVal = 0.5f*fval; #ifdef _TRACE_PAPER_FIGURES_ uint32_t time_e = Timestamp_get32(); uint32_t TimeForDoubleLoop = time_e-time_s; logout("SSDJacHess took %d us.\n", TimeForDoubleLoop / DSP_CLOCKRATE_US); #endif return fRetVal; } // end of computeFunctionValueAndDerivatives
t_real matlab_c_ssdRigid2D(t_pixel* dataR, int mR[2], t_real omegaR[4], t_pixel* dataT, int mT[2], t_real omegaT[4], const t_real w[3], const unsigned int MarginAddon[3], const t_real DSPRange[4], const unsigned int i_from, const unsigned int i_to){ #ifdef _TRACE_PAPER_FIGURES_ uint32_t time_s = Timestamp_get32(); #endif //Apply margin, set pointer fo first pixe containing data /* Bildbreite incl. Margin */ //const unsigned int width = ((BoundBox[1] - BoundBox[0]) + MarginAddon[0]) + 1U; const unsigned int width = (mT[0] + MarginAddon[0]); const unsigned int uImageStart = width * MarginAddon[1]; dataT = &dataT[uImageStart]; #ifdef _DO_ERROR_CHECKS_ CalcMinMaxIndex(MarginAddon, mT, uImageStart); #endif //precompute some values const t_real s = sin(w[0]); const t_real c = cos(w[0]); const t_real precomputedParamA = (w[1] - omegaT[0] + c*omegaR[0] - s*omegaR[2]) + 0.5f*(c - s - 1.0f); const t_real precomputedParamB = (w[2] - omegaT[2] + s*omegaR[0] + c*omegaR[2]) + 0.5f*(s + c - 1.0f); t_real fval = 0.0f; // Variables are declared inside loop on purpose. No speedup was observed when setting up // variables outside and declaring them as private, yet errors can easily remain // undetected that way. //#pragma omp parallel for reduction(+: fval) if(useOpenMP) for (int j=i_from; j<i_to/*mR[1]*/; j++) { const int yIndex = j*mR[0]; //#pragma MUST_ITERATE(5) //#pragma MUST_ITERATE(2, ,2) //ToDo: Would this make the code faster ? As we have padding borders (which might have to be sized +1 pixel below then) it might work to round i to the next multiple of 2 ... for (int i=0; i<mR[0]; i++) { // Compute corresponding template image coordinates belonging to current pixel // shifts for cell-centered discretization are incorporated into precomputedParams const t_real x_voxel = c*i - s*j + precomputedParamA; const t_real y_voxel = s*i + c*j + precomputedParamB; // Subtract current values for R and T //rbe todo: substraction is vice versa here and in the other place. Dies this make sence ? const t_real val = dataR[yIndex+i] - linearInterPoint2D(dataT, x_voxel, y_voxel, width #ifdef _TEST_DIRICHLET_AS_IF_THERE_WAS_NO_PADDING_ , mT[0], mT[1] #endif ); fval += val*val; } } t_real fRetVal = 0.5f*fval; #ifdef _TRACE_PAPER_FIGURES_ uint32_t time_e = Timestamp_get32(); uint32_t TimeForDoubleLoop = time_e-time_s; logout("SSD took %d us.\n", TimeForDoubleLoop / DSP_CLOCKRATE_US); #endif return fRetVal; } // end of computeFunctionValueAffine