int recursiveFindOverlap(dTree* parent, int overlap, Path currentPath){ if(parent == NULL || parent->path == NULL){ return overlap; }else{ printf("parent index: %i", parent->index); int currentOverlap = calcOverlap(currentPath, parent->path); if (currentOverlap > overlap){ overlap = currentOverlap; } return recursiveFindOverlap(parent->parent, overlap, currentPath); } }
quint64 sw_cuda_cpp::estimateNeededRamAmount(int seqLibLength, ScoreType qProfLen, int queryLength, const U2::SmithWatermanSettings::SWResultView resultView) { const int overlapLength = calcOverlap(queryLength); const int partsNumber = calcPartsNumber(seqLibLength, overlapLength); const int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber); const int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize); int directionMatrixSize = 0; int backtraceBeginsSize = 0; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { directionMatrixSize = sizeof(int) * queryLength * seqLibLength; backtraceBeginsSize = sizeof(int) * sizeRow * 2; } const quint64 memToAlloc = 3 * sizeRow * sizeof(ScoreType) + directionMatrixSize + backtraceBeginsSize; return memToAlloc; }
quint64 sw_cuda_cpp::estimateNeededGpuMemory( int seqLibLength, ScoreType qProfLen, int queryLength, const U2::SmithWatermanSettings::SWResultView resultView) { int sizeP = qProfLen * sizeof(ScoreType); int sizeL = (seqLibLength) * sizeof(char); const int overlapLength = calcOverlap(queryLength); int partsNumber = calcPartsNumber(seqLibLength, overlapLength); int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber); int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize); int sizeN = 7 * sizeRow * sizeof(ScoreType); int directionMatrixSize = 0; int backtraceBeginsSize = 0; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { directionMatrixSize = sizeof(int) * queryLength * seqLibLength; backtraceBeginsSize = sizeof(int) * sizeRow * 2; } quint64 memToAlloc = sizeL + sizeP + sizeN; //see cudaMallocs in sw_cuda.cu for details return memToAlloc * 1.2; //just for safety }
QList<resType> calculateOnGPU(const char * seqLib, int seqLibLength, ScoreType* queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) { //TODO: calculate maximum alignment length const int overlapLength = calcOverlap(queryLength); int partsNumber = calcPartsNumber(seqLibLength, overlapLength); int queryDevider = 1; if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) { queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH; } int partQuerySize = (queryLength + queryDevider - 1) / queryDevider; int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber); int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize); u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider)); u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength)); u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize)); //************************** declare some temp variables on host ScoreType* tempRow = new ScoreType[sizeRow]; ScoreType* zerroArr = new ScoreType[sizeRow]; for (int i = 0; i < sizeRow; i++) { zerroArr[i] = 0; } ScoreType* directionRow = new ScoreType[sizeRow]; size_t directionMatrixSize = 0; size_t backtraceBeginsSize = 0; int * globalMatrix = NULL; int * backtraceBegins = NULL; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { directionMatrixSize = seqLibLength * queryLength * sizeof(int); backtraceBeginsSize = 2 * sizeRow * sizeof(int); globalMatrix = new int[directionMatrixSize / sizeof(int)]; backtraceBegins = new int[backtraceBeginsSize / sizeof(int)]; memset(globalMatrix, 0, directionMatrixSize); memset(backtraceBegins, 0, backtraceBeginsSize); } //************************** sizes of arrays size_t sizeQ = sizeRow * sizeof(ScoreType); size_t sizeQQ = (sizeRow) * sizeof(ScoreType); size_t sizeP = qProfLen * sizeof(ScoreType); size_t sizeL = (seqLibLength) * sizeof(char); //************************** declare arrays on device char * g_seqLib; ScoreType* g_queryProfile; ScoreType* g_HdataMax; ScoreType* g_HdataUp; ScoreType* g_HdataRec; ScoreType* g_HdataTmp; ScoreType* g_FdataUp; ScoreType* g_directionsUp; ScoreType* g_directionsMax; ScoreType* g_directionsRec; int * g_directionsMatrix = NULL; int * g_backtraceBegins = NULL; //************************** allocate global memory on device cudaMalloc((void **)& g_seqLib, sizeL); cudaMalloc((void **)& g_queryProfile, sizeP); cudaMalloc((void **)& g_HdataMax, sizeQ); cudaMalloc((void **)& g_HdataUp, sizeQ); cudaMalloc((void **)& g_FdataUp, sizeQ); cudaMalloc((void **)& g_directionsUp, sizeQ); cudaMalloc((void **)& g_directionsMax, sizeQ); cudaMalloc((void **)& g_HdataRec, sizeQ); cudaMalloc((void **)& g_directionsRec, sizeQ); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize); cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize); } u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7 + directionMatrixSize + backtraceBeginsSize) / 1024)); //************************** copy from host to device cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice); cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice); cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice); } //************************** start calculation int BLOCK_SIZE = partsNumber; dim3 dimBlock(BLOCK_SIZE); dim3 dimGrid(partQuerySize); //move constants variables to constant cuda memory setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength, queryLength, gapOpen, gapExtension, maxScore, partQuerySize, U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG, U2::SmithWatermanAlgorithm::STOP); size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3; u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size)); // start main loop for (int i = 0; i < queryDevider; i++) { calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib, g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax, g_FdataUp, g_directionsUp, g_directionsRec, g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins); cudaError hasErrors = cudaThreadSynchronize(); if (hasErrors != 0) { u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors)); } //revert arrays g_HdataTmp = g_HdataRec; g_HdataRec = g_HdataUp; g_HdataUp = g_HdataTmp; g_HdataTmp = g_directionsRec; g_directionsRec = g_directionsUp; g_directionsUp = g_HdataTmp; } //Copy vectors on host and find actual results cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost); cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost); cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost); } QList<resType> pas; resType res; for (int j = 0; j < (sizeRow); j++) { if (tempRow[j] >= maxScore) { res.refSubseq.startPos = directionRow[j]; res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1); res.score = tempRow[j]; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { qint32 pairAlignOffset = 0; qint32 row = backtraceBegins[2 * j]; qint32 column = backtraceBegins[2 * j + 1]; while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) { if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG; row--; column--; } else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP; column--; } else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT; row--; } if(0 >= row || 0 >= column) { break; } } res.patternSubseq.startPos = row; res.patternSubseq.length = backtraceBegins[2 * j] - row + 1; } pas.append(res); } } //deallocation memory cudaFree(g_seqLib); cudaFree(g_queryProfile); cudaFree(g_HdataMax); cudaFree(g_HdataUp); cudaFree(g_HdataRec); cudaFree(g_FdataUp); cudaFree(g_directionsUp); cudaFree(g_directionsMax); cudaFree(g_directionsRec); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaFree(g_directionsMatrix); cudaFree(g_backtraceBegins); } delete[] tempRow; delete[] directionRow; delete[] zerroArr; delete[] globalMatrix; delete[] backtraceBegins; return pas; }
/*FGROUP DebCon---------------------------------------------------setswtrig This routine should be called to set trigger before the repetitive call of startswtrig(). (or there is GenSwtrg() combining both) TODO: - BCmask is not used. Should be given as symbolic name (similarly pf protection) in one if input parameters - p/f is disabled. If set, we should wait at least 'L0 Protection interval' before generating sw trigger (see ctp_pf.doc) Inputs: ------- trigtype: 'a' - asynchronous 's' - synchronous, noncalibration 'c' - callibration roc: -> INT,FO* boards BC : BCmask for trigtype 'a' bunch crossing for the trigtype 'c' or 's' 0xfff: retrieve CALIBRATION_BC from ltuproxy ctprodets: 24 bits representing CTP readout (L2 board). I.e. bits 23..0 corresponds to detectors 23..0 with DAQ numbering (i.e. daqdet# in VALID.LTUS file) It is used for programming: - L2 board - BUSY board - FO boards - for decision if trigger starts from LM level (i.e. TRD in) Operation: ---------- Set: L0: bc, bcmask is OFF now, flags: syn/asyn cal, PF:off L1: PF:off L2: PF:off, list of detectors- ctprodets BUSY: BUSY_CLUSTER_TEST word FOs: FO_TEST_CLUSTER words: INT: INTtcset (roc + CIT flag in case of Cal. trigger) RC: 0: if successfully set 1: bad parameter(s): BC>3563 or bad trigtype */ int setswtrig(char trigtype, int roc, w32 BC, w32 ctprodets){ w32 daqonoff, word, INTtcset, busyclusterT, overlap, bsysc[NCLUST+1]; int rc=0, i, idet, ifo, iconnector; w32 testclust[NFO],rocs[NFO]; /* following 3 lines are in gcalib.c #ifdef SIMVME srand(73); #endif */ #define TRDECSM 0x10 if( (((BC>3563) && (BC!=0xfff)) && (trigtype !='a')) ){ printf("Error: setswtrig: BC>3563 %i \n",BC); rc=1; goto RET; }; if(ctprodets & TRDECSM) { LMSTART=1; } else { LMSTART=0;}; INTtcset= roc<<1; // INT board // L0 board p/f, masks off // P/F BCM4 BCM3 BCM2 BCM1 word=(1<<18)+(1<<17)+(1<<16)+(1<<15)+(1<<14); switch(trigtype){ case 'a': word=word+0; if(BC !=0) { // mask (4bits, 1:use mask 0: do not use mask) word= word & (~(BC&0xf)); }; //printf("setswtrig: asynchr trigger 0x%x \n",word); break; case 's': { //word=word+(1<<12)+BC; int mybc=BC-vmer32(L0_BCOFFSETr2); if(mybc<0) BC=mybc+3564; else BC=mybc; word=word+(1<<12)+BC; // from 12.11.2015 (c707) //printf("setswtrig: synchr trigger 0x%x \n",word); break; } case 'c': if(BC==0xfff) { BC= getCALIBBC2(ctprodets); }; { //BC= BC - 97; // CALIB_BC: 3011 ORBIT_BC: 91 //BC= BC - 2; // CALIB_BC: 3011 ORBIT_BC: 3560 int mybc = BC - 2 - vmer32(L0_BCOFFSETr2); // from 12.11.2015 (c707) /* BC-2: software trigger in 3011 gives BCID: 3008 calib. trigger in 3011 gives BCID: 3006 We do not know why we program BC-2 for cal. trigger (but it was programmed this way before c707) Before c707: SOD (sync trigger), generated in BC 1750 was seen in DAQ in BC 1762 -this is very likely for TRD-run, when BCID is higher by 15, i.e. 1750 + 15 -3 = 1762). */ if(mybc<0) BC=mybc+3564; else BC=mybc; BC=BC%3564; word=word+(1<<12)+(1<<13)+BC; INTtcset= INTtcset | 1; //printf("setswtrig: calib trigger 0x%x \n",word); } break; default: printf("Error: setswtrig: unknown type of trigger %c \n",trigtype); rc=1; goto RET; }; TRIGTYPE= trigtype; // L0 board -p/f prot. off if(l0C0()) { if(LMSTART != 0) word= word | 0x80000; vmew32(L0_TCSETr2,word); } else { vmew32(L0_TCSET,word); }; word=(1<<18); vmew32(L1_TCSET,word); // L1 board p/f prot. off word=(1<<24)+ctprodets; daqonoff= vmer32(INT_DDL_EMU) &0xf; if((daqonoff==0) || (daqonoff==0x3b)) { // ctp readout active or emulated, set TRIGGER bit //the bit has to be set for CALIBRATION events too! // if not set, EVB complains (run 67492) word= word | (1<<CTPLTUECSN); // from Monday 23rd bit17 back //word=word ; // temporary suppress bit17 till Monday 23rd //printf("setswtrig: bit17 not set\n"); //}; }; vmew32(L2_TCSET,word); // L2 board p/f off if(DBGswtrg4) printf("setswtrig:L2_TCSET set to:%x\n", word); vmew32(INT_TCSET,INTtcset); // INT board /* set BUSY_CLUSTER word (bits 23..0 for detectors 24..1 connected to BUSY) */ busyclusterT= findBUSYinputs(ctprodets); //no bit17! vmew32(BUSY_CLUSTER, busyclusterT); /* we should update BUSY_OVERLAP word (at least bits corresponding to combinations 1T 2T 3T 4T 5T 6T should be updated: */ bsysc[0]= busyclusterT; for(i=1;i<NCLUST+1;i++){ bsysc[i]=vmer32(BUSY_CLUSTER+i*4); }; overlap= calcOverlap(bsysc); vmew32(BUSY_OVERLAP, overlap); if(DBGswtrg4) { printf("setswtrig: BUSY/SET_CLUSTER: 0x%x BUSY_OVERLAP:0x%x ctprodets:0x%x\n", busyclusterT, overlap, ctprodets); }; /* set corresponding FO boards: */ for(i=0;i<NFO;i++){testclust[i]=0;rocs[i]=0;} for(idet=0;idet<NDETEC;idet++){ if((ctprodets & (1<<idet))!=0) { // no bit17! if(Detector2Connector(idet,&ifo,&iconnector)) continue; //not connected testclust[ifo]=testclust[ifo] +(1<<(16+iconnector)); //TestCluster if(trigtype=='c') { // cal. trigger, set CALFLAG testclust[ifo]=testclust[ifo] | 0x100000 ; }; rocs[ifo]=rocs[ifo]+(roc<<(4*iconnector)); if(DBGswtrg4) {printf( "setswtrig ifo=%i icon=%i testcl=0x%x roc=0x%x BC:%d dets:0x%x\n", ifo,iconnector,testclust[ifo],rocs[ifo], BC, ctprodets); }; }; }; for(ifo=0;ifo<NFO;ifo++){ // set all FOs always //printf("FO:%d\n",ifo); if((notInCrate(ifo+FO1BOARD)==0)) { w32 vmeaddr; vmeaddr= FO_TESTCLUSTER+BSP*(ifo+1); ifoglob[ifo]= testclust[ifo] | rocs[ifo]; if((DBGswtrg4==1)&&(testclust[ifo]!=0)) printf("setswtrig FO:%d Waddr: 0x%x data: 0x%x\n", ifo, vmeaddr, rocs[ifo] | testclust[ifo]); vmew32(vmeaddr, rocs[ifo] | testclust[ifo]); } }; RET: return(rc); }