Idx2BWT * BWTLoad2BWT(const char * indexFilePrefix, const char * saFileNameExtension) { Idx2BWT * idx2BWT = malloc(sizeof(Idx2BWT)); BWT * bwt; BWT * rev_bwt; HSP * hsp; char bwtFilename[MAX_INDEX_FILENAME_LENGTH]; char bwtOccFilename[MAX_INDEX_FILENAME_LENGTH]; char saFilename[MAX_INDEX_FILENAME_LENGTH]; char rev_bwtFilename[MAX_INDEX_FILENAME_LENGTH]; char rev_bwtOccFilename[MAX_INDEX_FILENAME_LENGTH]; char packedDnaFilename[MAX_INDEX_FILENAME_LENGTH]; char annotationFilename[MAX_INDEX_FILENAME_LENGTH]; char ambiguityFilename[MAX_INDEX_FILENAME_LENGTH]; char translateFilename[MAX_INDEX_FILENAME_LENGTH]; strcpy(bwtFilename,indexFilePrefix); strcpy(bwtOccFilename,indexFilePrefix); strcpy(saFilename,indexFilePrefix); strcpy(rev_bwtFilename,indexFilePrefix); strcpy(rev_bwtOccFilename,indexFilePrefix); strcpy(packedDnaFilename,indexFilePrefix); strcpy(annotationFilename,indexFilePrefix); strcpy(ambiguityFilename,indexFilePrefix); strcpy(translateFilename,indexFilePrefix); strcat(bwtFilename,".bwt"); strcat(bwtOccFilename,".fmv"); strcat(saFilename,saFileNameExtension); strcat(rev_bwtFilename,".rev.bwt"); strcat(rev_bwtOccFilename,".rev.fmv"); strcat(packedDnaFilename,".pac"); strcat(annotationFilename,".ann"); strcat(ambiguityFilename,".amb"); strcat(translateFilename,".tra"); MMMasterInitialize(3, 0, FALSE, NULL); MMPool * mmPool = MMPoolCreate(2097152); bwt = BWTLoad(mmPool, bwtFilename, bwtOccFilename, saFilename, NULL, NULL, NULL); rev_bwt = BWTLoad(mmPool, rev_bwtFilename, rev_bwtOccFilename, NULL, NULL, NULL, NULL); hsp = HSPLoad(mmPool, packedDnaFilename, annotationFilename, ambiguityFilename,translateFilename, 1); HSPFillCharMap(idx2BWT->charMap); HSPFillComplementMap(idx2BWT->complementMap); idx2BWT->bwt = bwt; idx2BWT->rev_bwt = rev_bwt; idx2BWT->hsp = hsp; idx2BWT->mmPool = mmPool; return idx2BWT; }
unsigned int HSPParseFASTAToPacked(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName, const unsigned int FASTARandomSeed, const int maskLowerCase) { FILE *FASTAFile, *annotationFile, *packedDNAFile; Annotation *chrAnnotation; int chrAnnAllocated = 256; int blockAllocated = 256; char c; int chrNum, blockNum; unsigned int i, l; int nCount; unsigned int chrLen, usefulCharNum, numCharInBuffer, totalNumChar; unsigned char charMap[255]; char *chrSeq, *p; unsigned int chrAllocated = 65536; unsigned char buffer[PACKED_BUFFER_SIZE]; unsigned char packedBuffer[PACKED_BUFFER_SIZE / 4]; chrLen = usefulCharNum = numCharInBuffer = totalNumChar = chrNum = blockNum = i = l = nCount = 0; FASTAFile = (FILE*)fopen64(FASTAFileName, "r"); if (FASTAFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open FASTAFileName!\n"); exit(1); } annotationFile = (FILE*)fopen64(annotationFileName, "w"); if (annotationFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open annotationFileName!\n"); exit(1); } packedDNAFile = (FILE*)fopen64(packedDNAFileName, "wb"); if (packedDNAFile == NULL) { fprintf(stderr, "ParseFASTToPacked() : Cannot open packedDNAFileName!\n"); exit(1); } /* * ambiguityFile = (FILE*)fopen64(ambiguityFileName, "w"); * if (ambiguityFile == NULL) { * fprintf(stderr, "ParseFASTToPacked() : Cannot open ambiguityFileName!\n"); * exit(1); * } * */ HSPFillCharMap(charMap); c = (char)getc(FASTAFile); if (c != '>') { fprintf(stderr, "ParseFASTToPacked() : FASTA file does not begin with '>'!\n"); exit(1); } chrAnnotation = (Annotation *)malloc(sizeof(Annotation)*chrAnnAllocated); chrSeq = (char*)malloc(sizeof(char)*chrAllocated); chrNum = blockNum = usefulCharNum = numCharInBuffer = 0; while(!feof(FASTAFile)){ if (feof(FASTAFile)) break; if (chrNum == chrAnnAllocated){ chrAnnAllocated <<= 1; chrAnnotation = (Annotation *)realloc(chrAnnotation, sizeof(Annotation)*chrAnnAllocated); // printf("%d\n", chrNum); } l=0; c = (char)getc(FASTAFile); while(!feof(FASTAFile) && c!='\t' && c!=' ' && c!='\n' && l<MAX_SEQ_NAME_LENGTH){ chrAnnotation[chrNum].chrName[l]=c; l++; c=(char)getc(FASTAFile); } chrAnnotation[chrNum].chrName[l]='\0'; while(c!='\n'){ c=(char)getc(FASTAFile); } chrLen = 0; while(c!='>' && !feof(FASTAFile)){ if (c!='\n'){ if (c>='a' && c<='z'){ c+='A'-'a'; } if (chrLen >= chrAllocated){ chrAllocated <<= 1; chrSeq = (char*)realloc(chrSeq, sizeof(char)*chrAllocated); } *(chrSeq+chrLen) = c; chrLen += 1; } c=(char)getc(FASTAFile); } if (chrLen < 75) continue; //* i=0; p=chrSeq; while (ambiguityCount[charMap[(int)*p]] == 1 && i++ != chrLen) p++; if (i == chrLen) { blockNum = 1; chrAnnotation[chrNum].blockInChr = (ChrBlock *)malloc(sizeof(ChrBlock)*blockNum); chrAnnotation[chrNum].chrStart = usefulCharNum; chrAnnotation[chrNum].blockNum = blockNum; chrAnnotation[chrNum].blockInChr[0].blockStart = usefulCharNum; chrAnnotation[chrNum].blockInChr[0].ori = 0; usefulCharNum += chrLen; chrAnnotation[chrNum].chrEnd = usefulCharNum-1; chrAnnotation[chrNum].blockInChr[0].blockEnd = usefulCharNum-1; i=0; while(i<chrLen){ if (numCharInBuffer >= PACKED_BUFFER_SIZE) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE); fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile); numCharInBuffer = 0; } buffer[numCharInBuffer++] = chrSeq[i++]; } } else { i=0; p = chrSeq; while (ambiguityCount[charMap[(int)*p]]!=1 && ++i!=chrLen) p++; if (i<10) { i = 0; p = chrSeq;} blockNum = 1; chrAnnotation[chrNum].blockInChr = (ChrBlock *)malloc(sizeof(ChrBlock)*blockAllocated); chrAnnotation[chrNum].chrStart = usefulCharNum; chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i; chrAnnotation[chrNum].blockInChr[blockNum-1].blockStart = usefulCharNum; int len=0; while (i<chrLen) { if(ambiguityCount[charMap[(int)*p]] == 1){ if (numCharInBuffer >= PACKED_BUFFER_SIZE) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE); fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile); numCharInBuffer = 0; } buffer[numCharInBuffer++] = *p++; i++; usefulCharNum++; len++; }else{ nCount = 0; while((ambiguityCount[charMap[(int)*p]]!=1) && i<chrLen){ nCount++; i++; p++; } if (nCount<10) { do { if (numCharInBuffer >= PACKED_BUFFER_SIZE) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE); fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile); numCharInBuffer = 0; } buffer[numCharInBuffer++] = 'G'; usefulCharNum++; len++; } while(--nCount>0); } else { if (i<chrLen) { chrAnnotation[chrNum].blockInChr[blockNum-1].blockEnd = usefulCharNum -1; chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i-nCount-len; if (blockNum == blockAllocated){ blockAllocated <<= 1; chrAnnotation[chrNum].blockInChr = (ChrBlock *)realloc(chrAnnotation[chrNum].blockInChr, sizeof(ChrBlock)*blockAllocated); } blockNum++; len=0; chrAnnotation[chrNum].blockInChr[blockNum-1].blockStart = usefulCharNum; } else { i-=nCount; break; } } } } chrAnnotation[chrNum].blockInChr[blockNum-1].blockEnd = usefulCharNum-1; chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i-len; chrAnnotation[chrNum].blockNum = blockNum; chrAnnotation[chrNum].chrEnd = usefulCharNum-1; } //*/ chrNum++; totalNumChar += chrLen; } if (numCharInBuffer > 0) { ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, numCharInBuffer); fwrite(packedBuffer, 1, (numCharInBuffer + 3) / 4, packedDNAFile); numCharInBuffer = 0; } if (totalNumChar % 4 == 0) { c = 0; fwrite(&c, 1, 1, packedDNAFile); } c = (char)(totalNumChar % 4); fwrite(&c, 1, 1, packedDNAFile); fclose(packedDNAFile); fprintf(annotationFile, "%u\t%d\t%d\n", totalNumChar, chrNum, FASTARandomSeed); int j=0; int total = 0; for (i=0;i<chrNum;i++) { fprintf(annotationFile, "%d\t%s\n", (int)strlen(chrAnnotation[i].chrName), chrAnnotation[i].chrName); total += chrAnnotation[i].blockNum; } fprintf(annotationFile, "%d\n", total); // fprintf(stderr, "total block, %d, %d, %d\n",i,chrNum, total); for(i=0;i<chrNum;i++){ for(j=0;j<chrAnnotation[i].blockNum;j++){ fprintf(annotationFile,"%d\t%u\t%u\t%u\n",i, chrAnnotation[i].blockInChr[j].blockStart, chrAnnotation[i].blockInChr[j].blockEnd, chrAnnotation[i].blockInChr[j].ori); } free(chrAnnotation[i].blockInChr); } free(chrAnnotation); fclose(annotationFile); return chrNum; }
int main(int argc, char** argv) { char c; unsigned int i, j, k; MMPool *mmPool; dictionary *programInput, *ini; char argumentText[11] = "argument:0"; double startTime; double elapsedTime = 0, totalElapsedTime = 0; BWT *bwt; HSP *hsp; unsigned char charMap[256]; unsigned char *convertedKey; unsigned int *packedKey; unsigned int found; unsigned int numberOfPattern, numberOfPatternFound; unsigned int saIndexLeft, saIndexRight; SaIndexGroupNew *saIndexGroup; SaIndexList *tempSaIndex1, *tempSaIndex2; unsigned int numberOfSaIndexGroup; HitList *hitList; unsigned int textPositionMatched, textPositionRetrieved; unsigned long long totalTextPositionMatched, totalTextPositionRetrieved; BWTSaRetrievalStatistics BWTSaRetrievalStatistics; FILE *logFile; init(textPositionRetrieved); // to avoid compiler warning only init(textPositionMatched); // to avoid compiler warning only programInput = ParseInput(argc, argv); ini = ParseIniFile(); ProcessIni(); ValidateIni(); PrintIni(); if (Confirmation == TRUE) { printf("BWT Search. Press Y to go or N to cancel. "); c = (char)getchar(); while (c != 'y' && c != 'Y' && c != 'n' && c!= 'N') { c = (char)getchar(); } if (c == 'n' || c == 'N') { exit(0); } } startTime = setStartTime(); MMMasterInitialize(1, 0, FALSE, NULL); mmPool = MMPoolCreate(PoolSize); // Load Database printf("Loading Database.."); fflush(stdout); bwt = BWTLoad(mmPool, BWTCodeFileName, BWTOccValueFileName, SaValueFileName, NULL, SaIndexFileName, NULL); HSPFillCharMap(charMap); hsp = HSPLoad(mmPool, PackedDNAFileName, AnnotationFileName, AmbiguityFileName, MAX_SEARCH_PATTERN_LENGTH / CHAR_PER_WORD); if (bwt->textLength != hsp->dnaLength) { fprintf(stderr, "BWT-BLAST: Database length inconsistent!\n"); exit(1); } if (LogFileName[0] != '\0') { logFile = fopen64(LogFileName, "w"); if (logFile == NULL) { fprintf(stderr, "Cannot open log file!\n"); exit(1); } } else { logFile = NULL; } packedKey = MMPoolDispatch(mmPool, WordPackedLengthFromText(MAX_SEARCH_PATTERN_LENGTH, BIT_PER_CHAR)); convertedKey = MMPoolDispatch(mmPool, MAX_SEARCH_PATTERN_LENGTH); saIndexGroup = MMUnitAllocate(MaxNumberOfHitGroups * sizeof(SaIndexGroup)); hitList = MMUnitAllocate(MaxNumberOfTextPosition * sizeof(HitList)); tempSaIndex1 = MMUnitAllocate(MaxNumberOfTextPosition * sizeof(SaIndexList)); tempSaIndex2 = MMUnitAllocate(MaxNumberOfTextPosition * sizeof(SaIndexList)); elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); // Process search pattern files for (i=1; i<=NumberOfSearchPatternFile; i++) { argumentText[9] = '0' + (char)(i + 2); iniparser_copystring(programInput, argumentText, PatternFileName, PatternFileName, MAX_FILENAME_LEN); printf("Loading search pattern : %s\n", PatternFileName); numberOfPattern = ProcessSearchPattern(); printf("Finished loading search pattern.\n"); elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Searching for pattern : %s\n", PatternFileName); } if (SABinarySearch == TRUE) { printf("Start forward search with SA index.\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Forward search with SA index.\n"); } numberOfPatternFound = 0; totalTextPositionMatched = 0; totalTextPositionRetrieved = 0; BWTInitializeSaRetrievalStatistics(&BWTSaRetrievalStatistics); for (j=0; j<numberOfPattern; j++) { //ConvertTextToWordPacked(searchPattern + searchPatternPosition[j], packedKey, charMap, ALPHABET_SIZE, // searchPatternPosition[j+1] - searchPatternPosition[j]); //found = BWTForwardSearchSaIndex(packedKey, searchPatternPosition[j+1] - searchPatternPosition[j], // bwt, hsp->packedDNA, &saIndexLeft, &saIndexRight); ConvertTextToCode(searchPattern + searchPatternPosition[j], convertedKey, charMap, searchPatternPosition[j+1] - searchPatternPosition[j]); found = BWTSaBinarySearch(convertedKey, searchPatternPosition[j+1] - searchPatternPosition[j], bwt, hsp->packedDNA, &saIndexLeft, &saIndexRight, packedKey); if (found) { numberOfPatternFound++; textPositionMatched = saIndexRight - saIndexLeft + 1; totalTextPositionMatched += textPositionMatched; if (FindTextPosition) { saIndexGroup->startSaIndex = saIndexLeft; if (textPositionMatched <= MaxNumberOfTextPosition) { saIndexGroup->numOfMatch = textPositionMatched; } else { saIndexGroup->numOfMatch = MaxNumberOfTextPosition; printf("Max no of text positions reached! Only %u out of %u text positions retrieved.\n", MaxNumberOfTextPosition, textPositionMatched); } saIndexGroup->posQuery = 0; saIndexGroup->info = 0; textPositionRetrieved = BWTTextPosition(bwt, saIndexGroup, 1, hitList, tempSaIndex1, tempSaIndex2, &BWTSaRetrievalStatistics, FALSE); totalTextPositionRetrieved += textPositionRetrieved; } } if (LogFileName[0] != '\0') { if (found) { fprintf(logFile, "Pattern number %u found. SA Index from %u to %u. ", j + 1, saIndexLeft, saIndexRight); if (FindTextPosition) { fprintf(logFile, "%u matches of %u located. ", textPositionRetrieved, textPositionMatched); for (k=0; k<textPositionRetrieved; k++) { fprintf(logFile, "%u ", hitList[k].posText); } } } else { fprintf(logFile, "Pattern number %u not found.", j + 1); } fprintf(logFile, "\n"); } } printf("Finished forward search with SA index.\n"); printf("%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { printf("Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Finished forward search with SA index.\n"); fprintf(logFile, "%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { fprintf(logFile, "Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } fprintf(logFile, "Elapsed time = "); printElapsedTime(logFile, FALSE, FALSE, TRUE, 4, elapsedTime); fprintf(logFile, "\n"); } } if (BackwardSearch == TRUE) { printf("Start backward search.\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Backward search.\n"); } numberOfPatternFound = 0; totalTextPositionMatched = 0; totalTextPositionRetrieved = 0; BWTInitializeSaRetrievalStatistics(&BWTSaRetrievalStatistics); for (j=0; j<numberOfPattern; j++) { ConvertTextToCode(searchPattern + searchPatternPosition[j], convertedKey, charMap, searchPatternPosition[j+1] - searchPatternPosition[j]); found = BWTBackwardSearch(convertedKey, searchPatternPosition[j+1] - searchPatternPosition[j], bwt, &saIndexLeft, &saIndexRight); if (found) { numberOfPatternFound++; textPositionMatched = saIndexRight - saIndexLeft + 1; totalTextPositionMatched += textPositionMatched; if (FindTextPosition) { saIndexGroup->startSaIndex = saIndexLeft; if (textPositionMatched <= MaxNumberOfTextPosition) { saIndexGroup->numOfMatch = textPositionMatched; } else { saIndexGroup->numOfMatch = MaxNumberOfTextPosition; printf("Max no of text positions reached! Only %u out of %u text positions retrieved.\n", MaxNumberOfTextPosition, textPositionMatched); } saIndexGroup->posQuery = 0; saIndexGroup->info = 0; textPositionRetrieved = BWTTextPosition(bwt, saIndexGroup, 1, hitList, tempSaIndex1, tempSaIndex2, &BWTSaRetrievalStatistics, FALSE); totalTextPositionRetrieved += textPositionRetrieved; } } if (LogFileName[0] != '\0') { if (found) { fprintf(logFile, "Pattern number %u found. SA Index from %u to %u. Total %u occurrences ", j + 1, saIndexLeft, saIndexRight, saIndexRight - saIndexLeft + 1); if (FindTextPosition) { fprintf(logFile, "%u matches of %u located. ", textPositionRetrieved, textPositionMatched); for (k=0; k<textPositionRetrieved; k++) { fprintf(logFile, "%u ", hitList[k].posText); } } } else { fprintf(logFile, "Pattern number %u not found.", j + 1); } fprintf(logFile, "\n"); } } printf("Finished backward search.\n"); printf("%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { printf("Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Finished backward search.\n"); fprintf(logFile, "%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { fprintf(logFile, "Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } fprintf(logFile, "Elapsed time = "); printElapsedTime(logFile, FALSE, FALSE, TRUE, 4, elapsedTime); fprintf(logFile, "\n"); } } if (BackwardSearchCheck == TRUE) { printf("Start backward search with text.\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Backward search with text.\n"); } numberOfPatternFound = 0; totalTextPositionMatched = 0; totalTextPositionRetrieved = 0; BWTInitializeSaRetrievalStatistics(&BWTSaRetrievalStatistics); for (j=0; j<numberOfPattern; j++) { ConvertTextToCode(searchPattern + searchPatternPosition[j], convertedKey, charMap, searchPatternPosition[j+1] - searchPatternPosition[j]); ConvertTextToWordPacked(searchPattern + searchPatternPosition[j], packedKey, charMap, ALPHABET_SIZE, searchPatternPosition[j+1] - searchPatternPosition[j]); found = BWTBackwardSearchCheckWithText(convertedKey, packedKey, searchPatternPosition[j+1] - searchPatternPosition[j], bwt, hsp->packedDNA, TextCheckCostFactor, MaxNumberOfTextPosition, hitList, &saIndexLeft, &saIndexRight); if (found) { numberOfPatternFound++; textPositionMatched = saIndexRight - saIndexLeft + 1; totalTextPositionMatched += textPositionMatched; // Find text position if text check not used if (FindTextPosition && saIndexLeft != 0) { saIndexGroup->startSaIndex = saIndexLeft; if (textPositionMatched <= MaxNumberOfTextPosition) { saIndexGroup->numOfMatch = textPositionMatched; } else { saIndexGroup->numOfMatch = MaxNumberOfTextPosition; printf("Max no of text positions reached! Only %u out of %u text positions retrieved.\n", MaxNumberOfTextPosition, textPositionMatched); } saIndexGroup->posQuery = 0; saIndexGroup->info = 0; textPositionRetrieved = BWTTextPosition(bwt, saIndexGroup, 1, hitList, tempSaIndex1, tempSaIndex2, &BWTSaRetrievalStatistics, FALSE); totalTextPositionRetrieved += textPositionRetrieved; } else { textPositionRetrieved = textPositionMatched; totalTextPositionRetrieved += textPositionRetrieved; } } if (LogFileName[0] != '\0') { if (found) { fprintf(logFile, "Pattern number %u found. ", j + 1); if (FindTextPosition) { fprintf(logFile, "%u matches of %u located. ", textPositionRetrieved, textPositionMatched); for (k=0; k<textPositionRetrieved; k++) { fprintf(logFile, "%u ", hitList[k].posText); } } } else { fprintf(logFile, "Pattern number %u not found.", j + 1); } fprintf(logFile, "\n"); } } printf("Finished backward search with text.\n"); printf("%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { printf("Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Finished backward search with text.\n"); fprintf(logFile, "%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { fprintf(logFile, "Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } fprintf(logFile, "Elapsed time = "); printElapsedTime(logFile, FALSE, FALSE, TRUE, 4, elapsedTime); fprintf(logFile, "\n"); } } if (HammingDistSearch == TRUE) { printf("Start hamming distance %u approximate match.\n", MaxErrorAllowed); if (LogFileName[0] != '\0') { fprintf(logFile, "Hamming distance %u approximate match.\n", MaxErrorAllowed); } numberOfPatternFound = 0; totalTextPositionMatched = 0; totalTextPositionRetrieved = 0; BWTInitializeSaRetrievalStatistics(&BWTSaRetrievalStatistics); for (j=0; j<numberOfPattern; j++) { ConvertTextToCode(searchPattern + searchPatternPosition[j], convertedKey, charMap, searchPatternPosition[j+1] - searchPatternPosition[j]); numberOfSaIndexGroup = BWTHammingDistMatchOld(convertedKey, searchPatternPosition[j+1] - searchPatternPosition[j], bwt, MaxErrorAllowed, saIndexGroup, MaxNumberOfHitGroups, 0, 0); if (numberOfSaIndexGroup) { numberOfPatternFound++; textPositionMatched = 0; for (k=0; k<numberOfSaIndexGroup; k++) { textPositionMatched += saIndexGroup[k].numOfMatch; } if (textPositionMatched > MaxNumberOfTextPosition) { textPositionMatched = 0; for (k=0; k<numberOfSaIndexGroup && textPositionMatched < MaxNumberOfTextPosition; k++) { textPositionMatched += saIndexGroup[k].numOfMatch; } numberOfSaIndexGroup = k - 1; saIndexGroup[numberOfSaIndexGroup].numOfMatch -= textPositionMatched - MaxNumberOfTextPosition; for (; k<numberOfSaIndexGroup; k++) { textPositionMatched += saIndexGroup[k].numOfMatch; } printf("Max no of text positions reached! Only %u out of %u text positions retrieved.\n", MaxNumberOfTextPosition, textPositionMatched); } totalTextPositionMatched += textPositionMatched; if (FindTextPosition) { textPositionRetrieved = BWTTextPosition(bwt, saIndexGroup, numberOfSaIndexGroup, hitList, tempSaIndex1, tempSaIndex2, &BWTSaRetrievalStatistics, FALSE); totalTextPositionRetrieved += textPositionRetrieved; } } if (LogFileName[0] != '\0') { if (numberOfSaIndexGroup) { fprintf(logFile, "Pattern number %u found. %u matches", j + 1, textPositionMatched); if (FindTextPosition) { fprintf(logFile, "%u matches of %u located. ", textPositionRetrieved, textPositionMatched); for (k=0; k<textPositionRetrieved; k++) { fprintf(logFile, "%u ", hitList[k].posText); } } } else { fprintf(logFile, "Pattern number %u not found.", j + 1); } fprintf(logFile, "\n"); } } printf("Finished hamming distance search.\n"); printf("%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { printf("Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Finished hamming distance search.\n"); fprintf(logFile, "%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { fprintf(logFile, "Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } fprintf(logFile, "Elapsed time = "); printElapsedTime(logFile, FALSE, FALSE, TRUE, 4, elapsedTime); fprintf(logFile, "\n"); } } if (EditDistSearch == TRUE) { printf("Start edit distance %u approximate match.\n", MaxErrorAllowed); if (LogFileName[0] != '\0') { fprintf(logFile, "Edit distance %u approximate match.\n", MaxErrorAllowed); } numberOfPatternFound = 0; totalTextPositionMatched = 0; totalTextPositionRetrieved = 0; BWTInitializeSaRetrievalStatistics(&BWTSaRetrievalStatistics); for (j=0; j<numberOfPattern; j++) { ConvertTextToCode(searchPattern + searchPatternPosition[j], convertedKey, charMap, searchPatternPosition[j+1] - searchPatternPosition[j]); numberOfSaIndexGroup = BWTEditDistMatchOld(convertedKey, searchPatternPosition[j+1] - searchPatternPosition[j], bwt, MaxErrorAllowed, (SaIndexGroupWithLengthError*)saIndexGroup, MaxNumberOfHitGroups); if (numberOfSaIndexGroup > MaxNumberOfHitGroups) { fprintf(stderr, "numberOfSaIndexGroup > MaxNumberOfHitGroups!\n"); } if (numberOfSaIndexGroup) { numberOfPatternFound++; textPositionMatched = 0; for (k=0; k<numberOfSaIndexGroup; k++) { textPositionMatched += saIndexGroup[k].numOfMatch; } if (textPositionMatched > MaxNumberOfTextPosition) { textPositionMatched = 0; for (k=0; k<numberOfSaIndexGroup && textPositionMatched < MaxNumberOfTextPosition; k++) { textPositionMatched += saIndexGroup[k].numOfMatch; } numberOfSaIndexGroup = k - 1; saIndexGroup[numberOfSaIndexGroup].numOfMatch -= textPositionMatched - MaxNumberOfTextPosition; for (; k<numberOfSaIndexGroup; k++) { textPositionMatched += saIndexGroup[k].numOfMatch; } printf("Max no of text positions reached! Only %u out of %u text positions retrieved.\n", MaxNumberOfTextPosition, textPositionMatched); } totalTextPositionMatched += textPositionMatched; if (FindTextPosition) { textPositionRetrieved = BWTTextPosition(bwt, saIndexGroup, numberOfSaIndexGroup, hitList, tempSaIndex1, tempSaIndex2, &BWTSaRetrievalStatistics, FALSE); totalTextPositionRetrieved += textPositionRetrieved; } } if (LogFileName[0] != '\0') { if (numberOfSaIndexGroup) { fprintf(logFile, "Pattern number %u found. %u matches. ", j + 1, textPositionMatched); if (FindTextPosition) { fprintf(logFile, "%u matches of %u located. ", textPositionRetrieved, textPositionMatched); for (k=0; k<textPositionRetrieved; k++) { fprintf(logFile, "%u ", hitList[k].posText); } } } else { fprintf(logFile, "Pattern number %u not found.", j + 1); } fprintf(logFile, "\n"); } } printf("Finished edit distance search.\n"); printf("%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { printf("Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } elapsedTime = getElapsedTime(startTime) - totalElapsedTime; printf("Elapsed time = "); printElapsedTime(stdout, FALSE, FALSE, TRUE, 4, elapsedTime); totalElapsedTime += elapsedTime; printf("\n"); if (LogFileName[0] != '\0') { fprintf(logFile, "Finished edit distance search.\n"); fprintf(logFile, "%u patterns out of %u found. %llu of %llu matches located.\n", numberOfPatternFound, numberOfPattern, totalTextPositionRetrieved, totalTextPositionMatched); if (FindTextPosition) { fprintf(logFile, "Hits: BWT/Cached/Diag/Dup : %u/%u/%u/%u\n", BWTSaRetrievalStatistics.bwtSaRetrieved, BWTSaRetrievalStatistics.cachedSaRetrieved, BWTSaRetrievalStatistics.saDiagonalLinked, BWTSaRetrievalStatistics.saDuplicated); } fprintf(logFile, "Elapsed time = "); printElapsedTime(logFile, FALSE, FALSE, TRUE, 4, elapsedTime); fprintf(logFile, "\n"); } } } MMPoolReturn(mmPool, packedKey, WordPackedLengthFromText(MAX_SEARCH_PATTERN_LENGTH, BIT_PER_CHAR)); MMPoolReturn(mmPool, convertedKey, MAX_SEARCH_PATTERN_LENGTH); HSPFree(mmPool, hsp, MAX_SEARCH_PATTERN_LENGTH / CHAR_PER_WORD); BWTFree(mmPool, bwt); if (searchPattern != NULL) { MMUnitFree(searchPattern, searchPatternAllocated); } if (searchPatternPosition != NULL) { MMUnitFree(searchPatternPosition, searchPatternPositionAllocated); } MMUnitFree(saIndexGroup, MaxNumberOfHitGroups * sizeof(unsigned int)); MMUnitFree(hitList, MaxNumberOfTextPosition * sizeof(unsigned int)); MMPoolFree(mmPool); iniparser_freedict(programInput); iniparser_freedict(ini); return 0; }