// exact extend // leav_len: leaving length to extend // type: 1 - backward, 0 - forward int bwt_extend_exact(const Idx2BWT *bi_bwt, ubyte_t *seq, int len, int *leav_len, int type, bwtint_t *sa_begin, bwtint_t *sa_end, bwtint_t *rev_sa_begin, bwtint_t *rev_sa_end) { bwtint_t k, l, rev_k, rev_l; int i; k = *sa_begin; l = *sa_end; rev_k = *rev_sa_begin; rev_l = *rev_sa_end; if(type == 1){ for(i = *leav_len; i > 0; --i){ if(seq[i - 1] > 3) break; BWTSARangeBackward_Bidirection(bi_bwt, seq[i], &k, &l, &rev_k, &rev_l); if(k > l) break; else{ *sa_begin = k; *sa_end = l; *rev_sa_begin = rev_k; *rev_sa_end = rev_l; } } } else { for(i = (*leav_len); i > 0; --i){ if(seq[len-i] > 3) break; BWTSARangeForward_Bidirection(bi_bwt, seq[len-i], &k, &l, &rev_k, &rev_l); if(k > l) break; else{ *sa_begin = k; *sa_end = l; *rev_sa_begin = rev_k; *rev_sa_end = rev_l; } } } *leav_len = i; return *sa_end - (*sa_begin) + 1; }
int main() { int i,j,k,c; //Variables for backward and forward search unsigned int l,r,rev_l,rev_r; //Variables for search all sa ranges functions unsigned int result_l[ALPHABET_SIZE]; unsigned int result_r[ALPHABET_SIZE]; unsigned int result_rev_l[ALPHABET_SIZE]; unsigned int result_rev_r[ALPHABET_SIZE]; //Variables for result unsigned int offset; int sequenceId; unsigned int saCount; //Variables for pattern char pattern[1024]; strcpy(pattern,"AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA"); int patternLength = strlen(pattern); // Load up the index with the below statement printf("Loading index ... "); fflush(stdout); Idx2BWT * idx2BWT = BWTLoad2BWT("ncbi.genome.fa.index",".sa"); printf("DONE\n\n"); // Convert the pattern into 2BWT recognised coding scheme unsigned char packedPattern[1024]; BWTConvertPattern(idx2BWT,pattern,patternLength,packedPattern); // The following performs a backward search of the pattern // =================================================================================== // | printf("Performing backward search of the pattern..\n"); BWTSARangeInitial(idx2BWT,packedPattern[patternLength-1],&l,&r); for (i=patternLength-2;i>=0;i--) { BWTSARangeBackward(idx2BWT,packedPattern[i],&l,&r); } printf("SA Range being = %u %u (%u)\n\n",l,r,r-l+1); // | // =================================================================================== // The following performs a forward search of the pattern // =================================================================================== // | printf("Performing forward search of the pattern..\n"); BWTSARangeInitial(idx2BWT,packedPattern[0],&l,&r); BWTSARangeInitial(idx2BWT,packedPattern[0],&rev_l,&rev_r); for (i=1;i<patternLength;i++) { BWTSARangeForward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r); } printf("SA Range being = %u %u %u %u (%u)\n\n",l,r,rev_l,rev_r,r-l+1); // | // =================================================================================== // The following performs a bi-directional search of the pattern // Starting from the middle of the pattern, first move right, then move left. // =================================================================================== // | printf("Performing bi-directional search of the pattern..\n"); j = patternLength / 2; BWTSARangeInitial(idx2BWT,packedPattern[j],&l,&r); BWTSARangeInitial(idx2BWT,packedPattern[j],&rev_l,&rev_r); for (i=j+1;i<patternLength;i++) { BWTSARangeForward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r); } for (i=j-1;i>=0;i--) { BWTSARangeBackward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r); } printf("SA Range being = %u %u %u %u (%u)\n\n",l,r,rev_l,rev_r,r-l+1); // | // =================================================================================== // The following performs a 1-mismatch search of the pattern // =================================================================================== // | // | printf("Performing 1-mismatch search of the pattern..\n"); saCount = 0; j = patternLength / 2; BWTSARangeInitial(idx2BWT,packedPattern[patternLength-1],&l,&r); for (i=patternLength-2;i>j-1;i--) { BWTSARangeBackward(idx2BWT,packedPattern[i],&l,&r); } for (i=j-1;i>=0;i--) { BWTAllSARangesBackward(idx2BWT,l,r,result_l,result_r); for (c=0;c<ALPHABET_SIZE;c++) { if (c==packedPattern[i]) continue; unsigned int err_l=result_l[c]; unsigned int err_r=result_r[c]; for (k=i-1;k>=0;k--) { if (err_l>err_r) break; BWTSARangeBackward(idx2BWT,packedPattern[k],&err_l,&err_r); } if (err_l<=err_r && k<0) { //An SA range of occurrence is found (err_l,err_r) saCount+=err_r-err_l+1; } } l=result_l[packedPattern[i]]; r=result_r[packedPattern[i]]; } BWTSARangeInitial(idx2BWT,packedPattern[0],&l,&r); BWTSARangeInitial(idx2BWT,packedPattern[0],&rev_l,&rev_r); for (i=1;i<j;i++) { BWTSARangeForward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r); } for (i=j;i<patternLength;i++) { BWTAllSARangesForward_Bidirection(idx2BWT,l,r,rev_l,rev_r,result_l,result_r,result_rev_l,result_rev_r); for (c=0;c<ALPHABET_SIZE;c++) { if (c==packedPattern[i]) continue; unsigned int err_l=result_l[c]; unsigned int err_r=result_r[c]; unsigned int rev_err_l=result_rev_l[c]; unsigned int rev_err_r=result_rev_r[c]; for (k=i+1;k<patternLength;k++) { if (err_l>err_r) break; BWTSARangeForward_Bidirection(idx2BWT,packedPattern[k],&err_l,&err_r,&rev_err_l,&rev_err_r); } if (err_l<=err_r && k>=patternLength) { //An SA range of occurrence is found (err_l,err_r) saCount+=err_r-err_l+1; } } l=result_l[packedPattern[i]]; r=result_r[packedPattern[i]]; rev_l=result_rev_l[packedPattern[i]]; rev_r=result_rev_r[packedPattern[i]]; } printf("%u SA-indexes/occurrences were found.\n\n",saCount); // | // | // =================================================================================== // The following output the first 5 position of the pattern // =================================================================================== // | // | j=(r-l+1<5)?r-l+1:5; printf("Reporting %d arbitrary occurrences..\n",j); for (i=0;i<j;i++) { BWTRetrievePositionFromSAIndex(idx2BWT,l+i,&sequenceId,&offset); printf("Occurrence found in sequence #%d with offset %u\n",sequenceId,offset); } // | // | // =================================================================================== // Free up the 2BWT index printf("\nFree index ... "); fflush(stdout); BWTFree2BWT(idx2BWT); printf("DONE\n"); return 0; }