Esempio n. 1
0
// exact extend
// leav_len: leaving length to extend
// type: 1 - backward, 0 - forward
int bwt_extend_exact(const Idx2BWT *bi_bwt, ubyte_t *seq, int len, int *leav_len, int type,
        bwtint_t *sa_begin, bwtint_t *sa_end, bwtint_t *rev_sa_begin, bwtint_t *rev_sa_end)
{
    bwtint_t k, l, rev_k, rev_l;
    int i;
    k = *sa_begin;
    l = *sa_end;
    rev_k = *rev_sa_begin;
    rev_l = *rev_sa_end;
    if(type == 1){
        for(i = *leav_len; i > 0; --i){
            if(seq[i - 1] > 3)
                break;
            BWTSARangeBackward_Bidirection(bi_bwt, seq[i], &k, &l, &rev_k, &rev_l);
            if(k > l)
                break;
            else{
                *sa_begin = k;
                *sa_end = l;
                *rev_sa_begin = rev_k;
                *rev_sa_end = rev_l;
            }
        }
    } else {
        for(i = (*leav_len); i > 0; --i){
            if(seq[len-i] > 3)
                break; 
            BWTSARangeForward_Bidirection(bi_bwt, seq[len-i], &k, &l, &rev_k, &rev_l);
            if(k > l)
                break;
            else{
                *sa_begin = k;
                *sa_end = l;
                *rev_sa_begin = rev_k;
                *rev_sa_end = rev_l;
            }
        }
    }
    *leav_len = i;
    return *sa_end - (*sa_begin) + 1;
}
Esempio n. 2
0
int main() {
    int i,j,k,c;

    //Variables for backward and forward search
    unsigned int l,r,rev_l,rev_r;
    
    //Variables for search all sa ranges functions
    unsigned int result_l[ALPHABET_SIZE];
    unsigned int result_r[ALPHABET_SIZE];
    unsigned int result_rev_l[ALPHABET_SIZE];
    unsigned int result_rev_r[ALPHABET_SIZE];
    
    //Variables for result
    unsigned int offset;
    int sequenceId;
    unsigned int saCount;
    
    //Variables for pattern
    char pattern[1024];
    strcpy(pattern,"AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA");
    int patternLength = strlen(pattern);

    // Load up the index with the below statement
    printf("Loading index ... "); 
    fflush(stdout);
    Idx2BWT * idx2BWT = BWTLoad2BWT("ncbi.genome.fa.index",".sa");
    printf("DONE\n\n"); 
    
    // Convert the pattern into 2BWT recognised coding scheme
    unsigned char packedPattern[1024];
    BWTConvertPattern(idx2BWT,pattern,patternLength,packedPattern);
    
    
    
    
    
    
// The following performs a backward search of the pattern
// ===================================================================================
// |
    printf("Performing backward search of the pattern..\n");
    BWTSARangeInitial(idx2BWT,packedPattern[patternLength-1],&l,&r);
    for (i=patternLength-2;i>=0;i--) {
        BWTSARangeBackward(idx2BWT,packedPattern[i],&l,&r);
    }
    printf("SA Range being = %u %u (%u)\n\n",l,r,r-l+1);
// |
// ===================================================================================








    
// The following performs a forward search of the pattern
// ===================================================================================
// |
    printf("Performing forward search of the pattern..\n");
    BWTSARangeInitial(idx2BWT,packedPattern[0],&l,&r);
    BWTSARangeInitial(idx2BWT,packedPattern[0],&rev_l,&rev_r);
    for (i=1;i<patternLength;i++) {
        BWTSARangeForward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r);
    }
    printf("SA Range being = %u %u %u %u (%u)\n\n",l,r,rev_l,rev_r,r-l+1);
// |
// ===================================================================================
    
    
// The following performs a bi-directional search of the pattern
// Starting from the middle of the pattern, first move right, then move left.
// ===================================================================================
// |
    printf("Performing bi-directional search of the pattern..\n");
    j = patternLength / 2;
    BWTSARangeInitial(idx2BWT,packedPattern[j],&l,&r);
    BWTSARangeInitial(idx2BWT,packedPattern[j],&rev_l,&rev_r);
    for (i=j+1;i<patternLength;i++) {
        BWTSARangeForward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r);
    }
    for (i=j-1;i>=0;i--) {
        BWTSARangeBackward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r);
    }
    printf("SA Range being = %u %u %u %u (%u)\n\n",l,r,rev_l,rev_r,r-l+1);
// |
// ===================================================================================
    
    
// The following performs a 1-mismatch search of the pattern
// ===================================================================================
// |
// |
    printf("Performing 1-mismatch search of the pattern..\n");
    saCount = 0;
    j = patternLength / 2;
    BWTSARangeInitial(idx2BWT,packedPattern[patternLength-1],&l,&r);
    for (i=patternLength-2;i>j-1;i--) { BWTSARangeBackward(idx2BWT,packedPattern[i],&l,&r); }
    
    for (i=j-1;i>=0;i--) {
        BWTAllSARangesBackward(idx2BWT,l,r,result_l,result_r);
        for (c=0;c<ALPHABET_SIZE;c++) {
            if (c==packedPattern[i]) continue;
            unsigned int err_l=result_l[c];
            unsigned int err_r=result_r[c];
            for (k=i-1;k>=0;k--) {
                if (err_l>err_r) break;
                BWTSARangeBackward(idx2BWT,packedPattern[k],&err_l,&err_r);
            }
            if (err_l<=err_r && k<0) {
                //An SA range of occurrence is found (err_l,err_r)
                saCount+=err_r-err_l+1;
            }
        }
        l=result_l[packedPattern[i]];
        r=result_r[packedPattern[i]];
    }
    
    BWTSARangeInitial(idx2BWT,packedPattern[0],&l,&r);
    BWTSARangeInitial(idx2BWT,packedPattern[0],&rev_l,&rev_r);
    for (i=1;i<j;i++) { BWTSARangeForward_Bidirection(idx2BWT,packedPattern[i],&l,&r,&rev_l,&rev_r); }
    for (i=j;i<patternLength;i++) {
        BWTAllSARangesForward_Bidirection(idx2BWT,l,r,rev_l,rev_r,result_l,result_r,result_rev_l,result_rev_r);
        for (c=0;c<ALPHABET_SIZE;c++) {
            if (c==packedPattern[i]) continue;
            unsigned int err_l=result_l[c];
            unsigned int err_r=result_r[c];
            unsigned int rev_err_l=result_rev_l[c];
            unsigned int rev_err_r=result_rev_r[c];
            for (k=i+1;k<patternLength;k++) {
                if (err_l>err_r) break;
                BWTSARangeForward_Bidirection(idx2BWT,packedPattern[k],&err_l,&err_r,&rev_err_l,&rev_err_r);
            }
            if (err_l<=err_r && k>=patternLength) {
                //An SA range of occurrence is found (err_l,err_r)
                saCount+=err_r-err_l+1;
            }
        }
        l=result_l[packedPattern[i]];
        r=result_r[packedPattern[i]];
        rev_l=result_rev_l[packedPattern[i]];
        rev_r=result_rev_r[packedPattern[i]];
    }
    printf("%u SA-indexes/occurrences were found.\n\n",saCount);
// |
// |
// ===================================================================================
    
    
    
    
    
    
// The following output the first 5 position of the pattern
// ===================================================================================
// |
// |
    j=(r-l+1<5)?r-l+1:5;
    printf("Reporting %d arbitrary occurrences..\n",j);
    for (i=0;i<j;i++) {
        BWTRetrievePositionFromSAIndex(idx2BWT,l+i,&sequenceId,&offset);
        printf("Occurrence found in sequence #%d with offset %u\n",sequenceId,offset);
    }
// |
// |
// ===================================================================================
    
    
    
    
    
    
    // Free up the 2BWT index
    printf("\nFree index ... "); 
    fflush(stdout);
    BWTFree2BWT(idx2BWT);
    printf("DONE\n"); 
    
    return 0;
}