void checksuflinks(Suffixarray *s, Uint i, Uint j){ Uint k, childlcp, suflcp, *space = NULL; PairUint* child, childsuf; Container *children; // ignore singletons as initial input if (i == j){ return; } children = getChildintervals(space, s, i, j, 0); for (k = 0; k < bl_containerSize(children); k++){ child = (PairUint *) bl_containerGet(children, k); // exclude singletons if (child->a == child->b){ return; } // check suflink of child childlcp = getlcpval(s, child->a, child->b); childsuf = getSuflink(s, child->a, child->b); suflcp = getlcpval(s, childsuf.a, childsuf.b); if (childlcp != suflcp + 1){ DBG("suf[%u, %u, %u]=[%u, %u, %u]\n", child->a, child->b, childlcp, childsuf.a, childsuf.b, suflcp); } // recursively check all children of child checksuflinks(s, child->a, child->b); } bl_containerDestruct(children, NULL); free(children); }
PairUint getCharInterval(void *space, Suffixarray *s, Uint i, Uint j, Uint pos, char ch) { List *list; Uint lcp=0; PairUint lr; lr.a = 1; lr.b = 0; if(i==j) return lr; list = getChildintervals(space, s, i, j); lcp = getlcpval(s, i, j); for(i=0; i < list->length; i++) { if(s->suffixptr[s->suftab[((PairUint*)list->nodes[i].data)->a]][lcp] == ch) { lr.a = ((PairUint*)list->nodes[i].data)->a; lr.b = ((PairUint*)list->nodes[i].data)->b; break; } } wrapList(space, list, destructinterval); return lr; }
void constructsuflinks(void *space, Suffixarray *s) { Uint i, j, a, b, k, nooflists, lcp, pos; Stack istack; Stack jstack; List *children, **lists; PairUint **data, slinkinterval; nooflists = maxlcp(s) +1; lists = ALLOCMEMORY(space, NULL, List*, nooflists); memset(lists, 0, sizeof(List*)*nooflists); initStack(space, &istack, 1000); initStack(space, &jstack, 1000); stackpush(space, &istack, 0); stackpush(space, &jstack, s->numofsuffixes-1); while(!stackisempty(&istack)) { i = stackpop(&istack); j = stackpop(&jstack); lcp = getlcpval(s, i, j); /*printf("adding list %d\n", lcp);*/ if (lists[lcp] == NULL) { lists[lcp] = initList(space, 10); } addinterval(space, lists[lcp], i, j); /*printf("lcp: %d-[%d,%d]\n", lcp, i, j);*/ children = getChildintervals(space, s, i, j); data = (PairUint**) dataList(space, children); for(k=children->length; k > 0; k--) { a = data[k-1]->a; b = data[k-1]->b; FREEMEMORY(space, data[k-1]); if(a != b) { stackpush(space, &istack, a); stackpush(space, &jstack, b); } } FREEMEMORY(space, data); wrapList(space, children, NULL); } destructStack(space, &istack); destructStack(space, &jstack); s->suflink_l = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes); s->suflink_r = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes); memset(s->suflink_l, 0, sizeof(Uint)*s->numofsuffixes); memset(s->suflink_r, 0, sizeof(Uint)*s->numofsuffixes); for(i=1; i < nooflists; i++) { if(lists[i] != NULL && lists[i-1] !=NULL) { for(j=0; j < lists[i]->length; j++) { /*printf("looking at interval [%d,%d], list %d\n", ((PairUint*)lists[i]->nodes[j].data)->a, ((PairUint*)lists[i]->nodes[j].data)->b, i);*/ slinkinterval = findslinkinterval(space, s, lists, i, j); pos = getfirstlindex(s, ((PairUint*)lists[i]->nodes[j].data)->a, ((PairUint*)lists[i]->nodes[j].data)->b); /*printf("store at %d: [%d,%d]\n", pos, slinkinterval.a, slinkinterval.b);*/ s->suflink_l[pos]=slinkinterval.a; s->suflink_r[pos]=slinkinterval.b; } } wrapList(space, lists[i-1], destructinterval); } FREEMEMORY(space, lists); return; }
int main(int argc, char** argv) { char* content; Uint contentlen, i, j, k, l, id, lines=0; stringset_t *set, *set2, **csv, *que; CharSequence **s; Suffixarray *sarray; MultiCharSeq *mseq; PairSint d, *matches = NULL; Uint totallength = 0; Uint wsize=10; Uint counter=0; Uint all=0; int *space = NULL; char *pattern= "GGAAGAAAGCGTGGGGTTTG"; char *pattern2= "TGATTAGTGATTAGTGATTA"; char *pattern3= "ACAAACATAT"; char *start; time_t startsuf, endsuf; double difsuf; Uint noofchildren; List *list; PairUint **childinterval; gnuplot_ctrl *h; double *genome; //set = readfasta(&space, "HP26695.fasta"); //csv = readcsv(&space, "HP12_GCTC.inserts", "", &lines); /*s = ALLOCMEMORY(&space, NULL, CharSequence *, set->noofstrings); for(i=0; i < set->noofstrings/2; i++) { totallength += set->strings[i].len; s[i] = ALLOCMEMORY(&space, NULL, CharSequence, 1); s[i]->sequence = set->strings[i].str; s[i]->length = set->strings[i].len; / printf("%s,", set->strings[i].str); printf("\n"); / }*/ s = ALLOCMEMORY(&space, NULL, CharSequence *, 1); s[0] = ALLOCMEMORY(&space, NULL, CharSequence, 1); s[0]->sequence = pattern3; s[0]->length = strlen(pattern3); genome = ALLOCMEMORY(&space, NULL, double, totallength); memset(genome, 0, sizeof(double)*totallength); time (&startsuf); sarray = constructSufArr(&space, s, 1/*set->noofstrings/2*/, NULL); constructLcp(space, sarray); dumplcptab(sarray); constructchildtab(space, sarray); time (&endsuf); difsuf = difftime (endsuf, startsuf); printf("noofsuffixes: %d\n", sarray->numofsuffixes); dumpchildtab(sarray); dumpSufArr(sarray); list = getChildintervals(space, sarray, 0, 5); childinterval = (PairUint**) dataList(space, list); for(i=0; i < list->length; i++) { printf("[%d,%d]\n", childinterval[i]->a, childinterval[i]->b); } constructsuflinks(space, sarray); for(k=1; k < lines; k+=2) { // printf("searching %s\n", csv[k]->strings[0].str); if(csv[k]->strings[0].len > 8) { if(wsize > csv[k]->strings[0].len) { d=mmsearch(sarray, csv[k]->strings[0].str, csv[k]->strings[0].len, 0, 0, sarray->numofsuffixes-1); // printf("suffixes were found at positions (%d, %d)\n",d.a, d.b); for (j=d.a; j <= d.b; j++) { genome[sarray->suftab[j]]++; } } else { for(l=0; l < csv[k]->strings[0].len-wsize; l++) { d=mmsearch(sarray, &csv[k]->strings[0].str[l], wsize, 0, 0, sarray->numofsuffixes-1); // printf("suffixes were found at positions (%d, %d)\n",d.a, d.b); for (j=d.a; j <= d.b; j++) { genome[sarray->suftab[j]]++; /* start = sarray->suffixptr[sarray->suftab[j]]; printf("pattern was: %s\n", &csv[k]->strings[0].str[l]); printf("suffix found: "); for (i=0; i < wsize; i++) { printf("%c", start[i]); } printf("\n"); id = getMultiCharSeqIndex(sarray->seq, sarray->suffixptr[sarray->suftab[j]]); printf("found in sequence: %d\n", id); */ } } } if (d.a < d.b) counter++; all++;} } destructStringset(&space, set); writeY("out.xy", genome, totallength); /*h = gnuplot_init(); gnuplot_setstyle(h, "points"); gnuplot_cmd(h, "set title 'IMBISS - seed statistics' -28,0 font'Helvetica,15'"); gnuplot_cmd(h, "set label 'seed length: %d' at graph 0.05,0.95 font 'Helvetica, 12'", totallength); gnuplot_set_xlabel(h, "matches"); gnuplot_set_ylabel(h, "position"); gnuplot_plot_x(h, genome, totallength, "position"); */ printf ("sliding windows of %d sequences (of %d) found\n", counter, all); printf ("Building the suffixarray has taken %f seconds.\n", difsuf); printf ("Total length of suffixarray was %d\n", totallength); while(1); return EXIT_SUCCESS; }