Example #1
0
void checksuflinks(Suffixarray *s, Uint i, Uint j){
  Uint k, childlcp, suflcp, *space = NULL;
  PairUint* child, childsuf;
  Container *children;
  // ignore singletons as initial input
  if (i == j){
    return;
  }
  children = getChildintervals(space, s, i, j, 0);
  for (k = 0; k < bl_containerSize(children); k++){
    child = (PairUint *) bl_containerGet(children, k);
    // exclude singletons
    if (child->a == child->b){
      return;
    }
    // check suflink of child
    childlcp = getlcpval(s, child->a, child->b);
    childsuf = getSuflink(s, child->a, child->b);
    suflcp = getlcpval(s, childsuf.a, childsuf.b);
    if (childlcp != suflcp + 1){
      DBG("suf[%u, %u, %u]=[%u, %u, %u]\n", child->a, child->b, childlcp,
	  childsuf.a, childsuf.b, suflcp);
    }
    // recursively check all children of child
    checksuflinks(s, child->a, child->b);
  }
  bl_containerDestruct(children, NULL);
  free(children);
}
PairUint
getCharInterval(void *space,
            Suffixarray *s,
            Uint i,
            Uint j,
            Uint pos,
            char ch) 
{
    List *list;
    Uint lcp=0;
    PairUint lr;
    
    lr.a = 1;
    lr.b = 0;
    
    if(i==j) return lr;

    list = getChildintervals(space, s, i, j);
    lcp = getlcpval(s, i, j);

    for(i=0; i < list->length; i++) {

      if(s->suffixptr[s->suftab[((PairUint*)list->nodes[i].data)->a]][lcp] == ch) {
            lr.a = ((PairUint*)list->nodes[i].data)->a;       
            lr.b = ((PairUint*)list->nodes[i].data)->b;
            break;
        }
    }
    wrapList(space, list, destructinterval);
    return lr;
}
void
constructsuflinks(void *space, Suffixarray *s) {

  Uint   i,
         j,
         a,
         b,
         k,
         nooflists,
         lcp,
         pos;
  Stack  istack;
  Stack  jstack;

  List   *children,
         **lists;
  PairUint **data,
           slinkinterval;

  nooflists = maxlcp(s) +1;
  lists = ALLOCMEMORY(space, NULL, List*, nooflists);
  memset(lists, 0, sizeof(List*)*nooflists);

  initStack(space, &istack, 1000);
  initStack(space, &jstack, 1000);

  stackpush(space, &istack, 0);
  stackpush(space, &jstack, s->numofsuffixes-1);

  while(!stackisempty(&istack)) {
    i = stackpop(&istack);
    j = stackpop(&jstack);
    lcp = getlcpval(s, i, j);

    /*printf("adding list %d\n", lcp);*/
    if (lists[lcp] == NULL) {
      lists[lcp] = initList(space, 10);
    }

    addinterval(space, lists[lcp], i, j);

    /*printf("lcp: %d-[%d,%d]\n", lcp, i, j);*/
    children = getChildintervals(space, s, i, j);
    data = (PairUint**) dataList(space, children);

    for(k=children->length; k > 0; k--) {
      a = data[k-1]->a;
      b = data[k-1]->b;

      FREEMEMORY(space, data[k-1]);

      if(a != b) { 
        stackpush(space, &istack, a);
        stackpush(space, &jstack, b);
      }
    }

    FREEMEMORY(space, data);
    wrapList(space, children, NULL);
  }

  destructStack(space, &istack);
  destructStack(space, &jstack);

  s->suflink_l = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes);
  s->suflink_r = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes);
  memset(s->suflink_l, 0, sizeof(Uint)*s->numofsuffixes);
  memset(s->suflink_r, 0, sizeof(Uint)*s->numofsuffixes);

  for(i=1; i < nooflists; i++) {
    if(lists[i] !=  NULL && lists[i-1] !=NULL) {
      for(j=0; j < lists[i]->length; j++) {
       /*printf("looking at interval [%d,%d], list %d\n", ((PairUint*)lists[i]->nodes[j].data)->a, ((PairUint*)lists[i]->nodes[j].data)->b, i);*/
        slinkinterval = findslinkinterval(space, s, lists, i, j);
        pos = getfirstlindex(s, ((PairUint*)lists[i]->nodes[j].data)->a, ((PairUint*)lists[i]->nodes[j].data)->b);
       /*printf("store at %d: [%d,%d]\n", pos, slinkinterval.a, slinkinterval.b);*/
        s->suflink_l[pos]=slinkinterval.a;
        s->suflink_r[pos]=slinkinterval.b;
      }
    }
    wrapList(space, lists[i-1], destructinterval);
  }

  FREEMEMORY(space, lists);
  return;
}
int main(int argc, char** argv) {
  char* content;
  Uint contentlen, i, j, k, l, id, lines=0;
  stringset_t *set, *set2, **csv, *que;
  CharSequence **s;
  Suffixarray *sarray;
  MultiCharSeq *mseq;
  PairSint d, *matches  = NULL;
  Uint totallength = 0;
  Uint wsize=10;
  Uint counter=0;
  Uint all=0;
  int *space = NULL;
  char *pattern=  "GGAAGAAAGCGTGGGGTTTG";
  char *pattern2= "TGATTAGTGATTAGTGATTA";
  char *pattern3= "ACAAACATAT";
  char *start;
  time_t startsuf, endsuf; 
  double difsuf;
  Uint noofchildren;
  List *list;
  PairUint **childinterval;
  gnuplot_ctrl *h;
  double *genome;
  
  //set = readfasta(&space, "HP26695.fasta");
  //csv = readcsv(&space, "HP12_GCTC.inserts", "", &lines); 
  /*s = ALLOCMEMORY(&space, NULL, CharSequence *, set->noofstrings);   


   for(i=0; i < set->noofstrings/2; i++) {
    totallength += set->strings[i].len; 

    s[i] = ALLOCMEMORY(&space, NULL, CharSequence, 1);
    s[i]->sequence = set->strings[i].str;
    s[i]->length = set->strings[i].len;
    /  printf("%s,", set->strings[i].str);
        printf("\n"); / 
  }*/



  s = ALLOCMEMORY(&space, NULL, CharSequence *, 1);
  s[0] = ALLOCMEMORY(&space, NULL, CharSequence, 1);
  s[0]->sequence = pattern3;
  s[0]->length = strlen(pattern3);


  genome = ALLOCMEMORY(&space, NULL, double, totallength);
  memset(genome, 0, sizeof(double)*totallength);


  time (&startsuf);
  sarray = constructSufArr(&space, s, 1/*set->noofstrings/2*/, NULL); 
  constructLcp(space, sarray);
  dumplcptab(sarray);
  constructchildtab(space, sarray);
  time (&endsuf);
  difsuf = difftime (endsuf, startsuf);

  printf("noofsuffixes: %d\n", sarray->numofsuffixes);

  dumpchildtab(sarray);
  dumpSufArr(sarray);
  
  list = getChildintervals(space, sarray, 0, 5);
  childinterval = (PairUint**) dataList(space, list);
  for(i=0; i < list->length; i++) {
    printf("[%d,%d]\n", childinterval[i]->a, childinterval[i]->b);
  }
  
  constructsuflinks(space, sarray);

  for(k=1; k < lines; k+=2) {
//    printf("searching %s\n", csv[k]->strings[0].str);

  
    if(csv[k]->strings[0].len > 8) {  
    if(wsize > csv[k]->strings[0].len) {
      d=mmsearch(sarray, csv[k]->strings[0].str, csv[k]->strings[0].len, 0, 0, sarray->numofsuffixes-1);
  //    printf("suffixes were found at positions (%d, %d)\n",d.a, d.b);
        for  (j=d.a; j <= d.b; j++) {
            genome[sarray->suftab[j]]++;
         }

    } else {
      for(l=0; l < csv[k]->strings[0].len-wsize; l++) { 
         d=mmsearch(sarray, &csv[k]->strings[0].str[l], wsize, 0, 0, sarray->numofsuffixes-1);
    //    printf("suffixes were found at positions (%d, %d)\n",d.a, d.b);
        
        
     for  (j=d.a; j <= d.b; j++) {
            genome[sarray->suftab[j]]++;

       /*   start = sarray->suffixptr[sarray->suftab[j]];
          printf("pattern was: %s\n", &csv[k]->strings[0].str[l]);
          printf("suffix found: ");
          for (i=0; i < wsize; i++) {
          printf("%c", start[i]);
          }
          printf("\n");
          id = getMultiCharSeqIndex(sarray->seq, sarray->suffixptr[sarray->suftab[j]]);	
          printf("found in sequence: %d\n", id); */
        } 
      }
    }
    if (d.a < d.b) counter++;
    all++;}
  }
  
  destructStringset(&space, set);
  writeY("out.xy", genome, totallength);

  /*h = gnuplot_init();
  gnuplot_setstyle(h, "points");
  
  gnuplot_cmd(h, "set title 'IMBISS - seed statistics' -28,0 font'Helvetica,15'");	
  gnuplot_cmd(h, "set label 'seed length: %d' at graph 0.05,0.95 font 'Helvetica, 12'", totallength);
  gnuplot_set_xlabel(h, "matches");
  gnuplot_set_ylabel(h, "position");	
  gnuplot_plot_x(h, genome, totallength, "position");
  */

  printf ("sliding windows of %d sequences (of %d) found\n", counter, all);
  printf ("Building  the suffixarray has taken %f seconds.\n", difsuf);
  printf ("Total length of suffixarray was %d\n", totallength);
  while(1);
  return EXIT_SUCCESS;
}