Example #1
0
int main(int argc, char *argv[])
{
  void write_sa(char *filename, int *p, int n);
  void write_lcp(char *filename, UChar *x,int *p, int n);
  void write_bwt(char *filename, UChar *x,int *p, int n);
  void check_sa_ordering(UChar *x,int *p, int n, int);
  void print_sa_onscreen(UChar *x,int *p, int n, int);
  int c, *p, n;
  int print_sa, check_sa, num_opt,overshoot;
  UChar *x;
  clock_t end,start, end_real, start_real;
  struct tms r;
  double tot_time = 0.0;
  double tot_time_real = 0.0;
  extern char *optarg;
  extern int optind, opterr, optopt;
  char *fnam, *sa_filename;
  char *lcp_filename,*bwt_filename;   // names for (optional) lcp and bwt files
  FILE *f;

  /* ------------ set default values ------------- */
  set_global_variables();
  print_sa=check_sa=0;
  sa_filename = NULL;
  lcp_filename = NULL;
  bwt_filename = NULL;

  /* ------------- read options from command line ----------- */
  num_opt = opterr = 0;
  while ((c=getopt(argc, argv, "b:d:l:p:r:w:cvux:f:T:W:B:")) != -1) {
    switch (c) 
      {
      case 'b':
        bwt_filename = optarg; break;
      case 'c':
        check_sa++; break;
      case 'd':
        Anchor_dist = atoi(optarg); break;
      case 'l':
        Shallow_limit  = atoi(optarg); break;
      case 'p':
        print_sa = atoi(optarg); break;
      case 'x':
        _ds_Word_size = atoi(optarg); break;
      case 'v':
        _ds_Verbose++; break;
      case 'w':
        sa_filename = optarg; break;
      case 'f':
        Max_pseudo_anchor_offset = atoi(optarg); break;
      case 'r':
        B2g_ratio = atoi(optarg); break;
      case 'u':
        Update_anchor_ranks = 1; break;
      case 'T':
        Mk_qs_thresh = atoi(optarg); break;
      case 'W':
        lcp_filename = optarg; break;
      case 'B':
        Blind_sort_ratio = atoi(optarg); break;
      case '?':
        fprintf(stderr,"Unknown option: %c -main-\n", optopt);
        exit(1);
      }
    num_opt++;
  }
  if(optind<argc)
    fnam=argv[optind];
  else {
    fprintf(stderr, "Usage:\n\t%s [-b bwtfile][-cuv][-d dist]",argv[0]);
    fprintf(stderr, "[-l len][-p num][-f maxoff][-r ratio]\n");
    fprintf(stderr, 
            "\t   [-T thresh][-w safile][-W lcpfile][-x wsize][-B ratio]");
    fprintf(stderr, " file\n\n");
    fprintf(stderr,"\t-b bwtfile  write bwt to bwtfile\n");    
    fprintf(stderr,
	    "\t-B ratio    blind_sort ratio [def. %d]\n",Blind_sort_ratio);
    fprintf(stderr,"\t-c          check the sa (could be very slow)\n");    
    fprintf(stderr,"\t-d dist     anchor distance [def. %d]\n",Anchor_dist);
    fprintf(stderr,"\t-f maxoff   Maximum offset for forward ");
    fprintf(stderr,"pseudo-anchors [def. %d]\n",Max_pseudo_anchor_offset);
    fprintf(stderr,
            "\t-l len      shallow sort limit [def. %d]\n",Shallow_limit);
    fprintf(stderr,
	    "\t-r ratio    bucket to group max ratio [def. %d]\n",B2g_ratio);
    fprintf(stderr,"\t-p num      print num char of each suffix [def. 0]\n");
    fprintf(stderr,
	    "\t-T thresh   Threshold for mk-qs [def. %d]\n", Mk_qs_thresh);
    fprintf(stderr,"\t-u          updates anchor ranks in get_rank()\n");
    fprintf(stderr,"\t-v          produces a verbose output\n");
    fprintf(stderr,"\t-w safile   write sa to safile\n");    
    fprintf(stderr,
            "\t-W lcpfile  check sa and write lcp to lcpfile (very slow)\n");
    fprintf(stderr,
	    "\t-x wsize    word size in mkqs (default %d)\n\n",_ds_Word_size); 
    return 0;
  }
  if(_ds_Verbose) {
    fprintf(stderr,"Command line: ");
    for(c=0;c<argc;c++)
      fprintf(stderr,"%s ",argv[c]);
    fprintf(stderr,"\n");
  }
  /* -------- check parameters ------------- */
  if(check_global_variables()) {
    exit(1);
  }

  /* ---------- open file and read text ----------- */
  if (! (f=fopen(fnam, "rb"))) {
    perror(fnam);
    return 1;
  }
  if (fseek(f, 0L, SEEK_END)) {
    perror(fnam);
    return 1;
  }
  n=ftell(f);
  if (n==0) {
    fprintf(stderr, "%s: file empty\n", fnam);
    return 0;
  }

  // ------ allocate memory for text and sa -------
  overshoot = compute_overshoot();
  p=malloc((n)*sizeof *p);               // sa
  x=malloc((n+overshoot)*sizeof *x);     // text
  if (! p || ! x) {
    fprintf(stderr, "malloc failed\n");
    return 1;
  }

  // ------------ read input text ---------------
  rewind(f); 
  c=fread(x, (size_t) 1, (size_t) n, f);
  // lseek(fileno(f),0,SEEK_SET); 
  // c=read(fileno(f), x, (size_t) n);
  if(c!=n) {
    fprintf(stderr,"Error in read() (%d vs %d) (main)\n",c,n);
    perror(fnam);
    return 1;
  }
  fclose(f);

  /* ---------  start measuring time ------------- */
  if(_ds_Verbose)
    fprintf(stderr,"Starting sa construction ... \n");
  start_real = times(&r);
  start  = (r.tms_utime+r.tms_stime);     /* user + system */
  ds_ssort(x, p, n);
  end_real = times(&r);
  end  = (r.tms_utime+r.tms_stime);     /* user + system */
 // tot_time =  ((double) (end-start))/CLK_TCK;
  //tot_time_real =  ((double) (end_real-start_real))/CLK_TCK;
  printf("Elapsed time: %.2f seconds (user+sys). Total real time: %.2f.\n", 
	 tot_time, tot_time_real);

  // --------------- write bwt to a file 
  if(bwt_filename!=NULL) 
    write_bwt(bwt_filename,x,p,n);

  // --------------- write sa to a file 
  if(sa_filename!=NULL) 
    write_sa(sa_filename,p,n);

  // --------------- write lcp to a file 
  if(lcp_filename!=NULL) 
    write_lcp(lcp_filename,x,p,n);

  // ------------ check sa --------
  if(check_sa) 
    check_sa_ordering(x,p,n,check_sa);

  // ----- display sa -------
  if(print_sa)   
    print_sa_onscreen(x,p,n,print_sa);

  // deallocate and exit
  free(x); free(p);
   return 0;
}
Example #2
0
void csa_new_from_bwt(CSA csa, char *fname, char *fidx, int psi_id, int idx_id, bool coded)
{
	int k;
	i64 i,j,v,m;
	FILE *f2;
	i64 psize,isize;
	i64 n;

	psi_id = csa.id;
	if (psi_id >= 0) {
		printf("create psi: id=%d\n",psi_id);
	}
	if (idx_id >= 0) {
		printf("create idx: id=%d D=%d D2=%d\n",idx_id,csa.D,csa.D2);
	}

	psize = 0;

	if (psi_id >= 0) {
		switch (psi_id & 0x3f) {
			case ID_DIFF_GAMMA:
			case ID_DIFF_GAMMA_RL:
			case ID_DIFF_GAMMA_SPARSE:
			case ID_DIFF_GAMMA_RL_SPARSE:
				psize = psi1_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("Psi   %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_DIFF_GAMMA_RR:
				psize = psi12_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("Psi   %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_BWT_DNA:
				psize = lf_dna_makeindex(&csa, fname, coded);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_BWT_BIT:
				psize = lf_bit_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			case ID_BWT_WT:
			case ID_BWT_WT_HUF:
			case ID_BWT_WT_DENSE:
			case ID_BWT_WT_SPARSE4:
			case ID_BWT_WT_RR:
				psize = lf_wt_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
#if 0
			case ID_BWT_HUF:
				psize = lf_bwt_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("BW    %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
#endif
			case ID_SPARSE4:
				psize = psi2_makeindex(&csa, fname);
				printf("n     %ld\n",csa.n);
				printf("Psi   %ld bytes (%1.3f bpc)\n",
						psize,(double)psize*8/csa.n);
				break;
			default:
				printf("psi_id = %d\n",psi_id);
				exit(1);
		}
	}

	csa.k = (blog(csa.n+1)+1+8-1)/8;

	for (i=0; i<SIGMA; i++) csa.CtoA[i] = -1;
	//  csa.K[-1+1] = 0;
	csa.K[-1+1] = 1;
	for (m=0,v=1,i=0; i<SIGMA; i++) {
		if (csa.C[i]>0) {
			csa.AtoC[m] = i;
			csa.CtoA[i] = m;
			csa.K[m+1] = v;
			//      printf("i=%ld v = %ld C[i] = %ld\n",i,v,csa.C[i]);
			v += csa.C[i];
			m++;
		}
	}
	csa.K[m+1] = v;
	csa.m = m;

	if (csa.D >= csa.n) {
		printf("D=%d >= n=%ld\n",csa.D,csa.n);
		exit(0);
	}
	if (csa.D2 >= csa.n) {
		printf("D2=%d >= n=%ld\n",csa.D2,csa.n);
		exit(0);
	}

	if (idx_id >= 0) {
		n = csa.n;
		k = csa.k;
		////  compute SA and ISA
		if (csa.D > 0) csa.SA = (uchar *) mymalloc(((n-1)/csa.D+1+1)*k);
		if (csa.D2 > 0) csa.ISA = (uchar *) mymalloc(((n-1)/csa.D2+1+1)*k);
		if (csa.D == 0 && csa.D2 == 0) goto brk;

		switch (psi_id & 0x3f) {
			case ID_DIFF_GAMMA:
			case ID_DIFF_GAMMA_RL:
			case ID_DIFF_GAMMA_SPARSE:
			case ID_DIFF_GAMMA_RL_SPARSE:
			case ID_SPARSE4:
			case ID_DIFF_GAMMA_RR:
				j = 0;
				for (i=0; i<=n; i++) {
					display_progressbar("making sa ",i,n);
					j = csa.psi(&csa,j);
					//  sa[j] = i;
					if (csa.D > 0 && j % csa.D == 0) {
						putuint(csa.SA,j / csa.D,i,k);
					}
					if (csa.D2 > 0 && i % csa.D2 == 0) {
						putuint(csa.ISA,i / csa.D2,j,k);
					}
				}
				//      putuint(csa.SA,0,n,k);
				break;
			case ID_BWT_DNA:
			case ID_BWT_BIT:
			case ID_BWT_WT:
			case ID_BWT_WT_HUF:
			case ID_BWT_WT_DENSE:
			case ID_BWT_WT_SPARSE4:
			case ID_BWT_WT_RR:
			case ID_BWT_HUF:
				j = 0;
				for (i=n-1; i>=0; i--) {
					display_progressbar("making sa ",i,n);
					v = csa.LF(&csa,j);
					//        printf("LF[%ld] = %ld\n",j,v);
					j = v;
					if (csa.D > 0 && j % csa.D == 0) putuint(csa.SA, j/csa.D , i, k);
					if (csa.D2 > 0 && i % csa.D2 == 0) putuint(csa.ISA, i/csa.D2, j, k);
				}
				putuint(csa.SA,0,n,k);
				break;
			default:
				break;
		}
brk:
		////      write idx
		f2 = fopen(fidx,"wb"); /* directory */
		if (f2 == NULL) {
			perror("csa2_new1: ");
			exit(1);
		}

		isize = 0;

		writeint(4,VERSION,f2); /* version */
		isize += 4;

		writeint(1,ID_HEADER,f2); // header ID
		isize += 1;
		isize = write_header(&csa, f2, isize);

		if (csa.D > 0) {
			writeint(1,ID_SA,f2);
			isize += 1;
			isize = write_sa(&csa, f2, isize);
		}

		if (csa.D2 > 0) {
			writeint(1,ID_ISA,f2);
			isize += 1;
			isize = write_isa(&csa, f2, isize);
		}


		fclose(f2);

		if (csa.D > 0) free(csa.SA);
		if (csa.D2 > 0) free(csa.ISA);

		printf("Total %ld bytes (%1.3f bpc)\n",(psize+isize),
				(double)(psize+isize)*8/csa.n);
	}
	free(fidx);
}
Example #3
0
void csa_new_from_bwt(int argc, char *argv[])
{
  i64 i,j,v,m;
  FILE *f2;
  i64 psize,isize;
  i64 n;
  int k;
  char *fname,*fidx;
  char *p;
  int psi_id, idx_id;
  CSA csa;
  int sigma;

  csa.sigma = 256; /* default alphabet size */
  csa.k2 = 1;

//  for (i=0; i<SIGMA+2; i++) csa.C[i] = 0;
//  for (i=0; i<SIGMA; i++) csa.C[i] = 0;

  fname = NULL;  fidx = NULL;
  psi_id = idx_id = -1;
  for (i=1; i<argc; i++) {
    p = argv[i];
    if (p[0] == '-') {
      p++;
      switch (toupper(p[0])) {
      case 'I':
      // -I[n]:[D]:[D2]
        p++;
        idx_id = 0;
        csa_options(&csa, p);
        break;
      case 'P':
      // -P[n]:[L]
        p++;
        psi_id = 0;
        psi_options(&csa, p);
        break;
      case 'C':
      // -C[s]
        p++;
        sigma_options(&csa, p);
        break;
      default:
        printf("??? no such option %s\n",argv[i]);
        exit(1);
      }
    } else {
      fname = argv[i];
      k = strlen(fname);
      fidx = mymalloc(k+5);
      sprintf(fidx,"%s.idx",fname);
    }
  }
  if (fname == NULL) {
    printf("no input file.\n");
    exit(0);
  }
  printf("sigma = %d k2 = %d\n", csa.sigma, csa.k2);
  sigma = csa.sigma;

  csa.C = mymalloc(sizeof(*csa.C)*sigma); //
  csa.CtoA = mymalloc(sizeof(*csa.CtoA)*sigma); //
  csa.AtoC = mymalloc(sizeof(*csa.AtoC)*sigma); //
  csa.K = mymalloc(sizeof(*csa.K)*(sigma+2)); //
  for (i=0; i<sigma; i++) csa.C[i] = 0;


  psi_id = csa.id;
  if (psi_id >= 0) {
    printf("create psi: id=%d\n",psi_id);
  }
  if (idx_id >= 0) {
    printf("create idx: id=%d D=%d D2=%d\n",idx_id,csa.D,csa.D2);
  }

  psize = 0;

  if (psi_id >= 0) {
    switch (psi_id & 0x3f) {
    case ID_DIFF_GAMMA:
    case ID_DIFF_GAMMA_RL:
    case ID_DIFF_GAMMA_SPARSE:
    case ID_DIFF_GAMMA_RL_SPARSE:
      psize = psi1_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("Psi   %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_DIFF_GAMMA_RR:
      psize = psi12_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("Psi   %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_DNA:
      psize = lf_dna_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_DNA2:
      psize = lf_dna2_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_BIT:
      psize = lf_bit_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    case ID_BWT_WT:
    case ID_BWT_WT_HUF:
    case ID_BWT_WT_DENSE:
    case ID_BWT_WT_SPARSE4:
    case ID_BWT_WT_RR:
      psize = lf_wt_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
#if 0
    case ID_BWT_HUF:
      psize = lf_bwt_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("BW    %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
#endif
    case ID_SPARSE4:
      psize = psi2_makeindex(&csa, fname);
      printf("n     %ld\n",csa.n);
      printf("Psi   %ld bytes (%1.3f bpc)\n",
              psize,(double)psize*8/csa.n);
      break;
    default:
      printf("psi_id = %d\n",psi_id);
      exit(1);
    }
  }

  csa.k = (blog(csa.n+1)+1+8-1)/8;

  for (i=0; i<sigma; i++) csa.CtoA[i] = -1;
  csa.K[-1+1] = 1;
  for (m=0,v=1,i=0; i<sigma; i++) {
    if (csa.C[i]>0) {
      csa.AtoC[m] = i;
      csa.CtoA[i] = m;
      csa.K[m+1] = v;
//      printf("i=%ld v = %ld C[i] = %ld\n",i,v,csa.C[i]);
      v += csa.C[i];
      m++;
    }
  }
  csa.K[m+1] = v;
  csa.m = m;

  if (csa.D >= csa.n) {
    printf("D=%d >= n=%ld\n",csa.D,csa.n);
    exit(0);
  }
  if (csa.D2 >= csa.n) {
    printf("D2=%d >= n=%ld\n",csa.D2,csa.n);
    exit(0);
  }

  if (idx_id >= 0) {
    n = csa.n;
    k = csa.k;
////  compute SA and ISA
    if (csa.D > 0) csa.SA = mymalloc(((n-1)/csa.D+1+1)*k);
    if (csa.D2 > 0) csa.ISA = mymalloc(((n-1)/csa.D2+1+1)*k);
    if (csa.D == 0 && csa.D2 == 0) goto brk;

    switch (psi_id & 0x3f) {
    case ID_DIFF_GAMMA:
    case ID_DIFF_GAMMA_RL:
    case ID_DIFF_GAMMA_SPARSE:
    case ID_DIFF_GAMMA_RL_SPARSE:
    case ID_SPARSE4:
    case ID_DIFF_GAMMA_RR:
      j = 0;
      for (i=0; i<=n; i++) {
        display_progressbar("making sa ",i,n);
        j = csa.psi(&csa,j);
  //  sa[j] = i;
        if (csa.D > 0 && j % csa.D == 0) {
          putuint(csa.SA,j / csa.D,i,k);
        }
        if (csa.D2 > 0 && i % csa.D2 == 0) {
          putuint(csa.ISA,i / csa.D2,j,k);
        }
      }
//      putuint(csa.SA,0,n,k);
      break;
    case ID_BWT_DNA:
    case ID_BWT_DNA2:
    case ID_BWT_BIT:
    case ID_BWT_WT:
    case ID_BWT_WT_HUF:
    case ID_BWT_WT_DENSE:
    case ID_BWT_WT_SPARSE4:
    case ID_BWT_WT_RR:
    case ID_BWT_HUF:
      j = 0;
      for (i=n-1; i>=0; i--) {
        display_progressbar("making sa ",i,n);
        v = csa.LF(&csa,j);
//        printf("LF[%ld] = %ld\n",j,v);
        j = v;
        if (csa.D > 0 && j % csa.D == 0) putuint(csa.SA, j/csa.D , i, k);
        if (csa.D2 > 0 && i % csa.D2 == 0) putuint(csa.ISA, i/csa.D2, j, k);
      }
//      putuint(csa.SA,0,n,k);
      if (csa.D > 0) putuint(csa.SA,0,n,k); // 2011-12-20
      break;
    default:
      break;
    }
brk:
////      write idx
    f2 = fopen(fidx,"wb"); /* directory */
    if (f2 == NULL) {
      perror("csa2_new1: ");
      exit(1);
    }

    isize = 0;

    writeint(4,VERSION,f2); /* version */
    isize += 4;

    writeint(1,ID_HEADER,f2); // header ID
    isize += 1;
    isize = write_header(&csa, f2, isize);

    if (csa.D > 0) {
      writeint(1,ID_SA,f2);
      isize += 1;
      isize = write_sa(&csa, f2, isize);
    }

    if (csa.D2 > 0) {
      writeint(1,ID_ISA,f2);
      isize += 1;
      isize = write_isa(&csa, f2, isize);
    }


    fclose(f2);

    if (csa.D > 0) free(csa.SA);
    if (csa.D2 > 0) free(csa.ISA);

    printf("Total %ld bytes (%1.3f bpc)\n",(psize+isize),
                (double)(psize+isize)*8/csa.n);
  }
  free(fidx);
}