int main(int argc, char *argv[]) { void write_sa(char *filename, int *p, int n); void write_lcp(char *filename, UChar *x,int *p, int n); void write_bwt(char *filename, UChar *x,int *p, int n); void check_sa_ordering(UChar *x,int *p, int n, int); void print_sa_onscreen(UChar *x,int *p, int n, int); int c, *p, n; int print_sa, check_sa, num_opt,overshoot; UChar *x; clock_t end,start, end_real, start_real; struct tms r; double tot_time = 0.0; double tot_time_real = 0.0; extern char *optarg; extern int optind, opterr, optopt; char *fnam, *sa_filename; char *lcp_filename,*bwt_filename; // names for (optional) lcp and bwt files FILE *f; /* ------------ set default values ------------- */ set_global_variables(); print_sa=check_sa=0; sa_filename = NULL; lcp_filename = NULL; bwt_filename = NULL; /* ------------- read options from command line ----------- */ num_opt = opterr = 0; while ((c=getopt(argc, argv, "b:d:l:p:r:w:cvux:f:T:W:B:")) != -1) { switch (c) { case 'b': bwt_filename = optarg; break; case 'c': check_sa++; break; case 'd': Anchor_dist = atoi(optarg); break; case 'l': Shallow_limit = atoi(optarg); break; case 'p': print_sa = atoi(optarg); break; case 'x': _ds_Word_size = atoi(optarg); break; case 'v': _ds_Verbose++; break; case 'w': sa_filename = optarg; break; case 'f': Max_pseudo_anchor_offset = atoi(optarg); break; case 'r': B2g_ratio = atoi(optarg); break; case 'u': Update_anchor_ranks = 1; break; case 'T': Mk_qs_thresh = atoi(optarg); break; case 'W': lcp_filename = optarg; break; case 'B': Blind_sort_ratio = atoi(optarg); break; case '?': fprintf(stderr,"Unknown option: %c -main-\n", optopt); exit(1); } num_opt++; } if(optind<argc) fnam=argv[optind]; else { fprintf(stderr, "Usage:\n\t%s [-b bwtfile][-cuv][-d dist]",argv[0]); fprintf(stderr, "[-l len][-p num][-f maxoff][-r ratio]\n"); fprintf(stderr, "\t [-T thresh][-w safile][-W lcpfile][-x wsize][-B ratio]"); fprintf(stderr, " file\n\n"); fprintf(stderr,"\t-b bwtfile write bwt to bwtfile\n"); fprintf(stderr, "\t-B ratio blind_sort ratio [def. %d]\n",Blind_sort_ratio); fprintf(stderr,"\t-c check the sa (could be very slow)\n"); fprintf(stderr,"\t-d dist anchor distance [def. %d]\n",Anchor_dist); fprintf(stderr,"\t-f maxoff Maximum offset for forward "); fprintf(stderr,"pseudo-anchors [def. %d]\n",Max_pseudo_anchor_offset); fprintf(stderr, "\t-l len shallow sort limit [def. %d]\n",Shallow_limit); fprintf(stderr, "\t-r ratio bucket to group max ratio [def. %d]\n",B2g_ratio); fprintf(stderr,"\t-p num print num char of each suffix [def. 0]\n"); fprintf(stderr, "\t-T thresh Threshold for mk-qs [def. %d]\n", Mk_qs_thresh); fprintf(stderr,"\t-u updates anchor ranks in get_rank()\n"); fprintf(stderr,"\t-v produces a verbose output\n"); fprintf(stderr,"\t-w safile write sa to safile\n"); fprintf(stderr, "\t-W lcpfile check sa and write lcp to lcpfile (very slow)\n"); fprintf(stderr, "\t-x wsize word size in mkqs (default %d)\n\n",_ds_Word_size); return 0; } if(_ds_Verbose) { fprintf(stderr,"Command line: "); for(c=0;c<argc;c++) fprintf(stderr,"%s ",argv[c]); fprintf(stderr,"\n"); } /* -------- check parameters ------------- */ if(check_global_variables()) { exit(1); } /* ---------- open file and read text ----------- */ if (! (f=fopen(fnam, "rb"))) { perror(fnam); return 1; } if (fseek(f, 0L, SEEK_END)) { perror(fnam); return 1; } n=ftell(f); if (n==0) { fprintf(stderr, "%s: file empty\n", fnam); return 0; } // ------ allocate memory for text and sa ------- overshoot = compute_overshoot(); p=malloc((n)*sizeof *p); // sa x=malloc((n+overshoot)*sizeof *x); // text if (! p || ! x) { fprintf(stderr, "malloc failed\n"); return 1; } // ------------ read input text --------------- rewind(f); c=fread(x, (size_t) 1, (size_t) n, f); // lseek(fileno(f),0,SEEK_SET); // c=read(fileno(f), x, (size_t) n); if(c!=n) { fprintf(stderr,"Error in read() (%d vs %d) (main)\n",c,n); perror(fnam); return 1; } fclose(f); /* --------- start measuring time ------------- */ if(_ds_Verbose) fprintf(stderr,"Starting sa construction ... \n"); start_real = times(&r); start = (r.tms_utime+r.tms_stime); /* user + system */ ds_ssort(x, p, n); end_real = times(&r); end = (r.tms_utime+r.tms_stime); /* user + system */ // tot_time = ((double) (end-start))/CLK_TCK; //tot_time_real = ((double) (end_real-start_real))/CLK_TCK; printf("Elapsed time: %.2f seconds (user+sys). Total real time: %.2f.\n", tot_time, tot_time_real); // --------------- write bwt to a file if(bwt_filename!=NULL) write_bwt(bwt_filename,x,p,n); // --------------- write sa to a file if(sa_filename!=NULL) write_sa(sa_filename,p,n); // --------------- write lcp to a file if(lcp_filename!=NULL) write_lcp(lcp_filename,x,p,n); // ------------ check sa -------- if(check_sa) check_sa_ordering(x,p,n,check_sa); // ----- display sa ------- if(print_sa) print_sa_onscreen(x,p,n,print_sa); // deallocate and exit free(x); free(p); return 0; }
void csa_new_from_bwt(CSA csa, char *fname, char *fidx, int psi_id, int idx_id, bool coded) { int k; i64 i,j,v,m; FILE *f2; i64 psize,isize; i64 n; psi_id = csa.id; if (psi_id >= 0) { printf("create psi: id=%d\n",psi_id); } if (idx_id >= 0) { printf("create idx: id=%d D=%d D2=%d\n",idx_id,csa.D,csa.D2); } psize = 0; if (psi_id >= 0) { switch (psi_id & 0x3f) { case ID_DIFF_GAMMA: case ID_DIFF_GAMMA_RL: case ID_DIFF_GAMMA_SPARSE: case ID_DIFF_GAMMA_RL_SPARSE: psize = psi1_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("Psi %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_DIFF_GAMMA_RR: psize = psi12_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("Psi %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_DNA: psize = lf_dna_makeindex(&csa, fname, coded); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_BIT: psize = lf_bit_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_WT: case ID_BWT_WT_HUF: case ID_BWT_WT_DENSE: case ID_BWT_WT_SPARSE4: case ID_BWT_WT_RR: psize = lf_wt_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; #if 0 case ID_BWT_HUF: psize = lf_bwt_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; #endif case ID_SPARSE4: psize = psi2_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("Psi %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; default: printf("psi_id = %d\n",psi_id); exit(1); } } csa.k = (blog(csa.n+1)+1+8-1)/8; for (i=0; i<SIGMA; i++) csa.CtoA[i] = -1; // csa.K[-1+1] = 0; csa.K[-1+1] = 1; for (m=0,v=1,i=0; i<SIGMA; i++) { if (csa.C[i]>0) { csa.AtoC[m] = i; csa.CtoA[i] = m; csa.K[m+1] = v; // printf("i=%ld v = %ld C[i] = %ld\n",i,v,csa.C[i]); v += csa.C[i]; m++; } } csa.K[m+1] = v; csa.m = m; if (csa.D >= csa.n) { printf("D=%d >= n=%ld\n",csa.D,csa.n); exit(0); } if (csa.D2 >= csa.n) { printf("D2=%d >= n=%ld\n",csa.D2,csa.n); exit(0); } if (idx_id >= 0) { n = csa.n; k = csa.k; //// compute SA and ISA if (csa.D > 0) csa.SA = (uchar *) mymalloc(((n-1)/csa.D+1+1)*k); if (csa.D2 > 0) csa.ISA = (uchar *) mymalloc(((n-1)/csa.D2+1+1)*k); if (csa.D == 0 && csa.D2 == 0) goto brk; switch (psi_id & 0x3f) { case ID_DIFF_GAMMA: case ID_DIFF_GAMMA_RL: case ID_DIFF_GAMMA_SPARSE: case ID_DIFF_GAMMA_RL_SPARSE: case ID_SPARSE4: case ID_DIFF_GAMMA_RR: j = 0; for (i=0; i<=n; i++) { display_progressbar("making sa ",i,n); j = csa.psi(&csa,j); // sa[j] = i; if (csa.D > 0 && j % csa.D == 0) { putuint(csa.SA,j / csa.D,i,k); } if (csa.D2 > 0 && i % csa.D2 == 0) { putuint(csa.ISA,i / csa.D2,j,k); } } // putuint(csa.SA,0,n,k); break; case ID_BWT_DNA: case ID_BWT_BIT: case ID_BWT_WT: case ID_BWT_WT_HUF: case ID_BWT_WT_DENSE: case ID_BWT_WT_SPARSE4: case ID_BWT_WT_RR: case ID_BWT_HUF: j = 0; for (i=n-1; i>=0; i--) { display_progressbar("making sa ",i,n); v = csa.LF(&csa,j); // printf("LF[%ld] = %ld\n",j,v); j = v; if (csa.D > 0 && j % csa.D == 0) putuint(csa.SA, j/csa.D , i, k); if (csa.D2 > 0 && i % csa.D2 == 0) putuint(csa.ISA, i/csa.D2, j, k); } putuint(csa.SA,0,n,k); break; default: break; } brk: //// write idx f2 = fopen(fidx,"wb"); /* directory */ if (f2 == NULL) { perror("csa2_new1: "); exit(1); } isize = 0; writeint(4,VERSION,f2); /* version */ isize += 4; writeint(1,ID_HEADER,f2); // header ID isize += 1; isize = write_header(&csa, f2, isize); if (csa.D > 0) { writeint(1,ID_SA,f2); isize += 1; isize = write_sa(&csa, f2, isize); } if (csa.D2 > 0) { writeint(1,ID_ISA,f2); isize += 1; isize = write_isa(&csa, f2, isize); } fclose(f2); if (csa.D > 0) free(csa.SA); if (csa.D2 > 0) free(csa.ISA); printf("Total %ld bytes (%1.3f bpc)\n",(psize+isize), (double)(psize+isize)*8/csa.n); } free(fidx); }
void csa_new_from_bwt(int argc, char *argv[]) { i64 i,j,v,m; FILE *f2; i64 psize,isize; i64 n; int k; char *fname,*fidx; char *p; int psi_id, idx_id; CSA csa; int sigma; csa.sigma = 256; /* default alphabet size */ csa.k2 = 1; // for (i=0; i<SIGMA+2; i++) csa.C[i] = 0; // for (i=0; i<SIGMA; i++) csa.C[i] = 0; fname = NULL; fidx = NULL; psi_id = idx_id = -1; for (i=1; i<argc; i++) { p = argv[i]; if (p[0] == '-') { p++; switch (toupper(p[0])) { case 'I': // -I[n]:[D]:[D2] p++; idx_id = 0; csa_options(&csa, p); break; case 'P': // -P[n]:[L] p++; psi_id = 0; psi_options(&csa, p); break; case 'C': // -C[s] p++; sigma_options(&csa, p); break; default: printf("??? no such option %s\n",argv[i]); exit(1); } } else { fname = argv[i]; k = strlen(fname); fidx = mymalloc(k+5); sprintf(fidx,"%s.idx",fname); } } if (fname == NULL) { printf("no input file.\n"); exit(0); } printf("sigma = %d k2 = %d\n", csa.sigma, csa.k2); sigma = csa.sigma; csa.C = mymalloc(sizeof(*csa.C)*sigma); // csa.CtoA = mymalloc(sizeof(*csa.CtoA)*sigma); // csa.AtoC = mymalloc(sizeof(*csa.AtoC)*sigma); // csa.K = mymalloc(sizeof(*csa.K)*(sigma+2)); // for (i=0; i<sigma; i++) csa.C[i] = 0; psi_id = csa.id; if (psi_id >= 0) { printf("create psi: id=%d\n",psi_id); } if (idx_id >= 0) { printf("create idx: id=%d D=%d D2=%d\n",idx_id,csa.D,csa.D2); } psize = 0; if (psi_id >= 0) { switch (psi_id & 0x3f) { case ID_DIFF_GAMMA: case ID_DIFF_GAMMA_RL: case ID_DIFF_GAMMA_SPARSE: case ID_DIFF_GAMMA_RL_SPARSE: psize = psi1_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("Psi %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_DIFF_GAMMA_RR: psize = psi12_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("Psi %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_DNA: psize = lf_dna_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_DNA2: psize = lf_dna2_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_BIT: psize = lf_bit_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; case ID_BWT_WT: case ID_BWT_WT_HUF: case ID_BWT_WT_DENSE: case ID_BWT_WT_SPARSE4: case ID_BWT_WT_RR: psize = lf_wt_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; #if 0 case ID_BWT_HUF: psize = lf_bwt_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("BW %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; #endif case ID_SPARSE4: psize = psi2_makeindex(&csa, fname); printf("n %ld\n",csa.n); printf("Psi %ld bytes (%1.3f bpc)\n", psize,(double)psize*8/csa.n); break; default: printf("psi_id = %d\n",psi_id); exit(1); } } csa.k = (blog(csa.n+1)+1+8-1)/8; for (i=0; i<sigma; i++) csa.CtoA[i] = -1; csa.K[-1+1] = 1; for (m=0,v=1,i=0; i<sigma; i++) { if (csa.C[i]>0) { csa.AtoC[m] = i; csa.CtoA[i] = m; csa.K[m+1] = v; // printf("i=%ld v = %ld C[i] = %ld\n",i,v,csa.C[i]); v += csa.C[i]; m++; } } csa.K[m+1] = v; csa.m = m; if (csa.D >= csa.n) { printf("D=%d >= n=%ld\n",csa.D,csa.n); exit(0); } if (csa.D2 >= csa.n) { printf("D2=%d >= n=%ld\n",csa.D2,csa.n); exit(0); } if (idx_id >= 0) { n = csa.n; k = csa.k; //// compute SA and ISA if (csa.D > 0) csa.SA = mymalloc(((n-1)/csa.D+1+1)*k); if (csa.D2 > 0) csa.ISA = mymalloc(((n-1)/csa.D2+1+1)*k); if (csa.D == 0 && csa.D2 == 0) goto brk; switch (psi_id & 0x3f) { case ID_DIFF_GAMMA: case ID_DIFF_GAMMA_RL: case ID_DIFF_GAMMA_SPARSE: case ID_DIFF_GAMMA_RL_SPARSE: case ID_SPARSE4: case ID_DIFF_GAMMA_RR: j = 0; for (i=0; i<=n; i++) { display_progressbar("making sa ",i,n); j = csa.psi(&csa,j); // sa[j] = i; if (csa.D > 0 && j % csa.D == 0) { putuint(csa.SA,j / csa.D,i,k); } if (csa.D2 > 0 && i % csa.D2 == 0) { putuint(csa.ISA,i / csa.D2,j,k); } } // putuint(csa.SA,0,n,k); break; case ID_BWT_DNA: case ID_BWT_DNA2: case ID_BWT_BIT: case ID_BWT_WT: case ID_BWT_WT_HUF: case ID_BWT_WT_DENSE: case ID_BWT_WT_SPARSE4: case ID_BWT_WT_RR: case ID_BWT_HUF: j = 0; for (i=n-1; i>=0; i--) { display_progressbar("making sa ",i,n); v = csa.LF(&csa,j); // printf("LF[%ld] = %ld\n",j,v); j = v; if (csa.D > 0 && j % csa.D == 0) putuint(csa.SA, j/csa.D , i, k); if (csa.D2 > 0 && i % csa.D2 == 0) putuint(csa.ISA, i/csa.D2, j, k); } // putuint(csa.SA,0,n,k); if (csa.D > 0) putuint(csa.SA,0,n,k); // 2011-12-20 break; default: break; } brk: //// write idx f2 = fopen(fidx,"wb"); /* directory */ if (f2 == NULL) { perror("csa2_new1: "); exit(1); } isize = 0; writeint(4,VERSION,f2); /* version */ isize += 4; writeint(1,ID_HEADER,f2); // header ID isize += 1; isize = write_header(&csa, f2, isize); if (csa.D > 0) { writeint(1,ID_SA,f2); isize += 1; isize = write_sa(&csa, f2, isize); } if (csa.D2 > 0) { writeint(1,ID_ISA,f2); isize += 1; isize = write_isa(&csa, f2, isize); } fclose(f2); if (csa.D > 0) free(csa.SA); if (csa.D2 > 0) free(csa.ISA); printf("Total %ld bytes (%1.3f bpc)\n",(psize+isize), (double)(psize+isize)*8/csa.n); } free(fidx); }