int main(int argc, char **argv) { OverlapStore *ovs = NULL; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-O") == 0) { ovs = AS_OVS_openOverlapStore(argv[++arg]); } else { err++; } arg++; } if ((ovs == NULL) || (err)) { fprintf(stderr, "usage: %s -O ovlStore < unitigs.cgb > fixedUnitigs.cgb\n", argv[0]); exit(1); } GenericMesg *pmesg = NULL; while ((ReadProtoMesg_AS(stdin, &pmesg) != EOF)) { if (pmesg->t == MESG_IUM) fixUnitig((IntUnitigMesg *)(pmesg->m), ovs); WriteProtoMesg_AS(stdout, pmesg); } exit(0); }
int main (int argc, char **argv) { char *gkpName = NULL; char *tigName = NULL; int32 tigVers = -1; vector<char *> tigInputs; tgStoreType tigType = tgStoreModify; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-T") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((err) || (gkpName == NULL) || (tigName == NULL) || (tigInputs.size() == 0)) { fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v>\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G <gkpStore> Path to the gatekeeper store\n"); fprintf(stderr, " -T <tigStore> <v> Path to the tigStore and version to add tigs to\n"); fprintf(stderr, "\n"); fprintf(stderr, " Remove store versions before <v>. Data present in versions before <v>\n"); fprintf(stderr, " are copied to version <v>. Files for the earlier versions are removed.\n"); fprintf(stderr, "\n"); fprintf(stderr, " WARNING! This code HAS NOT been tested with canu.\n"); fprintf(stderr, "\n"); if (gkpName == NULL) fprintf(stderr, "ERROR: no gatekeeper store (-G) supplied.\n"); if (tigName == NULL) fprintf(stderr, "ERROR: no tig store (-T) supplied.\n"); exit(1); } operationCompress(tigName, tigVers); exit(0); }
int main(int argc, char **argv) { merylStreamReader *AF = 0L; merylStreamReader *TF = 0L; merylStreamReader *AC = 0L; merylStreamReader *DC = 0L; merylStreamReader *CO = 0L; uint32 AFmode = 0; uint32 TFmode = 0; char dumpSCZFname[1024] = {0}; // single contig, zero frags char dumpMCZFname[1024] = {0}; // low contig, zero frags char dumpMCSFname[1024] = {0}; // medium contig, low frags char dumpMCMFname[1024] = {0}; // everything else, contig > frags bool beVerbose = false; argc = AS_configure(argc, argv); int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-af") == 0) { // All frags ++arg; AFmode = findMode(argv[arg]); AF = new merylStreamReader(argv[arg]); AF->nextMer(); } else if (strcmp(argv[arg], "-tf") == 0) { // Trimmed frags ++arg; TFmode = findMode(argv[arg]); TF = new merylStreamReader(argv[arg]); TF->nextMer(); } else if (strcmp(argv[arg], "-ac") == 0) { // All contigs AC = new merylStreamReader(argv[++arg]); AC->nextMer(); } else if (strcmp(argv[arg], "-dc") == 0) { // Degenerate contigs DC = new merylStreamReader(argv[++arg]); DC->nextMer(); } else if (strcmp(argv[arg], "-co") == 0) { // Contigs CO = new merylStreamReader(argv[++arg]); CO->nextMer(); } else if (strcmp(argv[arg], "-dump") == 0) { arg++; dumpFlag = true; sprintf(dumpSCZFname, "%s.0.singlecontig.zerofrag.fasta", argv[arg]); sprintf(dumpMCZFname, "%s.1.multiplecontig.zerofrag.fasta", argv[arg]); sprintf(dumpMCSFname, "%s.2.multiplecontig.lowfrag.fasta", argv[arg]); sprintf(dumpMCMFname, "%s.3.multiplecontig.multiplefrag.fasta", argv[arg]); } else if (strcmp(argv[arg], "-v") == 0) { beVerbose = true; } else { fprintf(stderr, "unknown option '%s'\n", argv[arg]); } arg++; } if ((AF == 0L) && (TF == 0L) && (AC == 0L) && (DC == 0L) && (CO == 0L)) { fprintf(stderr, "usage: %s [opts] [-v] [-dump prefix]\n", argv[0]); fprintf(stderr, "At least one fragcounts and one contigcounts are needed.\n"); fprintf(stderr, " -af | -tf fragcounts\n"); fprintf(stderr, " -ac | -dc | -co contigcounts \n"); fprintf(stderr, "Dumping is probably only useful with exactly one frag and\n"); fprintf(stderr, "one contig, but I'll let you do it with any number.\n"); exit(1); } if ((AF == 0L) && (TF == 0L)) { fprintf(stderr, "ERROR - need at least one of -af, -tf\n"); exit(1); } if ((AC == 0L) && (DC == 0L) && (CO == 0L)) { fprintf(stderr, "ERROR - need at least one of -ac, -dc, -co\n"); exit(1); } // Check mersizes. // uint32 merSize = 0; uint32 ms[5] = { 0 }; if (AF) merSize = ms[0] = AF->merSize(); if (TF) merSize = ms[1] = TF->merSize(); if (AC) merSize = ms[2] = AC->merSize(); if (DC) merSize = ms[3] = DC->merSize(); if (CO) merSize = ms[4] = CO->merSize(); bool differ = false; if ((ms[0] > 0) && (ms[0] != merSize)) differ = true; if ((ms[1] > 0) && (ms[1] != merSize)) differ = true; if ((ms[2] > 0) && (ms[2] != merSize)) differ = true; if ((ms[3] > 0) && (ms[3] != merSize)) differ = true; if ((ms[4] > 0) && (ms[4] != merSize)) differ = true; if (differ) { fprintf(stderr, "error: mer size differ.\n"); fprintf(stderr, " AF - "F_U32"\n", ms[0]); fprintf(stderr, " TF - "F_U32"\n", ms[1]); fprintf(stderr, " AC - "F_U32"\n", ms[2]); fprintf(stderr, " DC - "F_U32"\n", ms[3]); fprintf(stderr, " CO - "F_U32"\n", ms[4]); exit(1); } if (dumpFlag) { errno = 0; dumpSCZF = fopen(dumpSCZFname, "w"); dumpMCZF = fopen(dumpMCZFname, "w"); dumpMCSF = fopen(dumpMCSFname, "w"); dumpMCMF = fopen(dumpMCMFname, "w"); if (errno) fprintf(stderr, "Failed to open the dump files: %s\n", strerror(errno)), exit(1); } uint32 AFvsAC[NUMCATEGORIES][NUMCATEGORIES]; uint32 AFvsDC[NUMCATEGORIES][NUMCATEGORIES]; uint32 AFvsCO[NUMCATEGORIES][NUMCATEGORIES]; uint32 TFvsAC[NUMCATEGORIES][NUMCATEGORIES]; uint32 TFvsDC[NUMCATEGORIES][NUMCATEGORIES]; uint32 TFvsCO[NUMCATEGORIES][NUMCATEGORIES]; for (uint32 i=0; i<NUMCATEGORIES; i++) for (uint32 j=0; j<NUMCATEGORIES; j++) { AFvsAC[i][j] = 0; AFvsDC[i][j] = 0; AFvsCO[i][j] = 0; TFvsAC[i][j] = 0; TFvsDC[i][j] = 0; TFvsCO[i][j] = 0; } // The default constructor for kMer sets the mer to size 0, all A. // We need it to be the proper size, and all T. kMer minmer(merSize); // Don't care what we pick, as long as it's a mer in the set. // if (AF && AF->validMer()) minmer = AF->theFMer(); if (TF && TF->validMer()) minmer = TF->theFMer(); if (AC && AC->validMer()) minmer = AC->theFMer(); if (DC && DC->validMer()) minmer = DC->theFMer(); if (CO && CO->validMer()) minmer = CO->theFMer(); speedCounter *C = new speedCounter(" Examining: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose); bool morestuff = true; while (morestuff) { // Find any mer in our set if (AF && AF->validMer()) minmer = AF->theFMer(); if (TF && TF->validMer()) minmer = TF->theFMer(); if (AC && AC->validMer()) minmer = AC->theFMer(); if (DC && DC->validMer()) minmer = DC->theFMer(); if (CO && CO->validMer()) minmer = CO->theFMer(); // Find the smallest mer in our set if (AF && AF->validMer() && (AF->theFMer() < minmer)) minmer = AF->theFMer(); if (TF && TF->validMer() && (TF->theFMer() < minmer)) minmer = TF->theFMer(); if (AC && AC->validMer() && (AC->theFMer() < minmer)) minmer = AC->theFMer(); if (DC && DC->validMer() && (DC->theFMer() < minmer)) minmer = DC->theFMer(); if (CO && CO->validMer() && (CO->theFMer() < minmer)) minmer = CO->theFMer(); // We need to do up to six comparisons here. if (AF && AC) compare(AF, AC, minmer, AFmode, AFvsAC); if (AF && DC) compare(AF, DC, minmer, AFmode, AFvsDC); if (AF && CO) compare(AF, CO, minmer, AFmode, AFvsCO); if (TF && AC) compare(TF, AC, minmer, TFmode, TFvsAC); if (TF && DC) compare(TF, DC, minmer, TFmode, TFvsDC); if (TF && CO) compare(TF, CO, minmer, TFmode, TFvsCO); C->tick(); #if 0 if (C->tick()) { char stringjunk[256]; fprintf(stderr, "\nMM %s\n", minmer.merToString(stringjunk)); if (AF) fprintf(stderr, "AF %s\n", AF->theFMer().merToString(stringjunk)); if (TF) fprintf(stderr, "TF %s\n", TF->theFMer().merToString(stringjunk)); if (AC) fprintf(stderr, "AC %s\n", AC->theFMer().merToString(stringjunk)); if (DC) fprintf(stderr, "DC %s\n", DC->theFMer().merToString(stringjunk)); if (CO) fprintf(stderr, "CO %s\n", CO->theFMer().merToString(stringjunk)); } #endif // Advance to the next mer, if we were just used morestuff = false; if ((AF) && (AF->theFMer() == minmer)) morestuff |= AF->nextMer(); if ((TF) && (TF->theFMer() == minmer)) morestuff |= TF->nextMer(); if ((AC) && (AC->theFMer() == minmer)) morestuff |= AC->nextMer(); if ((DC) && (DC->theFMer() == minmer)) morestuff |= DC->nextMer(); if ((CO) && (CO->theFMer() == minmer)) morestuff |= CO->nextMer(); } delete C; // output if ((AF) && (AC)) output("all frags vs all contigs", AFmode, AFvsAC); if ((AF) && (DC)) output("all frags vs deg. contigs", AFmode, AFvsDC); if ((AF) && (CO)) output("all frags vs non-deg. contigs", AFmode, AFvsCO); if ((TF) && (AC)) output("trimmed frags vs all contigs", TFmode, TFvsAC); if ((TF) && (DC)) output("trimmed frags vs deg. contigs", TFmode, TFvsDC); if ((TF) && (CO)) output("trimmed frags vs non-deg. contigs", TFmode, TFvsCO); delete AF; delete TF; delete AC; delete DC; delete CO; exit(0); }
int main(int argc, char *argv[]) { int K=-1,KB=-1; char **Seqs; char **Names; char **SeqsB; char **NamesB; char *seqfilename=NULL,*dbfilename=NULL; int internalCompare=0; /* whether query and database sequences are the same */ int *Profiles; int *ProfilesB=NULL; int ori; int first=1; FILE *seqfile=NULL, *dbfile=NULL; int *kmerCounts,*kmerIndex; int *len,*lenB; int *frontDiscount; int i,j; int maxlen=0; int minlen=40; int doTrimming=0; argc = AS_configure(argc, argv); { /* Parse the argument list using "man 3 getopt". */ int ch,errflg=0; optarg = NULL; while (!errflg && ((ch = getopt(argc, argv, "fFg:hk:q:d:m:t")) != EOF)) { switch(ch) { case 'f': fullnames=1; break; case 'F': dbfullnames=1; break; case 'g': GRANULARITY = atoi(optarg); assert(GRANULARITY>0&&GRANULARITY<500); break; case 'h': usage(argv[0]); break; case 'q': if(seqfile!=NULL)fclose(seqfile); seqfile=fopen(optarg,"r"); assert(seqfile!=NULL); seqfilename=optarg; if(dbfile==NULL){ dbfile=fopen(optarg,"r"); assert(dbfile!=NULL); dbfilename=optarg; } break; case 'd': if(dbfile!=NULL){ fclose(dbfile); } dbfile=fopen(optarg,"r"); assert(dbfile!=NULL); dbfilename=optarg; break; case 'k': ksize=atoi(optarg); assert(ksize>0&&ksize<14); break; case 'm': minlen=atoi(optarg); assert(minlen>=40&&minlen<1400); break; case 't': doTrimming=1; fprintf(stderr,"Will trim to common range\n"); break; case '?': errflg++; usage(argv[0]); break; default : fprintf( stderr, "Unrecognized option -%c\n", optopt); errflg++; usage(argv[0]); } } } assert(seqfile!=NULL); assert(dbfile!=NULL); if(seqfilename==dbfilename || strcmp(seqfilename,dbfilename)==0){ internalCompare=1; } else { internalCompare=0; } get_sequences(seqfile,&K,&Seqs,&Names); assert(K>=0); fprintf(stderr,"Read in %d query sequences\n",K+1); len = (int *)safe_malloc(sizeof(int)*(K+1)); for(i=0;i<=K;i++){ len[i]=strlen(Seqs[i]); if(maxlen<len[i]){ maxlen=len[i]; } } if(dbfullnames)fullnames=1; get_sequences(dbfile,&KB,&SeqsB,&NamesB); assert(KB>=0); fprintf(stderr,"Read in %d database sequences\n",KB+1); lenB = (int *)safe_malloc(sizeof(int)*(KB+1)); frontDiscount = (int *)safe_malloc(sizeof(int)*(KB+1)); for(i=0;i<=KB;i++){ lenB[i]=strlen(SeqsB[i]); frontDiscount[i]=0; } for (i = 0; i < 128; i++){ Map[i] = -1; } Map['a'] = Map['A'] = 0; Map['c'] = Map['C'] = 1; Map['g'] = Map['G'] = 2; Map['t'] = Map['T'] = 3; calc_kmer_members((const char **)SeqsB,ksize,&kmerCounts,&kmerIndex,KB+1); fprintf(stderr,"Built index\n"); { int **hitCounts=NULL; hitCounts = (int **) safe_malloc(maxlen*sizeof(int*)); for(j= 0; j<maxlen; j++){ hitCounts[j] = (int *) safe_malloc((KB+1)*sizeof(int)); } for (i = 0; i <= K; i++){ int k; int kword = 0; int h = -ksize; int bestfront=-1, bestback=-1; int bestscore=-1,bestloc=-1; int bestsimple=-1,simplescore=-1; int ilen,ilenlessone; int startbestmatch=0; int endbestmatch = len[i]-1; ilen=len[i]; ilenlessone=ilen-1; for(k=0;k<ksize;k++){ for(j= 0; j<=KB; j++){ hitCounts[k][j] = 0; } } for(j=0;j<ksize-1;j++){ int x = Map[(int) (Seqs[i][j])]; if (x >= 0){ kword = (kword << 2) | x; }else{ kword <<= 2; h = j-(ksize-1); } } while(Seqs[i][j]!='\0'){ int x = Map[(int) (Seqs[i][j])]; if (x >= 0){ kword = ((kword << 2) | x) & kmax; }else{ kword <<= 2; h = j-(ksize-1); } for(k=0;k<=KB;k++){ hitCounts[j][k]=hitCounts[j-1][k]; } if (j >= h+ksize){ for(k=kmerCounts[kword];k<kmerCounts[kword+1];k++){ if(internalCompare && kmerIndex[k]==i)continue; if(k==kmerCounts[kword]||(kmerIndex[k]!=kmerIndex[k-1])){ hitCounts[j][kmerIndex[k]]++; } } } j++; } for( k=0;k<=KB;k++){ if(hitCounts[ilenlessone][k]>simplescore){ simplescore = hitCounts[ilenlessone][k]; bestsimple=k; } } { ALNoverlap *ovl=NULL; double erate=0.02; // below, .9 fudge factor may be necessary to handle cases where some matching kmers are random out of order matches int minovl=simplescore *.9; while(erate<.4){ ovl = DP_Compare(Seqs[i], SeqsB[bestsimple], -lenB[bestsimple]+minovl, len[i]-minovl, strlen(Seqs[i]), strlen(SeqsB[bestsimple]), 0, erate, 1e-6, maxim(minlen,minovl), AS_FIND_LOCAL_ALIGN_NO_TRACE); if(ovl!=NULL)break; erate*=2.; } if(ovl!=NULL){ if(ovl->begpos>0){ startbestmatch=ovl->begpos; } if(ovl->endpos<0){ endbestmatch=len[i]+ovl->endpos; assert(endbestmatch>0); } } } // printf("startbestmatch init at %d\n",startbestmatch); // printf("endbestmatch init at %d\n",endbestmatch); for(k=0;k<=KB;k++){ frontDiscount[k]=hitCounts[startbestmatch+ksize-2][k]; } for(j=startbestmatch+ksize-1;j<=endbestmatch;j+=GRANULARITY){ int maxfront=-1; int maxback=-1; int whichfront=-1,whichback=-1; for(k=0;k<=KB;k++){ if(hitCounts[j][k]-frontDiscount[k]>maxfront){ maxfront=hitCounts[j][k]-frontDiscount[k]; whichfront=k; } if(hitCounts[endbestmatch][k]-hitCounts[j][k]>maxback){ maxback=hitCounts[endbestmatch][k]-hitCounts[j][k]; whichback=k; } } if(maxfront+maxback>bestscore){ bestscore=maxfront+maxback; bestfront=whichfront; bestback=whichback; bestloc=j; #ifdef DEBUG_SEGMENTATION fprintf(stderr,"New best %d: loc %d seqs %s / %s bestfront %d bestback %d\n", bestscore, bestloc,NamesB[bestfront],NamesB[bestback],bestfront,bestback); #endif } } if(doTrimming){ int frontstart=startbestmatch; int frontend=endbestmatch; int backstart=startbestmatch; int backend=endbestmatch; ALNoverlap *ovl=NULL; double erate=0.02; // below, .9 fudge factor may be necessary to handle cases where some matching kmers are random out of order matches int minovl=hitCounts[bestloc][bestfront]*.9; if(bestfront!=bestsimple){ while(erate<.4){ ovl = DP_Compare(Seqs[i], SeqsB[bestfront], -lenB[bestfront]+minovl, len[i]-minovl, strlen(Seqs[i]), strlen(SeqsB[bestfront]), 0, erate, 1e-6, maxim(minlen,minovl), AS_FIND_LOCAL_ALIGN_NO_TRACE); if(ovl!=NULL)break; erate*=2.; } if(ovl!=NULL){ if( minim(frontend,len[i]+ovl->endpos) <= maxim(frontstart,ovl->begpos) ){ // complain fprintf(stderr, "trouble with overlap found at erate %f: ahang %d bhang %d => frontend %d <= frontstart %d\n" ">Qseq\n%s\n>Dseq\n%s\n", erate, ovl->begpos, ovl->endpos, frontend, frontstart,Seqs[i],SeqsB[bestfront]); // and do nothing! } else { // update frontstart = maxim(frontstart,ovl->begpos); frontend = minim(frontend,len[i]+ovl->endpos); } } } erate=0.02; // below, .9 fudge factor may be necessary to handle cases where some matching kmers are random out of order matches minovl=(hitCounts[ilenlessone][bestback]-hitCounts[bestloc][bestback])*.9; // fprintf(stderr,"DEBUG: minovl = %d\n",hitCounts[ilenlessone][bestback]); ovl=NULL; if(bestback!=bestsimple){ // fprintf(stderr,"initial settings: backstart, backend to %d %d\n",backstart,backend); while(erate<.4){ ovl = DP_Compare(Seqs[i], SeqsB[bestback], -lenB[bestback], len[i], strlen(Seqs[i]), strlen(SeqsB[bestback]), 0, erate, 1e-6, maxim(minlen,minovl), AS_FIND_LOCAL_ALIGN_NO_TRACE); //ovl = DP_Compare(Seqs[i],SeqsB[bestback],-lenB[bestback],len[i],0,erate,1e-6,40,AS_FIND_LOCAL_ALIGN_NO_TRACE); if(ovl!=NULL)break; erate*=2.; } if(ovl!=NULL){ if( minim(backend,len[i]+ovl->endpos) <= maxim(backstart,ovl->begpos) ){ // complain fprintf(stderr, "trouble with overlap found at erate %f: ahang %d bhang %d => backend %d <= backstart %d\n" ">Qseq\n%s\n>Dseq\n%s\n", erate, ovl->begpos, ovl->endpos, backend, backstart,Seqs[i],SeqsB[bestback]); // and do nothing } else { // update backstart=maxim(backstart,ovl->begpos); backend=minim(backend,len[i]+ovl->endpos); // fprintf(stderr,"Updating backstart, backend to %d %d\n",backstart,backend); } } } // things are problematic if the overlap is to a region that doesn't come close to the implied breakpoint (bestloc); // however, partial sequence issues can make an absolute test fail, hence the constant 100 below: if(frontstart<bestloc+100&&frontend>bestloc-100){ startbestmatch= (startbestmatch > frontstart ? startbestmatch : frontstart); endbestmatch= (endbestmatch < frontend ? endbestmatch : frontend); } if(backstart<bestloc+100&&backend>bestloc-100){ startbestmatch= (startbestmatch > backstart ? startbestmatch : backstart); endbestmatch= (endbestmatch < backend ? endbestmatch : backend); } // fprintf(stderr,"Final bestmatch start %d end %d\n",startbestmatch,endbestmatch); simplescore=hitCounts[endbestmatch][bestsimple]-hitCounts[startbestmatch+ksize-2][bestsimple]; bestscore= hitCounts[endbestmatch][bestback]-hitCounts[bestloc][bestback]+ hitCounts[bestloc][bestfront]-hitCounts[startbestmatch+ksize-2][bestfront]; } // printf("startbestmatch final at %d\n",startbestmatch); // printf("endbestmatch final at %d\n",endbestmatch); assert(startbestmatch<=bestloc+100&&endbestmatch>bestloc-100); if(simplescore <= hitCounts[endbestmatch][bestfront]-hitCounts[startbestmatch+ksize-2][bestfront]){ bestsimple=bestfront; simplescore=hitCounts[endbestmatch][bestfront]-hitCounts[startbestmatch+ksize-2][bestfront]; // fprintf(stderr,"Resetting best simple, presumably due to trimming (now bestfront)\n"); } if( simplescore <= hitCounts[endbestmatch][bestback]-hitCounts[startbestmatch+ksize-2][bestback]){ bestsimple=bestback; simplescore=hitCounts[endbestmatch][bestback]-hitCounts[startbestmatch+ksize-2][bestback]; // fprintf(stderr,"Resetting best simple, presumably due to trimming (now bestback)\n"); } if(bestscore>simplescore){ printf("%s (len=%d): best split = %d %s : %s (score = %d from %d to %d ; separately, scores = %d and %d; best single %s scores %d )\n", Names[i],ilen,bestloc,NamesB[bestfront],NamesB[bestback], bestscore,startbestmatch,endbestmatch, hitCounts[endbestmatch][bestfront]-hitCounts[startbestmatch+ksize-2][bestfront], hitCounts[endbestmatch][bestback]-hitCounts[startbestmatch+ksize-2][bestback], NamesB[bestsimple],simplescore); } else { printf("%s (len=%d): best match = %s (from %d to %d, score = %d )\n", Names[i],ilen,NamesB[bestsimple],startbestmatch,endbestmatch,simplescore); } } } return (0); }
int main(int argc, char **argv) { feParameters *G = new feParameters(); argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { G->gkpStorePath = argv[++arg]; } else if (strcmp(argv[arg], "-R") == 0) { G->bgnID = atoi(argv[++arg]); G->endID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-O") == 0) { G->ovlStorePath = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { G->errorRate = atof(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { G->minOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-o") == 0) { // For 'corrections' file output G->outputFileName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { G->numThreads = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-d") == 0) { G->Degree_Threshold = strtol(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-k") == 0) { G->Kmer_Len = strtol(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-p") == 0) { G->Use_Haplo_Ct = FALSE; } else if (strcmp(argv[arg], "-V") == 0) { G->Vote_Qualify_Len = strtol(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-x") == 0) { G->End_Exclude_Len = strtol(argv[++arg], NULL, 10); } else { fprintf(stderr, "Unknown option '%s'\n", argv[arg]); err++; } arg++; } if (G->numThreads == 0) err++; if (err > 0) { fprintf(stderr, "usage: %s[-ehp][-d DegrThresh][-k KmerLen][-x ExcludeLen]\n", argv[0]); fprintf(stderr, " [-F OlapFile][-S OlapStore][-o CorrectFile]\n"); fprintf(stderr, " [-t NumPThreads][-v VerboseLevel]\n"); fprintf(stderr, " [-V Vote_Qualify_Len]\n"); fprintf(stderr, " <FragStore> <lo> <hi>\n"); fprintf(stderr, "\n"); fprintf(stderr, "Makes corrections to fragment sequence based on overlaps\n"); fprintf(stderr, "and recomputes overlaps on corrected fragments\n"); fprintf(stderr, "Fragments come from <FragStore> <lo> and <hi> specify\n"); fprintf(stderr, "the range of fragments to modify\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "-d set keep flag on end of frags with less than this many olaps\n"); fprintf(stderr, "-F specify file of sorted overlaps to use (in the format produced\n"); fprintf(stderr, " by get-olaps\n"); fprintf(stderr, "-h print this message\n"); fprintf(stderr, "-k minimum exact-match region to prevent change\n"); fprintf(stderr, "-o specify output file to hold correction info\n"); fprintf(stderr, "-p don't use haplotype counts to correct\n"); fprintf(stderr, "-S specify the binary overlap store containing overlaps to use\n"); fprintf(stderr, "-t set number of p-threads to use\n"); fprintf(stderr, "-v specify level of verbose outputs, higher is more\n"); fprintf(stderr, "-V specify number of exact match bases around an error to vote to change\n"); fprintf(stderr, "-x length of end of exact match to exclude in preventing change\n"); if (G->numThreads == 0) fprintf(stderr, "ERROR: number of compute threads (-t) must be larger than zero.\n"); exit(1); } // // Initialize Globals // double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN); Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS); for (uint32 i = 0; i <= AS_MAX_READLEN; i++) G->Error_Bound[i] = (int)ceil(i * G->errorRate); // // // gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath); if (G->bgnID < 1) G->bgnID = 1; if (gkpStore->gkStore_getNumReads() < G->endID) G->endID = gkpStore->gkStore_getNumReads(); Read_Frags(G, gkpStore); Read_Olaps(G, gkpStore); // Now sort them! sort(G->olaps, G->olaps + G->olapsLen); //fprintf (stderr, "Before Stream_Old_Frags Num_Olaps = "F_S64"\n", Num_Olaps); Threaded_Stream_Old_Frags(G, gkpStore); //fprintf (stderr, " Failed overlaps = %d\n", Failed_Olaps); gkpStore->gkStore_close(); //Output_Details(G); Output_Corrections(G); delete G; exit(0); }
int main(int argc, char **argv) { char *gkpName = 0L; char *ovsName = 0L; char *iniClrName = NULL; char *maxClrName = NULL; char *outClrName = NULL; uint32 errorValue = AS_OVS_encodeEvalue(0.015); uint32 minAlignLength = 40; uint32 minReadLength = 64; char *outputPrefix = NULL; char logName[FILENAME_MAX] = {0}; char sumName[FILENAME_MAX] = {0}; FILE *logFile = 0L; FILE *sumFile = 0L; uint32 idMin = 1; uint32 idMax = UINT32_MAX; uint32 minEvidenceOverlap = 40; uint32 minEvidenceCoverage = 1; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { ovsName = argv[++arg]; } else if (strcmp(argv[arg], "-Ci") == 0) { iniClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Cm") == 0) { maxClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Co") == 0) { outClrName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { double erate = atof(argv[++arg]); errorValue = AS_OVS_encodeEvalue(erate); } else if (strcmp(argv[arg], "-l") == 0) { minAlignLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-ol") == 0) { minEvidenceOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-oc") == 0) { minEvidenceCoverage = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { AS_UTL_decodeRange(argv[++arg], idMin, idMax); } else { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); err++; } arg++; } if ((gkpName == NULL) || (ovsName == NULL) || (outputPrefix == NULL) || (err)) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G gkpStore path to read store\n"); fprintf(stderr, " -O ovlStore path to overlap store\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o name output prefix, for logging\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n"); //fprintf(stderr, " -Cm clearFile path to maximal clear ranges\n"); fprintf(stderr, " -Co clearFile path to ouput clear ranges\n"); fprintf(stderr, "\n"); fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n"); //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -ol l the minimum evidence overlap length\n"); fprintf(stderr, " -oc c the minimum evidence overlap coverage\n"); fprintf(stderr, " evidence overlaps must overlap by 'l' bases to be joined, and\n"); fprintf(stderr, " must be at least 'c' deep to be retained\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n"); fprintf(stderr, "\n"); exit(1); } gkStore *gkp = gkStore::gkStore_open(gkpName); ovStore *ovs = new ovStore(ovsName, gkp); clearRangeFile *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, gkp); clearRangeFile *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, gkp); clearRangeFile *outClr = (outClrName == NULL) ? NULL : new clearRangeFile(outClrName, gkp); if (outClr) // If the outClr file exists, those clear ranges are loaded. We need to reset them // back to 'untrimmed' for now. outClr->reset(gkp); if (iniClr && outClr) // An iniClr file was supplied, so use those as the initial clear ranges. outClr->copy(iniClr); if (outputPrefix) { sprintf(logName, "%s.log", outputPrefix); sprintf(sumName, "%s.summary", outputPrefix); errno = 0; logFile = fopen(logName, "w"); if (errno) fprintf(stderr, "Failed to open log file '%s' for writing: %s\n", logName, strerror(errno)), exit(1); sumFile = fopen(sumName, "w"); if (errno) fprintf(stderr, "Failed to open summary file '%s' for writing: %s\n", sumName, strerror(errno)), exit(1); fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n"); fprintf(sumFile, "Overlap error rate <= %.4f fraction error\n", AS_OVS_decodeEvalue(errorValue)); fprintf(sumFile, "Overlap min overlap >= %u base%s (for 'largest covered')\n", minEvidenceOverlap, (minEvidenceOverlap == 1) ? "" : "s"); fprintf(sumFile, "Overlap min coverage >= %u read%s (for 'largest covered')\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s"); } uint32 ovlLen = 0; uint32 ovlMax = 64 * 1024; ovOverlap *ovl = ovOverlap::allocateOverlaps(gkp, ovlMax); memset(ovl, 0, sizeof(ovOverlap) * ovlMax); char logMsg[1024] = {0}; if (idMin < 1) idMin = 1; if (idMax > gkp->gkStore_getNumReads()) idMax = gkp->gkStore_getNumReads(); fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n", idMin, idMax, gkp->gkStore_getNumReads()); for (uint32 id=idMin; id<=idMax; id++) { gkRead *read = gkp->gkStore_getRead(id); gkLibrary *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID()); logMsg[0] = 0; // If the fragment is deleted, do nothing. If the fragment was deleted AFTER overlaps were // generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments // we skip. // if ((iniClr) && (iniClr->isDeleted(id) == true)) continue; // If it did not request trimming, do nothing. Similar to the above, we'll get overlaps to // fragments we skip. // if ((libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) && (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE)) continue; // Decide on the initial trimming. We copied any iniClr into outClr above, and if there wasn't // an iniClr, then outClr is the full read. uint32 ibgn = outClr->bgn(id); uint32 iend = outClr->end(id); // Set the, ahem, initial final trimming. bool isGood = false; uint32 fbgn = ibgn; uint32 fend = iend; // Load overlaps. uint32 nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax); // Trim! if (nLoaded == 0) { // No overlaps, so mark it as junk. isGood = false; } else if (libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) { // Use the largest region covered by overlaps as the trim assert(ovlLen > 0); assert(id == ovl[0].a_iid); isGood = largestCovered(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, errorValue, minEvidenceOverlap, minEvidenceCoverage, minReadLength); assert(fbgn <= fend); } else if (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE) { // Use the largest region covered by overlaps as the trim assert(ovlLen > 0); assert(id == ovl[0].a_iid); isGood = bestEdge(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, errorValue, minEvidenceOverlap, minEvidenceCoverage, minReadLength); assert(fbgn <= fend); } else { // Do nothing. Really shouldn't get here. assert(0); continue; } // Enforce the maximum clear range if ((isGood) && (maxClr)) { isGood = enforceMaximumClearRange(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, maxClr); assert(fbgn <= fend); } // // Trimmed. Make sense of the result, write some logs, and update the output. // // If bad trimming or too small, write the log and keep going. // if ((isGood == false) || (fend - fbgn < minReadLength)) { outClr->setbgn(id) = fbgn; outClr->setend(id) = fend; outClr->setDeleted(id); // Gah, just obliterates the clear range. fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); } // If we didn't change anything, also write a log. // else if ((ibgn == fbgn) && (iend == fend)) { fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); continue; } // Otherwise, we actually did something. else { outClr->setbgn(id) = fbgn; outClr->setend(id) = fend; fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); } } gkp->gkStore_close(); delete ovs; delete iniClr; delete maxClr; delete outClr; fclose(logFile); fclose(sumFile); exit(0); }
int main(int argc, char **argv) { char *gkpName = NULL; char *ovlName = NULL; char *outPrefix = NULL; uint32 bgnID = 0; uint32 endID = UINT32_MAX; uint32 ovlSelect = 0; double ovlAtMost = AS_OVS_encodeEvalue(1.0); double ovlAtLeast = AS_OVS_encodeEvalue(0.0); double expectedMean = 30.0; double expectedStdDev = 7.0; bool toFile = true; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) gkpName = argv[++arg]; else if (strcmp(argv[arg], "-O") == 0) ovlName = argv[++arg]; else if (strcmp(argv[arg], "-o") == 0) outPrefix = argv[++arg]; else if (strcmp(argv[arg], "-C") == 0) { expectedMean = atof(argv[++arg]); expectedStdDev = atof(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) toFile = false; else if (strcmp(argv[arg], "-b") == 0) bgnID = atoi(argv[++arg]); else if (strcmp(argv[arg], "-e") == 0) endID = atoi(argv[++arg]); else if (strcmp(argv[arg], "-overlap") == 0) { arg++; if (strcmp(argv[arg], "5") == 0) ovlSelect |= OVL_5; else if (strcmp(argv[arg], "3") == 0) ovlSelect |= OVL_3; else if (strcmp(argv[arg], "contained") == 0) ovlSelect |= OVL_CONTAINED; else if (strcmp(argv[arg], "container") == 0) ovlSelect |= OVL_CONTAINER; else if (strcmp(argv[arg], "partial") == 0) ovlSelect |= OVL_PARTIAL; else if (strcmp(argv[arg], "atmost") == 0) ovlAtMost = atof(argv[++arg]); else if (strcmp(argv[arg], "atleast") == 0) ovlAtLeast = atof(argv[++arg]); else { fprintf(stderr, "ERROR: unknown -overlap '%s'\n", argv[arg]); exit(1); } } else { fprintf(stderr, "%s: unknown option '%s'.\n", argv[0], argv[arg]); err++; } arg++; } if (gkpName == NULL) err++; if (ovlName == NULL) err++; if (outPrefix == NULL) err++; if (err) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -o outPrefix [-b bgnID] [-e endID] ...\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "Generates statistics for an overlap store. By default all possible classes\n"); fprintf(stderr, "are generated, options can disable specific classes.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -C mean stddev Expect coverage at mean +- stddev\n"); fprintf(stderr, " -c Write stats to stdout, not to a file\n"); fprintf(stderr, "\n"); fprintf(stderr, "Outputs:\n"); fprintf(stderr, "\n"); fprintf(stderr, " outPrefix.per-read.log One line per read, giving readID, read length and classification.\n"); fprintf(stderr, " outPrefix.summary The primary statistical output.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Overlap Selection:\n"); fprintf(stderr, " -overlap 5 5' overlaps only\n"); fprintf(stderr, " -overlap 3 3' overlaps only\n"); fprintf(stderr, " -overlap contained contained overlaps only\n"); fprintf(stderr, " -overlap container container overlaps only\n"); fprintf(stderr, " -overlap partial overlap is not valid for assembly\n"); fprintf(stderr, "\n"); fprintf(stderr, " An overlap is classified as exactly one of 5', 3', contained or container.\n"); fprintf(stderr, " By default, all overlaps are selected. Specifying any of these options will\n"); fprintf(stderr, " restrict overlaps to just those classifications. E.g., '-overlap 5 -overlap 3'\n"); fprintf(stderr, " will select dovetail overlaps off either end of the read.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -overlap atmost x at most fraction x error (overlap-erate <= x)\n"); fprintf(stderr, " -overlap atleast x at least fraction x error (x <= overlap-erate)\n"); fprintf(stderr, "\n"); fprintf(stderr, " Overlaps can be further filtered by fraction error. Usually, this will be an\n"); fprintf(stderr, " 'atmost' filtering to use only the higher qualtiy overlaps.\n"); fprintf(stderr, "\n"); fprintf(stderr, " A contained read has at least one container overlap. Container read -> ---------------\n"); fprintf(stderr, " A container read has at least one contained overlap. Contained overlap -> -----\n"); fprintf(stderr, "\n"); exit(1); } // Set the default to 'all' if nothing set. if (ovlSelect == 0) ovlSelect = 0xff; // Open inputs, find limits. gkStore *gkpStore = gkStore::gkStore_open(gkpName); ovStore *ovlStore = new ovStore(ovlName, gkpStore); if (endID > gkpStore->gkStore_getNumReads()) endID = gkpStore->gkStore_getNumReads(); if (endID < bgnID) fprintf(stderr, "ERROR: invalid bgn/end range bgn=%u end=%u; only %u reads in the store\n", bgnID, endID, gkpStore->gkStore_getNumReads()), exit(1); ovlStore->setRange(bgnID, endID); // Allocate output histograms. histogramStatistics *readNoOlaps = new histogramStatistics; // Bad reads! (read length) histogramStatistics *readHole = new histogramStatistics; histogramStatistics *readHump = new histogramStatistics; histogramStatistics *readNo5 = new histogramStatistics; histogramStatistics *readNo3 = new histogramStatistics; histogramStatistics *olapHole = new histogramStatistics; // Hole size (sum of holes if more than one) histogramStatistics *olapHump = new histogramStatistics; // Hump size (sum of humps if more than one) histogramStatistics *olapNo5 = new histogramStatistics; // 5' uncovered size histogramStatistics *olapNo3 = new histogramStatistics; // 3' uncovered size histogramStatistics *readLowCov = new histogramStatistics; // Good reads! (read length) histogramStatistics *readUnique = new histogramStatistics; histogramStatistics *readRepeatCont = new histogramStatistics; histogramStatistics *readRepeatDove = new histogramStatistics; histogramStatistics *readSpanRepeat = new histogramStatistics; histogramStatistics *readUniqRepeatCont = new histogramStatistics; histogramStatistics *readUniqRepeatDove = new histogramStatistics; histogramStatistics *readUniqAnchor = new histogramStatistics; histogramStatistics *covrLowCov = new histogramStatistics; // Good reads! (overlap length) histogramStatistics *covrUnique = new histogramStatistics; histogramStatistics *covrRepeatCont = new histogramStatistics; histogramStatistics *covrRepeatDove = new histogramStatistics; histogramStatistics *covrSpanRepeat = new histogramStatistics; histogramStatistics *covrUniqRepeatCont = new histogramStatistics; histogramStatistics *covrUniqRepeatDove = new histogramStatistics; histogramStatistics *covrUniqAnchor = new histogramStatistics; histogramStatistics *olapLowCov = new histogramStatistics; // Good reads! (overlap length) histogramStatistics *olapUnique = new histogramStatistics; histogramStatistics *olapRepeatCont = new histogramStatistics; histogramStatistics *olapRepeatDove = new histogramStatistics; histogramStatistics *olapSpanRepeat = new histogramStatistics; histogramStatistics *olapUniqRepeatCont = new histogramStatistics; histogramStatistics *olapUniqRepeatDove = new histogramStatistics; histogramStatistics *olapUniqAnchor = new histogramStatistics; // Coverage interval lists, of all overlaps selected. // Open outputs. char N[FILENAME_MAX]; sprintf(N, "%s.per-read.log", outPrefix); FILE *LOG = fopen(N, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); // Compute! uint32 overlapsMax = 1024 * 1024; uint32 overlapsLen = 0; ovOverlap *overlaps = ovOverlap::allocateOverlaps(gkpStore, overlapsMax); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); while (overlapsLen > 0) { uint32 readID = overlaps[0].a_iid; uint32 readLen = gkpStore->gkStore_getRead(readID)->gkRead_sequenceLength(); intervalList<uint32> cov; uint32 covID = 0; bool readCoverage5 = false; bool readCoverage3 = false; bool readContained = false; bool readContainer = false; bool readPartial = false; for (uint32 oo=0; oo<overlapsLen; oo++) { bool is5prime = (overlaps[oo].overlapAEndIs5prime() == true) && (ovlSelect & OVL_5) && (overlaps[oo].overlap5primeIsPartial() == false); bool is3prime = (overlaps[oo].overlapAEndIs3prime() == true) && (ovlSelect & OVL_3) && (overlaps[oo].overlap3primeIsPartial() == false); bool isContained = (overlaps[oo].overlapAIsContained() == true) && (ovlSelect & OVL_CONTAINED); bool isContainer = (overlaps[oo].overlapAIsContainer() == true) && (ovlSelect & OVL_CONTAINER); bool isPartial = (overlaps[oo].overlapIsPartial() == true) && (ovlSelect & OVL_PARTIAL); // Ignore the overlap? if ((is5prime == false) && (is3prime == false) && (isContained == false) && (isContainer == false) && (isPartial == false)) continue; if (overlaps[oo].evalue() < ovlAtLeast) continue; if (overlaps[oo].evalue() > ovlAtMost) continue; readCoverage5 |= is5prime; // If there is a 5' overlap, the read isn't missing 5' coverage readCoverage3 |= is3prime; readContained |= isContained; // Read is contained in something else readContainer |= isContainer; // Read is a container of somethign else readPartial |= isPartial; cov.add(overlaps[oo].a_bgn(), overlaps[oo].a_end() - overlaps[oo].a_bgn()); } // If we filtered all the overlaps, just get out of here. Yeah, some code duplication, // but cleaner than sticking an if block around the rest of the loop. if (cov.numberOfIntervals() == 0) { readNoOlaps->add(readLen); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } // Generate a depth-of-coverage map, then merge intervals intervalList<uint32> depth(cov); cov.merge(); // Analyze the intervals, save per-read information to the log. uint32 lastInt = cov.numberOfIntervals() - 1; uint32 bgn = cov.lo(0); uint32 end = cov.hi(lastInt); bool contiguous = (lastInt == 0) ? true : false; bool readFullCoverage = (lastInt == 0) && (bgn == 0) && (end == readLen); bool readMissingMiddle = (lastInt != 0); uint32 holeSize = 0; uint32 no5Size = bgn; uint32 no3Size = readLen - end; for (uint32 ii=1; ii<cov.numberOfIntervals(); ii++) holeSize += cov.lo(ii) - cov.hi(ii-1); // Handle bad cases. If it's a partial overlap, ignore the is5prime and is3prime markings. if (readMissingMiddle == true) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-missing"); readHole->add(readLen); olapHole->add(holeSize); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } if ((readCoverage5 == false) && (readCoverage3 == false) && (readContained == false) && (readPartial == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-only"); readHump->add(readLen); olapHump->add(no5Size + no3Size); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } if ((readCoverage5 == false) && (readContained == false) && (readPartial == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-5-prime"); readNo5->add(readLen); olapNo5->add(no5Size); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } if ((readCoverage3 == false) && (readContained == false) && (readPartial == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-3-prime"); readNo3->add(readLen); olapNo3->add(no3Size); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } // Handle good cases. For partial overlaps, bgn and end are not the extent of the read. if (readPartial == false) { assert(bgn == 0); assert(end == readLen); assert(contiguous == true); assert(readFullCoverage == true); } // Compute mean and std.dev of coverage. From this, we decide if the read is 'unique', // 'repeat' or 'mixed'. If 'mixed', we then need to decide if the read spans a repeat, or // joins unique and repeat. double covMean = 0; double covStdDev = 0; for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covMean += (depth.hi(ii) - depth.lo(ii)) * depth.depth(ii); covMean /= readLen; for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covStdDev += (depth.hi(ii) - depth.lo(ii)) * (depth.depth(ii) - covMean) * (depth.depth(ii) - covMean); covStdDev = sqrt(covStdDev / (readLen - 1)); // Classify each interval as either 'l'owcoverage, 'u'nique or 'r'epeat. char *classification = new char [depth.numberOfIntervals()]; for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) { if (depth.depth(ii) < expectedMean - 3 * expectedStdDev) { classification[ii] = 'l'; } else if (depth.depth(ii) < expectedMean + 3 * expectedStdDev) { classification[ii] = 'u'; } else { classification[ii] = 'r'; } } // Try to detect if a read is part unique and part repeat. bool isLowCov = false; bool isUnique = false; bool isRepeat = false; bool isSpanRepeat = false; bool isUniqRepeat = false; bool isUniqAnchor = false; int32 bgni = 0; int32 endi = depth.numberOfIntervals() - 1; char type5 = classification[bgni]; char typem = 0; char type3 = classification[endi]; while ((bgni <= endi) && (type5 == classification[bgni])) bgni++; bgni--; while ((bgni <= endi) && (type3 == classification[endi])) endi--; endi++; // All the same classification? if (bgni == endi) { isLowCov = (type5 == 'l'); isUnique = (type5 == 'u'); isRepeat = (type5 == 'r'); } // Nope, if we aren't the same, assume it is uniqRepeat. else if (type5 != type3) { isUniqRepeat = true; } // Nope, the same on both ends. Assume we're just flipped. else { if (type5 == 'r') isUniqAnchor = true; else isSpanRepeat = true; } // Now, do something with it. // LOG - readID readLen classification if (isLowCov) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "low-cov"); readLowCov->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrLowCov->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if (isUnique) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "unique"); readUnique->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrUnique->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if ((isRepeat) && (readContained == true)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "contained-repeat"); readRepeatCont->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrRepeatCont->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if ((isRepeat) && (readContained == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "dovetail-repeat"); readRepeatDove->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrRepeatDove->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if (isSpanRepeat) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "span-repeat"); readSpanRepeat->add(readLen); olapSpanRepeat->add(depth.lo(endi) - depth.hi(bgni)); } if ((isUniqRepeat) && (readContained == true)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-cont"); readUniqRepeatCont->add(readLen); } if ((isUniqRepeat) && (readContained == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-dove"); readUniqRepeatDove->add(readLen); } if (isUniqAnchor) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-anchor"); readUniqAnchor->add(readLen); olapUniqAnchor->add(depth.lo(endi) - depth.hi(bgni)); } // Done. Read more data. overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); } fclose(LOG); // Done with logging. readHole->finalizeData(); olapHole->finalizeData(); readHump->finalizeData(); olapHump->finalizeData(); readNo5->finalizeData(); olapNo5->finalizeData(); readNo3->finalizeData(); olapNo3->finalizeData(); readLowCov->finalizeData(); olapLowCov->finalizeData(); covrLowCov->finalizeData(); readUnique->finalizeData(); olapUnique->finalizeData(); covrUnique->finalizeData(); readRepeatCont->finalizeData(); olapRepeatCont->finalizeData(); covrRepeatCont->finalizeData(); readRepeatDove->finalizeData(); olapRepeatDove->finalizeData(); covrRepeatDove->finalizeData(); readSpanRepeat->finalizeData(); olapSpanRepeat->finalizeData(); readUniqRepeatCont->finalizeData(); olapUniqRepeatCont->finalizeData(); readUniqRepeatDove->finalizeData(); olapUniqRepeatDove->finalizeData(); readUniqAnchor->finalizeData(); olapUniqAnchor->finalizeData(); LOG = stdout; if (toFile == true) { sprintf(N, "%s.summary", outPrefix); LOG = fopen(N, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); } fprintf(LOG, "category reads read length feature size or coverage analysis\n"); fprintf(LOG, "---------------- ------- ---------------------- ------------------------ --------------------\n"); fprintf(LOG, "middle-missing %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHole->numberOfObjects(), readHole->mean(), readHole->stddev(), olapHole->mean(), olapHole->stddev()); fprintf(LOG, "middle-hump %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHump->numberOfObjects(), readHump->mean(), readHump->stddev(), olapHump->mean(), olapHump->stddev()); fprintf(LOG, "no-5-prime %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo5->numberOfObjects(), readNo5->mean(), readNo5->stddev(), olapNo5->mean(), olapNo5->stddev()); fprintf(LOG, "no-3-prime %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo3->numberOfObjects(), readNo3->mean(), readNo3->stddev(), olapNo3->mean(), olapNo3->stddev()); fprintf(LOG, "\n"); fprintf(LOG, "low-coverage %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, potential for lower quality consensus)\n", readLowCov->numberOfObjects(), readLowCov->mean(), readLowCov->stddev(), covrLowCov->mean(), covrLowCov->stddev()); fprintf(LOG, "unique %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, perfect, yay)\n", readUnique->numberOfObjects(), readUnique->mean(), readUnique->stddev(), covrUnique->mean(), covrUnique->stddev()); fprintf(LOG, "repeat-cont %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (potential for consensus errors, no impact on assembly)\n", readRepeatCont->numberOfObjects(), readRepeatCont->mean(), readRepeatCont->stddev(), covrRepeatCont->mean(), covrRepeatCont->stddev()); fprintf(LOG, "repeat-dove %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (hard to assemble, likely won't assemble correctly or even at all)\n", readRepeatDove->numberOfObjects(), readRepeatDove->mean(), readRepeatDove->stddev(), covrRepeatDove->mean(), covrRepeatDove->stddev()); fprintf(LOG, "\n"); fprintf(LOG, "span-repeat %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (read spans a large repeat, usually easy to assemble)\n", readSpanRepeat->numberOfObjects(), readSpanRepeat->mean(), readSpanRepeat->stddev(), olapSpanRepeat->mean(), olapSpanRepeat->stddev()); fprintf(LOG, "uniq-repeat-cont %7"F_U64P" %10.2f +- %-8.2f (should be uniquely placed, low potential for consensus errors, no impact on assembly)\n", readUniqRepeatCont->numberOfObjects(), readUniqRepeatCont->mean(), readUniqRepeatCont->stddev()); fprintf(LOG, "uniq-repeat-dove %7"F_U64P" %10.2f +- %-8.2f (will end contigs, potential to misassemble)\n", readUniqRepeatDove->numberOfObjects(), readUniqRepeatDove->mean(), readUniqRepeatDove->stddev()); fprintf(LOG, "uniq-anchor %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (repeat read, with unique section, probable bad read)\n", readUniqAnchor->numberOfObjects(), readUniqAnchor->mean(), readUniqAnchor->stddev(), olapUniqAnchor->mean(), olapUniqAnchor->stddev()); if (toFile == true) fclose(LOG); delete ovlStore; gkpStore->gkStore_close(); exit(0); }
int main(int argc, char **argv) { int msglist[NUM_OF_REC_TYPES + 1]; FILE *outfile[NUM_OF_REC_TYPES + 1]; off_t count[NUM_OF_REC_TYPES + 1]; off_t size[NUM_OF_REC_TYPES + 1]; int i; for (i=0; i<=NUM_OF_REC_TYPES; i++) { msglist[i] = 0; outfile[i] = 0L; count[i] = 0; size[i] = 0; } int arg = 1; int inc = 0; int err = 0; int msg = 0; argc = AS_configure(argc, argv); while (arg < argc) { if (strcmp(argv[arg], "-i") == 0) { inc = 1; } else if (strcmp(argv[arg], "-x") == 0) { inc = 0; } else if (strcmp(argv[arg], "-o") == 0) { errno = 0; FILE *F = fopen(argv[++arg], "w"); if (errno) fprintf(stderr, "%s: failed to open output file '%s': %s\n", argv[0], argv[arg], strerror(errno)), exit(1); // Depending on the include flag, we either write all messages // listed in our msglist (or write all message not in the // msglist) to the freshly opened file. // if (inc) { // Include message i in the output if it was listed for (i=1; i<=NUM_OF_REC_TYPES; i++) if ((outfile[i] == NULL) && (msglist[i] > 0)) outfile[i] = F; } else { // Include message i in the output if it was not listed for (i=1; i<=NUM_OF_REC_TYPES; i++) if ((outfile[i] == NULL) && (msglist[i] == 0)) outfile[i] = F; } for (i=0; i<=NUM_OF_REC_TYPES; i++) msglist[i] = 0; } else if (strcmp(argv[arg], "-m") == 0) { int type = GetMessageType(argv[++arg]); if ((type >= 1) && (type <= NUM_OF_REC_TYPES)) { msglist[type]++; msg++; } else { fprintf(stderr, "%s: invalid message type '%s'.\n", argv[0], argv[arg]); err = 1; } } else if (strcmp(argv[arg], "-h") == 0) { err = 1; } else { int type = GetMessageType(argv[arg]); if ((type >= 1) && (type <= NUM_OF_REC_TYPES)) { msglist[type]++; msg++; } else { fprintf(stderr, "%s: invalid option '%s'.\n", argv[0], argv[arg]); err = 1; } } arg++; } if (err) usage(argv[0]), exit(1); // Assume everything else goes to stdout. We need to obey the inc // flag, still, though. // if (inc) { // Include message i in the output if it was listed for (i=1; i<=NUM_OF_REC_TYPES; i++) if ((outfile[i] == NULL) && (msglist[i] > 0)) outfile[i] = stdout; } else { // Include message i in the output if it was not listed for (i=1; i<=NUM_OF_REC_TYPES; i++) if ((outfile[i] == NULL) && (msglist[i] == 0)) outfile[i] = stdout; } GenericMesg *pmesg; off_t currPos = 0; off_t prevPos = 0; while (ReadProtoMesg_AS(stdin, &pmesg) != EOF) { assert(pmesg->t <= NUM_OF_REC_TYPES); currPos = AS_UTL_ftell(stdin); if (outfile[pmesg->t] != NULL) { count[pmesg->t]++; size[pmesg->t] += currPos - prevPos; WriteProtoMesg_AS(outfile[pmesg->t], pmesg); } prevPos = currPos; } for (i=0; i<=NUM_OF_REC_TYPES; i++) if (count[i] > 0) fprintf(stderr, "%s num "F_OFF_T" size "F_OFF_T" avg %f\n", MessageTypeName[i], count[i], size[i], (double)size[i] / count[i]); exit(0); }
int main(int argc, char **argv) { char *ovlName = NULL; uint32 maxJob = 0; bool deleteIntermediates = true; bool doExplicitTest = false; bool doFixes = false; char name[FILENAME_MAX]; argc = AS_configure(argc, argv); int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-O") == 0) { ovlName = argv[++arg]; } else if (strcmp(argv[arg], "-F") == 0) { maxJob = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-f") == 0) { doFixes = true; } else if (strcmp(argv[arg], "-t") == 0) { doExplicitTest = true; ovlName = argv[++arg]; } else if (strcmp(argv[arg], "-nodelete") == 0) { deleteIntermediates = false; } else { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); } arg++; } if (ovlName == NULL) err++; if ((maxJob == 0) && (doExplicitTest == false)) err++; if (err) { fprintf(stderr, "usage: %s ...\n", argv[0]); fprintf(stderr, " -O x.ovlStore path to overlap store to build the final index for\n"); fprintf(stderr, " -F s number of slices used in bucketizing/sorting\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t x.ovlStore explicitly test a previously constructed index\n"); fprintf(stderr, " -f when testing, also create a new 'idx.fixed' which might\n"); fprintf(stderr, " resolve rare problems\n"); fprintf(stderr, "\n"); fprintf(stderr, " -nodelete do not remove intermediate files when the index is\n"); fprintf(stderr, " successfully created\n"); fprintf(stderr, "\n"); fprintf(stderr, " DANGER DO NOT USE DO NOT USE DO NOT USE DANGER\n"); fprintf(stderr, " DANGER DANGER\n"); fprintf(stderr, " DANGER This command is difficult to run by hand. DANGER\n"); fprintf(stderr, " DANGER Use ovStoreCreate instead. DANGER\n"); fprintf(stderr, " DANGER DANGER\n"); fprintf(stderr, " DANGER DO NOT USE DO NOT USE DO NOT USE DANGER\n"); fprintf(stderr, "\n"); if (ovlName == NULL) fprintf(stderr, "ERROR: No overlap store (-O) supplied.\n"); if ((maxJob == 0) && (doExplicitTest == false)) fprintf(stderr, "ERROR: One of -F (number of slices) or -t (test a store) must be supplied.\n"); exit(1); } // Do the test, and maybe fix things up. if (doExplicitTest == true) { bool passed = testIndex(ovlName, doFixes); exit((passed == true) ? 0 : 1); } // Check that all segments are present. Every segment should have an info file. uint32 cntJob = 0; for (uint32 i=1; i<=maxJob; i++) { uint32 complete = 0; sprintf(name, "%s/%04d", ovlName, i); if (AS_UTL_fileExists(name, FALSE, FALSE) == true) complete++; else fprintf(stderr, "ERROR: Segment "F_U32" data not present (%s)\n", i, name); sprintf(name, "%s/%04d.info", ovlName, i); if (AS_UTL_fileExists(name, FALSE, FALSE) == true) complete++; else fprintf(stderr, "ERROR: Segment "F_U32" info not present (%s)\n", i, name); sprintf(name, "%s/%04d.index", ovlName, i); if (AS_UTL_fileExists(name, FALSE, FALSE) == true) complete++; else fprintf(stderr, "ERROR: Segment "F_U32" index not present (%s)\n", i, name); if (complete == 3) cntJob++; } if (cntJob != maxJob) { fprintf(stderr, "ERROR: Expected "F_U32" segments, only found "F_U32".\n", maxJob, cntJob); exit(1); } // Merge the stuff. mergeInfoFiles(ovlName, maxJob); // Diagnostics. if (testIndex(ovlName, false) == false) { fprintf(stderr, "ERROR: index failed tests.\n"); exit(1); } // Remove intermediates. For the buckets, we keep going until there are 10 in a row not present. // During testing, on a microbe using 2850 buckets, some buckets were empty. if (deleteIntermediates == false) { fprintf(stderr, "\n"); fprintf(stderr, "Not removing intermediate files. Finished.\n"); exit(0); } fprintf(stderr, "\n"); fprintf(stderr, "Removing intermediate files.\n"); // Removing indices is easy, beacuse we know how many there are. for (uint32 i=1; i<=maxJob; i++) { sprintf(name, "%s/%04u.index", ovlName, i); AS_UTL_unlink(name); sprintf(name, "%s/%04u.info", ovlName, i); AS_UTL_unlink(name); } // We don't know how many buckets there are, so we remove until we fail to find ten // buckets in a row. for (uint32 missing=0, i=1; missing<10; i++) { sprintf(name, "%s/bucket%04d", ovlName, i); if (AS_UTL_fileExists(name, TRUE, FALSE) == FALSE) { missing++; continue; } missing = 0; sprintf(name, "%s/bucket%04d/sliceSizes", ovlName, i); AS_UTL_unlink(name); sprintf(name, "%s/bucket%04d", ovlName, i); rmdir(name); } fprintf(stderr, "Finished.\n"); exit(0); }
int32 main(int32 argc, char **argv) { int32 arg = 1; int32 err = 0; int32 hlp = 0; char * gkpStoreName = NULL; int32 gkpStorePart = 0; char * msgFile = NULL; char * outputFileName= NULL; char * seqAn = NULL; char * wrkDir = NULL; char * seqStoreName = NULL; int32 seqStoreVer = 0; int32 seqStorePart = 0; argc = AS_configure(argc, argv); while (arg < argc) { if (strcmp(argv[arg], "-c") == 0) { msgFile = argv[++arg]; } else if (strcmp(argv[arg], "-G") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-S") == 0) { gkpStorePart = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-o") == 0) { outputFileName = argv[++arg]; } else if (strcmp(argv[arg], "-s") == 0) { seqAn = argv[++arg]; } else if (strcmp(argv[arg], "-w") == 0) { wrkDir = argv[++arg]; } else if (strcmp(argv[arg], "-u") == 0) { seqStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-V") == 0) { seqStoreVer = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-p") == 0) { seqStorePart = atoi(argv[++arg]); } else { err++; } arg++; } if ((err) || (gkpStoreName == NULL) || (msgFile == NULL) || (outputFileName == NULL) || seqAn == NULL) { fprintf(stderr, "USAGE: SeqAn_CNS -G <gkpStore> -c <input.cgb> -o <output.cgi> -s <seqan_executable> [-u seqstore, required for contig consensus] [-w working directory]\n"); exit(1); } gkStore *gkpStore = new gkStore(gkpStoreName, FALSE, FALSE); gkpStore->gkStore_loadPartition(gkpStorePart); gkFragment fr; GenericMesg *pmesg; tSequenceDB *sequenceDB = NULL; FILE *infp = fopen(msgFile,"r"); FILE *tempReads; FILE *outfp = fopen(outputFileName, "w"); char fileName[AS_SEQAN_MAX_BUFFER_LENGTH]; char *prefix = outputFileName; getFileName(prefix, wrkDir, AS_SEQAN_INPUT_NAME, fileName); int32 i = 0; while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { int32 freeMem = 0; if (pmesg->t == MESG_IUM) { IntUnitigMesg *ium_mesg = (IntUnitigMesg *)pmesg->m; if (strlen(ium_mesg->consensus) == 0) { tempReads = fopen(fileName,"w"); for (i =0; i < ium_mesg->num_frags; i++) { // get the fragment sequence gkpStore->gkStore_getFragment(ium_mesg->f_list[i].ident, &fr, GKFRAGMENT_QLT); uint32 clrBeg = fr.gkFragment_getClearRegionBegin(); uint32 clrEnd = fr.gkFragment_getClearRegionEnd (); char *seqStart = fr.gkFragment_getSequence(); char *seq = seqStart+clrBeg; seq[clrEnd] = 0; AS_UTL_writeFastA(tempReads, seq, clrEnd-clrBeg, ">"F_IID","F_IID"\n", ium_mesg->f_list[i].position.bgn, ium_mesg->f_list[i].position.end); } fclose(tempReads); updateRecord(ium_mesg, fileName, seqAn, prefix, wrkDir); freeMem = 1; } WriteProtoMesg_AS(outfp, pmesg); if (freeMem) { safe_free(ium_mesg->consensus); safe_free(ium_mesg->quality); } } else if (pmesg->t == MESG_ICM) { IntConConMesg *icm_mesg = (IntConConMesg *)pmesg->m; if (seqStoreName == NULL) { fprintf(stderr, "USAGE: The -u option is required for contig consensus\n"); exit(1); } if (sequenceDB == NULL) { sequenceDB = openSequenceDB(seqStoreName, FALSE, seqStoreVer); openSequenceDBPartition(sequenceDB, seqStorePart); } if (strlen(icm_mesg->consensus) == 0) { tempReads = fopen(fileName,"w"); for (i =0; i < icm_mesg->num_pieces; i++) { // get the fragment sequence gkpStore->gkStore_getFragment(icm_mesg->pieces[i].ident, &fr, GKFRAGMENT_QLT); uint32 clrBeg = fr.gkFragment_getClearRegionBegin(); uint32 clrEnd = fr.gkFragment_getClearRegionEnd (); char *seqStart = fr.gkFragment_getSequence(); char *seq = seqStart+clrBeg; seq[clrEnd] = 0; AS_UTL_writeFastA(tempReads, seq, clrEnd-clrBeg, ">"F_IID","F_IID"\n", icm_mesg->pieces[i].position.bgn, icm_mesg->pieces[i].position.end); } // now handle the unitig messages for (i =0; i < icm_mesg->num_unitigs; i++) { VA_TYPE(char) *ungappedSequence = CreateVA_char(0); VA_TYPE(char) *ungappedQuality = CreateVA_char(0); MultiAlignT *uma = loadMultiAlignTFromSequenceDB(sequenceDB, icm_mesg->unitigs[i].ident, 1); assert(uma != NULL); GetMultiAlignUngappedConsensus(uma, ungappedSequence, ungappedQuality); char * seq = Getchar(ungappedSequence,0); AS_UTL_writeFastA(tempReads, seq, strlen(seq), ">"F_IID","F_IID"\n", icm_mesg->unitigs[i].position.bgn, icm_mesg->unitigs[i].position.end); } fclose(tempReads); updateICMRecord(icm_mesg, fileName, seqAn, prefix, wrkDir); freeMem = 1; } WriteProtoMesg_AS(outfp, pmesg); if (freeMem) { safe_free(icm_mesg->consensus); safe_free(icm_mesg->quality); } } } fclose(infp); fclose(outfp); return 0; }
int main (int argc, char *argv[]) { int32 checkpointVers = 0; int32 tigStoreVers = 0; GlobalData = new Globals_CGW(); argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-g") == 0) { strcpy(GlobalData->gkpStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-t") == 0) { strcpy(GlobalData->tigStoreName, argv[++arg]); tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { strcpy(GlobalData->outputPrefix, argv[++arg]); checkpointVers = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((GlobalData->gkpStoreName[0] == 0) || (GlobalData->tigStoreName[0] == 0) || (err)) { fprintf(stderr, "usage: %s -g gkpStore [-o prefix] [-s firstUID] [-n namespace] [-E server] [-h]\n", argv[0]); fprintf(stderr, " -g gkpStore mandatory path to the gkpStore\n"); fprintf(stderr, " -t tigStore version mandatory path to the tigStore and version\n"); fprintf(stderr, " -c checkpoint version optional path to a checkpoint and version\n"); fprintf(stderr, "\n"); exit(1); } LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, checkpointVers, FALSE); vector<instrumentLIB> lib; for (int32 i=0; i<GetNumDistTs(ScaffoldGraph->Dists); i++) { DistT *dptr = GetDistT(ScaffoldGraph->Dists, i); lib.push_back(instrumentLIB(i, dptr->mu, dptr->sigma, true)); } GraphNodeIterator scaffolds; CIScaffoldT *scaffold; InitGraphNodeIterator(&scaffolds, ScaffoldGraph->ScaffoldGraph, GRAPH_NODE_DEFAULT); while ((scaffold = NextGraphNodeIterator(&scaffolds)) != NULL) { if(scaffold->type != REAL_SCAFFOLD) continue; //if (scaffold->id != 14) // continue; instrumentSCF scf(scaffold); scf.analyze(lib); scf.report(); } DestroyScaffoldGraph(ScaffoldGraph); return(0); }
int main (int argc, char **argv) { char *gkpName = NULL; char *tigName = NULL; uint32 tigVers = UINT32_MAX; uint32 tigPart = UINT32_MAX; char *tigFileName = NULL; uint32 utgBgn = UINT32_MAX; uint32 utgEnd = UINT32_MAX; char *outResultsName = NULL; char *outLayoutsName = NULL; char *outSeqNameA = NULL; char *outSeqNameQ = NULL; char *outPackageName = NULL; FILE *outResultsFile = NULL; FILE *outLayoutsFile = NULL; FILE *outSeqFileA = NULL; FILE *outSeqFileQ = NULL; FILE *outPackageFile = NULL; char *inPackageName = NULL; char algorithm = 'P'; uint32 numThreads = 0; bool forceCompute = false; double errorRate = 0.12; double errorRateMax = 0.40; uint32 minOverlap = 40; int32 numFailures = 0; bool showResult = false; double maxCov = 0.0; uint32 maxLen = UINT32_MAX; uint32 verbosity = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-T") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); tigPart = atoi(argv[++arg]); if (argv[arg][0] == '.') tigPart = UINT32_MAX; if (tigVers == 0) fprintf(stderr, "invalid tigStore version (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1); if (tigPart == 0) fprintf(stderr, "invalid tigStore partition (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1); } else if (strcmp(argv[arg], "-u") == 0) { AS_UTL_decodeRange(argv[++arg], utgBgn, utgEnd); } else if (strcmp(argv[arg], "-t") == 0) { tigFileName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { outResultsName = argv[++arg]; } else if (strcmp(argv[arg], "-L") == 0) { outLayoutsName = argv[++arg]; } else if (strcmp(argv[arg], "-A") == 0) { outSeqNameA = argv[++arg]; } else if (strcmp(argv[arg], "-Q") == 0) { outSeqNameQ = argv[++arg]; } else if (strcmp(argv[arg], "-quick") == 0) { algorithm = 'Q'; } else if (strcmp(argv[arg], "-pbdagcon") == 0) { algorithm = 'P'; } else if (strcmp(argv[arg], "-utgcns") == 0) { algorithm = 'U'; } else if (strcmp(argv[arg], "-threads") == 0) { numThreads = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-p") == 0) { inPackageName = argv[++arg]; } else if (strcmp(argv[arg], "-P") == 0) { outPackageName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { errorRate = atof(argv[++arg]); } else if (strcmp(argv[arg], "-em") == 0) { errorRateMax = atof(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-f") == 0) { forceCompute = true; } else if (strcmp(argv[arg], "-v") == 0) { showResult = true; } else if (strcmp(argv[arg], "-V") == 0) { verbosity++; } else if (strcmp(argv[arg], "-maxcoverage") == 0) { maxCov = atof(argv[++arg]); } else if (strcmp(argv[arg], "-maxlength") == 0) { maxLen = atof(argv[++arg]); } else { fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((gkpName == NULL) && (inPackageName == NULL)) err++; if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL)) err++; if (err) { fprintf(stderr, "usage: %s [opts]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " INPUT\n"); fprintf(stderr, " -G g Load reads from gkStore 'g'\n"); fprintf(stderr, " -T t v p Load unitigs from tgStore 't', version 'v', partition 'p'.\n"); fprintf(stderr, " Expects reads will be in gkStore partition 'p' as well\n"); fprintf(stderr, " Use p='.' to specify no partition\n"); fprintf(stderr, " -t file Test the computation of the unitig layout in 'file'\n"); fprintf(stderr, " 'file' can be from:\n"); fprintf(stderr, " 'tgStoreDump -d layout' (human readable layout format)\n"); fprintf(stderr, " 'utgcns -L' (human readable layout format)\n"); fprintf(stderr, " 'utgcns -O' (binary multialignment format)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -p package Load unitig and read from 'package' created with -P. This\n"); fprintf(stderr, " is usually used by developers.\n"); fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, " ALGORITHM\n"); fprintf(stderr, " -quick No alignments, just paste read sequence into the unitig positions.\n"); fprintf(stderr, " This is very fast, but the consensus sequence is formed from a mosaic\n"); fprintf(stderr, " of read sequences, and there can be large indel. This is useful for\n"); fprintf(stderr, " checking intermediate assembly structure by mapping to reference, or\n"); fprintf(stderr, " possibly for use as input to a polishing step.\n"); fprintf(stderr, " -pbdagcon Use pbdagcon (https://github.com/PacificBiosciences/pbdagcon).\n"); fprintf(stderr, " This is fast and robust. It is the default algorithm. It does not\n"); fprintf(stderr, " generate a final multialignment output (the -v option will not show\n"); fprintf(stderr, " anything useful).\n"); fprintf(stderr, " -utgcns Use utgcns (the original Celera Assembler consensus algorithm)\n"); fprintf(stderr, " This isn't as fast, isn't as robust, but does generate a final multialign\n"); fprintf(stderr, " output.\n"); fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, " OUTPUT\n"); fprintf(stderr, " -O results Write computed tigs to binary output file 'results'\n"); fprintf(stderr, " -L layouts Write computed tigs to layout output file 'layouts'\n"); fprintf(stderr, " -A fasta Write computed tigs to fasta output file 'fasta'\n"); fprintf(stderr, " -Q fastq Write computed tigs to fastq output file 'fastq'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -P package Create a copy of the inputs needed to compute the unitigs. This\n"); fprintf(stderr, " file can then be sent to the developers for debugging. The unitig(s)\n"); fprintf(stderr, " are not processed and no other outputs are created. Ideally,\n"); fprintf(stderr, " only one unitig is selected (-u, below).\n"); fprintf(stderr, "\n"); fprintf(stderr, " TIG SELECTION (if -T input is used)\n"); fprintf(stderr, " -u b Compute only unitig ID 'b' (must be in the correct partition!)\n"); fprintf(stderr, " -u b-e Compute only unitigs from ID 'b' to ID 'e'\n"); fprintf(stderr, " -f Recompute unitigs that already have a multialignment\n"); fprintf(stderr, " -maxlength l Do not compute consensus for unitigs longer than l bases.\n"); fprintf(stderr, "\n"); fprintf(stderr, " PARAMETERS\n"); fprintf(stderr, " -e e Expect alignments at up to fraction e error\n"); fprintf(stderr, " -em m Don't ever allow alignments more than fraction m error\n"); fprintf(stderr, " -l l Expect alignments of at least l bases\n"); fprintf(stderr, " -maxcoverage c Use non-contained reads and the longest contained reads, up to\n"); fprintf(stderr, " C coverage, for consensus generation. The default is 0, and will\n"); fprintf(stderr, " use all reads.\n"); fprintf(stderr, "\n"); fprintf(stderr, " LOGGING\n"); fprintf(stderr, " -v Show multialigns.\n"); fprintf(stderr, " -V Enable debugging option 'verbosemultialign'.\n"); fprintf(stderr, "\n"); if ((gkpName == NULL) && (inPackageName == NULL)) fprintf(stderr, "ERROR: No gkpStore (-G) and no package (-p) supplied.\n"); if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL)) fprintf(stderr, "ERROR: No tigStore (-T) OR no test unitig (-t) OR no package (-p) supplied.\n"); exit(1); } errno = 0; // Open output files. If we're creating a package, the usual output files are not opened. if (outPackageName) outPackageFile = fopen(outPackageName, "w"); if (errno) fprintf(stderr, "Failed to open output package file '%s': %s\n", outPackageName, strerror(errno)), exit(1); if ((outResultsName) && (outPackageName == NULL)) outResultsFile = fopen(outResultsName, "w"); if (errno) fprintf(stderr, "Failed to open output results file '%s': %s\n", outResultsName, strerror(errno)), exit(1); if ((outLayoutsName) && (outPackageName == NULL)) outLayoutsFile = fopen(outLayoutsName, "w"); if (errno) fprintf(stderr, "Failed to open output layout file '%s': %s\n", outLayoutsName, strerror(errno)), exit(1); if ((outSeqNameA) && (outPackageName == NULL)) outSeqFileA = fopen(outSeqNameA, "w"); if (errno) fprintf(stderr, "Failed to open output FASTA file '%s': %s\n", outSeqNameA, strerror(errno)), exit(1); if ((outSeqNameQ) && (outPackageName == NULL)) outSeqFileQ = fopen(outSeqNameQ, "w"); if (errno) fprintf(stderr, "Failed to open output FASTQ file '%s': %s\n", outSeqNameQ, strerror(errno)), exit(1); if (numThreads > 0) { omp_set_num_threads(numThreads); fprintf(stderr, "number of threads = %d (command line)\n", numThreads); fprintf(stderr, "\n"); } else { fprintf(stderr, "number of threads = %d (OpenMP default)\n", omp_get_max_threads()); fprintf(stderr, "\n"); } // Open gatekeeper for read only, and load the partitioned data if tigPart > 0. gkStore *gkpStore = NULL; tgStore *tigStore = NULL; FILE *tigFile = NULL; FILE *inPackageFile = NULL; map<uint32, gkRead *> *inPackageRead = NULL; map<uint32, gkReadData *> *inPackageReadData = NULL; if (gkpName) { fprintf(stderr, "-- Opening gkpStore '%s' partition %u.\n", gkpName, tigPart); gkpStore = gkStore::gkStore_open(gkpName, gkStore_readOnly, tigPart); } if (tigName) { fprintf(stderr, "-- Opening tigStore '%s' version %u.\n", tigName, tigVers); tigStore = new tgStore(tigName, tigVers); } if (tigFileName) { fprintf(stderr, "-- Opening tigFile '%s'.\n", tigFileName); errno = 0; tigFile = fopen(tigFileName, "r"); if (errno) fprintf(stderr, "Failed to open input tig file '%s': %s\n", tigFileName, strerror(errno)), exit(1); } if (inPackageName) { fprintf(stderr, "-- Opening package file '%s'.\n", inPackageName); errno = 0; inPackageFile = fopen(inPackageName, "r"); if (errno) fprintf(stderr, "Failed to open input package file '%s': %s\n", inPackageName, strerror(errno)), exit(1); } // Report some sizes. fprintf(stderr, "sizeof(abBead) "F_SIZE_T"\n", sizeof(abBead)); fprintf(stderr, "sizeof(abColumn) "F_SIZE_T"\n", sizeof(abColumn)); fprintf(stderr, "sizeof(abAbacus) "F_SIZE_T"\n", sizeof(abAbacus)); fprintf(stderr, "sizeof(abSequence) "F_SIZE_T"\n", sizeof(abSequence)); // Decide on what to compute. Either all unitigs, or a single unitig, or a special case test. uint32 b = 0; uint32 e = UINT32_MAX; if (tigStore) { if (utgEnd > tigStore->numTigs() - 1) utgEnd = tigStore->numTigs() - 1; if (utgBgn != UINT32_MAX) { b = utgBgn; e = utgEnd; } else { b = 0; e = utgEnd; } fprintf(stderr, "-- Computing unitig consensus for b="F_U32" to e="F_U32" with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n", b, e, errorRate, errorRateMax, minOverlap); } else { fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n", errorRate, errorRateMax, minOverlap); } fprintf(stderr, "\n"); // I don't like this loop control. for (uint32 ti=b; (e == UINT32_MAX) || (ti <= e); ti++) { tgTig *tig = NULL; // If a tigStore, load the tig. The tig is the owner; it cannot be deleted by us. if (tigStore) tig = tigStore->loadTig(ti); // If a tigFile or a package, create a new tig and fill it. Obviously, we own it. if (tigFile || inPackageFile) { tig = new tgTig(); if (tig->loadFromStreamOrLayout((tigFile != NULL) ? tigFile : inPackageFile) == false) { delete tig; break; } } // No tig loaded, keep going. if (tig == NULL) continue; // If a package, populate the read and readData maps with data from the package. if (inPackageFile) { inPackageRead = new map<uint32, gkRead *>; inPackageReadData = new map<uint32, gkReadData *>; for (int32 ii=0; ii<tig->numberOfChildren(); ii++) { uint32 readID = tig->getChild(ii)->ident(); gkRead *read = (*inPackageRead)[readID] = new gkRead; gkReadData *data = (*inPackageReadData)[readID] = new gkReadData; gkStore::gkStore_loadReadFromStream(inPackageFile, read, data); if (read->gkRead_readID() != readID) fprintf(stderr, "ERROR: package not in sync with tig. package readID = %u tig readID = %u\n", read->gkRead_readID(), readID); assert(read->gkRead_readID() == readID); } } // More 'not liking' - set the verbosity level for logging. tig->_utgcns_verboseLevel = verbosity; // Are we parittioned? Is this tig in our partition? if (tigPart != UINT32_MAX) { uint32 missingReads = 0; for (uint32 ii=0; ii<tig->numberOfChildren(); ii++) if (gkpStore->gkStore_getReadInPartition(tig->getChild(ii)->ident()) == NULL) missingReads++; if (missingReads) { //fprintf(stderr, "SKIP unitig %u with %u reads found only %u reads in partition, skipped\n", // tig->tigID(), tig->numberOfChildren(), tig->numberOfChildren() - missingReads); continue; } } if (tig->length(true) > maxLen) { fprintf(stderr, "SKIP unitig %d of length %d (%d children) - too long, skipped\n", tig->tigID(), tig->length(true), tig->numberOfChildren()); continue; } if (tig->numberOfChildren() == 0) { fprintf(stderr, "SKIP unitig %d of length %d (%d children) - no children, skipped\n", tig->tigID(), tig->length(true), tig->numberOfChildren()); continue; } bool exists = tig->consensusExists(); if (tig->numberOfChildren() > 1) fprintf(stderr, "Working on unitig %d of length %d (%d children)%s%s\n", tig->tigID(), tig->length(true), tig->numberOfChildren(), ((exists == true) && (forceCompute == false)) ? " - already computed" : "", ((exists == true) && (forceCompute == true)) ? " - already computed, recomputing" : ""); // Process the tig. Remove deep coverage, create a consensus object, process it, and report the results. // before we add it to the store. unitigConsensus *utgcns = new unitigConsensus(gkpStore, errorRate, errorRateMax, minOverlap); savedChildren *origChildren = NULL; bool success = exists; // Save the tig in the package? // // The original idea was to dump the tig and all the reads, then load the tig and process as normal. // Sadly, stashContains() rearranges the order of the reads even if it doesn't remove any. The rearranged // tig couldn't be saved (otherwise it would be rearranged again). So, we were in the position of // needing to save the original tig and the rearranged reads. Impossible. // // Instead, we save the origianl tig and original reads -- including any that get stashed -- then // load them all back into a map for use in consensus proper. It's a bit of a pain, and could // have way more reads saved than necessary. if (outPackageFile) { utgcns->savePackage(outPackageFile, tig); fprintf(stderr, " Packaged unitig %u into '%s'\n", tig->tigID(), outPackageName); } // Compute consensus if it doesn't exist, or if we're forcing a recompute. But only if we // didn't just package it. if ((outPackageFile == NULL) && ((exists == false) || (forceCompute == true))) { origChildren = stashContains(tig, maxCov, true); switch (algorithm) { case 'Q': success = utgcns->generateQuick(tig, inPackageRead, inPackageReadData); break; case 'P': default: success = utgcns->generatePBDAG(tig, inPackageRead, inPackageReadData); break; case 'U': success = utgcns->generate(tig, inPackageRead, inPackageReadData); break; } } // If it was successful (or existed already), output. Success is always false if the unitig // was packaged, regardless of if it existed already. if (success == true) { if ((showResult) && (gkpStore)) // No gkpStore if we're from a package. Dang. tig->display(stdout, gkpStore, 200, 3); unstashContains(tig, origChildren); if (outResultsFile) tig->saveToStream(outResultsFile); if (outLayoutsFile) tig->dumpLayout(outLayoutsFile); if (outSeqFileA) tig->dumpFASTA(outSeqFileA, true); if (outSeqFileQ) tig->dumpFASTQ(outSeqFileQ, true); } // Report failures. if ((success == false) && (outPackageFile == NULL)) { fprintf(stderr, "unitigConsensus()-- unitig %d failed.\n", tig->tigID()); numFailures++; } // Clean up, unloading or deleting the tig. delete utgcns; // No real reason to keep this until here. delete origChildren; // Need to keep it until after we display() above. if (tigStore) tigStore->unloadTig(tig->tigID(), true); // Tell the store we're done with it if (tigFile) delete tig; } finish: delete tigStore; gkpStore->gkStore_close(); if (tigFile) fclose(tigFile); if (outResultsFile) fclose(outResultsFile); if (outLayoutsFile) fclose(outLayoutsFile); if (outPackageFile) fclose(outPackageFile); if (inPackageFile) fclose(inPackageFile); if (numFailures) { fprintf(stderr, "WARNING: Total number of unitig failures = %d\n", numFailures); fprintf(stderr, "\n"); fprintf(stderr, "Consensus did NOT finish successfully.\n"); } else { fprintf(stderr, "Consensus finished successfully. Bye.\n"); } return(numFailures != 0); }
int main(int argc, char **argv) { int toBinary = 0; int toASCII = 0; int format = FORMAT_NONE; argc = AS_configure(argc, argv); maxError = AS_OVS_encodeQuality(1.0); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-ovl") == 0) { format = FORMAT_OVL; } else if (strcmp(argv[arg], "-obt") == 0) { format = FORMAT_OBT; } else if (strncmp(argv[arg], "-minlength", 5) == 0) { minLength = atoi(argv[++arg]); } else if (strncmp(argv[arg], "-maxerror", 5) == 0) { double e = atof(argv[++arg]); maxError = AS_OVS_encodeQuality(e); } else if (strncmp(argv[arg], "-nocontainment", 4) == 0) { // aka, only dovetail noContainment = 1; } else if (strncmp(argv[arg], "-nodovetail", 4) == 0) { // aka, only containment noDovetail = 1; } else if (strcmp(argv[arg], "-gkp") == 0) { gkpStoreName = argv[++arg]; } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((format == FORMAT_NONE) || (err)) { fprintf(stderr, "usage: %s [-ovl | -obt] < input > output\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "Filters overlaps (raw binary files, not the store) based on\n"); fprintf(stderr, "length, error, dovetail or containment.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -ovl -- overlaps are OVL\n"); fprintf(stderr, " -obt -- overlaps are OBT\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l -- throw out overlaps shorter than l\n"); fprintf(stderr, " -maxerror e -- throw out overlaps with more than fraction e error\n"); fprintf(stderr, " -nocontainment -- throw out containment overlaps\n"); fprintf(stderr, " -nodovetail -- throw out dovetail overlaps\n"); fprintf(stderr, "\n"); fprintf(stderr, " -gkp gkpstore Needed for -ovl or -nocontainment or -nodovetail\n"); exit(1); } if (noDovetail || noContainment || (format == FORMAT_OVL)) { gkStore *gkp = new gkStore(gkpStoreName, FALSE, FALSE); gkStream *fs = new gkStream(gkp, 0, 0, GKFRAGMENT_INF); gkFragment fr; numReads = gkp->gkStore_getNumFragments(); readLength = (fragInfo *)safe_malloc(sizeof(fragInfo) * numReads); fprintf(stderr, "Reading gkpStore to get clear ranges for "F_U32" reads.\n", numReads); while (fs->next(&fr)) { AS_IID iid = fr.gkFragment_getReadIID(); readLength[iid].len = fr.gkFragment_getSequenceLength(); readLength[iid].beg = fr.gkFragment_getClearRegionBegin(); readLength[iid].end = fr.gkFragment_getClearRegionEnd (); } delete fs; delete gkp; } fprintf(stderr, "WARNING:\n"); fprintf(stderr, "WARNING: This has not been fully tested. Only -obt -minlength\n"); fprintf(stderr, "WARNING: is guaranteed (unless it doesn't work). Though,\n"); fprintf(stderr, "WARNING: -maxerror is pretty trivial.\n"); fprintf(stderr, "WARNING:\n"); switch (format) { case FORMAT_OVL: filterOVL(); break; case FORMAT_OBT: filterOBT(); break; default: fprintf(stderr, "%s: unknown format (%d)?!?!\n", argv[0], format); return(1); break; } return(0); }
int main(int argc, char **argv) { char bolfile_name[FILENAME_MAX] = {0}; char Outfile_Name[FILENAME_MAX] = {0}; int illegal; char * p; argc = AS_configure(argc, argv); Min_Olap_Len = AS_OVERLAP_MIN_LEN; // set after configure int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { Doing_Partial_Overlaps = TRUE; } else if (strcmp(argv[arg], "-h") == 0) { AS_UTL_decodeRange(argv[++arg], Lo_Hash_Frag, Hi_Hash_Frag); } else if (strcmp(argv[arg], "-H") == 0) { AS_UTL_decodeRange(argv[++arg], minLibToHash, maxLibToHash); } else if (strcmp(argv[arg], "-R") == 0) { AS_UTL_decodeRange(argv[++arg], minLibToRef, maxLibToRef); } else if (strcmp(argv[arg], "-k") == 0) { arg++; if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) || (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) { Kmer_Len = strtoull(argv[arg], NULL, 10); } else { errno = 0; Kmer_Skip_File = fopen(argv[arg], "r"); if (errno) fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1); } } else if (strcmp(argv[arg], "-l") == 0) { Frag_Olap_Limit = strtol(argv[++arg], NULL, 10); if (Frag_Olap_Limit < 1) Frag_Olap_Limit = INT_MAX; } else if (strcmp(argv[arg], "-m") == 0) { Unique_Olap_Per_Pair = FALSE; } else if (strcmp(argv[arg], "--hashbits") == 0) { Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashstrings") == 0) { Max_Hash_Strings = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashdatalen") == 0) { Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashload") == 0) { Max_Hash_Load = atof(argv[++arg]); } else if (strcmp(argv[arg], "--maxreadlen") == 0) { // Quite the gross way to do this, but simple. uint32 desired = strtoul(argv[++arg], NULL, 10); OFFSET_BITS = 1; while (((uint32)1 << OFFSET_BITS) < desired) OFFSET_BITS++; STRING_NUM_BITS = 30 - OFFSET_BITS; STRING_NUM_MASK = (1 << STRING_NUM_BITS) - 1; OFFSET_MASK = (1 << OFFSET_BITS) - 1; MAX_STRING_NUM = STRING_NUM_MASK; } else if (strcmp(argv[arg], "--readsperbatch") == 0) { Max_Reads_Per_Batch = strtoul(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--readsperthread") == 0) { Max_Reads_Per_Thread = strtoul(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-o") == 0) { strcpy(Outfile_Name, argv[++arg]); } else if (strcmp(argv[arg], "-r") == 0) { AS_UTL_decodeRange(argv[++arg], Lo_Old_Frag, Hi_Old_Frag); } else if (strcmp(argv[arg], "-t") == 0) { Num_PThreads = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-u") == 0) { Unique_Olap_Per_Pair = TRUE; } else if (strcmp(argv[arg], "-v") == 0) { Min_Olap_Len = (int) strtol (argv[++arg], & p, 10); } else if (strcmp(argv[arg], "-w") == 0) { Use_Window_Filter = TRUE; } else if (strcmp(argv[arg], "-x") == 0) { Ignore_Clear_Range = TRUE; } else if (strcmp(argv[arg], "-z") == 0) { Use_Hopeless_Check = FALSE; } else { if (Frag_Store_Path == NULL) { Frag_Store_Path = argv[arg]; } else { fprintf(stderr, "Unknown option '%s'\n", argv[arg]); err++; } } arg++; } // Fix up some flags if we're allowing high error rates. // if (AS_OVL_ERROR_RATE > 0.06) { if (Use_Window_Filter) fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n"); Use_Window_Filter = FALSE; Use_Hopeless_Check = FALSE; } if (Max_Hash_Strings == 0) fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++; if (Kmer_Len == 0) fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++; if (Max_Hash_Strings > MAX_STRING_NUM) fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++; if (Outfile_Name[0] == 0) fprintf (stderr, "ERROR: No output file name specified\n"), err++; if ((err) || (Frag_Store_Path == NULL)) { fprintf(stderr, "USAGE: %s [options] <gkpStorePath>\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "-b <fn> in contig mode, specify the output file\n"); fprintf(stderr, "-c contig mode. Use 2 frag stores. First is\n"); fprintf(stderr, " for reads; second is for contigs\n"); fprintf(stderr, "-G do partial overlaps\n"); fprintf(stderr, "-h <range> to specify fragments to put in hash table\n"); fprintf(stderr, " Implies LSF mode (no changes to frag store)\n"); fprintf(stderr, "-I designate a file of frag iids to limit olaps to\n"); fprintf(stderr, " (Contig mode only)\n"); fprintf(stderr, "-k if one or two digits, the length of a kmer, otherwise\n"); fprintf(stderr, " the filename containing a list of kmers to ignore in\n"); fprintf(stderr, " the hash table\n"); fprintf(stderr, "-l specify the maximum number of overlaps per\n"); fprintf(stderr, " fragment-end per batch of fragments.\n"); fprintf(stderr, "-m allow multiple overlaps per oriented fragment pair\n"); fprintf(stderr, "-M specify memory size. Valid values are '8GB', '4GB',\n"); fprintf(stderr, " '2GB', '1GB', '256MB'. (Not for Contig mode)\n"); fprintf(stderr, "-o specify output file name\n"); fprintf(stderr, "-P write protoIO output (if not -G)\n"); fprintf(stderr, "-r <range> specify old fragments to overlap\n"); fprintf(stderr, "-s ignore screen information with fragments\n"); fprintf(stderr, "-t <n> use <n> parallel threads\n"); fprintf(stderr, "-u allow only 1 overlap per oriented fragment pair\n"); fprintf(stderr, "-v <n> only output overlaps of <n> or more bases\n"); fprintf(stderr, "-w filter out overlaps with too many errors in a window\n"); fprintf(stderr, "-x ignore the clear ranges on reads and use the \n"); fprintf(stderr, " full sequence\n"); fprintf(stderr, "-z skip the hopeless check\n"); fprintf(stderr, "\n"); fprintf(stderr, "--hashbits n Use n bits for the hash mask.\n"); fprintf(stderr, "--hashstrings n Load at most n strings into the hash table at one time.\n"); fprintf(stderr, "--hashdatalen n Load at most n bytes into the hash table at one time.\n"); fprintf(stderr, "--hashload f Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n"); fprintf(stderr, "\n"); fprintf(stderr, "--maxreadlen n For batches with all short reads, pack bits differently to\n"); fprintf(stderr, " process more reads per batch.\n"); fprintf(stderr, " all reads must be shorter than n\n"); fprintf(stderr, " --hashstrings limited to 2^(30-m)\n"); fprintf(stderr, " Common values:\n"); fprintf(stderr, " maxreadlen 2048 -> hashstrings 524288 (default)\n"); fprintf(stderr, " maxreadlen 512 -> hashstrings 2097152\n"); fprintf(stderr, " maxreadlen 128 -> hashstrings 8388608\n"); fprintf(stderr, "\n"); fprintf(stderr, "--readsperbatch n Force batch size to n.\n"); fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n"); fprintf(stderr, "\n"); exit(1); } assert(NULL == Out_BOF); Out_BOF = AS_OVS_createBinaryOverlapFile(Outfile_Name, FALSE); // Adjust the number of reads to load into memory at once (for processing, not the hash table), if (Max_Reads_Per_Batch == 0) Max_Reads_Per_Batch = (Max_Hash_Strings < 100000) ? Max_Hash_Strings : 100000; //if (Max_Hash_Strings < Max_Reads_Per_Batch) // Max_Reads_Per_Batch = Max_Hash_Strings; // Adjust the number of reads processed per thread. Default to having four blocks per thread, // but make sure that (a) all threads have work to do, and (b) batches are not minuscule. if (Max_Reads_Per_Thread == 0) Max_Reads_Per_Thread = Max_Reads_Per_Batch / (4 * Num_PThreads); if (Max_Reads_Per_Thread * Num_PThreads > Max_Reads_Per_Batch) Max_Reads_Per_Thread = Max_Reads_Per_Batch / Num_PThreads + 1; if (Max_Reads_Per_Thread < 10) Max_Reads_Per_Thread = 10; // We know enough now to set the hash function variables, and some other random variables. HSF1 = Kmer_Len - (Hash_Mask_Bits / 2); HSF2 = 2 * Kmer_Len - Hash_Mask_Bits; SV1 = HSF1 + 2; SV2 = (HSF1 + HSF2) / 2; SV3 = HSF2 - 2; Branch_Match_Value = (Doing_Partial_Overlaps) ? PARTIAL_BRANCH_MATCH_VAL : DEFAULT_BRANCH_MATCH_VAL; Branch_Error_Value = Branch_Match_Value - 1.0; fprintf(stderr, "\n"); fprintf(stderr, "STRING_NUM_BITS "F_U32"\n", STRING_NUM_BITS); fprintf(stderr, "OFFSET_BITS "F_U32"\n", OFFSET_BITS); fprintf(stderr, "STRING_NUM_MASK "F_U64"\n", STRING_NUM_MASK); fprintf(stderr, "OFFSET_MASK "F_U64"\n", OFFSET_MASK); fprintf(stderr, "MAX_STRING_NUM "F_U64"\n", MAX_STRING_NUM); fprintf(stderr, "\n"); fprintf(stderr, "Hash_Mask_Bits "F_U32"\n", Hash_Mask_Bits); fprintf(stderr, "Max_Hash_Strings "F_U32"\n", Max_Hash_Strings); fprintf(stderr, "Max_Hash_Data_Len "F_U64"\n", Max_Hash_Data_Len); fprintf(stderr, "Max_Hash_Load %f\n", Max_Hash_Load); fprintf(stderr, "Kmer Length %d\n", (int)Kmer_Len); fprintf(stderr, "Min Overlap Length %d\n", Min_Olap_Len); fprintf(stderr, "MAX_ERRORS %d\n", MAX_ERRORS); fprintf(stderr, "ERRORS_FOR_FREE %d\n", ERRORS_FOR_FREE); fprintf(stderr, "\n"); fprintf(stderr, "Num_PThreads "F_U32"\n", Num_PThreads); fprintf(stderr, "Max_Reads_Per_Batch "F_U32"\n", Max_Reads_Per_Batch); fprintf(stderr, "Max_Reads_Per_Thread "F_U32"\n", Max_Reads_Per_Thread); assert (8 * sizeof (uint64) > 2 * Kmer_Len); Initialize_Globals (); OldFragStore = new gkStore(Frag_Store_Path, FALSE, FALSE); /****************************************/ OverlapDriver(); /****************************************/ fprintf (stderr, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct); fprintf (stderr, " Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct); fprintf (stderr, " Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct); fprintf (stderr, " Total overlaps produced = "F_S64"\n", Total_Overlaps); fprintf (stderr, " Contained overlaps = "F_S64"\n", Contained_Overlap_Ct); fprintf (stderr, " Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct); fprintf (stderr, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct); fprintf (stderr, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct); delete OldFragStore; AS_OVS_closeBinaryOverlapFile(Out_BOF); return(0); }
int main(int argc, char **argv) { char *gkpName = NULL; char *ovsName = NULL; char *finClrName = NULL; char *outClrName = NULL; double errorRate = 0.06; //uint32 minAlignLength = 40; uint32 minReadLength = 64; uint32 idMin = 1; uint32 idMax = UINT32_MAX; char *outputPrefix = NULL; char outputName[FILENAME_MAX]; FILE *staFile = NULL; FILE *reportFile = NULL; FILE *subreadFile = NULL; bool doSubreadLogging = true; bool doSubreadLoggingVerbose = false; // Statistics on the trimming - the second set are from the old logging, and don't really apply anymore. trimStat readsIn; // Read is eligible for trimming trimStat deletedIn; // Read was deleted already trimStat noTrimIn; // Read not requesting trimming trimStat noOverlaps; // no overlaps in store trimStat noCoverage; // no coverage after adjusting for trimming done trimStat readsProcChimera; // Read was processed for chimera signal trimStat readsProcSpur; // Read was processed for spur signal trimStat readsProcSubRead; // Read was processed for subread signal #if 0 trimStat badSpur5; trimStat badSpur3; trimStat badChimera; trimStat badSubread; #endif trimStat readsNoChange; trimStat readsBadSpur5, basesBadSpur5; trimStat readsBadSpur3, basesBadSpur3; trimStat readsBadChimera, basesBadChimera; trimStat readsBadSubread, basesBadSubread; trimStat readsTrimmed5; trimStat readsTrimmed3; #if 0 trimStat fullCoverage; // fully covered by overlaps trimStat noSignalNoGap; // no signal, no gaps trimStat noSignalButGap; // no signal, with gaps trimStat bothFixed; // both chimera and spur signal trimmed trimStat chimeraFixed; // only chimera signal trimmed trimStat spurFixed; // only spur signal trimmed trimStat bothDeletedSmall; // deleted because of both cimera and spur signals trimStat chimeraDeletedSmall; // deleted because of chimera signal trimStat spurDeletedSmall; // deleted because of spur signal trimStat spurDetectedNormal; // normal spur detected trimStat spurDetectedLinker; // linker spur detected trimStat chimeraDetectedInnie; // innpue-pair chimera detected trimStat chimeraDetectedOverhang; // overhanging chimera detected trimStat chimeraDetectedGap; // gap chimera detected trimStat chimeraDetectedLinker; // linker chimera detected #endif trimStat deletedOut; // Read was deleted by trimming argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { ovsName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { AS_UTL_decodeRange(argv[++arg], idMin, idMax); } else if (strcmp(argv[arg], "-Ci") == 0) { finClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Co") == 0) { outClrName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { errorRate = atof(argv[++arg]); //} else if (strcmp(argv[arg], "-l") == 0) { // minAlignLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (errorRate < 0.0) err++; if ((gkpName == 0L) || (ovsName == 0L) || (outputPrefix == NULL) || (err)) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Ci input.clearFile -Co output.clearFile -o outputPrefix]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G gkpStore path to read store\n"); fprintf(stderr, " -O ovlStore path to overlap store\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o name output prefix, for logging\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n"); fprintf(stderr, " -Co clearFile path to ouput clear ranges\n"); fprintf(stderr, "\n"); fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n"); //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n"); fprintf(stderr, "\n"); if (errorRate < 0.0) fprintf(stderr, "ERROR: Error rate (-e) value %f too small; must be 'fraction error' and above 0.0\n", errorRate); exit(1); } gkStore *gkp = gkStore::gkStore_open(gkpName); ovStore *ovs = new ovStore(ovsName, gkp); clearRangeFile *finClr = new clearRangeFile(finClrName, gkp); clearRangeFile *outClr = new clearRangeFile(outClrName, gkp); if (outClr) // If the outClr file exists, those clear ranges are loaded. We need to reset them // back to 'untrimmed' for now. outClr->reset(gkp); if (finClr && outClr) // A finClr file was supplied, so use those as the clear ranges. outClr->copy(finClr); sprintf(outputName, "%s.log", outputPrefix); errno = 0; reportFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); sprintf(outputName, "%s.subread.log", outputPrefix); errno = 0; subreadFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); uint32 ovlLen = 0; uint32 ovlMax = 64 * 1024; ovOverlap *ovl = ovOverlap::allocateOverlaps(gkp, ovlMax); memset(ovl, 0, sizeof(ovOverlap) * ovlMax); workUnit *w = new workUnit; if (idMin < 1) idMin = 1; if (idMax > gkp->gkStore_getNumReads()) idMax = gkp->gkStore_getNumReads(); fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n", idMin, idMax, gkp->gkStore_getNumReads(), errorRate); for (uint32 id=idMin; id<=idMax; id++) { gkRead *read = gkp->gkStore_getRead(id); gkLibrary *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID()); if (finClr->isDeleted(id)) { // Read already trashed. deletedIn += read->gkRead_sequenceLength(); continue; } if ((libr->gkLibrary_removeSpurReads() == false) && (libr->gkLibrary_removeChimericReads() == false) && (libr->gkLibrary_checkForSubReads() == false)) { // Nothing to do. noTrimIn += read->gkRead_sequenceLength(); continue; } readsIn += read->gkRead_sequenceLength(); uint32 nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax); //fprintf(stderr, "read %7u with %7u overlaps\r", id, nLoaded); if (nLoaded == 0) { // No overlaps, nothing to check! noOverlaps += read->gkRead_sequenceLength(); continue; } w->clear(id, finClr->bgn(id), finClr->end(id)); w->addAndFilterOverlaps(gkp, finClr, errorRate, ovl, ovlLen); if (w->adjLen == 0) { // All overlaps trimmed out! noCoverage += read->gkRead_sequenceLength(); continue; } // Find bad regions. //if (libr->gkLibrary_markBad() == true) // // From an external file, a list of known bad regions. If no overlaps span // // the region with sufficient coverage, mark the region as bad. This was // // motivated by the old 454 linker detection. // markBad(gkp, w, subreadFile, doSubreadLoggingVerbose); //if (libr->gkLibrary_removeSpurReads() == true) { // readsProcSpur += read->gkRead_sequenceLength(); // detectSpur(gkp, w, subreadFile, doSubreadLoggingVerbose); // Get stats on spur region detected - save the length of each region to the trimStats object. //} //if (libr->gkLibrary_removeChimericReads() == true) { // readsProcChimera += read->gkRead_sequenceLength(); // detectChimer(gkp, w, subreadFile, doSubreadLoggingVerbose); // Get stats on chimera region detected - save the length of each region to the trimStats object. //} if (libr->gkLibrary_checkForSubReads() == true) { readsProcSubRead += read->gkRead_sequenceLength(); detectSubReads(gkp, w, subreadFile, doSubreadLoggingVerbose); } // Get stats on the bad regions found. This kind of duplicates code in trimBadInterval(), but // I don't want to pass all the stats objects into there. if (w->blist.size() == 0) { readsNoChange += read->gkRead_sequenceLength(); } else { uint32 nSpur5 = 0, bSpur5 = 0; uint32 nSpur3 = 0, bSpur3 = 0; uint32 nChimera = 0, bChimera = 0; uint32 nSubread = 0, bSubread = 0; for (uint32 bb=0; bb<w->blist.size(); bb++) { switch (w->blist[bb].type) { case badType_5spur: nSpur5 += 1; basesBadSpur5 += w->blist[bb].end - w->blist[bb].bgn; break; case badType_3spur: nSpur3 += 1; basesBadSpur3 += w->blist[bb].end - w->blist[bb].bgn; break; case badType_chimera: nChimera += 1; basesBadChimera += w->blist[bb].end - w->blist[bb].bgn; break; case badType_subread: nSubread += 1; basesBadSubread += w->blist[bb].end - w->blist[bb].bgn; break; default: break; } } if (nSpur5 > 0) readsBadSpur5 += nSpur5; if (nSpur3 > 0) readsBadSpur3 += nSpur3; if (nChimera > 0) readsBadChimera += nChimera; if (nSubread > 0) readsBadSubread += nSubread; } // Find solution. This coalesces the list (in 'w') of all the bad regions found, picks out the // largest good region, generates a log of the bad regions that support this decision, and sets // the trim points. trimBadInterval(gkp, w, minReadLength, subreadFile, doSubreadLoggingVerbose); // Log the solution. AS_UTL_safeWrite(reportFile, w->logMsg, "logMsg", sizeof(char), strlen(w->logMsg)); // Save the solution.... outClr->setbgn(w->id) = w->clrBgn; outClr->setend(w->id) = w->clrEnd; // And maybe delete the read. if (w->isOK == false) { deletedOut += read->gkRead_sequenceLength(); outClr->setDeleted(w->id); } // Update stats on what was trimmed. The asserts say the clear range didn't expand, and the if // tests if the clear range changed. assert(w->clrBgn >= w->iniBgn); assert(w->iniEnd >= w->clrEnd); if (w->clrBgn > w->iniBgn) readsTrimmed5 += w->clrBgn - w->iniBgn; if (w->iniEnd > w->clrEnd) readsTrimmed3 += w->iniEnd - w->clrEnd; } delete [] ovl; delete w; gkp->gkStore_close(); delete finClr; delete outClr; // Close log files if (reportFile) fclose(reportFile); if (subreadFile) fclose(subreadFile); // Write the summary if (outputPrefix) { sprintf(outputName, "%s.stats", outputPrefix); errno = 0; staFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)); } if (staFile == NULL) staFile = stdout; // Would like to know number of subreads per read fprintf(staFile, "PARAMETERS:\n"); fprintf(staFile, "----------\n"); fprintf(staFile, "%7u (reads trimmed below this many bases are deleted)\n", minReadLength); fprintf(staFile, "%7.4f (use overlaps at or below this fraction error)\n", errorRate); //fprintf(staFile, "%7u (use only overlaps longer than this)\n", minAlignLength); // NOT SUPPORTED! fprintf(staFile, "INPUT READS:\n"); fprintf(staFile, "-----------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases); fprintf(staFile, "\n"); fprintf(staFile, "PROCESSED:\n"); fprintf(staFile, "--------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n", readsProcChimera.nReads, readsProcChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n", readsProcSpur.nReads, readsProcSpur.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases); fprintf(staFile, "\n"); fprintf(staFile, "READS WITH SIGNALS:\n"); fprintf(staFile, "------------------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads, readsBadSpur5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads, readsBadSpur3.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases); fprintf(staFile, "\n"); fprintf(staFile, "SIGNALS:\n"); fprintf(staFile, "-------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads, basesBadSpur5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads, basesBadSpur3.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases); fprintf(staFile, "\n"); fprintf(staFile, "TRIMMING:\n"); fprintf(staFile, "--------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases); #if 0 fprintf(staFile, "DELETED:\n"); fprintf(staFile, "-------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases); fprintf(staFile, "\n"); fprintf(staFile, "SPUR TYPES:\n"); fprintf(staFile, "----------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases); fprintf(staFile, "\n"); fprintf(staFile, "CHIMERA TYPES:\n"); fprintf(staFile, "-------------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases); #endif // INPUT READS = ACCEPTED + TRIMMED + DELETED // SPUR TYPE = TRIMMED and DELETED spur and both categories // CHIMERA TYPE = TRIMMED and DELETED chimera and both categories if (staFile != stdout) fclose(staFile); exit(0); }
int main(int argc, char **argv) { char *gkpStoreName = NULL; char *outPrefix = NULL; AS_IID libToDump = 0; uint32 clrToDump = AS_READ_CLEAR_LATEST; AS_IID bgnIID = 1; AS_IID endIID = AS_IID_MAX; bool dumpAllBases = true; bool dumpAllReads = false; argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-l") == 0) { libToDump = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-b") == 0) { bgnIID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-e") == 0) { endIID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { clrToDump = gkStore_decodeClearRegionLabel(argv[++arg]); } else if (strcmp(argv[arg], "-g") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outPrefix = argv[++arg]; } else { err++; fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); } arg++; } if (gkpStoreName == NULL) err++; if (outPrefix == NULL) err++; if (clrToDump == AS_READ_CLEAR_ERROR) err++; if (err) { fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]); fprintf(stderr, " -g gkpStore\n"); fprintf(stderr, " -o fastq-prefix write files fastq-prefix.1.fastq, fastq-prefix.2.fastq, fastq-prefix.paired.fastq, fastq-prefix.unmated.fastq\n"); fprintf(stderr, " \n"); fprintf(stderr, " -l libToDump output only fragments in library number libToDump (NOT IMPLEMENTED)\n"); fprintf(stderr, " -b iid output starting at fragment iid\n"); fprintf(stderr, " -e iid output stopping after fragment iid\n"); fprintf(stderr, " -c clrName output clear range 'clrName'\n"); fprintf(stderr, " \n"); if (gkpStoreName == NULL) fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n"); if (outPrefix == NULL) fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n"); if (clrToDump == AS_READ_CLEAR_ERROR) fprintf(stderr, "ERROR: clear range (-c) is not a valid clear range.\n"); exit(1); } gkStore *gkp = new gkStore(gkpStoreName, FALSE, FALSE); AS_IID numFrags = gkp->gkStore_getNumFragments(); AS_IID numLibs = gkp->gkStore_getNumLibraries(); libInfo **lib = new libInfo * [numLibs]; lib[0] = new libInfo(outPrefix, "legacy"); for (uint32 i=1; i<numLibs; i++) lib[i] = new libInfo(outPrefix, gkp->gkStore_getLibrary(i)->libraryName); if (bgnIID < 1) bgnIID = 1; if (numFrags < endIID) endIID = numFrags; //AS_IID streamBgn = AS_IID_MIN; //AS_IID streamEnd = AS_IID_MAX; gkStream *fs = new gkStream(gkp, bgnIID, endIID, GKFRAGMENT_QLT); gkFragment fr; while (fs->next(&fr)) { int32 lclr = fr.gkFragment_getClearRegionBegin(clrToDump); int32 rclr = fr.gkFragment_getClearRegionEnd (clrToDump); AS_IID id1 = fr.gkFragment_getReadIID(); AS_IID id2 = fr.gkFragment_getMateIID(); AS_IID libIID = fr.gkFragment_getLibraryIID(); if ((dumpAllReads == false) && (fr.gkFragment_getIsDeleted() == true)) // Fragment is deleted, don't dump. continue; if ((libToDump != 0) && (fr.gkFragment_getLibraryIID() == libToDump)) // Fragment isn't marked for dumping, don't dump. continue; if ((dumpAllBases == false) && (lclr >= rclr)) // Fragment has null or invalid clear range, don't dump. continue; if ((id2 != 0) && (id2 < id1)) // Mated, and the mate is the first frag. We've already reported this one. continue; char *seq = fr.gkFragment_getSequence() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0); char *qlt = fr.gkFragment_getQuality() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0); int32 len = (dumpAllBases == false) ? fr.gkFragment_getClearRegionLength(clrToDump) : fr.gkFragment_getSequenceLength(); seq[len] = 0; qlt[len] = 0; if (dumpAllBases == true) { for (uint32 i=0; i<lclr; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; for (uint32 i=lclr; i<rclr; i++) seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a'; for (uint32 i=rclr; seq[i]; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; } if (id2 == 0) { // Unmated read, dump to the unmated reads file. AS_UTL_writeFastQ(lib[libIID]->u, seq, len, qlt, len, "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n", AS_UID_toString(fr.gkFragment_getReadUID()), fr.gkFragment_getClearRegionBegin(clrToDump), fr.gkFragment_getClearRegionEnd (clrToDump), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_TNT), fr.gkFragment_getIsNonRandom() ? 'f' : 't'); continue; } // Write the first fragment (twice). AS_UTL_writeFastQ(lib[libIID]->a, seq, len, qlt, len, "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n", AS_UID_toString(fr.gkFragment_getReadUID()), fr.gkFragment_getClearRegionBegin(clrToDump), fr.gkFragment_getClearRegionEnd (clrToDump), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_TNT), fr.gkFragment_getIsNonRandom() ? 'f' : 't'); AS_UTL_writeFastQ(lib[libIID]->p, seq, len, qlt, len, "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n", AS_UID_toString(fr.gkFragment_getReadUID()), fr.gkFragment_getClearRegionBegin(clrToDump), fr.gkFragment_getClearRegionEnd (clrToDump), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_TNT), fr.gkFragment_getIsNonRandom() ? 'f' : 't'); // Grab the second fragment. gkp->gkStore_getFragment(id2, &fr, GKFRAGMENT_QLT); lclr = fr.gkFragment_getClearRegionBegin(clrToDump) + 1; rclr = fr.gkFragment_getClearRegionEnd (clrToDump); seq = fr.gkFragment_getSequence() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0); qlt = fr.gkFragment_getQuality() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0); len = (dumpAllBases == false) ? fr.gkFragment_getClearRegionLength(clrToDump) : fr.gkFragment_getSequenceLength(); seq[len] = 0; qlt[len] = 0; // Write the second fragment (twice). AS_UTL_writeFastQ(lib[libIID]->b, seq, len, qlt, len, "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n", AS_UID_toString(fr.gkFragment_getReadUID()), fr.gkFragment_getClearRegionBegin(clrToDump), fr.gkFragment_getClearRegionEnd (clrToDump), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_TNT), fr.gkFragment_getIsNonRandom() ? 'f' : 't'); AS_UTL_writeFastQ(lib[libIID]->p, seq, len, qlt, len, "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n", AS_UID_toString(fr.gkFragment_getReadUID()), fr.gkFragment_getClearRegionBegin(clrToDump), fr.gkFragment_getClearRegionEnd (clrToDump), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_VEC), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_MAX), fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT), fr.gkFragment_getClearRegionEnd (AS_READ_CLEAR_TNT), fr.gkFragment_getIsNonRandom() ? 'f' : 't'); } delete fs; delete gkp; exit(0); }
int main(int argc, char **argv) { // Options controlling main int generateOutput = 1; int preserveConsensus = 0; int preMergeRezLevel = -1; int repeatRezLevel = 0; int restartFromCheckpoint = -1; char *restartFromLogical = "ckp00-NUL"; bool recomputeLeastSquaresOnLoad = false; bool reloadMates = false; int doResolveSurrogates = 1; // resolveSurrogates int placeAllFragsInSinglePlacedSurros = 0; // resolveSurrogates double cutoffToInferSingleCopyStatus = 0.666; // resolveSurrogates int firstFileArg = 0; int32 outputFragsPerPartition = 0; #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) ContigOrientChecker * coc; coc = CreateContigOrientChecker(); assert(coc != NULL); #endif // temporary! fprintf(stderr, "Using up to %d OpenMP threads.\n", omp_get_max_threads()); GlobalData = new Globals_CGW(); argc = AS_configure(argc, argv); int arg = 1; int err = 0; int unk[64] = {0}; int unl = 0; while (arg < argc) { if (strcmp(argv[arg], "-C") == 0) { GlobalData->performCleanupScaffolds = 0; } else if (strcmp(argv[arg], "-D") == 0) { GlobalData->debugLevel = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-E") == 0) { GlobalData->outputOverlapOnlyContigEdges = 1; } else if (strcmp(argv[arg], "-F") == 0) { GlobalData->allowDemoteMarkedUnitigs = FALSE; } else if (strcmp(argv[arg], "-G") == 0) { generateOutput = 0; } else if (strcmp(argv[arg], "-GG") == 0) { preserveConsensus = 1; } else if (strcmp(argv[arg], "-g") == 0) { strcpy(GlobalData->gkpStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-t") == 0) { strcpy(GlobalData->tigStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-I") == 0) { GlobalData->ignoreChaffUnitigs = 1; } else if (strcmp(argv[arg], "-j") == 0) { GlobalData->cgbUniqueCutoff = atof(argv[++arg]); } else if (strcmp(argv[arg], "-K") == 0) { GlobalData->removeNonOverlapingContigsFromScaffold = 1; } else if (strcmp(argv[arg], "-k") == 0) { GlobalData->cgbDefinitelyUniqueCutoff = atof(argv[++arg]); } else if (strcmp(argv[arg], "-m") == 0) { GlobalData->minSamplesForOverride = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-N") == 0) { restartFromLogical = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { strcpy(GlobalData->outputPrefix, argv[++arg]); } else if (strcmp(argv[arg], "-B") == 0) { outputFragsPerPartition = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-P") == 0) { GlobalData->closurePlacement = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-p") == 0) { preMergeRezLevel = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-R") == 0) { restartFromCheckpoint = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-r") == 0) { repeatRezLevel = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-S") == 0) { doResolveSurrogates = 1; cutoffToInferSingleCopyStatus = atof(argv[++arg]); placeAllFragsInSinglePlacedSurros = 0; if (cutoffToInferSingleCopyStatus == 0.0) doResolveSurrogates = 0; if (cutoffToInferSingleCopyStatus < 0) { cutoffToInferSingleCopyStatus = 0.0; placeAllFragsInSinglePlacedSurros = 1; } } else if (strcmp(argv[arg], "-s") == 0) { GlobalData->stoneLevel = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-filter") == 0) { GlobalData->mergeFilterLevel = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-shatter") == 0) { GlobalData->shatterLevel = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-missingMate") == 0) { GlobalData->mergeScaffoldMissingMates = atof(argv[++arg]); // the value is a percentage between 0 and 1 so make sure it never goes out of those bounds if (GlobalData->mergeScaffoldMissingMates < 0) { GlobalData->mergeScaffoldMissingMates = -1; } else if (GlobalData->mergeScaffoldMissingMates > 1) { GlobalData->mergeScaffoldMissingMates = 1; } } else if (strcmp(argv[arg], "-U") == 0) { GlobalData->doUnjiggleWhenMerging = 1; } else if (strcmp(argv[arg], "-u") == 0) { fprintf(stderr, "Option -u is broken.\n"); exit(1); strcpy(GlobalData->unitigOverlaps, argv[++arg]); } else if (strcmp(argv[arg], "-Z") == 0) { GlobalData->demoteSingletonScaffolds = FALSE; } else if (strcmp(argv[arg], "-z") == 0) { GlobalData->checkRepeatBranchPattern = TRUE; } else if (strcmp(argv[arg], "-minmergeweight") == 0) { GlobalData->minWeightToMerge = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-recomputegaps") == 0) { recomputeLeastSquaresOnLoad = true; } else if (strcmp(argv[arg], "-reloadmates") == 0) { reloadMates = true; } else if ((argv[arg][0] != '-') && (firstFileArg == 0)) { firstFileArg = arg; arg = argc; } else { unk[unl++] = arg; err++; } arg++; } if (GlobalData->gkpStoreName[0] == 0) err++; if (GlobalData->outputPrefix[0] == 0) err++; if (cutoffToInferSingleCopyStatus > 1.0) err++; if (err) { fprintf(stderr, "usage: %s [options] -g <GatekeeperStoreName> -o <OutputPath> <unitigs*.cgb>\n", argv[0]); fprintf(stderr, " -C Don't cleanup scaffolds\n"); fprintf(stderr, " -D <lvl> Debug\n"); fprintf(stderr, " -E output overlap only contig edges\n"); fprintf(stderr, " -e <thresh> Microhet score probability cutoff\n"); fprintf(stderr, " -F strongly enforce unique/repeat flag set in unitig, default if not set is to still\n"); fprintf(stderr, " allow those marked unique to be demoted due to Repeat Branch Pattern or being\n"); fprintf(stderr, " too small\n"); fprintf(stderr, " -g gkp Store path (required)\n"); fprintf(stderr, " -G Don't generate output (cgw or cam)\n"); fprintf(stderr, " -GG Don't destroy consensus on output (ctgcns will do nothing)\n"); fprintf(stderr, " -I ignore chaff unitigs\n"); fprintf(stderr, " -i <thresh> Set max coverage stat for microhet determination of non-uniqueness (default -1)\n"); fprintf(stderr, " -j <thresh> Set min coverage stat for definite uniqueness\n"); fprintf(stderr, " -K Allow kicking out a contig placed in a scaffold by mate pairs that has no overlaps\n"); fprintf(stderr, " to both its left and right neighbor contigs.\n"); fprintf(stderr, " -k <thresh> Set max coverage stat for possible uniqueness\n"); fprintf(stderr, " -M don't do interleaved scaffold merging\n"); fprintf(stderr, " -m <min> Number of mate samples to recompute an insert size, default is 100\n"); fprintf(stderr, " -N <ckp> restart from checkpoint location 'ckp' (see the timing file)\n"); fprintf(stderr, " -o Output Name (required)\n"); fprintf(stderr, " -P <int> how to place closure reads.\n"); fprintf(stderr, " 0 - place at first location found\n"); fprintf(stderr, " 1 - place at best gap\n"); fprintf(stderr, " 2 - allow to be placed in multiple gaps\n"); fprintf(stderr, " -R <ckp> restart from checkpoint file number 'ckp'\n"); fprintf(stderr, " -r <lvl> repeat resolution level\n"); fprintf(stderr, " -S <t> place all frags in singly-placed surrogates if at least fraction <x> can be placed\n"); fprintf(stderr, " two special cases:\n"); fprintf(stderr, " if <t> = -1, place all frags in singly-placed surrogates aggressively\n"); fprintf(stderr, " (which really mean t = 0.0, but triggers a better algorithm)\n"); fprintf(stderr, " if <t> = 0, do not resolve surrogate fragments\n"); fprintf(stderr, " -s <lvl> stone throwing level\n"); fprintf(stderr, " -shatter <thresh> Set threshold for shattering scaffolds when loading from checkpoint. Any contigs\n"); fprintf(stderr, " connected to a scaffold only by edges with less weight than the threshold will be\n"); fprintf(stderr, " split into a new scaffold (default OFF)\n"); fprintf(stderr, " -missingMate <thresh> Set threshold (0-1) for the percentage of mates (out of total) that are allowed to be\n"); fprintf(stderr, " missing when attempting a scaffold merge (default 0). A value of -1 will ignore all\n"); fprintf(stderr, " missing mates\n"); fprintf(stderr, " -minmergeweight <w> Only use weight w or better edges for merging scaffolds.\n"); fprintf(stderr, " -recomputegaps if loading a checkpoint, recompute gaps, merging contigs and splitting low weight scaffolds.\n"); fprintf(stderr, " -reloadmates If loading a checkpoint, also load any new mates from gkpStore.\n"); fprintf(stderr, " -U after inserting rocks/stones try shifting contig positions back to their original location\n"); fprintf(stderr, " when computing overlaps to see if they overlap with the rock/stone and allow them to merge\n"); fprintf(stderr, " if they do\n"); fprintf(stderr, " -u <file> load these overlaps (from BOG) into the scaffold graph\n"); fprintf(stderr, " -v verbose\n"); fprintf(stderr, " -Z Don't demote singleton scaffolds\n"); fprintf(stderr, " -z Turn on Check for Repeat Branch Pattern (demotes some unique unitigs to repeat)\n"); fprintf(stderr, "\n"); if (GlobalData->gkpStoreName[0] == 0) fprintf(stderr, "ERROR: No gatekeeper (-g) supplied.\n"); if (GlobalData->outputPrefix[0] == 0) fprintf(stderr, "ERROR: No output prefix (-o) supplied.\n"); if (cutoffToInferSingleCopyStatus > 1.0) fprintf(stderr, "ERROR: surrogate fraction cutoff (-S) must be between 0.0 and 1.0.\n"); if (unl) { for (arg=0; arg<unl; arg++) fprintf(stderr, "ERROR: Unknown option '%s'\n", argv[unk[arg]]); } exit(1); } isValidCheckpointName(restartFromLogical); if(GlobalData->cgbDefinitelyUniqueCutoff < GlobalData->cgbUniqueCutoff) GlobalData->cgbDefinitelyUniqueCutoff = GlobalData->cgbUniqueCutoff; if (preMergeRezLevel >= 0) GlobalData->repeatRezLevel = preMergeRezLevel; else GlobalData->repeatRezLevel = repeatRezLevel; if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_LOADING) == true) { int ctme = time(0); // Create the checkpoint from scratch ScaffoldGraph = CreateScaffoldGraph(GlobalData->outputPrefix); ProcessInput(firstFileArg, argc, argv); // Insert sizes are set already, but we'll estimate again anyway. ComputeMatePairStatisticsRestricted(UNITIG_OPERATIONS, GlobalData->minSamplesForOverride, "unitig_initial"); if (time(0) - ctme > 60 * 60) CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_LOADING], "after loading"); } else if (isThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_LOADING) == true) { // Load the checkpoint if we are exactly after loading, otherwise, fall through to the // real load. LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix,restartFromCheckpoint, TRUE); } if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_EDGE_BUILDING) == true) { vector<CDS_CID_t> rawEdges; BuildGraphEdgesDirectly(ScaffoldGraph->CIGraph, rawEdges); // Broken, see comments in ChunkOverlap_CGW.c // //if (GlobalData->unitigOverlaps[0]) // AddUnitigOverlaps(ScaffoldGraph->CIGraph, GlobalData->unitigOverlaps, rawEdges); // Compute all overlaps implied by mate links between pairs of unique unitigs ComputeOverlaps(ScaffoldGraph->CIGraph, rawEdges); MergeAllGraphEdges(ScaffoldGraph->CIGraph, rawEdges, FALSE, FALSE); CheckEdgesAgainstOverlapper(ScaffoldGraph->CIGraph); CheckSurrogateUnitigs(); // Mark some Unitigs/Chunks/CIs as repeats based on overlaps GRANGER 2/2/07 // if (GlobalData->checkRepeatBranchPattern) DemoteUnitigsWithRBP(stderr, ScaffoldGraph->CIGraph); // At this Point we've constructed the CIGraph BuildInitialContigs(ScaffoldGraph); if(GlobalData->debugLevel > 0){ CheckEdgesAgainstOverlapper(ScaffoldGraph->ContigGraph); CheckSurrogateUnitigs(); } CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_EDGE_BUILDING], "after building edges"); } else { LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix,restartFromCheckpoint, TRUE); if (reloadMates) ReloadMatesFromGatekeeper(); // Dump stats on the loaded checkpoint //GeneratePlacedContigGraphStats(tmpBuffer,0); //GenerateScaffoldGraphStats(tmpBuffer,0); // shatter scaffolds if requested if (GlobalData->shatterLevel > 0) { ShatterScaffoldsConnectedByLowWeight(stderr, ScaffoldGraph, GlobalData->shatterLevel, TRUE); } // Useful for checking mate happiness on loading. Currently only checks one scaffold. if (0) { vector<instrumentLIB> libs; for (int32 i=0; i<GetNumDistTs(ScaffoldGraph->Dists); i++) { DistT *dptr = GetDistT(ScaffoldGraph->Dists, i); libs.push_back(instrumentLIB(i, dptr->mu, dptr->sigma, true)); } for (int32 sID=287340; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); fprintf(stderr, "ANALYZING SCAFFOLD %d\n", sID); if (scaffold->flags.bits.isDead == true) continue; instrumentSCF A(scaffold); A.analyze(libs); A.report(); exit(0); } } if (recomputeLeastSquaresOnLoad) { for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (scaffold->flags.bits.isDead == true) continue; if (true == LeastSquaresGapEstimates(ScaffoldGraph, GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID), LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } } } // We DO want to flush unused unitigs/contigs at this point. They're not in // a scaffold, and possibly will never be used again (except as rocks/stones). // ScaffoldGraph->tigStore->flushCache(); if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_DURING_INITIAL_SCAFFOLDING) == true) && (GlobalData->repeatRezLevel > 0)) { int ctme = time(0); if(GlobalData->debugLevel > 0) DumpContigs(stderr,ScaffoldGraph, FALSE); // Transitive reduction of ContigGraph followed by construction of SEdges // With markShakyBifurcations enabled. BuildUniqueCIScaffolds(ScaffoldGraph, TRUE, FALSE); CheckEdgesAgainstOverlapper(ScaffoldGraph->ContigGraph); // Equivalent to TidyUpScaffolds(). // for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } if (time(0) - ctme > 60 * 60) CheckpointScaffoldGraph(ckpNames[CHECKPOINT_DURING_INITIAL_SCAFFOLDING], "during initial scaffolding"); } if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_INITIAL_SCAFFOLDING) == true) && (GlobalData->repeatRezLevel > 0)) { //CheckAllTrustedEdges(ScaffoldGraph); { vector<CDS_CID_t> rawEdges; BuildSEdges(rawEdges, FALSE); MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, FALSE); } //ScaffoldSanity(ScaffoldGraph); // rocks is called inside of here // checkpoints are written inside of here int iter = 0; int iterMax = 10; // MAX_OUTPUT_REZ_ITERATIONS int ctme = time(0); int changed = TRUE; fprintf(stderr,"** Running Level 1 Repeat Rez **\n"); while ((changed) && (iter < iterMax)) { CheckEdgesAgainstOverlapper(ScaffoldGraph->ContigGraph); CheckCITypes(ScaffoldGraph); changed = RepeatRez(GlobalData->repeatRezLevel, GlobalData->outputPrefix); if (changed){ CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE); ScaffoldSanity(ScaffoldGraph); // With markShakyBifurcations disabled. BuildUniqueCIScaffolds(ScaffoldGraph, FALSE, FALSE); CheckEdgesAgainstOverlapper(ScaffoldGraph->ContigGraph); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } //CheckAllTrustedEdges(ScaffoldGraph); // This shouldn't be necessary (RepeatRez() calling TidyUpScaffolds() should be doing it), // but it is infrequent (at most iterMax=10 times). { vector<CDS_CID_t> rawEdges; BuildSEdges(rawEdges, FALSE); MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, FALSE); } // If we've been running for 2 hours, AND we've not just // completed the last iteration, checkpoint. // if ((time(0) - ctme > 120 * 60) && (changed) && (iter+1 < iterMax)) { ctme = time(0); CheckpointScaffoldGraph(ckpNames[CHECKPOINT_DURING_INITIAL_SCAFFOLDING], "during initial scaffolding"); } iter++; } } #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) ResetContigOrientChecker(coc); AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc); #endif if(GlobalData->debugLevel > 0) DumpCIScaffolds(stderr,ScaffoldGraph, FALSE); CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_INITIAL_SCAFFOLDING], "after initial scaffolding"); } // else TidyUpScaffolds (ScaffoldGraph); // We DO want to flush unused unitigs/contigs at this point. They're not in // a scaffold, and possibly will never be used again (except as rocks/stones). // ScaffoldGraph->tigStore->flushCache(); if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_1ST_SCAFF_MERGE) == true) { CleanupScaffolds(ScaffoldGraph,FALSE, NULLINDEX, FALSE); ScaffoldSanity(ScaffoldGraph); /* First we try to merge Scaffolds agressively */ MergeScaffoldsAggressive(ScaffoldGraph, ckpNames[CHECKPOINT_DURING_1ST_SCAFF_MERGE], FALSE); CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE); #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) fprintf(stderr, "---Checking contig orders after MergeScaffoldsAggressive (1)\n\n"); CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE); #endif #ifdef CHECK_CONTIG_ORDERS_INCREMENTAL ResetContigOrientChecker(coc); AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc); #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_1ST_SCAFF_MERGE], "after 1st scaffold merge"); } // We DO want to flush unused unitigs/contigs at this point. They're not in // a scaffold, and possibly will never be used again (except as rocks/stones). // ScaffoldGraph->tigStore->flushCache(); /* now that we are done with initial scaffold merge, we want to use the standard/default repeatRezLevel. Up to now, the value of preMergeRezLevel was in use if set on the command line */ GlobalData->repeatRezLevel = repeatRezLevel; /* Now we throw stones */ if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_STONES) == true) && (GlobalData->stoneLevel > 0)) { // Convert single-contig scaffolds that are marginally unique back // to unplaced contigs so they might be placed as stones // // If we removed any scaffolds, rebuild all the edges. // if ((GlobalData->demoteSingletonScaffolds == true) && (DemoteSmallSingletonScaffolds() == true)) { vector<CDS_CID_t> rawEdges; BuildSEdges(rawEdges, TRUE); MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, TRUE); } ScaffoldSanity(ScaffoldGraph); Throw_Stones(GlobalData->outputPrefix, GlobalData->stoneLevel, FALSE); // Cleanup and split scaffolds. The cleanup shouldn't do anything, but it's cheap. CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } vector<CDS_CID_t> rawEdges; BuildSEdges(rawEdges, TRUE); MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, TRUE); ScaffoldSanity(ScaffoldGraph); #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) fprintf(stderr, "---Checking contig orders after Throw_Stones\n\n"); CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE); #endif #ifdef CHECK_CONTIG_ORDERS_INCREMENTAL ResetContigOrientChecker(coc); AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc); #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_STONES], "after stone throwing"); //GenerateLinkStats(ScaffoldGraph->CIGraph, "Stones", 0); //GeneratePlacedContigGraphStats("Stones", 0); //GenerateLinkStats(ScaffoldGraph->ContigGraph, "Stones", 0); //GenerateScaffoldGraphStats("Stones", 0); } if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_2ND_SCAFF_MERGE) == true) && (GlobalData->stoneLevel > 0)) { ScaffoldSanity(ScaffoldGraph); MergeScaffoldsAggressive(ScaffoldGraph, ckpNames[CHECKPOINT_DURING_2ND_SCAFF_MERGE], FALSE); CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE); #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) fprintf(stderr, "---Checking contig orders after MergeScaffoldsAggressive (2)\n\n"); CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE); #endif #ifdef CHECK_CONTIG_ORDERS_INCREMENTAL ResetContigOrientChecker(coc); AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc); #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_2ND_SCAFF_MERGE], "after 2nd scaffold merge"); } // We DO want to flush unused unitigs/contigs at this point. They're not in // a scaffold, and possibly will never be used again (except as rocks/stones). // ScaffoldGraph->tigStore->flushCache(); // The original rock throwing (above, RepeatRez()) calls TidyUpScaffolds() after each call to // Fill_Gaps(). This does CleanupAScaffold() and LeastSquaresGapEstimates(). The it rebuilds // scaffold edges (but not contig edges). It's not been tested here, so we don't do it yet. if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_FINAL_ROCKS) == true) && (GlobalData->repeatRezLevel > 0)) { int32 extra_rocks = 0; int32 iter = 0; do { // Zero means to rebuild the hopeless scaffold array - e.g., try all scaffolds again. // Before this, it was using iter, but iter was never changed from zero. extra_rocks = Fill_Gaps(GlobalData->outputPrefix, GlobalData->repeatRezLevel, 0); fprintf(stderr, "Threw additional %d rocks on iter %d\n", extra_rocks, iter++); #if 0 CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } vector<CDS_CID_t> rawEdges; BuildSEdges(rawEdges, FALSE); MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, FALSE); #endif //ScaffoldGraph->tigStore->flushCache(); } while (extra_rocks > 1); // // XXX do we need least squares here? // #if 1 fprintf(stderr, "Beta - LeastSquaresGapEstimates #1 after final rocks\n"); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_FINAL_ROCKS], "after final rocks"); } if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_PARTIAL_STONES) == true) && (GlobalData->stoneLevel > 0)) { ScaffoldSanity (ScaffoldGraph); int partial_stones = Throw_Stones(GlobalData->outputPrefix, GlobalData->stoneLevel, TRUE); // // XXX do we need least squares here? // #if 1 fprintf(stderr, "Beta - LeastSquaresGapEstimates #2 after partial stones\n"); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } #endif // If throw_stones splits scaffolds, rebuild edges { vector<CDS_CID_t> rawEdges; BuildSEdges(rawEdges, TRUE); MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, TRUE); } ScaffoldSanity (ScaffoldGraph); //ScaffoldGraph->tigStore->flushCache(); fprintf (stderr, "Threw %d partial stones\n", partial_stones); #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) fprintf(stderr, "---Checking contig orders after partial_stones\n\n"); CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE); #endif #ifdef CHECK_CONTIG_ORDERS_INCREMENTAL ResetContigOrientChecker(coc); AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc); #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_PARTIAL_STONES], "after partial stones"); //GenerateLinkStats (ScaffoldGraph->CIGraph, "PStones", 0); //GeneratePlacedContigGraphStats ("PStones", 0); //GenerateLinkStats(ScaffoldGraph->ContigGraph, "PStones", 0); //GenerateScaffoldGraphStats ("PStones", 0); } if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_FINAL_CONTAINED_STONES) == true) && (GlobalData->stoneLevel > 0)) { ScaffoldSanity (ScaffoldGraph); int contained_stones = Toss_Contained_Stones (GlobalData->outputPrefix, GlobalData->stoneLevel, 0); fprintf(stderr, "Threw %d contained stones\n", contained_stones); fprintf (stderr, "**** Finished Final Contained Stones level %d ****\n", GlobalData->stoneLevel); // Merge contigs before fiddling with gap sizes. CleanupScaffolds (ScaffoldGraph, FALSE, NULLINDEX, FALSE); // // XXX do we need least squares here? // #if 1 fprintf(stderr, "Beta - LeastSquaresGapEstimates #3 after contained stones\n"); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } #endif ScaffoldSanity (ScaffoldGraph); // Remove copies of surrogates which are placed multiple times in the same place in a contig RemoveSurrogateDuplicates(); #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) fprintf(stderr, "---Checking contig orders after contained_stones\n\n"); CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE); #endif #ifdef CHECK_CONTIG_ORDERS_INCREMENTAL ResetContigOrientChecker(coc); AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc); #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_FINAL_CONTAINED_STONES], "after final contained stones"); //GenerateLinkStats (ScaffoldGraph->CIGraph, "CStones", 0); //GeneratePlacedContigGraphStats ("CStones", 0); //GenerateLinkStats(ScaffoldGraph->ContigGraph, "CStones", 0); //GenerateScaffoldGraphStats ("CStones", 0); } // We DO want to flush unused unitigs/contigs at this point. They're not in // a scaffold, and possibly will never be used again (except as rocks/stones). // ScaffoldGraph->tigStore->flushCache(); if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_FINAL_CLEANUP) == true) { // Try to cleanup failed merges, and if we do, generate a checkpoint if(CleanupFailedMergesInScaffolds(ScaffoldGraph)){ // This call deletes surrogate-only contigs that failed to merge if(CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, TRUE)){ #if defined(CHECK_CONTIG_ORDERS) || defined(CHECK_CONTIG_ORDERS_INCREMENTAL) fprintf(stderr, "---Checking contig orders after final cleanup\n\n"); CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE); #endif } // // XXX do we need least squares here? // #if 1 fprintf(stderr, "Beta - LeastSquaresGapEstimates #4 after final cleanup\n"); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_FINAL_CLEANUP], "after final cleanup"); } } if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_RESOLVE_SURROGATES) == true) && (doResolveSurrogates > 0)) { resolveSurrogates(placeAllFragsInSinglePlacedSurros, cutoffToInferSingleCopyStatus); // Call resolve surrogate twice, this is necessary for finishing (closure) reads. // Consider a closure read and its two bounding reads, named left and right: // If one (right) is placed in a unique region while the other (left) is in a surrogate itself, the closure read cannot be placed // However, once the surrogate bounding read is placed (and fully incorporated which happens at the very end of resolveSurrogates) // the closure read can be placed. // Therefore, we run resolve surrogates twice. // Note that is closure reads are themselves mated, it may be necessary to do a third round of placement. resolveSurrogates(placeAllFragsInSinglePlacedSurros, cutoffToInferSingleCopyStatus); // // XXX do we need least squares here? // #if 1 fprintf(stderr, "Beta - LeastSquaresGapEstimates #5 after resolve surrogates\n"); for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) { CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID); if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split)) ScaffoldSanity(ScaffoldGraph, scaffold); } #endif CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_RESOLVE_SURROGATES], "after resolve surrogates"); } // This generates the 'rezlog/gapreads' file. It's hugely // expensive, usually dies on a negative variance assert, and as // far as BPW knows, unused. // //Show_Reads_In_Gaps (GlobalData->outputPrefix); ComputeMatePairStatisticsRestricted(SCAFFOLD_OPERATIONS, GlobalData->minSamplesForOverride, "scaffold_final"); ComputeMatePairStatisticsRestricted(CONTIG_OPERATIONS, GlobalData->minSamplesForOverride, "contig_final"); GenerateCIGraph_U_Stats(); GenerateLinkStats(ScaffoldGraph->CIGraph,"final",0); GeneratePlacedContigGraphStats("final",0); GenerateLinkStats(ScaffoldGraph->ContigGraph,"final",0); GenerateScaffoldGraphStats("final",0); GenerateSurrogateStats("final"); #ifdef DEBUG int j = 0; for (j = 0; j < GetNumVA_CIFragT(ScaffoldGraph->CIFrags); j++) { CIFragT * frag = GetCIFragT(ScaffoldGraph->CIFrags, j); if (ScaffoldGraph->gkpStore->gkStore_getFRGtoPLC(frag->read_iid) != 0) { AS_UID uid = getGatekeeperIIDtoUID(ScaffoldGraph->gkpStore, frag->read_iid, AS_IID_FRG); if (frag->contigID != -1) { ChunkInstanceT * ctg = GetGraphNode(ScaffoldGraph->ContigGraph, frag->contigID); fprintf(stderr, "CLOSURE_READS: CLOSURE READ %s PLACED=%d CHAFF=%d SINGLETON=%d IN ASM type %c in SCF %d\n", AS_UID_toString(uid), frag->flags.bits.isPlaced, frag->flags.bits.isChaff, frag->flags.bits.isSingleton, frag->type, ctg->scaffoldID); } } } #endif // We DO want to flush unused unitigs/contigs at this point. They're not in // a scaffold, and possibly will never be used again (except as rocks/stones). // // (This assumes that output doesn't load unitigs/contigs again) // ScaffoldGraph->tigStore->flushCache(); SetCIScaffoldTLengths(ScaffoldGraph); if(generateOutput){ CelamyAssembly(GlobalData->outputPrefix); MarkContigEdges(); ComputeMatePairDetailedStatus(); // Note that OutputContigs partitions the tigStore, and closes ScaffoldGraph->tigStore. The // only operation valid after this function is CheckpointScaffoldGraph(). OutputUnitigsFromMultiAligns(); OutputContigsFromMultiAligns(outputFragsPerPartition, preserveConsensus); CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_OUTPUT], "after output"); } DestroyScaffoldGraph(ScaffoldGraph); delete GlobalData; fprintf(stderr,"* Bye *\n"); exit(0); }
int main (int argc, char *argv[]) { FILE *asmFile = NULL; char *outputPrefix = NULL; char outputName[FILENAME_MAX] = {0}; int32 checkpointVers = 0; int32 tigStoreVers = 0; uint64 uidStart = 0; int32 outputScaffolds = FALSE; GlobalData = new Globals_CGW(); argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-g") == 0) { strcpy(GlobalData->gkpStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-t") == 0) { strcpy(GlobalData->tigStoreName, argv[++arg]); tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { strcpy(GlobalData->outputPrefix, argv[++arg]); checkpointVers = atoi(argv[++arg]); outputScaffolds = TRUE; } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-s") == 0) { uidStart = strtoul(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-n") == 0) { SYS_UIDset_euid_namespace(argv[++arg]); } else if (strcmp(argv[arg], "-E") == 0) { SYS_UIDset_euid_server(argv[++arg]); } else if (strcmp(argv[arg], "-h") == 0) { err++; } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((GlobalData->gkpStoreName[0] == 0) || (GlobalData->tigStoreName[0] == 0) || (err)) { fprintf(stderr, "usage: %s -g gkpStore [-o prefix] [-s firstUID] [-n namespace] [-E server] [-h]\n", argv[0]); fprintf(stderr, " -g gkpStore mandatory path to the gkpStore\n"); fprintf(stderr, " -t tigStore version mandatory path to the tigStore and version\n"); fprintf(stderr, " -c checkpoint version optional path to a checkpoint and version\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o prefix write the output here\n"); fprintf(stderr, "\n"); fprintf(stderr, " -s firstUID don't use real UIDs, but start counting from here\n"); fprintf(stderr, " -n namespace use this UID namespace\n"); fprintf(stderr, " -E server use this UID server\n"); exit(1); } sprintf(outputName, "%s.asm", outputPrefix); errno = 0; asmFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "%s: Couldn't open '%s' for write: %s\n", argv[0], outputName, strerror(errno)), exit(1); // if we have contigs if (outputScaffolds) { LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, checkpointVers, FALSE); } else { ScaffoldGraph = CreateScaffoldGraph(outputPrefix); } // Reopen the tigStore used for consensus. delete ScaffoldGraph->tigStore; ScaffoldGraph->tigStore = new MultiAlignStore(GlobalData->tigStoreName, tigStoreVers, 0, 0, FALSE, FALSE); fprintf(stderr, "Writing assembly file\n"); writeMDI(asmFile, true); writeAFG(asmFile, true, outputScaffolds); // If uidStart is zero, use the UID server; otherwise, initialize the 'fake uid server' to start // there, or after the last UID used by a fragment. // uidServer = UIDserverInitialize(256, (uidStart == 0) ? 0 : MAX(uidMin, uidStart)); writeAMP(asmFile, true); writeUTG(asmFile, true, outputScaffolds); writeULK(asmFile, true); writeCCO(asmFile, true); writeCLK(asmFile, true); writeSCF(asmFile, true); writeSLK(asmFile, true); fclose(asmFile); fprintf(stderr, "Assembly file complete.\n"); fprintf(stderr, "Writing IID to UID mapping files.\n"); sprintf(outputName, "%s.iidtouid", outputPrefix); errno = 0; asmFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "%s: Couldn't open '%s' for write: %s\n", argv[0], outputName, strerror(errno)), exit(1); FRGmap.dump("FRG", asmFile); UTGmap.dump("UTG", asmFile); CCOmap.dump("CTG", asmFile); SCFmap.dump("SCF", asmFile); fclose(asmFile); fprintf(stderr, "IID to UID mapping files complete.\n"); DestroyScaffoldGraph(ScaffoldGraph); return(0); }
int main( int argc, char **argv) { int ckptNum = NULLINDEX; int makeMiniScaffolds = 1; uint64 uidStart = 1230000; UIDserver *uids = NULL; GlobalData = new Globals_CGW(); argc = AS_configure(argc, argv); int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-p") == 0) { ckptNum = GlobalData->setPrefix(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { strcpy(GlobalData->outputPrefix, argv[++arg]); } else if (strcmp(argv[arg], "-g") == 0) { strcpy(GlobalData->gkpStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-t") == 0) { strcpy(GlobalData->tigStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { ckptNum = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-U") == 0) { uidStart = 0; } else if (strcmp(argv[arg], "-S") == 0) { makeMiniScaffolds = 0; } else { fprintf(stderr, "unknown option '%s'\n", argv[arg]); err = 1; } arg++; } if ((GlobalData->outputPrefix[0] == 0) || (GlobalData->gkpStoreName[0] == 0)) { fprintf(stderr, "usage: %s [[-p prefix] | [-c name -g gkpstore -n ckptNum]] [-U] [-S]\n", argv[0]); fprintf(stderr, " -p Attempt to locate the last checkpoint in directory 7-CGW.\n"); fprintf(stderr, " -c Look for checkpoints in 'name'\n"); fprintf(stderr, " -g Path to gkpStore\n"); fprintf(stderr, " -n Checkpoint number to load\n"); fprintf(stderr, " -U Use real UIDs for miniscaffolds, otherwise, UIDs start at 1230000\n"); fprintf(stderr, " -S Do NOT make mini scaffolds.\n"); exit(1); } uids = UIDserverInitialize(256, uidStart); char *toprint = (char *)safe_malloc(sizeof(char) * (AS_READ_MAX_NORMAL_LEN + 51 + AS_READ_MAX_NORMAL_LEN + 2)); LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, ckptNum, FALSE); int ifrag; for (ifrag=0; ifrag < GetNumVA_CIFragT(ScaffoldGraph->CIFrags); ifrag++) { CIFragT *frag = GetCIFragT(ScaffoldGraph->CIFrags, ifrag); CIFragT *mate = NULL; if (frag->flags.bits.isDeleted) continue; assert(frag->cid != NULLINDEX); assert((frag->flags.bits.hasMate == 0) || (frag->mate_iid != 0)); // Fix for missing mates -- OBT used to not delete mate links, leaving // dangling mates. Somebody else seems to be doing this too. // if (frag->flags.bits.hasMate) { mate = GetCIFragT(ScaffoldGraph->CIFrags, frag->mate_iid); if (mate == NULL) frag->flags.bits.hasMate = 0; } // If this fragment is not chaff, we have nothing to do here. // if (GetGraphNode(ScaffoldGraph->CIGraph,frag->cid)->flags.bits.isChaff == 0) continue; // Print a singleton if there is no mate, the mate isn't chaff, // or we were told to not make miniscaffolds. // if ((mate == NULL) || (mate->flags.bits.isChaff == 0) || (makeMiniScaffolds == 0)) { AS_UID fUID = getFragmentClear(frag->read_iid, 0, toprint); AS_UTL_writeFastA(stdout, toprint, strlen(toprint), 0, ">%s /type=singleton\n", AS_UID_toString(fUID)); } else if ((mate != NULL) && (mate->flags.bits.isChaff == 1) && (makeMiniScaffolds == 1) && (frag->read_iid < mate->read_iid)) { // make sure the following chain of Ns is divisible by three; // the exact length is arbitrary but Doug Rusch points out that // by making it divisible by 3, we can get lucky and maintain // the phase of a protein ... which helps in the // auto-annotation of environmental samples AS_UID fUID = getFragmentClear(frag->read_iid, 0, toprint); strcat(toprint, "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"); AS_UID mUID = getFragmentClear(mate->read_iid, 1, toprint + strlen(toprint)); AS_UTL_writeFastA(stdout, toprint, strlen(toprint), 0, ">"F_U64" /type=mini_scaffold /frgs=(%s,%s)\n", getUID(uids), AS_UID_toString(fUID), AS_UID_toString(mUID)); } } delete GlobalData; exit(0); }
int main(int argc, char **argv) { char *gkpPath = 0L; char *merCountsFile = 0L; merylStreamReader *MF = 0L; uint32 maxCount = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-m") == 0) { merCountsFile = argv[++arg]; } else { fprintf(stderr, "unknown option '%s'\n", argv[arg]); err++; } arg++; } if ((merCountsFile == 0L) || (err)) { fprintf(stderr, "usage: %s -m mercounts\n", argv[0]); fprintf(stderr, " -m mercounts file of mercounts\n"); exit(1); } MF = new merylStreamReader(merCountsFile); // Examine the counts, pick a reasonable upper limit. uint64 totalUsefulDistinct = MF->numberOfDistinctMers() - MF->numberOfUniqueMers(); uint64 totalUsefulAll = MF->numberOfTotalMers() - MF->numberOfUniqueMers(); uint64 distinct = 0; uint64 total = 0; uint32 Xcoverage = 8; fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers()); fprintf(stderr, "unique: "F_U64"\n", MF->numberOfUniqueMers()); fprintf(stderr, "total: "F_U64"\n", MF->numberOfTotalMers()); // Pass 0: try to deduce the X coverage we have. The // pattern we should see in mer counts is an initial spike // for unique mers (these contain errors), then a drop into // a valley, and a bump at the X coverage. // // . // . ... // .. .......... // ................. // // If this pattern is not found, we fallback to the default // guess of 8x coverage. // uint32 i = 0; uint32 iX = 0; fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers()); fprintf(stderr, "unique: "F_U64"\n", MF->numberOfUniqueMers()); fprintf(stderr, "total: "F_U64"\n", MF->numberOfTotalMers()); fprintf(stderr, "Xcoverage zero 1 0 "F_U64"\n", MF->histogram(1)); for (i=2; (i < MF->histogramLength()) && (MF->histogram(i-1) > MF->histogram(i)); i++) fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(i-1), MF->histogram(i)); iX = i - 1; for (; i < MF->histogramLength(); i++) { if (MF->histogram(iX) < MF->histogram(i)) { fprintf(stderr, "Xcoverage incr "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i)); iX = i; } else { //fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i)); } } fprintf(stderr, "Guessed X coverage is "F_U32"\n", iX); Xcoverage = iX; // Pass 1: look for a reasonable limit, using %distinct and %total. // for (i=2; (i < MF->histogramLength()) && (maxCount == 0); i++) { distinct += MF->histogram(i); total += MF->histogram(i) * i; // If we cover 99% of all the distinct mers, that's reasonable. // if ((distinct / (double)totalUsefulDistinct) > 0.99) maxCount = i; // If we're a somewhat high count, and we're covering 2/3 // of the total mers, assume that there are lots of // errors (or polymorphism) that are preventing us from // covering many distinct mers. // if ((i > 25 * Xcoverage) && ((total / (double)totalUsefulAll) > (2.0 / 3.0))) maxCount = i; } fprintf(stderr, "Set maxCount to "F_U32", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n", i, 100.0 * distinct / totalUsefulDistinct, 100.0 * total / totalUsefulAll); // Pass 2: if the limit is relatively small compared to our // guessed Xcoverage, and %total is high, keep going to // close 75% of the gap in total coverage. So if the TC is // 90%, we'd keep going until TC is 97.5%. // // If we're WAY low compared to X coverage, close the gap // too, but not as much. This only happens if we're // covering 99% of the distinct, so we're already in good // shape. The genome doesn't appear to be very repetitive. // if (((maxCount < 5 * Xcoverage)) || ((maxCount < 50 * Xcoverage) && (total / (double)totalUsefulAll > 0.90))) { double closeAmount = 0.75; if (total / (double)totalUsefulAll <= 0.90) closeAmount = 0.5; // No, really. This is just 0.75 * (1-TC) + TC double desiredTC = closeAmount + (1 - closeAmount) * total / (double)totalUsefulAll; for (; (i < MF->histogramLength()) && (total / (double)totalUsefulAll < desiredTC); i++) { distinct += MF->histogram(i); total += MF->histogram(i) * i; } maxCount = i; fprintf(stderr, "Reset maxCount to "F_U32", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n", maxCount, 100.0 * distinct / totalUsefulDistinct, 100.0 * total / totalUsefulAll); } fprintf(stdout, F_U32"\n", maxCount); return(0); }
int main(int argc, char **argv) { int illegal; argc = AS_configure(argc, argv); G.initialize(); int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { G.Doing_Partial_Overlaps = TRUE; } else if (strcmp(argv[arg], "-h") == 0) { AS_UTL_decodeRange(argv[++arg], G.bgnHashID, G.endHashID); } else if (strcmp(argv[arg], "-H") == 0) { AS_UTL_decodeRange(argv[++arg], G.minLibToHash, G.maxLibToHash); } else if (strcmp(argv[arg], "-r") == 0) { AS_UTL_decodeRange(argv[++arg], G.bgnRefID, G.endRefID); } else if (strcmp(argv[arg], "-R") == 0) { AS_UTL_decodeRange(argv[++arg], G.minLibToRef, G.maxLibToRef); } else if (strcmp(argv[arg], "-k") == 0) { arg++; if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) || (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) { G.Kmer_Len = strtoull(argv[arg], NULL, 10); } else { errno = 0; G.Kmer_Skip_File = fopen(argv[arg], "r"); if (errno) fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1); } } else if (strcmp(argv[arg], "-l") == 0) { G.Frag_Olap_Limit = strtol(argv[++arg], NULL, 10); if (G.Frag_Olap_Limit < 1) G.Frag_Olap_Limit = UINT64_MAX; } else if (strcmp(argv[arg], "-m") == 0) { G.Unique_Olap_Per_Pair = FALSE; } else if (strcmp(argv[arg], "-u") == 0) { G.Unique_Olap_Per_Pair = TRUE; } else if (strcmp(argv[arg], "--hashbits") == 0) { G.Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashstrings") == 0) { G.Max_Hash_Strings = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashdatalen") == 0) { G.Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashload") == 0) { G.Max_Hash_Load = atof(argv[++arg]); } else if (strcmp(argv[arg], "--maxreadlen") == 0) { // Quite the gross way to do this, but simple. uint32 desired = strtoul(argv[++arg], NULL, 10); OFFSET_BITS = 1; while (((uint32)1 << OFFSET_BITS) < desired) OFFSET_BITS++; STRING_NUM_BITS = 30 - OFFSET_BITS; STRING_NUM_MASK = (1 << STRING_NUM_BITS) - 1; OFFSET_MASK = (1 << OFFSET_BITS) - 1; MAX_STRING_NUM = STRING_NUM_MASK; } else if (strcmp(argv[arg], "-o") == 0) { G.Outfile_Name = argv[++arg]; } else if (strcmp(argv[arg], "-s") == 0) { G.Outstat_Name = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { G.Num_PThreads = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--minlength") == 0) { G.Min_Olap_Len = strtol (argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--maxerate") == 0) { G.maxErate = ceil(strtof(argv[++arg], NULL) * 100) / 100; } else if (strcmp(argv[arg], "-w") == 0) { G.Use_Window_Filter = TRUE; } else if (strcmp(argv[arg], "-z") == 0) { G.Use_Hopeless_Check = FALSE; } else { if (G.Frag_Store_Path == NULL) { G.Frag_Store_Path = argv[arg]; } else { fprintf(stderr, "Unknown option '%s'\n", argv[arg]); err++; } } arg++; } // Fix up some flags if we're allowing high error rates. // if (G.maxErate > 0.06) { if (G.Use_Window_Filter) fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n"); G.Use_Window_Filter = FALSE; G.Use_Hopeless_Check = FALSE; } if (G.Max_Hash_Strings == 0) fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++; if (G.Kmer_Len == 0) fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++; if (G.Max_Hash_Strings > MAX_STRING_NUM) fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++; if (G.Outfile_Name == NULL) fprintf (stderr, "ERROR: No output file name specified\n"), err++; if ((err) || (G.Frag_Store_Path == NULL)) { fprintf(stderr, "USAGE: %s [options] <gkpStorePath>\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "-b <fn> in contig mode, specify the output file\n"); fprintf(stderr, "-c contig mode. Use 2 frag stores. First is\n"); fprintf(stderr, " for reads; second is for contigs\n"); fprintf(stderr, "-G do partial overlaps\n"); fprintf(stderr, "-h <range> to specify fragments to put in hash table\n"); fprintf(stderr, " Implies LSF mode (no changes to frag store)\n"); fprintf(stderr, "-I designate a file of frag iids to limit olaps to\n"); fprintf(stderr, " (Contig mode only)\n"); fprintf(stderr, "-k if one or two digits, the length of a kmer, otherwise\n"); fprintf(stderr, " the filename containing a list of kmers to ignore in\n"); fprintf(stderr, " the hash table\n"); fprintf(stderr, "-l specify the maximum number of overlaps per\n"); fprintf(stderr, " fragment-end per batch of fragments.\n"); fprintf(stderr, "-m allow multiple overlaps per oriented fragment pair\n"); fprintf(stderr, "-M specify memory size. Valid values are '8GB', '4GB',\n"); fprintf(stderr, " '2GB', '1GB', '256MB'. (Not for Contig mode)\n"); fprintf(stderr, "-o specify output file name\n"); fprintf(stderr, "-P write protoIO output (if not -G)\n"); fprintf(stderr, "-r <range> specify old fragments to overlap\n"); fprintf(stderr, "-t <n> use <n> parallel threads\n"); fprintf(stderr, "-u allow only 1 overlap per oriented fragment pair\n"); fprintf(stderr, "-w filter out overlaps with too many errors in a window\n"); fprintf(stderr, "-z skip the hopeless check\n"); fprintf(stderr, "\n"); fprintf(stderr, "--maxerate <n> only output overlaps with fraction <n> or less error (e.g., 0.06 == 6%%)\n"); fprintf(stderr, "--minlength <n> only output overlaps of <n> or more bases\n"); fprintf(stderr, "\n"); fprintf(stderr, "--hashbits n Use n bits for the hash mask.\n"); fprintf(stderr, "--hashstrings n Load at most n strings into the hash table at one time.\n"); fprintf(stderr, "--hashdatalen n Load at most n bytes into the hash table at one time.\n"); fprintf(stderr, "--hashload f Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n"); fprintf(stderr, "\n"); fprintf(stderr, "--maxreadlen n For batches with all short reads, pack bits differently to\n"); fprintf(stderr, " process more reads per batch.\n"); fprintf(stderr, " all reads must be shorter than n\n"); fprintf(stderr, " --hashstrings limited to 2^(30-m)\n"); fprintf(stderr, " Common values:\n"); fprintf(stderr, " maxreadlen 2048->hashstrings 524288 (default)\n"); fprintf(stderr, " maxreadlen 512->hashstrings 2097152\n"); fprintf(stderr, " maxreadlen 128->hashstrings 8388608\n"); fprintf(stderr, "\n"); fprintf(stderr, "--readsperbatch n Force batch size to n.\n"); fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n"); fprintf(stderr, "\n"); exit(1); } Out_BOF = new ovFile(G.Outfile_Name, ovFileFullWrite); // We know enough now to set the hash function variables, and some other random variables. HSF1 = G.Kmer_Len - (G.Hash_Mask_Bits / 2); HSF2 = 2 * G.Kmer_Len - G.Hash_Mask_Bits; SV1 = HSF1 + 2; SV2 = (HSF1 + HSF2) / 2; SV3 = HSF2 - 2; // Log parameters. fprintf(stderr, "\n"); fprintf(stderr, "STRING_NUM_BITS "F_U32"\n", STRING_NUM_BITS); fprintf(stderr, "OFFSET_BITS "F_U32"\n", OFFSET_BITS); fprintf(stderr, "STRING_NUM_MASK "F_U64"\n", STRING_NUM_MASK); fprintf(stderr, "OFFSET_MASK "F_U64"\n", OFFSET_MASK); fprintf(stderr, "MAX_STRING_NUM "F_U64"\n", MAX_STRING_NUM); fprintf(stderr, "\n"); fprintf(stderr, "Hash_Mask_Bits "F_U32"\n", G.Hash_Mask_Bits); fprintf(stderr, "Max_Hash_Strings "F_U32"\n", G.Max_Hash_Strings); fprintf(stderr, "Max_Hash_Data_Len "F_U64"\n", G.Max_Hash_Data_Len); fprintf(stderr, "Max_Hash_Load %f\n", G.Max_Hash_Load); fprintf(stderr, "Kmer Length "F_U64"\n", G.Kmer_Len); fprintf(stderr, "Min Overlap Length %d\n", G.Min_Olap_Len); fprintf(stderr, "Max Error Rate %f\n", G.maxErate); fprintf(stderr, "\n"); fprintf(stderr, "Num_PThreads "F_U32"\n", G.Num_PThreads); assert (8 * sizeof (uint64) > 2 * G.Kmer_Len); Bit_Equivalent['a'] = Bit_Equivalent['A'] = 0; Bit_Equivalent['c'] = Bit_Equivalent['C'] = 1; Bit_Equivalent['g'] = Bit_Equivalent['G'] = 2; Bit_Equivalent['t'] = Bit_Equivalent['T'] = 3; for (int i = 0; i < 256; i ++) { char ch = tolower ((char) i); if (ch == 'a' || ch == 'c' || ch == 'g' || ch == 't') Char_Is_Bad[i] = 0; else Char_Is_Bad[i] = 1; } fprintf(stderr, "\n"); fprintf(stderr, "HASH_TABLE_SIZE "F_U32"\n", HASH_TABLE_SIZE); fprintf(stderr, "sizeof(Hash_Bucket_t) "F_SIZE_T"\n", sizeof(Hash_Bucket_t)); fprintf(stderr, "hash table size: "F_SIZE_T" MB\n", (HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)) >> 20); fprintf(stderr, "\n"); Hash_Table = new Hash_Bucket_t [HASH_TABLE_SIZE]; fprintf(stderr, "check "F_SIZE_T" MB\n", (HASH_TABLE_SIZE * sizeof (Check_Vector_t) >> 20)); fprintf(stderr, "info "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (Hash_Frag_Info_t) >> 20)); fprintf(stderr, "start "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (int64) >> 20)); fprintf(stderr, "\n"); Hash_Check_Array = new Check_Vector_t [HASH_TABLE_SIZE]; String_Info = new Hash_Frag_Info_t [G.Max_Hash_Strings]; String_Start = new int64 [G.Max_Hash_Strings]; String_Start_Size = G.Max_Hash_Strings; memset(Hash_Check_Array, 0, sizeof(Check_Vector_t) * HASH_TABLE_SIZE); memset(String_Info, 0, sizeof(Hash_Frag_Info_t) * G.Max_Hash_Strings); memset(String_Start, 0, sizeof(int64) * G.Max_Hash_Strings); OverlapDriver(); delete [] basesData; delete [] qualsData; delete [] nextRef; delete [] String_Start; delete [] String_Info; delete [] Hash_Check_Array; delete [] Hash_Table; delete Out_BOF; FILE *stats = stderr; if (G.Outstat_Name != NULL) { errno = 0; stats = fopen(G.Outstat_Name, "w"); if (errno) { fprintf(stderr, "WARNING: failed to open '%s' for writing: %s\n", G.Outstat_Name, strerror(errno)); stats = stderr; } } fprintf(stats, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct); fprintf(stats, " Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct); fprintf(stats, " Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct); fprintf(stats, " Total overlaps produced = "F_S64"\n", Total_Overlaps); fprintf(stats, " Contained overlaps = "F_S64"\n", Contained_Overlap_Ct); fprintf(stats, " Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct); fprintf(stats, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct); fprintf(stats, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct); if (stats != stderr) fclose(stats); return(0); }
int main (int argc, char **argv) { char tmpName[FILENAME_MAX] = {0}; char *gkpName = NULL; char *tigName = NULL; int32 tigVers = -1; int32 tigPart = -1; int64 ctgBgn = -1; int64 ctgEnd = -1; char *ctgName = NULL; char *outName = NULL; char *inName = NULL; bool forceCompute = false; int32 numFailures = 0; int32 numSkipped = 0; bool useUnitig = false; bool showResult = false; CNS_Options options = { CNS_OPTIONS_SPLIT_ALLELES_DEFAULT, CNS_OPTIONS_MIN_ANCHOR_DEFAULT, CNS_OPTIONS_DO_PHASING_DEFAULT }; // Comminucate to MultiAlignment_CNS.c that we are doing consensus and not cgw. thisIsConsensus = 1; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-g") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); tigPart = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { AS_UTL_decodeRange(argv[++arg], ctgBgn, ctgEnd); } else if (strcmp(argv[arg], "-T") == 0) { ctgName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { outName = argv[++arg]; } else if (strcmp(argv[arg], "-I") == 0) { inName = argv[++arg]; } else if (strcmp(argv[arg], "-f") == 0) { forceCompute = true; } else if (strcmp(argv[arg], "-U") == 0) { useUnitig = true; } else if (strcmp(argv[arg], "-v") == 0) { showResult = true; } else if (strcmp(argv[arg], "-V") == 0) { VERBOSE_MULTIALIGN_OUTPUT++; } else if (strcmp(argv[arg], "-w") == 0) { options.smooth_win = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-P") == 0) { options.do_phasing = atoi(argv[++arg]); } else { fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((err) || (gkpName == NULL) || (tigName == NULL)) { fprintf(stderr, "usage: %s -g gkpStore -t tigStore version partition [opts]\n", argv[0]); fprintf(stderr, " -c b Compute only contig ID 'b' (must be in the correct partition!)\n"); fprintf(stderr, " -c b-e Compute only contigs from ID 'b' to ID 'e'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -T file Test the computation of the contig layout in 'file'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -f Recompute contigs that already have a multialignment\n"); fprintf(stderr, "\n"); fprintf(stderr, " -U Reuse the unitig consensus for contigs with only a single\n"); fprintf(stderr, " unitig (EXPERIMENTAL!)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -O file Don't update tigStore, dump a binary file instead.\n"); fprintf(stderr, " -I file Import binary file into tigStore\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v Show multialigns.\n"); fprintf(stderr, " -V Enable debugging option 'verbosemultialign'.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -w ws Smoothing window size\n"); fprintf(stderr, "\n"); exit(1); } // Open both stores for read only. gkpStore = new gkStore(gkpName, false, false); tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, false, false, false); gkpStore->gkStore_loadPartition(tigPart); // Decide on what to compute. Either all contigs, or a single contig, or a special case test. uint32 b = 0; uint32 e = tigStore->numContigs(); if (ctgBgn != -1) { b = ctgBgn; e = ctgEnd + 1; } FORCE_UNITIG_ABUT = 1; if (ctgName != NULL) { errno = 0; FILE *F = fopen(ctgName, "r"); if (errno) fprintf(stderr, "Failed to open input contig file '%s': %s\n", ctgName, strerror(errno)), exit(1); MultiAlignT *ma = CreateEmptyMultiAlignT(); bool isUnitig = false; while (LoadMultiAlignFromHuman(ma, isUnitig, F) == true) { if (ma->maID < 0) ma->maID = (isUnitig) ? tigStore->numUnitigs() : tigStore->numContigs(); if (MultiAlignContig(ma, gkpStore, &options)) { if (showResult) PrintMultiAlignT(stdout, ma, gkpStore, false, false, AS_READ_CLEAR_LATEST); } else { fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", ma->maID); numFailures++; } } DeleteMultiAlignT(ma); b = e = 0; } // Reopen for writing, if we have work to do. if (((inName) || (b < e)) && (outName == NULL)) { delete tigStore; tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, true, false, true); } if (inName) { importFromFile(inName, tigPart); b = e = 0; } // Now the usual case. Iterate over all contigs, compute and update. for (uint32 i=b; i<e; i++) { MultiAlignT *cma = tigStore->loadMultiAlign(i, false); if (cma == NULL) { // Not in our partition, or deleted. continue; } bool exists = (cma->consensus != NULL) && (GetNumchars(cma->consensus) > 1); if ((forceCompute == false) && (exists == true)) { // Already finished contig consensus. fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - already computed, skipped\n", cma->maID, cma->data.num_unitigs, cma->data.num_frags); numSkipped++; tigStore->unloadMultiAlign(cma->maID, false); continue; } int32 uID = GetIntUnitigPos(cma->u_list, 0)->ident; // If this is a surrogate, we CANNOT reuse the unitig. We need to process the contig so that // the unplaced reads are stripped out. A surrogate should have different contig and unitig // IDs; we could also check the contig status. if ((cma->data.num_unitigs == 1) && (cma->maID == uID) && (useUnitig == true)) { fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - reusing unitig %d consensus\n", cma->maID, cma->data.num_unitigs, cma->data.num_frags, uID); MultiAlignT *uma = tigStore->loadMultiAlign(uID, true); uma->data = cma->data; tigStore->unloadMultiAlign(cma->maID, false); if (outName) writeToOutFile(outName, tigPart, uma); else tigStore->insertMultiAlign(uma, false, false); tigStore->unloadMultiAlign(uma->maID, true); continue; } fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments)%s\n", cma->maID, cma->data.num_unitigs, cma->data.num_frags, (exists) ? " - already computed, recomputing" : ""); if (MultiAlignContig(cma, gkpStore, &options)) { if (outName) writeToOutFile(outName, tigPart, cma); else tigStore->insertMultiAlign(cma, false, true); if (showResult) PrintMultiAlignT(stdout, cma, gkpStore, false, false, AS_READ_CLEAR_LATEST); tigStore->unloadMultiAlign(cma->maID, false); } else { fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", cma->maID); numFailures++; } } delete tigStore; fprintf(stderr, "\n"); fprintf(stderr, "NumColumnsInUnitigs = %d\n", NumColumnsInUnitigs); fprintf(stderr, "NumGapsInUnitigs = %d\n", NumGapsInUnitigs); fprintf(stderr, "NumRunsOfGapsInUnitigReads = %d\n", NumRunsOfGapsInUnitigReads); fprintf(stderr, "NumColumnsInContigs = %d\n", NumColumnsInContigs); fprintf(stderr, "NumGapsInContigs = %d\n", NumGapsInContigs); fprintf(stderr, "NumRunsOfGapsInContigReads = %d\n", NumRunsOfGapsInContigReads); fprintf(stderr, "NumAAMismatches = %d\n", NumAAMismatches); fprintf(stderr, "NumVARRecords = %d\n", NumVARRecords); fprintf(stderr, "NumVARStringsWithFlankingGaps = %d\n", NumVARStringsWithFlankingGaps); fprintf(stderr, "NumUnitigRetrySuccess = %d\n", NumUnitigRetrySuccess); fprintf(stderr, "\n"); if (numFailures) { fprintf(stderr, "WARNING: Total number of contig failures = %d\n", numFailures); fprintf(stderr, "\n"); fprintf(stderr, "Consensus did NOT finish successfully.\n"); return(1); } fprintf(stderr, "Consensus finished successfully. Bye.\n"); return(0); }
int main (int argc, char **argv) { char *gkpName = NULL; char *tigName = NULL; int32 tigVers = -1; vector<char *> tigInputs; tgStoreType tigType = tgStoreModify; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-T") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-L") == 0) { AS_UTL_loadFileList(argv[++arg], tigInputs); } else if (strcmp(argv[arg], "-n") == 0) { tigType = tgStoreReadOnly; } else if (AS_UTL_fileExists(argv[arg])) { tigInputs.push_back(argv[arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((err) || (gkpName == NULL) || (tigName == NULL) || (tigInputs.size() == 0)) { fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v> [input.cns]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G <gkpStore> Path to the gatekeeper store\n"); fprintf(stderr, " -T <tigStore> <v> Path to the tigStore and version to add tigs to\n"); fprintf(stderr, "\n"); fprintf(stderr, " -L <file-of-files> Load the tig(s) from files listed in 'file-of-files'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -n Don't replace, just report what would have happened\n"); fprintf(stderr, "\n"); fprintf(stderr, " The primary operation is to replace tigs in the store with ones in a set of input files.\n"); fprintf(stderr, " The input files can be either supplied directly on the command line or listed in\n"); fprintf(stderr, " a text file (-L).\n"); fprintf(stderr, "\n"); fprintf(stderr, " A new store is created if one doesn't exist, otherwise, whatever tigs are there are\n"); fprintf(stderr, " replaced with those in the -R file. If version 'v' doesn't exist, it is created.\n"); fprintf(stderr, "\n"); fprintf(stderr, " Even if -n is supplied, a new store is created if one doesn't exist.\n"); fprintf(stderr, "\n"); fprintf(stderr, " To add a new tig, give it a tig id of -1. New tigs must be added to the latest version.\n"); fprintf(stderr, " To delete a tig, remove all children, and set the number of them to zero.\n"); fprintf(stderr, "\n"); if (gkpName == NULL) fprintf(stderr, "ERROR: no gatekeeper store (-G) supplied.\n"); if (tigName == NULL) fprintf(stderr, "ERROR: no tig store (-T) supplied.\n"); if (tigInputs.size() == 0) fprintf(stderr, "ERROR: no input tigs (-R) supplied.\n"); exit(1); } // If the store doesn't exist, create one, and make a bunch of versions if (AS_UTL_fileExists(tigName, true, false) == false) { fprintf(stderr, "Creating tig store '%s' version %d\n", tigName, tigVers); tgStore *tigStore = new tgStore(tigName); for (int32 vv=1; vv<tigVers; vv++) tigStore->nextVersion(); delete tigStore; } gkStore *gkpStore = gkStore::gkStore_open(gkpName); tgStore *tigStore = new tgStore(tigName, tigVers, tigType); tgTig *tig = new tgTig; for (uint32 ff=0; ff<tigInputs.size(); ff++) { errno = 0; FILE *TI = fopen(tigInputs[ff], "r"); if (errno) fprintf(stderr, "Failed to open '%s': %s\n", tigInputs[ff], strerror(errno)), exit(1); fprintf(stderr, "Reading layouts from '%s'.\n", tigInputs[ff]); while (tig->loadFromStreamOrLayout(TI) == true) { // Handle insertion. if (tig->numberOfChildren() > 0) { //fprintf(stderr, "INSERTING tig %d\n", tig->tigID()); tigStore->insertTig(tig, false); continue; } // Deleted already? if (tigStore->isDeleted(tig->tigID()) == true) { //fprintf(stderr, "DELETING tig %d -- ALREADY DELETED\n", tig->tigID()); continue; } // Really delete it then. //fprintf(stderr, "DELETING tig %d\n", tig->tigID()); tigStore->deleteTig(tig->tigID()); } fclose(TI); fprintf(stderr, "Reading layouts from '%s' completed.\n", tigInputs[ff]); } delete tig; delete tigStore; gkpStore->gkStore_close(); exit(0); }
int main(int argc, char **argv) { char *gkpStoreName = NULL; char *outPrefix = NULL; char *clrName = NULL; uint32 libToDump = 0; uint32 bgnID = 1; uint32 endID = UINT32_MAX; bool dumpAllReads = false; bool dumpAllBases = false; bool dumpOnlyDeleted = false; bool dumpFASTQ = true; bool dumpFASTA = false; bool withLibName = true; argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-c") == 0) { clrName = argv[++arg]; } else if (strcmp(argv[arg], "-l") == 0) { libToDump = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-b") == 0) { bgnID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-e") == 0) { endID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-r") == 0) { bgnID = atoi(argv[++arg]); endID = bgnID; } else if (strcmp(argv[arg], "-allreads") == 0) { dumpAllReads = true; } else if (strcmp(argv[arg], "-allbases") == 0) { dumpAllBases = true; } else if (strcmp(argv[arg], "-onlydeleted") == 0) { dumpOnlyDeleted = true; dumpAllReads = true; // Otherwise we won't report the deleted reads! } else if (strcmp(argv[arg], "-fastq") == 0) { dumpFASTQ = true; dumpFASTA = false; } else if (strcmp(argv[arg], "-fasta") == 0) { dumpFASTQ = false; dumpFASTA = true; } else if (strcmp(argv[arg], "-nolibname") == 0) { withLibName = false; } else { err++; fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); } arg++; } if (gkpStoreName == NULL) err++; if (outPrefix == NULL) err++; if (err) { fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]); fprintf(stderr, " -G gkpStore\n"); fprintf(stderr, " -o fastq-prefix write files fastq-prefix.(libname).fastq, ...\n"); fprintf(stderr, "\n"); fprintf(stderr, " -l libToDump output only read in library number libToDump (NOT IMPLEMENTED)\n"); fprintf(stderr, " -b id output starting at read 'id'\n"); fprintf(stderr, " -e id output stopping after read 'id'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -c clearFile clear range file from OBT modules\n"); fprintf(stderr, " -allreads if a clear range file, lower case mask the deleted reads\n"); fprintf(stderr, " -allbases if a clear range file, lower case mask the non-clear bases\n"); fprintf(stderr, " -onlydeleted if a clear range file, only output deleted reads (the entire read)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -r id output only the single read 'id'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -fastq output is FASTQ format (with extension .fastq, default)\n"); fprintf(stderr, " -fasta output is FASTA format (with extension .fasta)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -nolibname don't include the library name in the output file name\n"); fprintf(stderr, "\n"); if (gkpStoreName == NULL) fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n"); if (outPrefix == NULL) fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n"); exit(1); } gkStore *gkpStore = new gkStore(gkpStoreName); uint32 numReads = gkpStore->gkStore_getNumReads(); uint32 numLibs = gkpStore->gkStore_getNumLibraries(); clearRangeFile *clrRange = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore); if (bgnID < 1) bgnID = 1; if (numReads < endID) endID = numReads; if (endID < bgnID) fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID); fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID); libOutput **out = new libOutput * [numLibs + 1]; // Allocate outputs. If withLibName == false, all reads will artificially be in lib zero, the // other files won't ever be created. Otherwise, the zeroth file won't ever be created. out[0] = new libOutput(outPrefix, NULL); for (uint32 i=1; i<=numLibs; i++) out[i] = new libOutput(outPrefix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName()); // Grab a new readData, and iterate through reads to dump. gkReadData *readData = new gkReadData; for (uint32 rid=bgnID; rid<=endID; rid++) { gkRead *read = gkpStore->gkStore_getRead(rid); uint32 libID = (withLibName == false) ? 0 : read->gkRead_libraryID(); uint32 lclr = 0; uint32 rclr = read->gkRead_sequenceLength(); bool ignore = false; //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID); // If a clear range file is supplied, grab the clear range. If it hasn't been set, the default // is the entire read. if (clrRange) { lclr = clrRange->bgn(rid); rclr = clrRange->end(rid); ignore = clrRange->isDeleted(rid); } // Abort if we're not dumping anything from this read // - not in a library we care about // - deleted, and not dumping all reads // - not deleted, but only reporting deleted reads if (((libToDump != 0) && (libID == libToDump)) || ((dumpAllReads == false) && (ignore == true)) || ((dumpOnlyDeleted == true) && (ignore == false))) continue; // And if we're told to ignore the read, and here, then the read was deleted and we're printing // all reads. Reset the clear range to the whole read, the clear range is invalid. if (ignore) { rclr = read->gkRead_sequenceLength(); lclr = 0; } // Grab the sequence and quality. gkpStore->gkStore_loadReadData(read, readData); char *seq = readData->gkReadData_getSequence(); char *qlt = readData->gkReadData_getQualities(); uint32 len = rclr - lclr; // Soft mask not-clear bases if (dumpAllBases == true) { for (uint32 i=0; i<lclr; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; for (uint32 i=lclr; i<rclr; i++) seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a'; for (uint32 i=rclr; seq[i]; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; rclr = read->gkRead_sequenceLength(); lclr = 0; } // Chop off the ends we're not printing. seq += lclr; seq[len] = 0; qlt[len] = 0; // And print the read. if (dumpFASTQ) AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, len, qlt, len, "@"F_U32" clr="F_U32","F_U32"\n", rid, lclr, rclr); if (dumpFASTA) AS_UTL_writeFastA(out[libID]->getFASTA(), seq, len, 0, ">"F_U32" clr="F_U32","F_U32"\n", rid, lclr, rclr); } delete readData; for (uint32 i=1; i<=numLibs; i++) delete out[i]; delete [] out; delete gkpStore; exit(0); }
int main(int argc, char **argv) { argc = AS_configure(argc, argv); merylArgs *args = new merylArgs(argc, argv); gkpStoreFile::registerFile(); gkpStoreChain::registerFile(); switch (args->personality) { case 'P': estimate(args); break; case 'B': build(args); break; case 'd': dumpDistanceBetweenMers(args); break; case 't': dumpThreshold(args); break; case 'p': dumpPositions(args); break; case 'c': countUnique(args); break; case 'h': plotHistogram(args); break; case PERSONALITY_MIN: case PERSONALITY_MINEXIST: case PERSONALITY_MAX: case PERSONALITY_MAXEXIST: case PERSONALITY_ADD: case PERSONALITY_AND: case PERSONALITY_NAND: case PERSONALITY_OR: case PERSONALITY_XOR: multipleOperations(args); break; case PERSONALITY_SUB: case PERSONALITY_ABS: case PERSONALITY_DIVIDE: binaryOperations(args); break; case PERSONALITY_LEQ: case PERSONALITY_GEQ: case PERSONALITY_EQ: unaryOperations(args); break; default: args->usage(); fprintf(stderr, "%s: unknown personality. Specify -P, -B, -S or -M!\n", args->execName); exit(1); break; } delete args; return(0); }
int main(int argc, char **argv) { coParameters *G = new coParameters(); argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { G->gkpStorePath = argv[++arg]; } else if (strcmp(argv[arg], "-R") == 0) { G->bgnID = atoi(argv[++arg]); G->endID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-O") == 0) { // -F? -S Olap_Path G->ovlStorePath = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { G->errorRate = atof(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { G->minOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { // For 'corrections' file input G->correctionsName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { // For 'erates' output G->eratesName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { // But we're not threaded! G->numThreads = atoi(argv[++arg]); } else { err++; } arg++; } if (G->gkpStorePath == NULL) fprintf(stderr, "ERROR: no input gatekeeper store (-G) supplied.\n"), err++; if (G->ovlStorePath == NULL) fprintf(stderr, "ERROR: no input overlap store (-O) supplied.\n"), err++; if (G->correctionsName == NULL) fprintf(stderr, "ERROR: no input read corrections file (-c) supplied.\n"), err++; if (G->eratesName == NULL) fprintf(stderr, "ERROR: no output erates file (-o) supplied.\n"), err++; if (err) { fprintf(stderr, "USAGE: %s [-d <dna-file>] [-o <ovl_file>] [-q <quality>]\n", argv[0]); fprintf(stderr, " [-x <del_file>] [-F OlapFile] [-S OlapStore]\n"); fprintf(stderr, " [-c <cgb_file>] [-e <erate_file>\n"); fprintf(stderr, " <gkpStore> <CorrectFile> <lo> <hi>\n"); fprintf(stderr, "\n"); fprintf(stderr, "Recalculates overlaps for frags <lo> .. <hi> in\n"); fprintf(stderr, " <gkpStore> using corrections in <CorrectFile> \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "-e <erate-file> specifies binary file to dump corrected erates to\n"); fprintf(stderr, " for later updating of olap store by update-erates \n"); fprintf(stderr, "-F specify file of sorted overlaps to use (in the format\n"); fprintf(stderr, " produced by get-olaps\n"); fprintf(stderr, "-o <ovl_file> specifies name of file to which OVL messages go\n"); fprintf(stderr, "-q <quality> overlaps less than this error rate are\n"); fprintf(stderr, " automatically output\n"); fprintf(stderr, "-S specify the binary overlap store containing overlaps to use\n"); exit(1); } //fprintf (stderr, "Quality Threshold = %.2f%%\n", 100.0 * Quality_Threshold); // // Initialize Globals // fprintf(stderr, "Initializing.\n"); double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN); Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS); for (int32 i=0; i <= AS_MAX_READLEN; i++) G->Error_Bound[i] = (int)ceil(i * G->errorRate); // // // fprintf(stderr, "Opening gkpStore '%s'.\n", G->gkpStorePath); gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath); if (G->bgnID < 1) G->bgnID = 1; if (gkpStore->gkStore_getNumReads() < G->endID) G->endID = gkpStore->gkStore_getNumReads(); // Load the reads for the overlaps we are going to be correcting, and apply corrections to them fprintf(stderr, "Correcting reads "F_U32" to "F_U32".\n", G->bgnID, G->endID); Correct_Frags(G, gkpStore); // Load overlaps we're going to correct fprintf(stderr, "Loading overlaps.\n"); Read_Olaps(G, gkpStore); // Now sort them on the B iid. fprintf(stderr, "Sorting overlaps.\n"); #ifdef _GLIBCXX_PARALLEL __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID()); #else sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID()); #endif // Recompute overlaps fprintf(stderr, "Recomputing overlaps.\n"); Redo_Olaps(G, gkpStore); gkpStore->gkStore_close(); gkpStore = NULL; // Sort the overlaps back into the original order fprintf(stderr, "Sorting overlaps.\n"); #ifdef _GLIBCXX_PARALLEL __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order()); #else sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order()); #endif // Dump the new erates fprintf (stderr, "Saving corrected error rates to file %s\n", G->eratesName); { errno = 0; FILE *fp = fopen(G->eratesName, "w"); if (errno) fprintf(stderr, "Failed to open '%s': %s\n", G->eratesName, strerror(errno)), exit(1); AS_UTL_safeWrite(fp, &G->bgnID, "loid", sizeof(int32), 1); AS_UTL_safeWrite(fp, &G->endID, "hiid", sizeof(int32), 1); AS_UTL_safeWrite(fp, &G->olapsLen, "num", sizeof(uint64), 1); fprintf(stderr, "--Allocate "F_U64" MB for output error rates.\n", (sizeof(uint16) * G->olapsLen) >> 20); uint16 *evalue = new uint16 [G->olapsLen]; for (int32 i=0; i<G->olapsLen; i++) evalue[i] = G->olaps[i].evalue; AS_UTL_safeWrite(fp, evalue, "evalue", sizeof(uint16), G->olapsLen); delete [] evalue; fclose(fp); } // Finished. //fprintf (stderr, "%d/%d failed/total alignments (%.1f%%)\n", // Failed_Alignments_Ct, Total_Alignments_Ct, // Total_Alignments_Ct == 0 ? 0.0 : (100.0 * Failed_Alignments_Ct) / Total_Alignments_Ct); delete G; fprintf(stderr, "DONE.\n"); exit(0); }
int main(int argc, char **argv) { char *gkpStoreName = NULL; char *outPrefix = NULL; uint32 minReadLength = 0; uint32 firstFileArg = 0; char errorLogName[FILENAME_MAX]; char htmlLogName[FILENAME_MAX]; char nameMapName[FILENAME_MAX]; argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-o") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "--") == 0) { firstFileArg = arg++; break; } else if (argv[arg][0] == '-') { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); err++; } else { firstFileArg = arg; break; } arg++; } if (gkpStoreName == NULL) err++; if (firstFileArg == 0) err++; if (err) { fprintf(stderr, "usage: %s [...] -o gkpStore\n", argv[0]); fprintf(stderr, " -o gkpStore create this gkpStore\n"); fprintf(stderr, " \n"); fprintf(stderr, " -minlength L discard reads shorter than L\n"); fprintf(stderr, " \n"); fprintf(stderr, " \n"); if (gkpStoreName == NULL) fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n"); if (firstFileArg == 0) fprintf(stderr, "ERROR: no input files supplied.\n"); exit(1); } gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName, gkStore_extend); gkRead *gkpRead = NULL; gkLibrary *gkpLibrary = NULL; uint32 gkpFileID = 0; // Used for HTML output, an ID for each file loaded. uint32 inLineLen = 1024; char inLine[1024] = { 0 }; validSeq['a'] = validSeq['c'] = validSeq['g'] = validSeq['t'] = validSeq['n'] = 1; validSeq['A'] = validSeq['C'] = validSeq['G'] = validSeq['T'] = validSeq['N'] = 1; errno = 0; sprintf(errorLogName, "%s/errorLog", gkpStoreName); FILE *errorLog = fopen(errorLogName, "w"); if (errno) fprintf(stderr, "ERROR: cannot open error file '%s': %s\n", errorLogName, strerror(errno)), exit(1); sprintf(htmlLogName, "%s/load.dat", gkpStoreName); FILE *htmlLog = fopen(htmlLogName, "w"); if (errno) fprintf(stderr, "ERROR: cannot open uid map file '%s': %s\n", htmlLogName, strerror(errno)), exit(1); sprintf(nameMapName, "%s/readNames.txt", gkpStoreName); FILE *nameMap = fopen(nameMapName, "w"); if (errno) fprintf(stderr, "ERROR: cannot open uid map file '%s': %s\n", nameMapName, strerror(errno)), exit(1); uint32 nERROR = 0; // There aren't any errors, we just exit fatally if encountered. uint32 nWARNS = 0; uint32 nLOADED = 0; // Reads loaded uint64 bLOADED = 0; // Bases loaded uint32 nSKIPPED = 0; uint64 bSKIPPED = 0; // Bases not loaded, too short #if 0 fprintf(htmlLog, "<!DOCTYPE html>\n"); fprintf(htmlLog, "<html>\n"); fprintf(htmlLog, "<head>\n"); fprintf(htmlLog, "<title>gatekeeper load statistics</title>\n"); fprintf(htmlLog, "<style type='text/css'>\n"); fprintf(htmlLog, "body { font-family: Helvetica, Verdana, sans-serif; }\n"); fprintf(htmlLog, "h1, h2 { color: #ee3e80; }\n"); fprintf(htmlLog, "p { color: #665544; }\n"); fprintf(htmlLog, "th, td { border: 1px solid #111111; padding: 2px 2px 2px 2px; }\n"); fprintf(htmlLog, "td:hover { background-color: #e4e4e4; }\n"); fprintf(htmlLog, "th:hover { background-color: #d4d4d4; }\n"); fprintf(htmlLog, "tr.details { visibility: collapse; }\n"); fprintf(htmlLog, "</style>\n"); fprintf(htmlLog, "</head>\n"); fprintf(htmlLog, "<body>\n"); fprintf(htmlLog, "<h2>Input Files</h2>\n"); fprintf(htmlLog, "<table>\n"); #endif for (; firstFileArg < argc; firstFileArg++) { fprintf(stderr, "\n"); fprintf(stderr, "Starting file '%s'.\n", argv[firstFileArg]); compressedFileReader *inFile = new compressedFileReader(argv[firstFileArg]); char *line = new char [10240]; KeyAndValue keyval; while (fgets(line, 10240, inFile->file()) != NULL) { chomp(line); keyval.find(line); if (keyval.key() == NULL) { // No key, so must be a comment or blank line continue; } if (strcasecmp(keyval.key(), "name") == 0) { gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value()); continue; } // We'd better have a gkpLibrary defined, if not, the .gkp input file is incorrect. if (gkpLibrary == NULL) { fprintf(stderr, "WARNING: no 'name' tag in gkp input; creating library with name 'DEFAULT'.\n"); gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value()); nWARNS++; } if (strcasecmp(keyval.key(), "preset") == 0) { gkpLibrary->gkLibrary_parsePreset(keyval.value()); } else if (strcasecmp(keyval.key(), "qv") == 0) { gkpLibrary->gkLibrary_setDefaultQV(keyval.value_double()); } else if (strcasecmp(keyval.key(), "isNonRandom") == 0) { gkpLibrary->gkLibrary_setIsNonRandom(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "trustHomopolymerRuns") == 0) { gkpLibrary->gkLibrary_setTrustHomopolymerRuns(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "removeDuplicateReads") == 0) { gkpLibrary->gkLibrary_setRemoveDuplicateReads(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "finalTrim") == 0) { gkpLibrary->gkLibrary_setFinalTrim(keyval.value()); } else if (strcasecmp(keyval.key(), "removeSpurReads") == 0) { gkpLibrary->gkLibrary_setRemoveSpurReads(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "removeChimericReads") == 0) { gkpLibrary->gkLibrary_setRemoveChimericReads(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "checkForSubReads") == 0) { gkpLibrary->gkLibrary_setCheckForSubReads(keyval.value_bool()); } else if (AS_UTL_fileExists(keyval.key(), false, false)) { loadReads(gkpStore, gkpLibrary, gkpFileID++, minReadLength, nameMap, htmlLog, errorLog, keyval.key(), nWARNS, nLOADED, bLOADED, nSKIPPED, bSKIPPED); } else { fprintf(stderr, "ERROR: option '%s' not recognized, and not a file of reads.\n", line); exit(1); } } delete inFile; delete [] line; } #if 0 fprintf(htmlLog, "</table>\n"); #endif gkpStore->gkStore_close(); fclose(nameMap); fclose(errorLog); fprintf(stderr, "\n"); fprintf(stderr, "Finished with:\n"); fprintf(stderr, " "F_U32" warnings (bad base or qv)\n", nWARNS); fprintf(stderr, "\n"); fprintf(stderr, "Read from inputs:\n"); fprintf(stderr, " "F_U64" bp.\n", bLOADED); fprintf(stderr, " "F_U32" reads.\n", nLOADED); fprintf(stderr, "\n"); fprintf(stderr, "Loaded into store:\n"); fprintf(stderr, " "F_U64" bp.\n", bLOADED); fprintf(stderr, " "F_U32" reads.\n", nLOADED); fprintf(stderr, "\n"); fprintf(stderr, "Skipped (too short):\n"); fprintf(stderr, " "F_U64" bp (%.4f%%).\n", bSKIPPED, 100.0 * bSKIPPED / (bSKIPPED + bLOADED)); fprintf(stderr, " "F_U32" reads (%.4f%%).\n", nSKIPPED, 100.0 * nSKIPPED / (nSKIPPED + nLOADED)); fprintf(stderr, "\n"); fprintf(stderr, "\n"); #if 0 fprintf(htmlLog, "\n"); fprintf(htmlLog, "<h2>Final Store</h2>\n"); fprintf(htmlLog, "<table>\n"); fprintf(htmlLog, "<tr><td colspan='2'>%s</td></tr>\n", gkpStoreName); fprintf(htmlLog, "<tr><td>readsLoaded</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADED, bLOADED); fprintf(htmlLog, "<tr><td>readsSkipped</td><td>"F_U32" reads ("F_U64" bp) (read was too short)</td></tr>\n", nSKIPPED, bSKIPPED); fprintf(htmlLog, "<tr><td>warnings</td><td>"F_U32" warnings (invalid base or quality value)</td></tr>\n", nWARNS); fprintf(htmlLog, "</table>\n"); fprintf(htmlLog, "\n"); fprintf(htmlLog, "<script type='text/javascript'>\n"); fprintf(htmlLog, "var toggleOne = function() {\n"); fprintf(htmlLog, " var table = this.closest('table');\n"); fprintf(htmlLog, " var elts = table.querySelectorAll('.details');\n"); fprintf(htmlLog, "\n"); fprintf(htmlLog, " for (var i=0; i<elts.length; i++) {\n"); fprintf(htmlLog, " if (!elts[i].enabled) {\n"); fprintf(htmlLog, " elts[i].enabled = true;\n"); fprintf(htmlLog, " elts[i].style.visibility = 'visible';\n"); fprintf(htmlLog, " } else {\n"); fprintf(htmlLog, " elts[i].enabled = false;\n"); fprintf(htmlLog, " elts[i].style.visibility = 'collapse';\n"); fprintf(htmlLog, " }\n"); fprintf(htmlLog, " }\n"); fprintf(htmlLog, "}\n"); fprintf(htmlLog, "\n"); for (uint32 ii=0; ii<gkpFileID; ii++) { fprintf(htmlLog, "document.getElementById('gkpload%u').onclick = toggleOne;\n", ii); fprintf(htmlLog, "document.getElementById('gkpload%u').style = 'cursor: pointer;';\n", ii); } fprintf(htmlLog, "</script>\n"); fprintf(htmlLog, "\n"); fprintf(htmlLog, "</body>\n"); fprintf(htmlLog, "</html>\n"); #else fprintf(htmlLog, "sum "F_U32" "F_U64" "F_U32" "F_U64" "F_U32"\n", nLOADED, bLOADED, nSKIPPED, bSKIPPED, nWARNS); #endif fclose(htmlLog); if (nERROR > 0) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many errors.\n"); if (bSKIPPED > 0.25 * (bSKIPPED + bLOADED)) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many bases skipped. Check your reads.\n"); if (nWARNS > 0.25 * (nLOADED)) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many warnings. Check your reads.\n"); if (nSKIPPED > 0.50 * (nLOADED)) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many short reads. Check your reads!\n"); if ((nERROR > 0) || (bSKIPPED > 0.25 * (bSKIPPED + bLOADED)) || (nWARNS > 0.25 * (nSKIPPED + nLOADED)) || (nSKIPPED > 0.50 * (nSKIPPED + nLOADED))) exit(1); fprintf(stderr, "gatekeeperCreate finished successfully.\n"); exit(0); }
int main (int argc, char *argv[]) { char *asmFileName = NULL; char *tigStoreName = NULL; uint32 tigStoreVers = 2; int minLength = DEFAULT_UNITIG_LENGTH; int numInstances = DEFAULT_NUM_INSTANCES; int distanceToEnds = DEFAULT_DISTANCE_TO_ENDS; uint32 numToggled = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-a") == 0) { asmFileName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigStoreName = argv[++arg]; tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { numInstances = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-d") == 0) { distanceToEnds = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (minLength <= 0) err++; if (numInstances < 0) err++; if (distanceToEnds <= 0) err++; if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) { fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -a asmFile path to the assembly .asm file\n"); fprintf(stderr, " -t tigStore version path to the tigStore and version to modify\n"); fprintf(stderr, " -l minLength minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH); fprintf(stderr, " -n numInstances number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES); fprintf(stderr, " -d distanceToEnd max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS); fprintf(stderr, "\n"); fprintf(stderr, " Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n"); fprintf(stderr, " 1. the unitig meets all the -l, -n and -d conditions\n"); fprintf(stderr, " 2. When -n = 0, all surrogate unitigs with more than one read\n"); fprintf(stderr, " 3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n"); exit(1); } HashTable_AS *UIDtoIID = CreateScalarHashTable_AS(); HashTable_AS *CTGtoFirstUTG = CreateScalarHashTable_AS(); HashTable_AS *CTGtoLastUTG = CreateScalarHashTable_AS(); VA_TYPE(int32) *unitigLength = CreateVA_int32(8192); VA_TYPE(uint32) *surrogateCount = CreateVA_uint32(8192); VA_TYPE(uint32) *surrogateAtScaffoldEnds = CreateVA_uint32(8192); GenericMesg *pmesg; FILE *infp = fopen(asmFileName, "r"); while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { SnapUnitigMesg *utg = NULL; SnapConConMesg *ctg = NULL; SnapScaffoldMesg *scf = NULL; uint32 count = 0; uint32 forward = TRUE; uint32 lastCtg = 0; switch(pmesg->t) { case MESG_UTG: utg = (SnapUnitigMesg*)(pmesg->m); Setint32(unitigLength, utg->iaccession, &utg->length); if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) { // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0 count = 0; InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0); Setuint32(surrogateCount, utg->iaccession, &count); } break; case MESG_CCO: ctg = (SnapConConMesg *)(pmesg->m); for (int32 i = 0; i < ctg->num_unitigs; i++) { // increment the surrogate unitigs instance counter if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) { uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)); assert(ret != NULL); (*ret)++; // store first surrogate in a contig if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) { InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } // also store the last if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) { ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } } } break; case MESG_SCF: scf = (SnapScaffoldMesg *)(pmesg->m); count = scf->iaccession; if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) { forward = FALSE; } lastCtg = MAX(scf->num_contig_pairs - 1, 0); // All four cases below follow the same pattern // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX) // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds) // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } break; default: break; } } fclose(infp); uint32 *ret = NULL; uint32 *atScfEnd = NULL; // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially) // this also removes any partitioning MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE); for (uint32 i = 0; i < tigStore->numUnitigs(); i++) { uint32 *ret = Getuint32(surrogateCount, i); uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i); uint32 *length = Getuint32(unitigLength, i); bool toggled = false; if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) { toggled = TRUE; } // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) { toggled = TRUE; } // special case, mark non-singleton unitigs as unique if we are given no instances else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) { toggled = TRUE; } if (toggled) { tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE); numToggled++; } } DeleteHashTable_AS(UIDtoIID); DeleteHashTable_AS(CTGtoFirstUTG); DeleteHashTable_AS(CTGtoLastUTG); delete tigStore; fprintf(stderr, "Toggled %d\n", numToggled); return 0; }