int main(int argc, char **argv) { char *gkpStoreName = NULL; char *outPrefix = NULL; char *clrName = NULL; uint32 libToDump = 0; uint32 bgnID = 1; uint32 endID = UINT32_MAX; bool dumpAllReads = false; bool dumpAllBases = false; bool dumpOnlyDeleted = false; bool dumpFASTQ = true; bool dumpFASTA = false; bool withLibName = true; argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-c") == 0) { clrName = argv[++arg]; } else if (strcmp(argv[arg], "-l") == 0) { libToDump = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-b") == 0) { bgnID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-e") == 0) { endID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-r") == 0) { bgnID = atoi(argv[++arg]); endID = bgnID; } else if (strcmp(argv[arg], "-allreads") == 0) { dumpAllReads = true; } else if (strcmp(argv[arg], "-allbases") == 0) { dumpAllBases = true; } else if (strcmp(argv[arg], "-onlydeleted") == 0) { dumpOnlyDeleted = true; dumpAllReads = true; // Otherwise we won't report the deleted reads! } else if (strcmp(argv[arg], "-fastq") == 0) { dumpFASTQ = true; dumpFASTA = false; } else if (strcmp(argv[arg], "-fasta") == 0) { dumpFASTQ = false; dumpFASTA = true; } else if (strcmp(argv[arg], "-nolibname") == 0) { withLibName = false; } else { err++; fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); } arg++; } if (gkpStoreName == NULL) err++; if (outPrefix == NULL) err++; if (err) { fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]); fprintf(stderr, " -G gkpStore\n"); fprintf(stderr, " -o fastq-prefix write files fastq-prefix.(libname).fastq, ...\n"); fprintf(stderr, "\n"); fprintf(stderr, " -l libToDump output only read in library number libToDump (NOT IMPLEMENTED)\n"); fprintf(stderr, " -b id output starting at read 'id'\n"); fprintf(stderr, " -e id output stopping after read 'id'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -c clearFile clear range file from OBT modules\n"); fprintf(stderr, " -allreads if a clear range file, lower case mask the deleted reads\n"); fprintf(stderr, " -allbases if a clear range file, lower case mask the non-clear bases\n"); fprintf(stderr, " -onlydeleted if a clear range file, only output deleted reads (the entire read)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -r id output only the single read 'id'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -fastq output is FASTQ format (with extension .fastq, default)\n"); fprintf(stderr, " -fasta output is FASTA format (with extension .fasta)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -nolibname don't include the library name in the output file name\n"); fprintf(stderr, "\n"); if (gkpStoreName == NULL) fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n"); if (outPrefix == NULL) fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n"); exit(1); } gkStore *gkpStore = new gkStore(gkpStoreName); uint32 numReads = gkpStore->gkStore_getNumReads(); uint32 numLibs = gkpStore->gkStore_getNumLibraries(); clearRangeFile *clrRange = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore); if (bgnID < 1) bgnID = 1; if (numReads < endID) endID = numReads; if (endID < bgnID) fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID); fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID); libOutput **out = new libOutput * [numLibs + 1]; // Allocate outputs. If withLibName == false, all reads will artificially be in lib zero, the // other files won't ever be created. Otherwise, the zeroth file won't ever be created. out[0] = new libOutput(outPrefix, NULL); for (uint32 i=1; i<=numLibs; i++) out[i] = new libOutput(outPrefix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName()); // Grab a new readData, and iterate through reads to dump. gkReadData *readData = new gkReadData; for (uint32 rid=bgnID; rid<=endID; rid++) { gkRead *read = gkpStore->gkStore_getRead(rid); uint32 libID = (withLibName == false) ? 0 : read->gkRead_libraryID(); uint32 lclr = 0; uint32 rclr = read->gkRead_sequenceLength(); bool ignore = false; //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID); // If a clear range file is supplied, grab the clear range. If it hasn't been set, the default // is the entire read. if (clrRange) { lclr = clrRange->bgn(rid); rclr = clrRange->end(rid); ignore = clrRange->isDeleted(rid); } // Abort if we're not dumping anything from this read // - not in a library we care about // - deleted, and not dumping all reads // - not deleted, but only reporting deleted reads if (((libToDump != 0) && (libID == libToDump)) || ((dumpAllReads == false) && (ignore == true)) || ((dumpOnlyDeleted == true) && (ignore == false))) continue; // And if we're told to ignore the read, and here, then the read was deleted and we're printing // all reads. Reset the clear range to the whole read, the clear range is invalid. if (ignore) { rclr = read->gkRead_sequenceLength(); lclr = 0; } // Grab the sequence and quality. gkpStore->gkStore_loadReadData(read, readData); char *seq = readData->gkReadData_getSequence(); char *qlt = readData->gkReadData_getQualities(); uint32 len = rclr - lclr; // Soft mask not-clear bases if (dumpAllBases == true) { for (uint32 i=0; i<lclr; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; for (uint32 i=lclr; i<rclr; i++) seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a'; for (uint32 i=rclr; seq[i]; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; rclr = read->gkRead_sequenceLength(); lclr = 0; } // Chop off the ends we're not printing. seq += lclr; seq[len] = 0; qlt[len] = 0; // And print the read. if (dumpFASTQ) AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, len, qlt, len, "@"F_U32" clr="F_U32","F_U32"\n", rid, lclr, rclr); if (dumpFASTA) AS_UTL_writeFastA(out[libID]->getFASTA(), seq, len, 0, ">"F_U32" clr="F_U32","F_U32"\n", rid, lclr, rclr); } delete readData; for (uint32 i=1; i<=numLibs; i++) delete out[i]; delete [] out; delete gkpStore; exit(0); }
int main( int argc, char **argv) { int ckptNum = NULLINDEX; int makeMiniScaffolds = 1; uint64 uidStart = 1230000; UIDserver *uids = NULL; GlobalData = new Globals_CGW(); argc = AS_configure(argc, argv); int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-p") == 0) { ckptNum = GlobalData->setPrefix(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { strcpy(GlobalData->outputPrefix, argv[++arg]); } else if (strcmp(argv[arg], "-g") == 0) { strcpy(GlobalData->gkpStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-t") == 0) { strcpy(GlobalData->tigStoreName, argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { ckptNum = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-U") == 0) { uidStart = 0; } else if (strcmp(argv[arg], "-S") == 0) { makeMiniScaffolds = 0; } else { fprintf(stderr, "unknown option '%s'\n", argv[arg]); err = 1; } arg++; } if ((GlobalData->outputPrefix[0] == 0) || (GlobalData->gkpStoreName[0] == 0)) { fprintf(stderr, "usage: %s [[-p prefix] | [-c name -g gkpstore -n ckptNum]] [-U] [-S]\n", argv[0]); fprintf(stderr, " -p Attempt to locate the last checkpoint in directory 7-CGW.\n"); fprintf(stderr, " -c Look for checkpoints in 'name'\n"); fprintf(stderr, " -g Path to gkpStore\n"); fprintf(stderr, " -n Checkpoint number to load\n"); fprintf(stderr, " -U Use real UIDs for miniscaffolds, otherwise, UIDs start at 1230000\n"); fprintf(stderr, " -S Do NOT make mini scaffolds.\n"); exit(1); } uids = UIDserverInitialize(256, uidStart); char *toprint = (char *)safe_malloc(sizeof(char) * (AS_READ_MAX_NORMAL_LEN + 51 + AS_READ_MAX_NORMAL_LEN + 2)); LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, ckptNum, FALSE); int ifrag; for (ifrag=0; ifrag < GetNumVA_CIFragT(ScaffoldGraph->CIFrags); ifrag++) { CIFragT *frag = GetCIFragT(ScaffoldGraph->CIFrags, ifrag); CIFragT *mate = NULL; if (frag->flags.bits.isDeleted) continue; assert(frag->cid != NULLINDEX); assert((frag->flags.bits.hasMate == 0) || (frag->mate_iid != 0)); // Fix for missing mates -- OBT used to not delete mate links, leaving // dangling mates. Somebody else seems to be doing this too. // if (frag->flags.bits.hasMate) { mate = GetCIFragT(ScaffoldGraph->CIFrags, frag->mate_iid); if (mate == NULL) frag->flags.bits.hasMate = 0; } // If this fragment is not chaff, we have nothing to do here. // if (GetGraphNode(ScaffoldGraph->CIGraph,frag->cid)->flags.bits.isChaff == 0) continue; // Print a singleton if there is no mate, the mate isn't chaff, // or we were told to not make miniscaffolds. // if ((mate == NULL) || (mate->flags.bits.isChaff == 0) || (makeMiniScaffolds == 0)) { AS_UID fUID = getFragmentClear(frag->read_iid, 0, toprint); AS_UTL_writeFastA(stdout, toprint, strlen(toprint), 0, ">%s /type=singleton\n", AS_UID_toString(fUID)); } else if ((mate != NULL) && (mate->flags.bits.isChaff == 1) && (makeMiniScaffolds == 1) && (frag->read_iid < mate->read_iid)) { // make sure the following chain of Ns is divisible by three; // the exact length is arbitrary but Doug Rusch points out that // by making it divisible by 3, we can get lucky and maintain // the phase of a protein ... which helps in the // auto-annotation of environmental samples AS_UID fUID = getFragmentClear(frag->read_iid, 0, toprint); strcat(toprint, "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"); AS_UID mUID = getFragmentClear(mate->read_iid, 1, toprint + strlen(toprint)); AS_UTL_writeFastA(stdout, toprint, strlen(toprint), 0, ">"F_U64" /type=mini_scaffold /frgs=(%s,%s)\n", getUID(uids), AS_UID_toString(fUID), AS_UID_toString(mUID)); } } delete GlobalData; exit(0); }
int32 main(int32 argc, char **argv) { int32 arg = 1; int32 err = 0; int32 hlp = 0; char * gkpStoreName = NULL; int32 gkpStorePart = 0; char * msgFile = NULL; char * outputFileName= NULL; char * seqAn = NULL; char * wrkDir = NULL; char * seqStoreName = NULL; int32 seqStoreVer = 0; int32 seqStorePart = 0; argc = AS_configure(argc, argv); while (arg < argc) { if (strcmp(argv[arg], "-c") == 0) { msgFile = argv[++arg]; } else if (strcmp(argv[arg], "-G") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-S") == 0) { gkpStorePart = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-o") == 0) { outputFileName = argv[++arg]; } else if (strcmp(argv[arg], "-s") == 0) { seqAn = argv[++arg]; } else if (strcmp(argv[arg], "-w") == 0) { wrkDir = argv[++arg]; } else if (strcmp(argv[arg], "-u") == 0) { seqStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-V") == 0) { seqStoreVer = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-p") == 0) { seqStorePart = atoi(argv[++arg]); } else { err++; } arg++; } if ((err) || (gkpStoreName == NULL) || (msgFile == NULL) || (outputFileName == NULL) || seqAn == NULL) { fprintf(stderr, "USAGE: SeqAn_CNS -G <gkpStore> -c <input.cgb> -o <output.cgi> -s <seqan_executable> [-u seqstore, required for contig consensus] [-w working directory]\n"); exit(1); } gkStore *gkpStore = new gkStore(gkpStoreName, FALSE, FALSE); gkpStore->gkStore_loadPartition(gkpStorePart); gkFragment fr; GenericMesg *pmesg; tSequenceDB *sequenceDB = NULL; FILE *infp = fopen(msgFile,"r"); FILE *tempReads; FILE *outfp = fopen(outputFileName, "w"); char fileName[AS_SEQAN_MAX_BUFFER_LENGTH]; char *prefix = outputFileName; getFileName(prefix, wrkDir, AS_SEQAN_INPUT_NAME, fileName); int32 i = 0; while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { int32 freeMem = 0; if (pmesg->t == MESG_IUM) { IntUnitigMesg *ium_mesg = (IntUnitigMesg *)pmesg->m; if (strlen(ium_mesg->consensus) == 0) { tempReads = fopen(fileName,"w"); for (i =0; i < ium_mesg->num_frags; i++) { // get the fragment sequence gkpStore->gkStore_getFragment(ium_mesg->f_list[i].ident, &fr, GKFRAGMENT_QLT); uint32 clrBeg = fr.gkFragment_getClearRegionBegin(); uint32 clrEnd = fr.gkFragment_getClearRegionEnd (); char *seqStart = fr.gkFragment_getSequence(); char *seq = seqStart+clrBeg; seq[clrEnd] = 0; AS_UTL_writeFastA(tempReads, seq, clrEnd-clrBeg, ">"F_IID","F_IID"\n", ium_mesg->f_list[i].position.bgn, ium_mesg->f_list[i].position.end); } fclose(tempReads); updateRecord(ium_mesg, fileName, seqAn, prefix, wrkDir); freeMem = 1; } WriteProtoMesg_AS(outfp, pmesg); if (freeMem) { safe_free(ium_mesg->consensus); safe_free(ium_mesg->quality); } } else if (pmesg->t == MESG_ICM) { IntConConMesg *icm_mesg = (IntConConMesg *)pmesg->m; if (seqStoreName == NULL) { fprintf(stderr, "USAGE: The -u option is required for contig consensus\n"); exit(1); } if (sequenceDB == NULL) { sequenceDB = openSequenceDB(seqStoreName, FALSE, seqStoreVer); openSequenceDBPartition(sequenceDB, seqStorePart); } if (strlen(icm_mesg->consensus) == 0) { tempReads = fopen(fileName,"w"); for (i =0; i < icm_mesg->num_pieces; i++) { // get the fragment sequence gkpStore->gkStore_getFragment(icm_mesg->pieces[i].ident, &fr, GKFRAGMENT_QLT); uint32 clrBeg = fr.gkFragment_getClearRegionBegin(); uint32 clrEnd = fr.gkFragment_getClearRegionEnd (); char *seqStart = fr.gkFragment_getSequence(); char *seq = seqStart+clrBeg; seq[clrEnd] = 0; AS_UTL_writeFastA(tempReads, seq, clrEnd-clrBeg, ">"F_IID","F_IID"\n", icm_mesg->pieces[i].position.bgn, icm_mesg->pieces[i].position.end); } // now handle the unitig messages for (i =0; i < icm_mesg->num_unitigs; i++) { VA_TYPE(char) *ungappedSequence = CreateVA_char(0); VA_TYPE(char) *ungappedQuality = CreateVA_char(0); MultiAlignT *uma = loadMultiAlignTFromSequenceDB(sequenceDB, icm_mesg->unitigs[i].ident, 1); assert(uma != NULL); GetMultiAlignUngappedConsensus(uma, ungappedSequence, ungappedQuality); char * seq = Getchar(ungappedSequence,0); AS_UTL_writeFastA(tempReads, seq, strlen(seq), ">"F_IID","F_IID"\n", icm_mesg->unitigs[i].position.bgn, icm_mesg->unitigs[i].position.end); } fclose(tempReads); updateICMRecord(icm_mesg, fileName, seqAn, prefix, wrkDir); freeMem = 1; } WriteProtoMesg_AS(outfp, pmesg); if (freeMem) { safe_free(icm_mesg->consensus); safe_free(icm_mesg->quality); } } } fclose(infp); fclose(outfp); return 0; }