void splitNcbiFa(char *ncbiIn, char *outDir) /* splitNcbiFa - Split up NCBI format fa file into UCSC formatted ones.. */ { struct lineFile *lf = lineFileOpen(ncbiIn, TRUE); static struct dnaSeq seq; ZeroVar(&seq); makeDir(outDir); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { FILE *f; char fileName[512]; char *row[5]; int wordCount; char ourName[129]; char cloneName[128]; wordCount = chopByChar(seq.name, '|', row, ArraySize(row)); if (wordCount != 5) errAbort("Expecting 5 | separated fields line %d of %s", lf->lineIx, lf->fileName); strcpy(cloneName, row[3]); chopSuffix(cloneName); sprintf(fileName, "%s/%s.fa", outDir, cloneName); sprintf(ourName, "%s_1", row[3]); faWrite(fileName, ourName, seq.dna, seq.size); } }
int main(int argc, char *argv[]) { char *sourceName, *destRootName; int maxSize; char destName[512]; char faName[512]; int destIx; int size, start; struct dnaSeq *seq; if (argc != 4) usage(); sourceName = argv[1]; maxSize = atoi(argv[2]); if (maxSize < 1) usage(); destRootName = argv[3]; printf("reading %s\n", sourceName); seq = faReadDna(sourceName); for (start = 0, destIx = 1; start < seq->size; start += size, ++destIx) { size = seq->size - start; if (size > maxSize) size = maxSize; sprintf(destName, "%s%02d.fa", destRootName, destIx); sprintf(faName, "%s.%d", seq->name, destIx); printf("writing %s\n", destName); faWrite(destName, faName, seq->dna+start, size); } return 0; }
void snpMaskChrom(char *tableName, char *nibFile, char *outFile) /* snpMaskChrom - Print a nib file as a fasta file, using IUPAC codes for single base substitutions. */ { struct dnaSeq *seq; char *ptr; struct snpSimple *snps = NULL; struct snpSimple *snp = NULL; boolean inRep = FALSE; seq = nibLoadAllMasked(NIB_MASK_MIXED, nibFile); ptr = seq->dna; snps = readSnpsFromChrom(tableName, chromName); /* do all substitutions */ for (snp = snps; snp != NULL; snp = snp->next) { if (islower(ptr[snp->chromStart])) inRep = TRUE; else inRep = FALSE; ptr[snp->chromStart] = iupac(snp->name, snp->observed, ptr[snp->chromStart]); if (inRep) ptr[snp->chromStart] = tolower(ptr[snp->chromStart]); } faWrite(outFile, chromName, seq->dna, seq->size); snpSimpleFreeList(&snps); dnaSeqFree(&seq); }
void writeChromFaFile(char *chromName, char *dna, int dnaSize, char *destDir) /* Writes the contents of a single chromsome out to a file in FASTA format. param chromName - The name of the chromosome for which we are writing the fa file. param dna - Pointer to the dna array. param dnaSize - The size of the dna array. */ { char filename [DEFAULT_PATH_SIZE]; sprintf(filename, "%s/%s.fa", destDir, chromName); printf("Writing fa file %s for chromosome %s\n", filename, chromName); faWrite(filename, chromName, dna, dnaSize); }
void snpMask(char *nibFile, char *outFile) /* snpMask - Print a nib file, using IUPAC codes for single base substitutions. */ { struct dnaSeq *seq; char *ptr; struct snp *snps = NULL; struct snp *snp = NULL; seq = nibLoadAllMasked(NIB_MASK_MIXED, nibFile); ptr = seq->dna; snps = readSnpsFromChrom(chromName); printf("got all snps in %s\n", chromName); /* do all substitutions */ for (snp = snps; snp != NULL; snp = snp->next) { ptr[snp->chromStart] = iupac(snp->name, snp->observed, ptr[snp->chromStart]); } if (printSnps) { for (snp = snps; snp != NULL; snp = snp->next) { printSnpSeq(snp, seq); } } if (printChrom) faWrite(outFile, chromName, seq->dna, seq->size); snpFreeList(&snps); if (printGenes) doPrintGenes(chromName, seq); dnaSeqFree(&seq); }
void writeSuperContigFaFile(DNA *dna, struct agpData *startData, struct agpData *endData, char *filename, int sequenceNum) /* Creates a fasta file containing the contents of a supercontig in FASTA format. param dna - Pointer to the dna array. param startData - Pointer to the dna gap or fragment at which we are starting to write data. The data will include the contents of this gap/frag. param end - Pointer to the dna gap or fragment at which we are stopping to write data. The data will include the contents of this gap/frag. param filename - The file name to which to write. param sequenceNum - The 1-based number of this clone supercontig in the chromsome. */ { int startOffset = startData->data.pGap->chromStart; int endOffset = endData->data.pGap->chromEnd; int dnaSize = 0; char sequenceName[BUF_SIZE]; printf("Writing supercontig fa file %s\n", filename); // starting at dna[%d] up to but not including dna[%d]\n", filename, startOffset, endOffset); sprintf(sequenceName, "%s_%d %d-%d", startData->data.pGap->chrom, sequenceNum, startOffset, endOffset); dnaSize = endOffset - startOffset; faWrite(filename, sequenceName, &dna[startOffset], dnaSize); }
void gsBig(char *faName, char *gtfName, char *suboptName, char *transName, char *exeName, char *parName, char *tmpDirName) /* gsBig - Run Genscan on big input and produce GTF files. */ { struct dnaSeq seq; struct lineFile *lf = lineFileOpen(faName, TRUE); FILE *gtfFile = mustOpen(gtfName, "w"); FILE *subFile = NULL; FILE *transFile = NULL; ZeroVar(&seq); if (suboptName != NULL) subFile = mustOpen(suboptName, "w"); if (transName != NULL) transFile = mustOpen(transName, "w"); if (exeName != NULL) exePath = cloneString(exeName); if (parName != NULL) parPath = cloneString(parName); if (tmpDirName != NULL) tmpDir = cloneString(tmpDirName); if (optionExists("prerun")) { char *preFileName = optionVal("prerun", NULL); char seqName[128]; struct segment *seg = parseSegment(preFileName, 0, 100000000, seqName); writeSeg(seqName, seg, gtfFile, subFile, transFile); } else { struct dyString *dy = newDyString(1024); char tempFa[512], tempGs[512]; char dir1[256], root1[128], ext1[64]; int myPid = (int)getpid(); splitPath(faName, dir1, root1, ext1); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { int offset, sizeOne; struct segment *segList = NULL, *seg; char *seqName = cloneString(seq.name); int chunkNum = 0; for (offset = 0; offset < seq.size; offset += stepSize) { boolean allN = TRUE; int i; safef(tempFa, sizeof(tempFa), "%s/temp_gsBig_%d_%s_%d.fa", tmpDir, myPid, seqName, chunkNum); safef(tempGs, sizeof(tempGs), "%s/temp_gsBig_%d_%s_%d.genscan", tmpDir, myPid, seqName, chunkNum); sizeOne = seq.size - offset; if (sizeOne > winSize) sizeOne = winSize; /* Genscan hangs forever if a chunk is all-N's... if so, * then skip this chunk. */ for (i=offset; i < (offset+sizeOne); i++) { if (seq.dna[i] != 'N' && seq.dna[i] != 'n') { allN = FALSE; break; } } if (allN) { printf("\ngsBig: skipping %s[%d:%d] -- it's all N's.\n\n", seqName, offset, (offset+sizeOne-1)); } else { faWrite(tempFa, "split", seq.dna + offset, sizeOne); dyStringClear(dy); dyStringPrintf(dy, "%s %s %s", exePath, parPath, tempFa); if (suboptName != NULL) dyStringPrintf(dy, " -subopt"); dyStringPrintf(dy, " > %s", tempGs); verbose(3, "%s\n", dy->string); mustSystem(dy->string); seg = parseSegment(tempGs, offset, offset+sizeOne, NULL); slAddHead(&segList, seg); } chunkNum++; } slReverse(&segList); seg = mergeSegs(segList); writeSeg(seqName, seg, gtfFile, subFile, transFile); freez(&seqName); } if (! optionExists("noRemove")) { remove(tempFa); remove(tempGs); } } }
void fakeFinContigs(char *agpName, char *faName, char *finDir, char *rootName, char *finFaDir, char *ooVer) /* fakeFinContigs - Fake up contigs for a finished chromosome. */ { struct contig *contigList = NULL, *contig = NULL; struct agpFrag *agp; struct lineFile *lf = lineFileOpen(agpName, TRUE); char *line, *words[16]; int lineSize, wordCount; int contigIx = 0; char liftDir[512], contigDir[512], path[512]; char chrom[128]; FILE *f; struct dnaSeq *seq; int fragIx; /* Build up contig list by scanning agp file. */ printf("Reading %s\n", lf->fileName); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#' || line[0] == 0) continue; wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Expecting at least 5 words line %d of %s", lf->lineIx, lf->fileName); if (words[4][0] == 'N' || words[4][0] == 'U') { contig = NULL; continue; } lineFileExpectWords(lf, 9, wordCount); agp = agpFragLoad(words); // file is 1-based but agpFragLoad() now assumes 0-based: agp->chromStart -= 1; agp->fragStart -= 1; if (contig == NULL) { AllocVar(contig); sprintf(contig->name, "%s%d", rootName, ++contigIx); contig->startOffset = agp->chromStart; slAddHead(&contigList, contig); } else { if (contig->agpList != NULL && contig->agpList->chromEnd != agp->chromStart) errAbort("Start doesn't match previous end line %d of %s", lf->lineIx, lf->fileName); } if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Chrom and frag size mismatch line %d of %s", lf->lineIx, lf->fileName); slAddHead(&contig->agpList, agp); contig->endOffset = agp->chromEnd; } slReverse(&contigList); for (contig = contigList; contig != NULL; contig = contig->next) slReverse(&contig->agpList); lineFileClose(&lf); /* Load up chromosome sequence and make sure it is in one piece. */ printf("Reading %s\n", faName); seq = faReadAllDna(faName); if (slCount(seq) != 1) errAbort("Got %d sequences in %s, can only handle one.", slCount(seq), faName); /* Fix up agp coordinates. Make a directory for each contig. Fill it with * .fa .agp barge.NN files for that contig. */ printf("Writing contig dirs\n"); for (contig = contigList; contig != NULL; contig = contig->next) { /* Make Contig dir. */ sprintf(contigDir, "%s/%s", finDir, contig->name); makeDir(contigDir); /* Make contig.agp file. */ sprintf(path, "%s/%s.agp", contigDir, contig->name); f = mustOpen(path, "w"); fragIx = 0; for (agp = contig->agpList; agp != NULL; agp = agp->next) { char buf[128]; sprintf(buf, "%s/%s", skipChr(agp->chrom), contig->name); freez(&agp->chrom); agp->chrom = cloneString(buf); agp->chromStart -= contig->startOffset; agp->chromEnd -= contig->startOffset; agp->ix = ++fragIx; agpFragTabOut(agp, f); } carefulClose(&f); /* Make ooGreedy.NN.gl file */ sprintf(path, "%s/%s.%s.gl", contigDir, "ooGreedy", ooVer); f = mustOpen(path, "w"); for (agp = contig->agpList; agp != NULL; agp = agp->next) { if (agp->type[0] != 'N' && agp->type[0] != 'U') { fprintf(f, "%s_1\t%d\t%d\t%s\n", agp->frag, agp->chromStart, agp->chromEnd, agp->strand); } } carefulClose(&f); /* Make contig.fa file. */ sprintf(path, "%s/%s.fa", contigDir, contig->name); faWrite(path, contig->name, seq->dna + contig->startOffset, contig->endOffset - contig->startOffset); /* Make contig/barge file. */ sprintf(path, "%s/barge.%s", contigDir, ooVer); f = mustOpen(path, "w"); fprintf(f, "Barge (Connected Clone) File ooGreedy Version %s\n", ooVer); fprintf(f, "\n"); fprintf(f, "start accession size overlap maxClone maxOverlap\n"); fprintf(f, "------------------------------------------------------------\n"); for (agp = contig->agpList; agp != NULL; agp = agp->next) { char clone[128]; strcpy(clone, agp->frag); chopSuffix(clone); fprintf(f, "%d\t%s\t%d\t100\tn/a\t0\n", agp->chromStart, clone, agp->chromEnd); } carefulClose(&f); /* Make contig/gold file. */ sprintf(path, "%s/gold.%s", contigDir, ooVer); f = mustOpen(path, "w"); fragIx = 0; for (agp = contig->agpList; agp != NULL; agp = agp->next) { char fragName[128]; struct agpFrag frag = *agp; sprintf(fragName, "%s_1", agp->frag); frag.frag = fragName; frag.type[0] = '0'; agpFragTabOut(&frag, f); } carefulClose(&f); } /* Create lift subdirectory. */ printf("Creating lift files\n"); sprintf(liftDir, "%s/lift", finDir); makeDir(liftDir); /* Create lift/oOut.lst file (just a list of contigs). */ sprintf(path, "%s/oOut.lst", liftDir); f = mustOpen(path, "w"); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%s/%s.fa.out\n", contig->name, contig->name); carefulClose(&f); /* Create lift/ordered.lst file (just a list of contigs). */ sprintf(path, "%s/ordered.lst", liftDir); f = mustOpen(path, "w"); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%s\n", contig->name); carefulClose(&f); /* Create lift/ordered.lft file. */ sprintf(path, "%s/ordered.lft", liftDir); f = mustOpen(path, "w"); splitPath(faName, NULL, chrom, NULL); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%d\t%s/%s\t%d\t%s\t%d\n", contig->startOffset, skipChr(chrom), contig->name, contig->endOffset - contig->startOffset, chrom, seq->size); carefulClose(&f); }