void writeAgpFile(char *chromName, struct agpData *startAgpData, char *filename) /* Writes the agp file out for a single chromsome param chromName - The name of the chromsome. param startGap - Pointer to the dna gap or fragment at which we are starting to write data. The data will include the contents of this gap/frag. param filename - The file to write. */ { struct agpData *curData = NULL; FILE *fp = NULL; fp = fopen(filename, "w"); /*printf("Writing agp file %s for chromo %s\n", filename, startAgpData->data.pGap->chrom);*/ curData = startAgpData; while (NULL != curData) { if (curData->isGap) { /* Undo the decrement we did earlier. This was done in nextAgpEntryToSplitOn() in order to be compatible with the 0-based frag addressing scheme. */ curData->data.pGap->chromStart++; agpGapOutput(curData->data.pGap, fp, '\t', '\n'); /* Redo the above decrement - don't want to create side effects */ curData->data.pGap->chromStart--; } else { agpFragOutput(curData->data.pFrag, fp, '\t', '\n'); } curData = curData->next; } fclose(fp); }
void scaffoldFaToAgp(char *scaffoldFile) /* scaffoldFaToAgp - create AGP file, gap file and lift file * from scaffold FA file */ { DNA *scaffoldSeq; char *name; int size; struct agpFrag frag; struct agpGap scaffoldGap, fragGap; struct lineFile *lf = lineFileOpen(scaffoldFile, TRUE); char outDir[256], outFile[128], ext[64], outPath[512]; FILE *agpFile = NULL, *gapFile = NULL, *liftFile = NULL; int fileNumber = 1; /* sequence number in AGP file */ int start = 0, end = 0; int chromSize = 0; int scaffoldCount = 0; int fragSize = 0, gapSize = 0; char *seq; int seqStart = 0; /* determine size of "unordered chromosome" that will be constructed. * This is needed for the lift file. */ while (faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name)) { chromSize += size; chromSize += scaffoldGapSize; scaffoldCount++; } /* do not need the final useless gap */ chromSize -= scaffoldGapSize; printf("scaffold gap size is %d, total scaffolds: %d\n", scaffoldGapSize, scaffoldCount); printf("chrom size is %d\n", chromSize); /* initialize fixed fields in AGP frag */ ZeroVar(&frag); frag.chrom = CHROM_NAME; frag.type[0] = 'D'; /* draft */ frag.fragStart = 0; /* always start at beginning of scaffold */ frag.strand[0] = '+'; /* initialize fixed fields in scaffold gap */ ZeroVar(&scaffoldGap); scaffoldGap.chrom = CHROM_NAME; scaffoldGap.n[0] = 'N'; scaffoldGap.size = scaffoldGapSize; scaffoldGap.type = SCAFFOLD_GAP_TYPE; scaffoldGap.bridge = "no"; /* initialize fixed fields in frag gap */ ZeroVar(&fragGap); fragGap.chrom = CHROM_NAME; fragGap.n[0] = 'N'; fragGap.type = FRAGMENT_GAP_TYPE; fragGap.bridge = "yes"; /* munge file paths */ splitPath(scaffoldFile, outDir, outFile, ext); sprintf(outPath, "%s%s.agp", outDir, outFile); agpFile = mustOpen(outPath, "w"); printf("writing %s\n", outPath); sprintf(outPath, "%s%s.gap", outDir, outFile); gapFile = mustOpen(outPath, "w"); printf("writing %s\n", outPath); sprintf(outPath, "%s%s.lft", outDir, outFile); liftFile = mustOpen(outPath, "w"); printf("writing %s\n", outPath); /* read in scaffolds from fasta file, and generate * the three files */ lineFileSeek(lf, 0, SEEK_SET); boolean allDone = FALSE; allDone = ! faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name); while (! allDone) { end = start + size; /* setup AGP frag for the scaffold and write to AGP file */ frag.frag = name; frag.ix = fileNumber++; frag.chromStart = start; frag.chromEnd = end; frag.fragEnd = size; agpFragOutput(&frag, agpFile, '\t', '\n'); /* write lift file entry for this scaffold */ fprintf(liftFile, "%d\t%s\t%d\t%s\t%d\n", start, name, size, CHROM_NAME, chromSize); /* write gap file entries for this scaffold */ seq = scaffoldSeq; seqStart = start; while (seqGetGap(seq, &fragSize, &gapSize)) { if (gapSize > minGapSize) { fragGap.size = gapSize; fragGap.chromStart = seqStart + fragSize + 1; fragGap.chromEnd = fragGap.chromStart + gapSize - 1; agpGapOutput(&fragGap, gapFile, '\t', '\n'); } seqStart = seqStart + fragSize + gapSize; seq = seq + fragSize + gapSize; } /* setup AGP gap to separate scaffolds and write to AGP and gap files */ /* Note: may want to suppress final gap -- not needed as separator */ start = end + 1; end = start + scaffoldGapSize - 1; /* Avoid an extra gap on the end - not needed */ allDone = ! faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name); if (allDone) break; scaffoldGap.ix = fileNumber++; scaffoldGap.chromStart = start; scaffoldGap.chromEnd = end; agpGapOutput(&scaffoldGap, agpFile, '\t', '\n'); agpGapOutput(&scaffoldGap, gapFile, '\t', '\n'); /* write lift file entry for this gap */ fprintf(liftFile, "%d\t%s\t%d\t%s\t%d\n", start-1, "gap", scaffoldGapSize, CHROM_NAME, chromSize); start = end; //freeMem(seq); } carefulClose(&agpFile); carefulClose(&liftFile); carefulClose(&gapFile); lineFileClose(&lf); }
static void agpSangerUnfinished(char *agpFile, char *contigFasta, char *agpOut) /* Fix agp to match unfinished contigs in fasta */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, "w"); char *newChrom = NULL; struct hash *hash = hashFasta(contigFasta); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); struct hashEl *e, *elist = hashLookup(hash, root); for (e = elist; e; e = hashLookupNext(e)) { struct unfinishedContig *u = e->val; if ((u->fragStart <= agp->fragStart) && (u->fragEnd >= agp->fragEnd)) { agp->frag = cloneString(u->frag); agp->fragEnd -= u->fragStart; agp->fragStart -= u->fragStart; } } freeMem(root); } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }
static void agpMergeChromScaf(char *agpFile, char *agpOut, boolean filtering) /* Create a combined agp file from the chrom.agp and scaffold.agp, * merging in only scaffolds from scaffold.agp * that are not already in chroms. */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, filtering ? "a" : "w"); char *newChrom = NULL; static struct hash *hash = NULL; boolean skipping = FALSE; if (!hash) hash = hashNew(0); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } skipping = FALSE; if (filtering) { if (hashLookup(hash, words[0])) skipping = TRUE; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); if (!filtering) { char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); hashStore(hash, root); freeMem(root); } } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ if (!skipping) agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; if (!skipping) agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }