void fakeFinContigs(char *agpName, char *faName, char *finDir, char *rootName, char *finFaDir, char *ooVer) /* fakeFinContigs - Fake up contigs for a finished chromosome. */ { struct contig *contigList = NULL, *contig = NULL; struct agpFrag *agp; struct lineFile *lf = lineFileOpen(agpName, TRUE); char *line, *words[16]; int lineSize, wordCount; int contigIx = 0; char liftDir[512], contigDir[512], path[512]; char chrom[128]; FILE *f; struct dnaSeq *seq; int fragIx; /* Build up contig list by scanning agp file. */ printf("Reading %s\n", lf->fileName); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#' || line[0] == 0) continue; wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Expecting at least 5 words line %d of %s", lf->lineIx, lf->fileName); if (words[4][0] == 'N' || words[4][0] == 'U') { contig = NULL; continue; } lineFileExpectWords(lf, 9, wordCount); agp = agpFragLoad(words); // file is 1-based but agpFragLoad() now assumes 0-based: agp->chromStart -= 1; agp->fragStart -= 1; if (contig == NULL) { AllocVar(contig); sprintf(contig->name, "%s%d", rootName, ++contigIx); contig->startOffset = agp->chromStart; slAddHead(&contigList, contig); } else { if (contig->agpList != NULL && contig->agpList->chromEnd != agp->chromStart) errAbort("Start doesn't match previous end line %d of %s", lf->lineIx, lf->fileName); } if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Chrom and frag size mismatch line %d of %s", lf->lineIx, lf->fileName); slAddHead(&contig->agpList, agp); contig->endOffset = agp->chromEnd; } slReverse(&contigList); for (contig = contigList; contig != NULL; contig = contig->next) slReverse(&contig->agpList); lineFileClose(&lf); /* Load up chromosome sequence and make sure it is in one piece. */ printf("Reading %s\n", faName); seq = faReadAllDna(faName); if (slCount(seq) != 1) errAbort("Got %d sequences in %s, can only handle one.", slCount(seq), faName); /* Fix up agp coordinates. Make a directory for each contig. Fill it with * .fa .agp barge.NN files for that contig. */ printf("Writing contig dirs\n"); for (contig = contigList; contig != NULL; contig = contig->next) { /* Make Contig dir. */ sprintf(contigDir, "%s/%s", finDir, contig->name); makeDir(contigDir); /* Make contig.agp file. */ sprintf(path, "%s/%s.agp", contigDir, contig->name); f = mustOpen(path, "w"); fragIx = 0; for (agp = contig->agpList; agp != NULL; agp = agp->next) { char buf[128]; sprintf(buf, "%s/%s", skipChr(agp->chrom), contig->name); freez(&agp->chrom); agp->chrom = cloneString(buf); agp->chromStart -= contig->startOffset; agp->chromEnd -= contig->startOffset; agp->ix = ++fragIx; agpFragTabOut(agp, f); } carefulClose(&f); /* Make ooGreedy.NN.gl file */ sprintf(path, "%s/%s.%s.gl", contigDir, "ooGreedy", ooVer); f = mustOpen(path, "w"); for (agp = contig->agpList; agp != NULL; agp = agp->next) { if (agp->type[0] != 'N' && agp->type[0] != 'U') { fprintf(f, "%s_1\t%d\t%d\t%s\n", agp->frag, agp->chromStart, agp->chromEnd, agp->strand); } } carefulClose(&f); /* Make contig.fa file. */ sprintf(path, "%s/%s.fa", contigDir, contig->name); faWrite(path, contig->name, seq->dna + contig->startOffset, contig->endOffset - contig->startOffset); /* Make contig/barge file. */ sprintf(path, "%s/barge.%s", contigDir, ooVer); f = mustOpen(path, "w"); fprintf(f, "Barge (Connected Clone) File ooGreedy Version %s\n", ooVer); fprintf(f, "\n"); fprintf(f, "start accession size overlap maxClone maxOverlap\n"); fprintf(f, "------------------------------------------------------------\n"); for (agp = contig->agpList; agp != NULL; agp = agp->next) { char clone[128]; strcpy(clone, agp->frag); chopSuffix(clone); fprintf(f, "%d\t%s\t%d\t100\tn/a\t0\n", agp->chromStart, clone, agp->chromEnd); } carefulClose(&f); /* Make contig/gold file. */ sprintf(path, "%s/gold.%s", contigDir, ooVer); f = mustOpen(path, "w"); fragIx = 0; for (agp = contig->agpList; agp != NULL; agp = agp->next) { char fragName[128]; struct agpFrag frag = *agp; sprintf(fragName, "%s_1", agp->frag); frag.frag = fragName; frag.type[0] = '0'; agpFragTabOut(&frag, f); } carefulClose(&f); } /* Create lift subdirectory. */ printf("Creating lift files\n"); sprintf(liftDir, "%s/lift", finDir); makeDir(liftDir); /* Create lift/oOut.lst file (just a list of contigs). */ sprintf(path, "%s/oOut.lst", liftDir); f = mustOpen(path, "w"); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%s/%s.fa.out\n", contig->name, contig->name); carefulClose(&f); /* Create lift/ordered.lst file (just a list of contigs). */ sprintf(path, "%s/ordered.lst", liftDir); f = mustOpen(path, "w"); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%s\n", contig->name); carefulClose(&f); /* Create lift/ordered.lft file. */ sprintf(path, "%s/ordered.lft", liftDir); f = mustOpen(path, "w"); splitPath(faName, NULL, chrom, NULL); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%d\t%s/%s\t%d\t%s\t%d\n", contig->startOffset, skipChr(chrom), contig->name, contig->endOffset - contig->startOffset, chrom, seq->size); carefulClose(&f); }
void splitAgp(char *agpName, char *goldFileName, char *gapFileName) /* Split up agp file into gold and gap files. */ { struct lineFile *lf; char *words[16]; int wordCount; FILE *goldTab, *gapTab; /* Scan through .agp file splitting it into gold * and gap components. */ goldTab = mustOpen(goldFileName, "w"); gapTab = mustOpen(gapFileName, "w"); lf = lineFileOpen(agpName, TRUE); while ((wordCount = lineFileChop(lf, words)) > 0) { int start, end; if (wordCount < 5) errAbort("Short line %d of %s", lf->lineIx, lf->fileName); int len = strlen(words[0]); if (len > maxChromNameSize) { maxChromNameSize = len; if (maxChromNameSize > 254) errAbort("ERROR: chrom name size is over 254(%d) characters: " "'%s'", maxChromNameSize, words[0]); } start = sqlUnsigned(words[1])-1; end = sqlUnsigned(words[2]); if (words[4][0] == 'N' || words[4][0] == 'U') { struct agpGap gap; agpGapStaticLoad(words, &gap); gap.chromStart -= 1; fprintf(gapTab, "%u\t", hFindBin(start, end)); agpGapTabOut(&gap, gapTab); verbose(3,"#GAP\t%s:%d-%d\n", gap.chrom, gap.chromStart, gap.chromEnd); } else { struct agpFrag gold; agpFragStaticLoad(words, &gold); agpFragValidate(&gold); len = strlen(words[5]); if (len > maxFragNameSize) { maxFragNameSize = len; if (maxFragNameSize > 254) errAbort("ERROR: fragment name size is over 254(%d) " "characters: '%s'", maxFragNameSize, words[5]); } // file is 1-based. agpFragLoad() now assumes 0-based. // and agpFragTabOut() will assume 1-based, but we will load // the generated file straight into the database, so // subtract 2: gold.chromStart -= 2; gold.fragStart -= 2; fprintf(goldTab, "%u\t", hFindBin(start, end)); agpFragTabOut(&gold, goldTab); } } lineFileClose(&lf); carefulClose(&goldTab); carefulClose(&gapTab); }