static void goldLoad(struct track *tg) /* Load up golden path from database table to track items. */ { struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr = NULL; char **row; struct agpFrag *fragList = NULL, *frag; struct agpGap *gapList = NULL, *gap; int rowOffset; /* Get the frags and load into tg->items. */ sr = hRangeQuery(conn, "gold", chromName, winStart, winEnd, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { frag = agpFragLoad(row+rowOffset); slAddHead(&fragList, frag); } slSort(&fragList, cmpAgpFrag); sqlFreeResult(&sr); tg->items = fragList; /* Get the gaps into tg->customPt. */ sr = hRangeQuery(conn, "gap", chromName, winStart, winEnd, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { gap = agpGapLoad(row+rowOffset); slAddHead(&gapList, gap); } slReverse(&gapList); sqlFreeResult(&sr); tg->customPt = gapList; hFreeConn(&conn); }
struct agp *agpLoad(char **row, int ct) /* Load an AGP entry from array of strings. Dispose with agpFree */ { struct agp *agp; struct agpFrag *agpFrag; struct agpGap *agpGap; if (ct < 8) errAbort("Expecting >= 8 words in AGP file, got %d\n", ct); AllocVar(agp); if (row[4][0] != 'N' && row[4][0] != 'U') { /* not a gap */ if (ct != 9) errAbort("Expecting 9 words in AGP fragment line, got %d\n", ct); agpFrag = agpFragLoad(row); agp->entry = agpFrag; agp->isFrag = TRUE; } else { /* gap */ agpGap = agpGapLoad(row); agp->entry = agpGap; agp->isFrag = FALSE; } return agp; }
static struct agpGap *loadAllGaps(struct sqlConnection *conn, char *db, struct chromInfo *cInfoList) /* fetch all gaps, returns list of gaps */ { struct agpGap *gapList = NULL; struct chromInfo *cInfo; int gapCount = 0; for (cInfo = cInfoList; cInfo; cInfo = cInfo->next) { char **row; int rowOffset; struct sqlResult *sr = hRangeQuery(conn, "gap", cInfo->chrom, 0, cInfo->size, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct agpGap *gap = agpGapLoad(row+rowOffset); if (minGap) { if (gap->size >= minGap) { slAddHead(&gapList, gap); ++gapCount; } } else { slAddHead(&gapList, gap); ++gapCount; } } sqlFreeResult(&sr); } slSort(&gapList, bedCmp); if (! insane) gapSanityCheck(gapList); verbose(2,"#\tfound %d gaps of size >= %d\n", gapCount, minGap); return (gapList); }
struct agpData* nextAgpEntryToSplitOn(struct lineFile *lfAgpFile, int dnaSize, struct agpData **retStartData) /* Finds the next agp entry in the agp file at which to split on. param lfAgpFile - The .agp file we are examining. param dnaSize - The total size of the chromsome's dna sequence we are splitting up. Used to prevent overrun of the algorithm that looks at agp entries. param retStartData - An out param returning the starting(inclusive) gap that we will start to split on. return struct agpData* - The ending (inclusive) agp data we are to split on. */ { int startIndex = 0; int numBasesRead = 0; char *line = NULL; char *words[9]; int lineSize = 0; struct agpGap *agpGap = NULL; struct agpFrag *agpFrag = NULL; struct agpData *curAgpData = NULL; struct agpData *prevAgpData = NULL; boolean splitPointFound = FALSE; int splitSize = _nSize; do { lineFileNext(lfAgpFile, &line, &lineSize); if (line[0] == '#' || line[0] == '\n') { continue; } AllocVar(curAgpData); curAgpData->endOfContig = FALSE; curAgpData->isGap = FALSE; curAgpData->prev = NULL; curAgpData->next = NULL; chopLine(line, words); if (words[4][0] == 'N' || words[4][0] == 'U') { agpGap = agpGapLoad(words); /* Decrement the chromStart index since that's how the agpFrags do it and we want to use 0-based addressing */ --(agpGap->chromStart); if (0 == startIndex) { startIndex = agpGap->chromStart; } if (numBasesRead >= _bSize) { splitPointFound = TRUE; } else { /* Split points are made after non-bridged contigs only here */ splitPointFound = (0 == strcasecmp(agpGap->bridge, NO)); } curAgpData->isGap = TRUE; curAgpData->data.pGap = agpGap; } else { agpFrag = agpFragLoad(words); // file is 1-based but agpFragLoad() now assumes 0-based: agpFrag->chromStart -= 1; agpFrag->fragStart -= 1; /* If we find a fragment and not a gap */ if (0 == startIndex) { startIndex = agpFrag->chromStart; } if (numBasesRead >= _aSize) { splitPointFound = TRUE; } else { splitPointFound = FALSE; } curAgpData->isGap = FALSE; curAgpData->data.pFrag = agpFrag; } /* Since this our first loop iteration, save the start gap as the beginning of the section to write out */ if (NULL == prevAgpData) { *retStartData = curAgpData; /* Save the pointer to the head of the list */ } else { /* Build a doubly linked list for use elsewhere */ prevAgpData->next = curAgpData; curAgpData->prev = prevAgpData; } prevAgpData = curAgpData; numBasesRead = curAgpData->data.pGap->chromEnd - startIndex; } while ((numBasesRead < splitSize || !splitPointFound) && curAgpData->data.pGap->chromEnd < dnaSize); curAgpData->next = NULL; /* Terminate the linked list */ curAgpData->endOfContig = TRUE; return curAgpData; }
static void agpSangerUnfinished(char *agpFile, char *contigFasta, char *agpOut) /* Fix agp to match unfinished contigs in fasta */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, "w"); char *newChrom = NULL; struct hash *hash = hashFasta(contigFasta); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); struct hashEl *e, *elist = hashLookup(hash, root); for (e = elist; e; e = hashLookupNext(e)) { struct unfinishedContig *u = e->val; if ((u->fragStart <= agp->fragStart) && (u->fragEnd >= agp->fragEnd)) { agp->frag = cloneString(u->frag); agp->fragEnd -= u->fragStart; agp->fragStart -= u->fragStart; } } freeMem(root); } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }
static void agpMergeChromScaf(char *agpFile, char *agpOut, boolean filtering) /* Create a combined agp file from the chrom.agp and scaffold.agp, * merging in only scaffolds from scaffold.agp * that are not already in chroms. */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, filtering ? "a" : "w"); char *newChrom = NULL; static struct hash *hash = NULL; boolean skipping = FALSE; if (!hash) hash = hashNew(0); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } skipping = FALSE; if (filtering) { if (hashLookup(hash, words[0])) skipping = TRUE; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); if (!filtering) { char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); hashStore(hash, root); freeMem(root); } } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ if (!skipping) agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; if (!skipping) agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }
struct hash *agpLoadAll(char *agpFile) /* load AGP entries into a hash of AGP lists, one per chromosome */ { struct hash *agpHash = newHash(0); struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *words[9]; int lastPos = 0; int wordCount; struct agpFrag *agpFrag; struct agpGap *agpGap; char *chrom; struct agp *agp; struct hashEl *hel; while ((wordCount = lineFileChopNext(lf, words, ArraySize(words))) != 0) { lineFileExpectAtLeast(lf, 8, wordCount); chrom = words[0]; if (!hashFindVal(agpHash, chrom)) lastPos = 1; AllocVar(agp); if (words[4][0] != 'N' && words[4][0] != 'U') { /* not a gap */ lineFileExpectWords(lf, 9, wordCount); agpFrag = agpFragLoad(words); if (agpFrag->chromStart != lastPos) errAbort( "Frag start (%d, %d) doesn't match previous end line %d of %s\n", agpFrag->chromStart, lastPos, lf->lineIx, lf->fileName); if (agpFrag->chromEnd - agpFrag->chromStart != agpFrag->fragEnd - agpFrag->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agpFrag->chrom, agpFrag->frag, lf->lineIx, lf->fileName); lastPos = agpFrag->chromEnd + 1; agp->entry = agpFrag; agp->isFrag = TRUE; } else { /* gap */ lineFileExpectWords(lf, 8, wordCount); agpGap = agpGapLoad(words); if (agpGap->chromStart != lastPos) errAbort("Gap start (%d, %d) doesn't match previous end line %d of %s\n", agpGap->chromStart, lastPos, lf->lineIx, lf->fileName); lastPos = agpGap->chromEnd + 1; agp->entry = agpGap; agp->isFrag = FALSE; } if ((hel = hashLookup(agpHash, chrom)) == NULL) hashAdd(agpHash, chrom, agp); else slAddHead(&(hel->val), agp); } #ifndef DEBUG { struct hashCookie cookie; struct hashEl *hel; cookie = hashFirst(agpHash); while ((hel = hashNext(&cookie)) != NULL) { struct agp *agpList; agpList = (struct agp *)hel->val; /* for (agp = agpList; agp != NULL; agp = agp->next) printf("isFrag: %d\n", agp->isFrag); */ } } #endif /* reverse AGP lists */ //hashTraverseVals(agpHash, slReverse); #ifndef DEBUG { struct hashCookie cookie; struct hashEl *hel; cookie = hashFirst(agpHash); while ((hel = hashNext(&cookie)) != NULL) { struct agp *agpList; slReverse(&hel->val); agpList = hel->val; /* agpList = (struct agp *)hel->val; slReverse(&agpList); hashRemove(agpHash, hel->name); hashAdd(agpHash, hel->name, agpList); */ /* for (agp = agpList; agp != NULL; agp = agp->next) printf("isFrag: %d\n", agp->isFrag); */ } } #endif return agpHash; }
struct chrom *readChromScaffoldsFromAgp(char *fileName) /* Read in agp file and return as list of chroms. */ { struct hash *chromHash = newHash(17); struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[9]; int wordCount; struct chrom *chromList = NULL, *chrom; struct agpFrag *frag = NULL; struct agpGap *gap = NULL; char *chromName; int chromSize = 0; for (;;) { wordCount = lineFileChop(lf, row); if (wordCount <= 0) break; if (wordCount < 8) lineFileShort(lf); if (row[4][0] == 'N' || row[4][0] == 'U') { /* need to get chromEnd from gaps to determine chrom size * if the chrom ends with a gap */ gap = agpGapLoad(row); chromName = gap->chrom; chromSize = gap->chromEnd; frag = NULL; } else { if (wordCount < 9) lineFileShort(lf); frag = agpFragLoad(row); chromName = frag->chrom; chromSize = frag->chromEnd; frag->chromStart -= 1; frag->fragStart -= 1; if (frag->fragEnd - frag->fragStart != frag->chromEnd - frag->chromStart) errAbort("chrom/scaffold size mismatch line %d of %s", lf->lineIx, lf->fileName); } chrom = hashFindVal(chromHash, chromName); if (chrom == NULL) { AllocVar(chrom); slAddHead(&chromList, chrom); hashAdd(chromHash, chromName, chrom); } chrom->size = max(chromSize, chrom->size); if (frag != NULL) slAddHead(&chrom->list, frag); } slReverse(&chromList); for (chrom = chromList; chrom != NULL; chrom = chrom->next) slReverse(&chrom->list); verbose(1, "Got %d chroms in %s\n", slCount(chromList), lf->fileName); lineFileClose(&lf); hashFree(&chromHash); return chromList; }