struct hash *allChainsHash(char *fileName) /* Hash all the chains in a given file by their ids. */ { struct hash *chainHash = newHash(18); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; char chainId[20]; struct lm *lm = chainHash->lm; struct rbTreeNode **stack; lmAllocArray(lm, stack, 128); while ((chain = chainRead(lf)) != NULL) { struct indexedChain *ixc; lmAllocVar(lm, ixc); ixc->chain = chain; #ifdef SOON #endif /* SOON */ ixc->blockTree = rangeTreeNewDetailed(lm, stack); struct cBlock *block; for (block = chain->blockList; block != NULL; block = block->next) { struct range *r = rangeTreeAdd(ixc->blockTree, block->tStart, block->tEnd); r->val = block; } safef(chainId, sizeof(chainId), "%x", chain->id); hashAddUnique(chainHash, chainId, ixc); } lineFileClose(&lf); return chainHash; }
struct hash *readChainToBinKeeper(char *sizeFileName, char *fileName) { struct binKeeper *bk; struct chain *chain; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct lineFile *sf = lineFileOpen(sizeFileName, TRUE); struct hash *hash = newHash(0); char *chromRow[2]; while (lineFileRow(sf, chromRow)) { char *name = chromRow[0]; int size = lineFileNeedNum(sf, chromRow, 1); if (hashLookup(hash, name) != NULL) warn("Duplicate %s, ignoring all but first\n", name); else { bk = binKeeperNew(0, size); assert(size > 1); hashAdd(hash, name, bk); } } while ((chain = chainRead(lf)) != NULL) { bk = hashMustFindVal(hash, chain->tName); binKeeperAdd(bk, chain->tStart, chain->tEnd, chain); } lineFileClose(&lf); return hash; }
void chainToPsl(char *inName, char *tSizeFile, char *qSizeFile, char *targetList, char *queryList, char *outName) /* chainToPsl - Convert chain file to psl format. */ { struct hash *tSizeHash = readSizes(tSizeFile); struct hash *qSizeHash = readSizes(qSizeFile); struct lineFile *lf = lineFileOpen(inName, TRUE); FILE *f = mustOpen(outName, "w"); struct hash *fileHash = newHash(0); /* No value. */ struct hash *tHash = newHash(20); /* seqFilePos value. */ struct hash *qHash = newHash(20); /* seqFilePos value. */ struct dlList *fileCache = newDlList(); struct chain *chain; int q,t; verbose(1, "Scanning %s\n", targetList); hashFileList(targetList, fileHash, tHash); verbose(1, "Scanning %s\n", queryList); hashFileList(queryList, fileHash, qHash); verbose(1, "Converting %s\n", inName); while ((chain = chainRead(lf)) != NULL) { //uglyf("chain %s %s \n",chain->tName,chain->qName); q = findSize(qSizeHash, chain->qName); t = findSize(tSizeHash, chain->tName); aliStringToPsl(lf, chain->qName, chain->tName, chain->qSize, chain->tSize, min(chain->tEnd-chain->tStart, chain->qEnd-chain->qStart), chain->qStart, chain->qEnd, chain->tStart, chain->tEnd, chain->qStrand, f, chain, tHash, qHash, fileCache); chainFree(&chain); } lineFileClose(&lf); carefulClose(&f); }
struct hash *chainReadUsedSwapLf(char *fileName, boolean swapQ, Bits *bits, struct lineFile *lf) /* Read chains that are marked as used in the * bits array (which may be NULL) into a hash keyed by id. */ { char nameBuf[16]; struct hash *hash = hashNew(18); struct chain *chain; int usedCount = 0, count = 0; while ((chain = chainRead(lf)) != NULL) { ++count; if (bits != NULL && !bitReadOne(bits, chain->id)) { chainFree(&chain); continue; } safef(nameBuf, sizeof(nameBuf), "%x", chain->id); if (hashLookup(hash, nameBuf)) errAbort("Duplicate chain %d ending line %d of %s", chain->id, lf->lineIx, lf->fileName); if (swapQ) chainSwap(chain); hashAdd(hash, nameBuf, chain); ++usedCount; } return hash; }
void chainIndex(char *inChain, char *outIndex) /* chainIndex - Create simple two column file index for chain. */ { struct lineFile *lf = lineFileOpen(inChain, TRUE); FILE *f = mustOpen(outIndex, "w"); struct chain *chain, *lastChain = NULL; long pos = 0; struct hash *uniqHash = hashNew(16); while ((chain = chainRead(lf)) != NULL) { if (lastChain == NULL || !sameString(chain->tName, lastChain->tName)) { if (hashLookup(uniqHash, chain->tName)) { errAbort("%s is not sorted, %s repeated with intervening %s", inChain, chain->tName, lastChain->tName); } hashAddInt(uniqHash, chain->tName, pos); fprintf(f, "%lx\t%s\n", pos, chain->tName); } chainFree(&lastChain); lastChain = chain; pos = lineFileTell(lf); } }
void chainMergeSort(int fileCount, char *files[], FILE *out, int level) /* chainMergeSort - Combine sorted files into larger sorted file. */ { int i; struct chainFile *cf; int id = 0; struct quickHeap *h = NULL; h = newQuickHeap(fileCount, &cmpChainScores); /* Open up all input files and read first chain. */ for (i=0; i<fileCount; ++i) { AllocVar(cf); cf->lf = lineFileOpen(files[i], TRUE); lineFileSetMetaDataOutput(cf->lf, out); cf->chain = chainRead(cf->lf); if (cf->chain) addToQuickHeap(h, cf); else cfEof(&cf,level); /* deal with EOF */ } while (!quickHeapEmpty(h)) { cf = peekQuickHeapTop(h); if (!saveId) cf->chain->id = ++id; /* We reset id's here. */ chainWrite(cf->chain, out); chainFree(&cf->chain); if ((cf->chain = chainRead(cf->lf))) { quickHeapTopChanged(h); } else { /* deal with EOF */ if (!removeFromQuickHeapByElem(h, cf)) errAbort("unexpected error: chainFile not found on heap"); cfEof(&cf,level); } } freeQuickHeap(&h); }
static struct mappingCnts *cntChains(char *chainFile) /* count all chains */ { struct mappingCnts *mCnts = mappingCntsNew(); struct lineFile *chainLf = lineFileOpen(chainFile, TRUE); struct chain *chain; while ((chain = chainRead(chainLf)) != NULL) cntChain(mCnts, chain); lineFileClose(&chainLf); return mCnts; }
void chainStitchId(char *inChain, char *outChain) /* chainStitchId - Join chain fragments with the same chain ID into a single chain per ID. */ { struct lineFile *lf = lineFileOpen(inChain, TRUE); struct chain *chain = NULL, *chainList = NULL; FILE *f = mustOpen(outChain, "w"); int idArrLen = 64 * 1024 * 1024; struct chain **idArr = needLargeZeroedMem(idArrLen * sizeof(struct chain *)); int i=0; /* Build up an array of chains, indexed by IDs. Agglomerate chains with same * ID as we go. */ while ((chain = chainRead(lf)) != NULL) { while (chain->id >= idArrLen) { idArr = needMoreMem(idArr, idArrLen, idArrLen*2*sizeof(idArr[0])); idArrLen *= 2; } if (idArr[chain->id] == NULL) idArr[chain->id] = chain; else { tackOnFrag(idArr[chain->id], chain); chainFree(&chain); } } lineFileClose(&lf); /* Clean up each agglomerated chain and add to head of list (but step * backwards so the resulting list is in order by chain id). */ for (i = idArrLen-1; i >= 0; i--) { chain = idArr[i]; if (chain != NULL) { slSort(&(chain->blockList), cBlockCmpTarget); slAddHead(&chainList, chain); } } /* Ordering by original chain id gets us most of the way to sorting by * score, but not all the way: sort and finally write out the chains. */ slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { chainWrite(chain, f); /* could free here, but program is about to end so why waste the time. */ } carefulClose(&f); }
static struct chromBins* loadMapChains(char *chainFile) /* read a chain file, convert to mapAln object and chromBins by query locations. */ { struct chromBins* mapAlns = chromBinsNew((chromBinsFreeFunc*)pslFree); struct chain *ch; struct lineFile *chLf = lineFileOpen(chainFile, TRUE); while ((ch = chainRead(chLf)) != NULL) { struct mapAln *mapAln = chainToPsl(ch); chromBinsAdd(mapAlns, mapAln->psl->qName, mapAln->psl->qStart, mapAln->psl->qEnd, mapAln); chainFree(&ch); } lineFileClose(&chLf); return mapAlns; }
void chainStats(char *chains) { int lastChainId = -1; struct lineFile *chainsLf = lineFileOpen(chains, TRUE); struct cseqPair *cspList = NULL, *csp; struct dyString *dy = newDyString(512); struct hash *chainHash = newHash(0); /* Hash keyed by qSeq<strand>tSeq */ struct chain *chain; struct cBlock *block; int count; count = 0; while ((chain = chainRead(chainsLf)) != NULL) { if (chain->id > lastChainId) lastChainId = chain->id; dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", chain->qName, chain->qStrand, chain->tName); csp = hashFindVal(chainHash, dy->string); if (csp == NULL) { AllocVar(csp); slAddHead(&cspList, csp); hashAddSaveName(chainHash, dy->string, csp, &csp->name); csp->qName = cloneString(chain->qName); csp->tName = cloneString(chain->tName); csp->qStrand = chain->qStrand; } slAddHead(&csp->chain, chain); count++; } lineFileClose(&chainsLf); printf("read in %d chains\n",count); for(csp = cspList; csp; csp = csp->next) { slSort(&csp->chain, chainCmpTarget); gapChains(csp->chain); for(chain = csp->chain ; chain ; chain = chain->next) { for(block = chain->blockList; block; block = block->next) { } } } dyStringFree(&dy); }
struct hash *allChainsHash(char *fileName) /* Create a hash of all the chains in a file by their id. */ { struct hash *hash = newHash(0); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; char chainId[128]; while ((chain = chainRead(lf)) != NULL) { safef(chainId, sizeof(chainId), "%d", chain->id); hashAddUnique(hash, chainId, chain); } lineFileClose(&lf); return hash; }
void checkIds(char *inputFileName, char *outputFileName) /* report if duplicate ID found */ /* put all ids in idHash */ { struct chain *chainEl; struct lineFile *lf = lineFileOpen(inputFileName, TRUE); FILE *outputFileHandle = NULL; char idString[64]; char *idString2 = NULL; struct hashEl *hel = NULL; struct hashEl *hel2 = NULL; int chainCount = 0; int dupCount = 0; struct hashCookie cookie; idHash = newHash(0); duplicateHash = newHash(0); while ((chainEl = chainRead(lf)) != NULL) { chainCount++; safef(idString, sizeof(idString), "%d", chainEl->id); hel = hashLookup(idHash, idString); if (hel == NULL) hashAdd(idHash, cloneString(idString), NULL); else { hel2 = hashLookup(duplicateHash, idString); if (hel2 == NULL) hashAdd(duplicateHash, cloneString(idString), NULL); } } verbose(1, "chain count = %d\n", chainCount); // freeHash(&idHash); /* print contents of duplicateHash */ outputFileHandle = mustOpen(outputFileName, "w"); cookie = hashFirst(duplicateHash); while ((idString2 = hashNextName(&cookie)) != NULL) { dupCount++; fprintf(outputFileHandle, "%s\n", idString2); } verbose(1, "count of duplicate IDs = %d\n", dupCount); carefulClose(&outputFileHandle); // freeHash(&duplicateHash); }
struct hash *qSizeHash(char *chainfile) /* read the chain file and figure out what the chromosome sizes are on the query end */ { struct lineFile *lf = lineFileOpen(chainfile, TRUE); struct chain *ch; struct hash *csizes = hashNew(10); while ((ch = chainRead(lf)) != NULL) { char *chrom = ch->qName; int size = ch->qSize; if (!hashLookup(csizes, chrom)) hashAddInt(csizes, chrom, size); chainFree(&ch); } lineFileClose(&lf); return csizes; }
void doIt(char *inName, char *tNibDirOr2bit, char *qNibDirOr2bit, char *outName) /* chainToAxt - Convert from chain to axt file. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); struct nibTwoCache *tSeqCache = nibTwoCacheNew(tNibDirOr2bit); struct nibTwoCache *qSeqCache = nibTwoCacheNew(qNibDirOr2bit); struct chain *chain = NULL; FILE *f = mustOpen(outName, "w"); while ((chain = chainRead(lf)) != NULL) { if (chain->score >= minScore) doAChain(chain, tSeqCache, qSeqCache, f); chainFree(&chain); } lineFileClose(&lf); carefulClose(&f); }
void chainSplit(char *outDir, int inCount, char *inFiles[]) /* chainSplit - Split chains up by target or query sequence. */ { struct hash *hash = newHash(0); int inIx; char tpath[512]; FILE *meta ; bool metaOpen = TRUE; makeDir(outDir); safef(tpath, sizeof(tpath), "%s/meta.tmp", outDir); meta = mustOpen(tpath,"w"); for (inIx = 0; inIx < inCount; ++inIx) { struct lineFile *lf = lineFileOpen(inFiles[inIx], TRUE); struct chain *chain; FILE *f; lineFileSetMetaDataOutput(lf, meta); while ((chain = chainRead(lf)) != NULL) { char *name = (splitOnQ ? chain->qName : chain->tName); if (lump > 0) name = lumpName(name); if ((f = hashFindVal(hash, name)) == NULL) { char path[512], cmd[512]; safef(path, sizeof(path),"%s/%s.chain", outDir, name); if (metaOpen) fclose(meta); metaOpen = FALSE; safef(cmd,sizeof(cmd), "cat %s | sort -u > %s", tpath, path); mustSystem(cmd); f = mustOpen(path, "a"); hashAdd(hash, name, f); } chainWrite(chain, f); chainFree(&chain); } lineFileClose(&lf); } }
struct hash *readLiftOverMapChainHash(char *fileName) /* taken from kent/src/hg/lib/liftOver.c */ /* Read map file into hashes. */ { struct hash *chainHash = hashNew(10); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; struct liftOverChromMap *map; while ((chain = chainRead(lf)) != NULL) { if ((map = hashFindVal(chainHash, chain->tName)) == NULL) { AllocVar(map); map->bk = binKeeperNew(0, chain->tSize); hashAddSaveName(chainHash, chain->tName, map, &map->name); } binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain); } lineFileClose(&lf); return chainHash; }
static struct chromAnn* chromAnnChainReaderRead(struct chromAnnReader *car) /* read a chromAnn object from a tab file or table */ { struct chromAnnChainReader *carr = car->data; struct chain *chain = chainRead(carr->lf); if (chain == NULL) return NULL; struct chromAnn* ca; if (car->opts & chromAnnUseQSide) ca = chromAnnNew(chain->qName, '+', chain->tName, ((car->opts & chromAnnSaveLines) ? chain : NULL), chainRecWrite, chainRecFree); else ca = chromAnnNew(chain->tName, chain->qStrand, chain->qName, ((car->opts & chromAnnSaveLines) ? chain : NULL), chainRecWrite, chainRecFree); if (car->opts & chromAnnRange) { if (car->opts & chromAnnUseQSide) chromAnnBlkNew(ca, chain->qStart, chain->qEnd); else chromAnnBlkNew(ca, chain->tStart, chain->tEnd); } else { if (car->opts & chromAnnUseQSide) addChainQBlocks(ca, car->opts, chain); else addChainTBlocks(ca, car->opts, chain); } chromAnnFinish(ca); if (!(car->opts & chromAnnSaveLines)) chainFree(&chain); return ca; }
void chainPreNet(char *inFile, char *targetSizes, char *querySizes, char *outFile) /* chainPreNet - Remove chains that don't have a chance of being netted. */ { struct hash *tHash = setupChroms(targetSizes); struct hash *qHash = setupChroms(querySizes); struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); struct chain *chain; double score, lastScore = 9e99; struct chrom *qChrom, *tChrom; lineFileSetMetaDataOutput(lf, f); while ((chain = chainRead(lf)) != NULL) { /* Report progress. */ dotOut(); /* Check to make sure it really is sorted by score. */ score = chain->score; if (score > lastScore) { errAbort("%s not sorted by score line %d", lf->fileName, lf->lineIx); } lastScore = score; /* Output chain if necessary and then free it. */ qChrom = hashMustFindVal(qHash, chain->qName); tChrom = hashMustFindVal(tHash, chain->tName); if (chainUsed(chain, qChrom, tChrom) && inclQuery(chain)) { chainWrite(chain, f); } chainFree(&chain); } }
void fbOrChain(Bits *acc, char *track, char *chrom, int chromSize) /* Or in a chain file. */ { struct lineFile *lf; char fileName[512]; struct chain *chain; struct cBlock *b; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; lf = lineFileOpen(fileName, TRUE); while ((chain = chainRead(lf)) != NULL) { for (b = chain->blockList; b != NULL; b = b->next) { int s = b->tStart, e = b->tEnd; if (s < 0) outOfRange(lf, chrom, chromSize); if (e > chromSize) outOfRange(lf, chrom, chromSize); bitSetRange(acc, b->tStart, b->tEnd - b->tStart); } chainFree(&chain); } }
void liftChain(char *destFile, struct hash *liftHash, int sourceCount, char *sources[], boolean querySide) /* Lift up coordinates in .chain file. */ { FILE *f = mustOpen(destFile, "w"); int sourceIx; int dotMod = dots; for (sourceIx = 0; sourceIx < sourceCount; ++sourceIx) { char *source = sources[sourceIx]; struct lineFile *lf = lineFileOpen(source, TRUE); struct chain *chain; lineFileSetMetaDataOutput(lf, f); verbose(1, "Lifting %s\n", source); while ((chain = chainRead(lf)) != NULL) { struct liftSpec *spec; char *seqName = querySide ? chain->qName : chain->tName; spec = findLift(liftHash, seqName, lf); if (spec == NULL) { if (how != carryMissing) { chainFree(&chain); continue; } } else { struct cBlock *b = NULL; int offset = spec->offset; if (spec->strand == '-') { if (querySide) { int qSpan = chain->qEnd - chain->qStart; if (chain->qStrand == '-') chain->qStart += spec->offset; else { chain->qStart = spec->newSize - spec->offset - (chain->qSize - chain->qStart); } chain->qEnd = chain->qStart + qSpan; chain->qStrand = flipStrand(chain->qStrand); freeMem(chain->qName); chain->qName = cloneString(spec->newName); chain->qSize = spec->newSize; /* We don't need to mess with the blocks here * since they are all relative to the start. */ } else { /* We try and keep the target strand positive, so we end up * flipping in both target and query and flipping the target * strand. */ reverseIntRange(&chain->qStart, &chain->qEnd, chain->qSize); reverseIntRange(&chain->tStart, &chain->tEnd, chain->tSize); chain->qStrand = flipStrand(chain->qStrand); /* Flip around blocks and add offset. */ for (b=chain->blockList; b != NULL; b=b->next) { reverseIntRange(&b->qStart, &b->qEnd, chain->qSize); reverseIntRange(&b->tStart, &b->tEnd, chain->tSize); b->tStart += offset; b->tEnd += offset; } slReverse(&chain->blockList); /* On target side add offset as well and update name and size. */ chain->tStart += offset; chain->tEnd += offset; freeMem(chain->tName); chain->tName = cloneString(spec->newName); chain->tSize = spec->newSize; } } else { if (querySide) { if (chain->qStrand == '-') offset = spec->newSize - (spec->offset + spec->oldSize); freeMem(chain->qName); chain->qName = cloneString(spec->newName); chain->qSize = spec->newSize; chain->qStart += offset; chain->qEnd += offset; for (b=chain->blockList; b != NULL; b=b->next) { b->qStart += offset; b->qEnd += offset; } } else { freeMem(chain->tName); chain->tName = cloneString(spec->newName); chain->tSize = spec->newSize; chain->tStart += offset; chain->tEnd += offset; for (b=chain->blockList; b != NULL; b=b->next) { b->tStart += offset; b->tEnd += offset; } } } } chainWrite(chain, f); chainFree(&chain); doDots(&dotMod); } lineFileClose(&lf); if (dots) verbose(1, "\n"); } }
void doChainScore(char *chainIn, char *tNibDir, char *qNibDir, char *chainOut) { char qStrand = 0, tStrand = 0; struct dnaSeq *qSeq = NULL, *tSeq = NULL; char *qName = "", *tName = ""; FILE *f = mustOpen(chainOut, "w"); struct chain *chainList = NULL, *chain; struct chain *inputChains, *next; FILE *details = NULL; struct lineFile *lf = NULL; struct dnaSeq *seq, *seqList = NULL; struct hash *faHash = newHash(0); struct hash *chainHash = newHash(0); char comment[1024]; FILE *faF; struct seqPair *spList = NULL, *sp; struct dyString *dy = newDyString(512); struct lineFile *chainsLf = lineFileOpen(chainIn, TRUE); while ((chain = chainRead(chainsLf)) != NULL) { dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", chain->qName, chain->qStrand, chain->tName); sp = hashFindVal(chainHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(chainHash, dy->string, sp, &sp->name); sp->qName = cloneString(chain->qName); sp->tName = cloneString(chain->tName); sp->qStrand = chain->qStrand; } slAddHead(&sp->chain, chain); } slSort(&spList, seqPairCmp); lineFileClose(&chainsLf); if (optionExists("faQ")) { faF = mustOpen(qNibDir, "r"); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) { hashAdd(faHash, seq->name, seq); slAddHead(&seqList, seq); } fclose(faF); } for (sp = spList; sp != NULL; sp = sp->next) { if (optionExists("faQ")) { assert (faHash != NULL); loadFaSeq(faHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } else loadIfNewSeq(qNibDir, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); loadIfNewSeq(tNibDir, sp->tName, '+', &tName, &tSeq, &tStrand); scorePair(sp, qSeq, tSeq, &chainList, sp->chain); } slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { assert(chain->qStart == chain->blockList->qStart && chain->tStart == chain->blockList->tStart); chainWrite(chain, f); } carefulClose(&f); }
int main(int argc, char *argv[]) { FILE *f; struct chain *Chain; struct chain *SubChain, *chainToFree; struct chain *ch_p, *next_p; char buf[NUM_CHARS]; struct lineFile *lf; int i = 0; int b = 0, e = 0; bool is_null = true; struct exons_list *homologs; int num_chains = 0; int num_homologs = 0; struct exons_list *repeats; int num_repeats = 0; char chr[LEN_NAME]; strcpy(chr, ""); if( argc == 3 ) { if( (f = ckopen(argv[2], "r")) ) { if( fgets(buf, NUM_CHARS, f) ) { if( sscanf(buf, "%s %d %d", chr, &b, &e) != 3 ) { fatalf("format errors: chr beg end in %s", buf); } } else { fatalf("%s is empty\n", argv[2]); } } fclose(f); } else if( argc != 4 ) { fatal("args: chain_file interval_text features_gff_file\n"); } else { if( (f = ckopen(argv[2], "r")) ) { if( fgets(buf, NUM_CHARS, f) ) { if( sscanf(buf, "%s %d %d", chr, &b, &e) != 3 ) { fatalf("format errors: chr beg end in %s", buf); } } else { fatalf("%s is empty\n", argv[2]); } } fclose(f); if( (f = ckopen(argv[3], "r")) ) { while(fgets(buf, NUM_CHARS, f)) { i++; } num_repeats = i; repeats = (struct exons_list *) ckalloc(num_repeats * sizeof(struct exons_list)); init_exons(repeats, 0, num_repeats-1); fseek(f, 0, SEEK_SET); assign_gff_exons_chr(f, repeats, num_repeats, chr); quick_sort_inc_exons(repeats, 0, num_repeats-1, POS_BASE); } else { fatalf("file %s invalid\n", argv[4]); } fclose(f); } lf = lineFileOpen(argv[1], true); Chain = chainRead(lf); ch_p = Chain; while( (ch_p != NULL) && ((next_p = chainRead(lf)) != NULL) ) { ch_p->next = next_p; ch_p = ch_p->next; i++; } // printf("Number of chains: %d\n", i); i = 0; ch_p = Chain; // while( (i < NUM_LOOPS) && (ch_p != NULL) ) { while( ch_p != NULL ) { // printf("chain %d: %d-%d\n", ch_p->id, ch_p->tStart, ch_p->tEnd); ch_p = ch_p->next; i++; } num_chains = i; homologs = (struct exons_list *) ckalloc(num_chains * sizeof(struct exons_list)); i = 0; f = ckopen(argv[2], "r"); while( fgets(buf, NUM_CHARS, f) ) { if( sscanf(buf, "%*s %d %d", &b, &e) != 2 ) { fatalf("format errors: chr beg end in %s", buf); } else { ch_p = Chain; if( ch_p != NULL ) { while( (ch_p != NULL) && (is_null == true) ) { chainSubsetOnT(ch_p, b, e, &SubChain, &chainToFree); if( SubChain != NULL ) is_null = false; ch_p = ch_p->next; } } if( is_null == false ) { if( (num_repeats == 0 ) || (is_repeats(repeats, num_repeats, SubChain->tName, SubChain->tStart, SubChain->tEnd) == false) ) { homologs[i].reg = assign_I(SubChain->qStart, SubChain->qEnd); homologs[i].dir = SubChain->qStrand; strcpy(homologs[i].chr, SubChain->qName); i++; } // printf("query: %s %d %d\n", SubChain->qName, SubChain->qStart, SubChain->qEnd); if( chainToFree != NULL ) { chainFree(&chainToFree); } while( ch_p != NULL ) { chainSubsetOnT(ch_p, b, e, &SubChain, &chainToFree); ch_p = ch_p->next; if( SubChain != NULL ) { if( (num_repeats == 0 ) || ( is_repeats(repeats, num_repeats, SubChain->tName, SubChain->tStart, SubChain->tEnd) == false )) { if( SubChain->qStrand == '-' ) { homologs[i].reg = assign_I(SubChain->qSize - SubChain->qEnd, SubChain->qSize - SubChain->qStart); } else { homologs[i].reg = assign_I(SubChain->qStart, SubChain->qEnd); } homologs[i].dir = SubChain->qStrand; strcpy(homologs[i].chr, SubChain->qName); i++; } // printf("query: %s %d %d\n", SubChain->qName, SubChain->qStart, SubChain->qEnd); if( chainToFree != NULL ) { chainFree(&chainToFree); } } } } } } num_homologs = i; selection_sort_exons(homologs, num_homologs); // print_exons_list(homologs, num_homologs); num_homologs = remove_redundant_intervals(homologs, num_homologs); print_exons_list(homologs, num_homologs); free(homologs); free(repeats); chainFreeList(&Chain); fclose(f); lineFileClose(&lf); return EXIT_SUCCESS; }
void chainStitch(char *psls, char *chains, char *outChainName, char *outFoundName, char *outNotFoundName) /* chainStitch - Stitch psls into chains. */ { int lastChainId = -1; struct psl *prevPsl, *nextPsl; struct psl *fakePslList; int jj; int deletedBases, addedBases; FILE *outFound = mustOpen(outFoundName, "w"); FILE *outNotFound = mustOpen(outNotFoundName, "w"); FILE *outChains = mustOpen(outChainName, "w"); struct lineFile *chainsLf = lineFileOpen(chains, TRUE); struct cseqPair *cspList = NULL, *csp; struct seqPair *spList = NULL, *sp; struct lineFile *pslLf = pslFileOpen(psls); struct dyString *dy = newDyString(512); struct psl *psl; struct hash *pslHash = newHash(0); /* Hash keyed by qSeq<strand>tSeq */ struct hash *chainHash = newHash(0); /* Hash keyed by qSeq<strand>tSeq */ struct chain *chain, *chainList = NULL; struct cBlock *block , *nextBlock = NULL, *prevBlock = NULL; int count; count = 0; while ((psl = pslNext(pslLf)) != NULL) { assert((psl->strand[1] == 0) || (psl->strand[1] == '+')); count++; dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", psl->qName, psl->strand[0], psl->tName); sp = hashFindVal(pslHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(pslHash, dy->string, sp, &sp->name); sp->qName = cloneString(psl->qName); sp->tName = cloneString(psl->tName); sp->qStrand = psl->strand[0]; } slAddHead(&sp->psl, psl); } lineFileClose(&pslLf); printf("read in %d psls\n",count); for(sp = spList; sp; sp = sp->next) slReverse(&sp->psl); //slSort(&sp->psl, pslCmpTarget); count = 0; while ((chain = chainRead(chainsLf)) != NULL) { if (chain->id > lastChainId) lastChainId = chain->id; dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", chain->qName, chain->qStrand, chain->tName); csp = hashFindVal(chainHash, dy->string); if (csp == NULL) { AllocVar(csp); slAddHead(&cspList, csp); hashAddSaveName(chainHash, dy->string, csp, &csp->name); csp->qName = cloneString(chain->qName); csp->tName = cloneString(chain->tName); csp->qStrand = chain->qStrand; } slAddHead(&csp->chain, chain); count++; } lineFileClose(&chainsLf); printf("read in %d chains\n",count); for(csp = cspList; csp; csp = csp->next) { slSort(&csp->chain, chainCmpTarget); // csp->chain = aggregateChains(csp->chain); } addedBases = deletedBases = 0; for(sp = spList; sp; sp = sp->next) { #ifdef NOTNOW /* find the chains associated with this strand */ if ((csp = hashFindVal(chainHash, sp->name)) != NULL) { /* first check to see if psl blocks are in any chains */ checkInChains(&sp->psl, &csp->chain, outFound, &addedBases); /* now extend chains to the right */ checkAfterChains(&sp->psl, &csp->chain, outFound, &addedBases); /* now extend chains to the left */ slReverse(&sp->psl); checkBeforeChains(&sp->psl, &csp->chain, outFound, &addedBases); } #endif /* do we still have psl's */ if (sp->psl != NULL) { /* make sure we have a chainList */ chainList = NULL; if (csp == NULL) { AllocVar(csp); slAddHead(&cspList, csp); csp->qName = cloneString(sp->psl->qName); csp->tName = cloneString(sp->psl->tName); csp->qStrand = sp->psl->strand[0]; dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", csp->qName, csp->qStrand, csp->tName); hashAddSaveName(chainHash, dy->string, csp, &csp->name); } for(psl = sp->psl; psl ; psl = nextPsl) { /* this psl will either fit a chain or make a new one */ nextPsl = psl->next; sp->psl = nextPsl; psl->next = NULL; fakePslList = psl; if (chainList) checkInChains(&fakePslList, &chainList, outFound, &addedBases); if (fakePslList == NULL) { //freez(&psl); continue; } // if (chainList) // checkAfterChains(&fakePslList, &chainList, outFound, &addedBases); if (fakePslList == NULL) { //freez(&psl); continue; } if (chainList) checkBeforeChains(&fakePslList, &chainList, outFound, &addedBases); if (fakePslList == NULL) { //freez(&psl); continue; } AllocVar(chain); chain->tStart = psl->tStarts[0]; chain->qStart = psl->qStarts[0]; chain->tEnd = psl->tStarts[psl->blockCount - 1] + psl->blockSizes[psl->blockCount - 1]; chain->qEnd = psl->qStarts[psl->blockCount - 1] + psl->blockSizes[psl->blockCount - 1]; chain->tSize = psl->tSize; chain->qSize = psl->qSize; chain->qName = cloneString(psl->qName); chain->tName = cloneString(psl->tName); chain->qStrand = psl->strand[0]; chain->id = ++lastChainId; if (!addPslToChain(chain, psl, &addedBases)) errAbort("new "); slAddHead(&chainList, chain); pslTabOut(psl, outFound); freez(&psl); } csp->chain = slCat(csp->chain, chainList); } slSort(&csp->chain, chainCmpTarget); csp->chain = aggregateChains(csp->chain); } fclose(outFound); printf("deleted %d bases\n",deletedBases); printf("added %d bases\n",addedBases); count = 0; for(sp = spList; sp; sp = sp->next) for(psl = sp->psl ; psl ; psl = psl->next) { pslTabOut(psl, outNotFound); count++; } fclose(outNotFound); printf("%d psls remain\n",count); for(csp = cspList; csp; csp = csp->next) for(chain = csp->chain ; chain ; chain = chain->next) { //slSort(&chain->blockList, boxInCmpBoth); chain->tStart = chain->blockList->tStart; chain->qStart = chain->blockList->qStart; for(block = chain->blockList; block; block = block->next) { chain->tEnd = block->tEnd; chain->qEnd = block->qEnd; } chainWrite(chain, outChains); } fclose(outChains); dyStringFree(&dy); }
void chainNet(char *chainFile, char *tSizes, char *qSizes, char *tNet, char *qNet) /* chainNet - Make alignment nets out of chains. */ { struct lineFile *lf = lineFileOpen(chainFile, TRUE); struct hash *qHash, *tHash; struct chrom *qChromList, *tChromList, *tChrom, *qChrom; struct chain *chain; double lastScore = -1; struct lm *lm = lmInit(0); struct rbTreeNode **rbStack; FILE *tNetFile = mustOpen(tNet, "w"); FILE *qNetFile = mustOpen(qNet, "w"); lmAllocArray(lm, rbStack, 256); makeChroms(qSizes, lm, rbStack, &qHash, &qChromList); makeChroms(tSizes, lm, rbStack, &tHash, &tChromList); verbose(1, "Got %d chroms in %s, %d in %s\n", slCount(tChromList), tSizes, slCount(qChromList), qSizes); lineFileSetMetaDataOutput(lf, tNetFile); lineFileSetMetaDataOutput(lf, qNetFile); /* Loop through chain file building up net. */ while ((chain = chainRead(lf)) != NULL) { /* Make sure that input is really sorted. */ if (lastScore >= 0 && chain->score > lastScore) errAbort("%s must be sorted in order of score", chainFile); lastScore = chain->score; if (chain->score < minScore) { break; } verbose(2, "chain %f (%d els) %s %d-%d %c %s %d-%d\n", chain->score, slCount(chain->blockList), chain->tName, chain->tStart, chain->tEnd, chain->qStrand, chain->qName, chain->qStart, chain->qEnd); qChrom = hashMustFindVal(qHash, chain->qName); if (qChrom->size != chain->qSize) errAbort("%s is %d in %s but %d in %s", chain->qName, chain->qSize, chainFile, qChrom->size, qSizes); tChrom = hashMustFindVal(tHash, chain->tName); if (tChrom->size != chain->tSize) errAbort("%s is %d in %s but %d in %s", chain->tName, chain->tSize, chainFile, tChrom->size, tSizes); if (!inclQuery(chain)) verbose(2, "skipping chain on query %s\n", chain->qName); else { addChain(qChrom, tChrom, chain); verbose(2, "%s has %d inserts, %s has %d\n", tChrom->name, tChrom->spaces->n, qChrom->name, qChrom->spaces->n); } } /* Build up other side of fills. It's just for historical * reasons this is not done during the main build up. * It's a little less efficient this way, but to change it * some hard reverse strand issues would have to be juggled. */ verbose(1, "Finishing nets\n"); finishNet(qChromList, TRUE); finishNet(tChromList, FALSE); /* Write out basic net files. */ verbose(1, "writing %s\n", tNet); outputNetSide(tChromList, tNetFile, FALSE); verbose(1, "writing %s\n", qNet); outputNetSide(qChromList, qNetFile, TRUE); /* prevent SIGPIPE in preceding process if input is a pipe, consume remainder * of input file since we stop before EOF. */ if (isPipe(lf->fd)) { char *line; while(lineFileNext(lf, &line, NULL)) continue; } lineFileClose(&lf); if (verboseLevel() > 1) printMem(stderr); }