void hashFileList(char *fileList, struct hash *fileHash, struct hash *seqHash) /* Read file list into hash */ { if (twoBitIsFile(fileList)) addTwoBit(fileList, fileHash, seqHash); else if (endsWith(fileList, ".nib")) addNib(fileList, fileHash, seqHash); else if (isFa(fileList)) addFa(fileList, fileHash, seqHash); else { struct lineFile *lf = lineFileOpen(fileList, TRUE); char *row[1]; while (lineFileRow(lf, row)) { char *file = row[0]; if (twoBitIsFile(file)) addTwoBit(file, fileHash, seqHash); else if (endsWith(file, ".nib")) addNib(file, fileHash, seqHash); else addFa(file, fileHash, seqHash); } lineFileClose(&lf); } }
struct nibTwoCache *nibTwoCacheNew(char *pathName) /* Get something that will more or less transparently get sequence from * nib files or .2bit. */ { struct nibTwoCache *ntc; AllocVar(ntc); ntc->pathName = cloneString(pathName); ntc->isTwoBit = twoBitIsFile(pathName); if (ntc->isTwoBit) ntc->tbf = twoBitOpen(pathName); else ntc->nibHash = newHash(10); return ntc; }
struct dnaSeq *nibTwoLoadOne(char *pathName, char *seqName) /* Return sequence from a directory full of nibs or a .2bit file. * The sequence will have repeats in lower case. */ { struct dnaSeq *seq; if (twoBitIsFile(pathName)) { struct twoBitFile *tbf = twoBitOpen(pathName); seq = twoBitReadSeqFrag(tbf, seqName, 0, 0); twoBitClose(&tbf); } else { char path[512]; safef(path, sizeof(path), "%s/%s.nib", pathName, seqName); seq = nibLoadAllMasked(NIB_MASK_MIXED, path); } return seq; }
void twoBitMask(char *inName, char *maskName, char *outName) /* twoBitMask - apply masking to a .2bit file, creating a new .2bit file. */ { struct hash *tbHash = hashNew(20); struct hash *bitmapHash = hashNew(20); struct twoBit *twoBitList = NULL; struct twoBit *twoBit = NULL; FILE *f = NULL; if (! twoBitIsFile(inName)) { if (twoBitIsSpec(inName)) errAbort("Sorry, this works only on whole .2bit files, not specs."); else errAbort("Input %s does not look like a proper .2bit file.", inName); } twoBitList = slurpInput(inName, tbHash, bitmapHash); /* Read mask data into bitmapHash, store it in twoBits: */ if ((type && endsWith(type, "bed")) || endsWith(maskName, ".bed")) maskWithBed(maskName, tbHash, bitmapHash); else if ((type && endsWith(type, "out")) || endsWith(maskName, ".out")) maskWithOut(maskName, tbHash, bitmapHash); else errAbort("Sorry, maskFile must end in \".bed\" or \".out\"."); /* Create a new .2bit file, write it out from twoBits. */ f = mustOpen(outName, "wb"); twoBitWriteHeader(twoBitList, f); for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { twoBitWriteOne(twoBit, f); } carefulClose(&f); /* Don't bother freeing twoBitList and hashes here -- just exit. */ }
struct dnaLoadStack *dnaLoadStackNew(char *fileName) /* Create new dnaLoadStack on composite file. */ { struct dnaLoadStack *dls; AllocVar(dls); if (twoBitIsFile(fileName)) { dls->twoBit = twoBitOpen(fileName); dls->tbi = dls->twoBit->indexList; } else { char *line; dls->textFile = lineFileOpen(fileName, TRUE); if (lineFileNextReal(dls->textFile, &line)) { line = trimSpaces(line); if (line[0] == '>') dls->textIsFa = TRUE; lineFileReuse(dls->textFile); } } return dls; }
void axtChain(char *axtIn, char *tNibDir, char *qNibDir, char *chainOut) /* axtChain - Chain together axt alignments.. */ { struct hash *pairHash = newHash(0); /* Hash keyed by qSeq<strand>tSeq */ struct seqPair *spList = NULL, *sp; FILE *f = mustOpen(chainOut, "w"); char *qName = "", *tName = ""; struct dnaSeq *qSeq = NULL, *tSeq = NULL; char qStrand = 0, tStrand = 0; struct chain *chainList = NULL, *chain; FILE *details = NULL; struct dnaSeq *seq, *seqList = NULL; struct hash *faHash = newHash(0); struct hash *tFaHash = newHash(0); FILE *faF; boolean qIsTwoBit = twoBitIsFile(qNibDir); boolean tIsTwoBit = twoBitIsFile(tNibDir); axtScoreSchemeDnaWrite(scoreScheme, f, "axtChain"); if (detailsName != NULL) details = mustOpen(detailsName, "w"); /* Read input file and divide alignments into various parts. */ if (optionExists("psl")) spList = readPslBlocks(axtIn, pairHash, f); else spList = readAxtBlocks(axtIn, pairHash, f); if (optionExists("faQ")) { faF = mustOpen(qNibDir, "r"); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) { hashAdd(faHash, seq->name, seq); slAddHead(&seqList, seq); } fclose(faF); } if (optionExists("faT")) { faF = mustOpen(tNibDir, "r"); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) { hashAdd(tFaHash, seq->name, seq); slAddHead(&seqList, seq); } fclose(faF); } for (sp = spList; sp != NULL; sp = sp->next) { slReverse(&sp->blockList); removeExactOverlaps(&sp->blockList); verbose(1, "%d blocks after duplicate removal\n", slCount(sp->blockList)); if (optionExists("faQ")) { assert (faHash != NULL); loadFaSeq(faHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } else { loadIfNewSeq(qNibDir, qIsTwoBit, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } if (optionExists("faT")) { assert (tFaHash != NULL); loadFaSeq(tFaHash, sp->tName, '+', &tName, &tSeq, &tStrand); } else { loadIfNewSeq(tNibDir, tIsTwoBit, sp->tName, '+', &tName, &tSeq, &tStrand); } chainPair(sp, qSeq, tSeq, &chainList, details); } slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { assert(chain->qStart == chain->blockList->qStart && chain->tStart == chain->blockList->tStart); chainWrite(chain, f); } carefulClose(&f); }
void outputBlocks(struct lineFile *lf, struct block *blockList, int score, FILE *f, boolean isRc, char *qName, int qSize, char *qNibDir, struct dlList *qCache, char *tName, int tSize, char *tNibDir, struct dlList *tCache, boolean rescore) /* Output block list as an axt to file f. */ { int qStart = BIGNUM, qEnd = 0, tStart = BIGNUM, tEnd = 0; struct block *lastBlock = NULL; struct block *block; struct dyString *qSym = newDyString(16*1024); struct dyString *tSym = newDyString(16*1024); struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seq = NULL; struct axt axt; boolean qIsTwoBit = twoBitIsFile(qNibDir); boolean tIsTwoBit = twoBitIsFile(tNibDir); if (blockList == NULL) return; /* Figure overall dimensions. */ for (block = blockList; block != NULL; block = block->next) { if (qStart > block->qStart) qStart = block->qStart; if (qEnd < block->qEnd) qEnd = block->qEnd; if (tStart > block->tStart) tStart = block->tStart; if (tEnd < block->tEnd) tEnd = block->tEnd; } /* Load sequence covering alignment from nib files. */ if (isRc) { reverseIntRange(&qStart, &qEnd, qSize); if (qIsFa) { for (seq = qFaList ; seq != NULL ; seq = seq->next) if (sameString(qName, seq->name)) break; if (seq != NULL) { AllocVar(qSeq); qSeq->size = qEnd - qStart; qSeq->name = cloneString(qName); qSeq->dna = cloneMem((seq->dna)+qStart, qSeq->size); } else errAbort("sequence not found %s\n",qName); } else qSeq = readFromCache(qCache, qNibDir, qName, qStart, qEnd - qStart, qSize, qIsTwoBit); reverseIntRange(&qStart, &qEnd, qSize); reverseComplement(qSeq->dna, qSeq->size); } else { if (qIsFa) { for (seq = qFaList ; seq != NULL ; seq = seq->next) { if (sameString(qName, seq->name)) break; } if (seq != NULL) { AllocVar(qSeq); qSeq->size = qEnd - qStart; qSeq->name = cloneString(qName); qSeq->dna = (seq->dna)+qStart; } else errAbort("sequence not found %s\n",qName); } else qSeq = readFromCache(qCache, qNibDir, qName, qStart, qEnd - qStart, qSize, qIsTwoBit); } if (tIsFa) { for (seq = tFaList ; seq != NULL ; seq = seq->next) if (sameString(tName, seq->name)) break; if (seq != NULL) { AllocVar(tSeq); tSeq->size = tEnd - tStart; tSeq->name = cloneString(tName); tSeq->dna = cloneMem((seq->dna)+tStart, tSeq->size); } else errAbort("sequence not found %s\n",tName); } else tSeq = readFromCache(tCache, tNibDir, tName, tStart, tEnd - tStart, tSize, tIsTwoBit); /* Loop through blocks copying sequence into dynamic strings. */ for (block = blockList; block != NULL; block = block->next) { if (lastBlock != NULL) { int qGap = block->qStart - lastBlock->qEnd; int tGap = block->tStart - lastBlock->tEnd; if (qGap != 0 && tGap != 0) { errAbort("Gaps in both strand on alignment ending line %d of %s", lf->lineIx, lf->fileName); } if (qGap > 0) { dyStringAppendMultiC(tSym, '-', qGap); dyStringAppendN(qSym, qSeq->dna + lastBlock->qEnd - qStart, qGap); } if (tGap > 0) { dyStringAppendMultiC(qSym, '-', tGap); dyStringAppendN(tSym, tSeq->dna + lastBlock->tEnd - tStart, tGap); } } if (qSeq->size < block->qStart - qStart) { errAbort("read past end of sequence %s size =%d block->qStart-qstart=%d block->qStart=%d qEnd=%d \n", qName, qSeq->size, block->qStart-qStart,block->qStart, block->qEnd ); } dyStringAppendN(qSym, qSeq->dna + block->qStart - qStart, block->qEnd - block->qStart); if (tSeq->size < block->tStart - tStart) { errAbort("read past end of sequence %s size =%d block->tStart-tstart=%d\n", tName, tSeq->size, block->tStart-tStart); } dyStringAppendN(tSym, tSeq->dna + block->tStart - tStart, block->tEnd - block->tStart); lastBlock = block; } if (qSym->stringSize != tSym->stringSize) errAbort("qSize and tSize don't agree in alignment ending line %d of %s", lf->lineIx, lf->fileName); if (rescore) score = axtScoreSym(scoreScheme, qSym->stringSize, qSym->string, tSym->string); /* Fill in an axt and write it to output. */ ZeroVar(&axt); axt.qName = qName; axt.qStart = qStart; axt.qEnd = qEnd; axt.qStrand = (isRc ? '-' : '+'); axt.tName = tName; axt.tStart = tStart; axt.tEnd = tEnd; axt.tStrand = '+'; axt.score = score; axt.symCount = qSym->stringSize; axt.qSym = qSym->string; axt.tSym = tSym->string; axtWrite(&axt, f); /* Clean up. */ if (!qIsFa) freeDnaSeq(&qSeq); freeDnaSeq(&tSeq); dyStringFree(&qSym); dyStringFree(&tSym); }