int main(int argc, char *argv[]) /* Process command line. */ { optionHash(&argc, argv); if (argc != 9) usage(); fileCache = newDlList(); maxGap = optionInt("maxGap", maxGap); verboseSetLogFile("stdout"); ss = axtScoreSchemeDefault(); verbose(1,"Reading alignments from %s\n",argv[3]); mrnaHash = readPslToBinKeeper(argv[2], argv[3]); twoBitFile = twoBitOpen(argv[5]); //verbose(1,"Reading alignments from %s\n",argv[]); //pseudoHash = readPslToBinKeeper(argv[3], argv[]); //verbose(1,"Reading mRNA sequences from %s\n",argv[5]); //mrnaList = faReadAllMixed(argv[5]); //if (mrnaList == NULL) //errAbort("could not open %s\n",argv[5]); //faHash = newHash(0); //for (el = mrnaList; el != NULL ; el = el->next) //hashAdd(faHash, el->name, el); verbose(1,"Reading chains from %s\n",argv[6]); chainHash = readChainToBinKeeper(argv[2], argv[6]); outFile = fopen(argv[8],"w"); verbose(1,"Scoring %s\n",argv[1]); checkExp(argv[1], argv[7], argv[4]); fclose(outFile); freeDnaSeqList(&mrnaList); return(0); }
void mafAddIRows(char *mafIn, char *twoBitIn, char *mafOut, char *nBedFile) /* mafAddIRows - Filter out maf files. */ { FILE *f = mustOpen(mafOut, "w"); struct twoBitFile *twoBit = twoBitOpen(twoBitIn); struct mafAli *mafList, *maf; struct mafFile *mf = mafOpen(mafIn); struct hash *bedHash = newHash(6); if (nBedFile != NULL) { struct lineFile *lf = lineFileOpen(nBedFile, TRUE); char *row[1]; while (lineFileRow(lf, row)) { addBed(row[0], bedHash); } lineFileClose(&lf); } speciesHash = newHash(6); mafList = readMafs(mf); mafWriteStart(f, mf->scoring); mafFileFree(&mf); chainStrands(strandHeads, bedHash); bridgeSpecies(mafList, speciesList); fillHoles(mafList, speciesList, twoBit); for(maf = mafList; maf ; maf = maf->next) mafWrite(f, maf); }
void twoBitDup(char *filename) /* twoBitDup - check to see if a twobit file has any identical sequences in it. */ { struct twoBitFile *tbf; tbf = twoBitOpen(filename); struct twoBitIndex *index; int seqCount = slCount(tbf->indexList); int hashSize = log2(seqCount) + 2; // +2 for luck struct hash *seqHash = newHash(hashSize); verbose(2, "hash size is %d\n", hashSize); for (index = tbf->indexList; index != NULL; index = index->next) { verbose(2,"grabbing seq %s\n", index->name); int size; struct dnaSeq *seq = twoBitReadSeqFragExt(tbf, index->name, 0, 0, FALSE, &size); struct hashEl *hel; if ((hel = hashLookup(seqHash, seq->dna)) != NULL) printf("%s and %s are identical\n", index->name, (char *)hel->val); else hashAdd(seqHash, seq->dna, index->name); freeDnaSeq(&seq); } }
struct dnaSeq *gfiExpandAndLoadCached(struct gfRange *range, struct hash *tFileCache, char *tSeqDir, int querySize, int *retTotalSeqSize, boolean respectFrame, boolean isRc, int expansion) /* Expand range to cover an additional expansion bases on either side. * Load up target sequence and return. (Done together because don't * know target size before loading.) */ { struct dnaSeq *target = NULL; char fileName[PATH_LEN+256]; safef(fileName, sizeof(fileName), "%s/%s", tSeqDir, range->tName); if (nibIsFile(fileName)) { struct nibInfo *nib = hashFindVal(tFileCache, fileName); if (nib == NULL) { nib = nibInfoNew(fileName); hashAdd(tFileCache, fileName, nib); } if (isRc) reverseIntRange(&range->tStart, &range->tEnd, nib->size); gfiExpandRange(range, querySize, nib->size, respectFrame, isRc, expansion); target = nibLdPart(fileName, nib->f, nib->size, range->tStart, range->tEnd - range->tStart); if (isRc) { reverseComplement(target->dna, target->size); reverseIntRange(&range->tStart, &range->tEnd, nib->size); } *retTotalSeqSize = nib->size; } else { struct twoBitFile *tbf = NULL; char *tSeqName = strchr(fileName, ':'); int tSeqSize = 0; if (tSeqName == NULL) errAbort("No colon in .2bit response from gfServer"); *tSeqName++ = 0; tbf = hashFindVal(tFileCache, fileName); if (tbf == NULL) { tbf = twoBitOpen(fileName); hashAdd(tFileCache, fileName, tbf); } tSeqSize = twoBitSeqSize(tbf, tSeqName); if (isRc) reverseIntRange(&range->tStart, &range->tEnd, tSeqSize); gfiExpandRange(range, querySize, tSeqSize, respectFrame, isRc, expansion); target = twoBitReadSeqFragLower(tbf, tSeqName, range->tStart, range->tEnd); if (isRc) { reverseComplement(target->dna, target->size); reverseIntRange(&range->tStart, &range->tEnd, tSeqSize); } *retTotalSeqSize = tSeqSize; } return target; }
struct annoAssembly *annoAssemblyNew(char *name, char *twoBitPath) /* Return an annoAssembly with open twoBitFile. */ { struct annoAssembly *aa; AllocVar(aa); aa->name = cloneString(name); aa->tbf = twoBitOpen(twoBitPath); aa->twoBitPath = cloneString(twoBitPath); return aa; }
FILE *openFromCache(struct dlList *cache, struct seqFilePos *sfp) /* Return open file handle via cache. The simple logic here * depends on not more than N files being returned at once. */ { static int maxCacheSize=16; int cacheSize = 0; struct dlNode *node; struct cachedFile *cf; int size; /* First loop through trying to find it in cache, counting * cache size as we go. */ for (node = cache->head; !dlEnd(node); node = node->next) { ++cacheSize; cf = node->val; if (sameString(sfp->file, cf->name)) { dlRemove(node); dlAddHead(cache, node); return cf->f; } } /* If cache has reached max size free least recently used. */ if (cacheSize >= maxCacheSize) { node = dlPopTail(cache); cf = node->val; carefulClose(&cf->f); freeMem(cf->name); freeMem(cf); freeMem(node); } /* Cache new file. */ AllocVar(cf); cf->name = cloneString(sfp->file); if (sfp->isTwoBit) { cf->f = (FILE *)twoBitOpen(sfp->file); } else if (sfp->isNib) { nibOpenVerify(sfp->file, &cf->f, &size); if (cf->f == NULL) errAbort("can't open nibfile %s\n",sfp->file); sfp->pos = size; } else cf->f = mustOpen(sfp->file, "rb"); dlAddValHead(cache, cf); return cf->f; }
struct dnaSeq *dnaLoadSingle(char *fileName, int *retStart, int *retEnd, int *retParentSize) /* Return sequence if it's a nib file or 2bit part, NULL otherwise. */ { struct dnaSeq *seq = NULL; unsigned start = 0, end = 0; int parentSize = 0; if (nibIsFile(fileName)) { /* Save offset out of fileName for auto-lifting */ char filePath[PATH_LEN]; char name[PATH_LEN]; nibParseName(0, fileName, filePath, name, &start, &end); if (end != 0) /* It's just a range. */ { FILE *f; int size; nibOpenVerify(filePath, &f, &size); parentSize = size; } seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); if (end == 0) parentSize = end = seq->size; freez(&seq->name); seq->name = cloneString(name); } else if (twoBitIsRange(fileName)) { /* Save offset out of fileName for auto-lifting */ char *rangeSpec = cloneString(fileName); int start, end; char *file, *seqName; twoBitParseRange(rangeSpec, &file, &seqName, &start, &end); /* Load sequence. */ { struct twoBitFile *tbf = twoBitOpen(file); parentSize = twoBitSeqSize(tbf, seqName); seq = twoBitReadSeqFrag(tbf, seqName, start, end); twoBitClose(&tbf); } if (end == 0) end = seq->size; freez(&rangeSpec); } if (retStart != NULL) *retStart = start; if (retEnd != NULL) *retEnd = end; if (retParentSize != NULL) *retParentSize = parentSize; return seq; }
struct nibTwoCache *nibTwoCacheNew(char *pathName) /* Get something that will more or less transparently get sequence from * nib files or .2bit. */ { struct nibTwoCache *ntc; AllocVar(ntc); ntc->pathName = cloneString(pathName); ntc->isTwoBit = twoBitIsFile(pathName); if (ntc->isTwoBit) ntc->tbf = twoBitOpen(pathName); else ntc->nibHash = newHash(10); return ntc; }
void mafStats(char *twoBitFile, char *mafDir, char *outFile) /* mafStats - Calculate basic stats on maf file including species-by-species * coverage and percent ID. */ { struct twoBitFile *tbf = twoBitOpen(twoBitFile); FILE *f = mustOpen(outFile, "w"); struct twoBitIndex *ix; long genomeSize = 0; struct hash *speciesHash = hashNew(0); struct speciesAcc *speciesList = NULL, *species; for (ix = tbf->indexList; ix != NULL; ix = ix->next) { unsigned chromSize = twoBitSeqSizeNoNs(tbf, ix->name); genomeSize += chromSize; char mafFileName[PATH_LEN]; safef(mafFileName, sizeof(mafFileName), "%s/%s.maf", mafDir, ix->name); struct mafFile *mf = mafMayOpen(mafFileName); verbose(1, "processing %s\n", ix->name); if (mf == NULL) { warn("%s doesn't exist", mafFileName); continue; } struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { struct mafComp *mc; for (mc = maf->components; mc != NULL; mc = mc->next) { if (mc->text != NULL) toUpperN(mc->text, maf->textSize); } addCounts(maf, speciesHash, &speciesList); mafAliFree(&maf); } mafFileFree(&mf); } slReverse(&speciesList); for (species = speciesList; species != NULL; species = species->next) { fprintf(f, "counts: %s\t%ld\t%ld\t%ld\n", species->name, species->covCount, species->aliCount, species->idCount); fprintf(f, "precents: %s\t%4.2f%%\t%4.2f%%\t%4.2f%%\n", species->name, 100.0 * species->covCount/genomeSize, 100.0 * species->aliCount/genomeSize, 100.0 * species->idCount/species->aliCount); } carefulClose(&f); }
void doFetch(char *inputFileName, char *sequenceFileName, char *outputFileName) /* lookup sequence for each line */ { struct lineFile *lf = NULL; char *line; char *row[6]; int elementCount; struct twoBitFile *tbf; char *fileChrom = NULL; int start = 0; int end = 0; char *name = NULL; int score = 0; char *strand = NULL; struct dnaSeq *chunk = NULL; FILE *outputFileHandle = mustOpen(outputFileName, "w"); tbf = twoBitOpen(sequenceFileName); lf = lineFileOpen(inputFileName, TRUE); while (lineFileNext(lf, &line, NULL)) { elementCount = chopString(line, "\t", row, ArraySize(row)); if (elementCount != 6) continue; fileChrom = cloneString(row[0]); start = sqlUnsigned(row[1]); end = sqlUnsigned(row[2]); name = cloneString(row[3]); score = sqlUnsigned(row[4]); strand = cloneString(row[5]); if (start == end) continue; assert (end > start); chunk = twoBitReadSeqFrag(tbf, fileChrom, start, end); touppers(chunk->dna); if (sameString(strand, "-")) reverseComplement(chunk->dna, chunk->size); fprintf(outputFileHandle, "%s\t%d\t%d\t%s\t%d\t%s\t%s\n", fileChrom, start, end, name, score, strand, chunk->dna); dnaSeqFree(&chunk); } lineFileClose(&lf); carefulClose(&outputFileHandle); }
struct twoBitFile *twoBitOpenCached(char *path) /* Return open two bit file associated with path. */ { static struct hash *hash = NULL; struct twoBitFile *tbf; if (hash == NULL) hash = newHash(8); tbf = hashFindVal(hash, path); if (tbf == NULL) { tbf = twoBitOpen(path); hashAdd(hash, path, tbf); } return tbf; }
void addTwoBit(char *file, struct hash *fileHash, struct hash *seqHash) /* Add a nib file to hashes. */ { struct seqFilePos *sfp; struct twoBitFile *tbf = twoBitOpen(file); struct twoBitIndex *index; for (index = tbf->indexList; index != NULL; index = index->next) { AllocVar(sfp); hashAddSaveName(seqHash, index->name, sfp, &sfp->name); sfp->file = hashStoreName(fileHash, file); sfp->isTwoBit = TRUE; sfp->tbf = tbf; } }
struct cachedSeqFile *openTwoBitFromCache(struct dlList *cache, char *fileName) /* Return open file handle via cache. In this case it's just a cache of one. */ { struct cachedSeqFile *cn; if (dlEmpty(cache)) { AllocVar(cn); cn->fileName = cloneString(fileName); cn->tbf = twoBitOpen(fileName); dlAddValHead(cache, cn); } else cn = cache->head->val; return cn; }
void twoBitInfo(char *inName, char *outName) /* twoBitInfo - get information about sequences in a .2bit file. */ { struct twoBitFile *tbf; FILE *outFile; char *seqName = NULL; twoBitParseRange(inName, &inName, &seqName, NULL, NULL); tbf = twoBitOpen(inName); outFile = mustOpen(outName, "w"); if (seqName != NULL) { char *seqArray[1023]; int i; int seqCount = chopString(seqName, ",", seqArray, ArraySize(seqArray)); for (i = 0 ; i < seqCount ; i++) { if (optionExists("maskBed")) twoBitOutMaskBeds(tbf, seqArray[i], outFile); else if (optionExists("nBed")) twoBitOutNBeds(tbf, seqArray[i], outFile); else if(optionExists("noNs")) fprintf(outFile, "%s\t%d\n", seqArray[i], twoBitSeqSizeNoNs(tbf, seqArray[i])); else fprintf(outFile, "%s\t%d\n", seqArray[i], twoBitSeqSize(tbf, seqArray[i])); } } else { struct twoBitIndex *index; for (index = tbf->indexList; index != NULL; index = index->next) { if (optionExists("maskBed")) twoBitOutMaskBeds(tbf, index->name, outFile); else if (optionExists("nBed")) twoBitOutNBeds(tbf, index->name, outFile); else if(optionExists("noNs")) fprintf(outFile, "%s\t%d\n", index->name, twoBitSeqSizeNoNs(tbf, index->name)); else fprintf(outFile, "%s\t%d\n", index->name, twoBitSeqSize(tbf, index->name)); } } twoBitClose(&tbf); carefulClose(&outFile); }
void edwFixTargetSeq(char *when) /* edwFixTargetSeq - Fill in new fields about target seq to edwBamFile and edwAssembly.. */ { struct sqlConnection *conn = edwConnectReadWrite(); struct edwAssembly *as, *asList = edwAssemblyLoadByQuery(conn, "select * from edwAssembly"); char query[512]; for (as = asList; as != NULL; as = as->next) { char *twoBitFileName = edwPathForFileId(conn, as->twoBitId); struct twoBitFile *tbf = twoBitOpen(twoBitFileName); safef(query, sizeof(query), "update edwAssembly set seqCount=%u where id=%u", tbf->seqCount, as->id); sqlUpdate(conn, query); freez(&twoBitFileName); twoBitClose(&tbf); } edwAssemblyFreeList(&asList); struct edwBamFile *bam, *bamList = edwBamFileLoadByQuery(conn, "select * from edwBamFile"); for (bam = bamList; bam != NULL; bam = bam->next) { char *fileName = edwPathForFileId(conn, bam->fileId); samfile_t *sf = samopen(fileName, "rb", NULL); if (sf == NULL) errnoAbort("Couldn't open %s.\n", fileName); bam_header_t *head = sf->header; if (head == NULL) errAbort("Aborting ... Bad BAM header in file: %s", fileName); /* Sum up some target sizes. */ long long targetBaseCount = 0; /* Total size of all bases in target seq */ int i; for (i=0; i<head->n_targets; ++i) targetBaseCount += head->target_len[i]; safef(query, sizeof(query), "update edwBamFile set targetBaseCount=%lld,targetSeqCount=%u where id=%u", targetBaseCount, (unsigned)head->n_targets, bam->id); sqlUpdate(conn, query); samclose(sf); freez(&fileName); } }
void twoBitToFa(char *inName, char *outName) /* twoBitToFa - Convert all or part of twoBit file to fasta. */ { struct twoBitFile *tbf; FILE *outFile = mustOpen(outName, "w"); struct twoBitSpec *tbs; if (clSeq != NULL) { char seqSpec[2*PATH_LEN]; if (clEnd > clStart) safef(seqSpec, sizeof(seqSpec), "%s:%s:%d-%d", inName, clSeq, clStart, clEnd); else safef(seqSpec, sizeof(seqSpec), "%s:%s", inName, clSeq); tbs = twoBitSpecNew(seqSpec); } else if (clSeqList != NULL) tbs = twoBitSpecNewFile(inName, clSeqList); else tbs = twoBitSpecNew(inName); if (tbs == NULL) errAbort("%s is not a twoBit file", inName); if (tbs->seqs != NULL && clBpt != NULL) tbf = twoBitOpenExternalBptIndex(tbs->fileName, clBpt); else tbf = twoBitOpen(tbs->fileName); if (clBed != NULL) { processSeqsFromBed(tbf, clBed, outFile); } else { if (tbs->seqs == NULL) processAllSeqs(tbf, outFile); else processSeqSpecs(tbf, tbs->seqs, outFile); } twoBitSpecFree(&tbs); carefulClose(&outFile); twoBitClose(&tbf); }
struct dnaSeq *nibTwoLoadOne(char *pathName, char *seqName) /* Return sequence from a directory full of nibs or a .2bit file. * The sequence will have repeats in lower case. */ { struct dnaSeq *seq; if (twoBitIsFile(pathName)) { struct twoBitFile *tbf = twoBitOpen(pathName); seq = twoBitReadSeqFrag(tbf, seqName, 0, 0); twoBitClose(&tbf); } else { char path[512]; safef(path, sizeof(path), "%s/%s.nib", pathName, seqName); seq = nibLoadAllMasked(NIB_MASK_MIXED, path); } return seq; }
void addTwoBit(char *file, struct hash *fileHash, struct hash *seqHash) /* Add a 2bit file to hashes. */ { struct twoBitFile *lf = twoBitOpen(file); char *rFile = hashStoreName(fileHash, file); struct slName *names = twoBitSeqNames(file); struct slName *name; for(name = names;name;name = name->next) { struct seqFilePos *sfp; AllocVar(sfp); hashAddSaveName(seqHash, name->name, sfp, &sfp->name); sfp->file = rFile; sfp->isTwoBit = TRUE; sfp->pos = 0; } slFreeList(&names); twoBitClose(&lf); }
void seqFromPsl(char *inPsl, char *inTwoBit, char *outFa) /* seqFromPsl - Extract masked sequence from database corresponding to psl file. */ { struct twoBitFile *tbf = twoBitOpen(inTwoBit); struct lineFile *lf = pslFileOpen(inPsl); FILE *f = mustOpen(outFa, "w"); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { char faHead[512]; struct dnaSeq *seq = twoBitReadSeqFrag(tbf, psl->tName, psl->tStart, psl->tEnd); if (psl->strand[0] == '-') reverseComplement(seq->dna, seq->size); safef(faHead, sizeof(faHead), "%s (%s:%d-%d)", psl->qName, psl->tName, psl->tStart+1, psl->tEnd); if (hardMask) lowerToN(seq->dna, seq->size); faWriteNext(f, faHead, seq->dna, seq->size); } carefulClose(&f); }
struct dnaLoadStack *dnaLoadStackNew(char *fileName) /* Create new dnaLoadStack on composite file. */ { struct dnaLoadStack *dls; AllocVar(dls); if (twoBitIsFile(fileName)) { dls->twoBit = twoBitOpen(fileName); dls->tbi = dls->twoBit->indexList; } else { char *line; dls->textFile = lineFileOpen(fileName, TRUE); if (lineFileNextReal(dls->textFile, &line)) { line = trimSpaces(line); if (line[0] == '>') dls->textIsFa = TRUE; lineFileReuse(dls->textFile); } } return dls; }
void searchOneIndex(int fileCount, char *files[], struct genoFind *gf, char *outName, boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus) /* Search all sequences in all files against single genoFind index. */ { int i; char *fileName; int count = 0; long long totalSize = 0; gfOutputHead(gvo, outFile); for (i=0; i<fileCount; ++i) { fileName = files[i]; if (nibIsFile(fileName)) { struct dnaSeq *seq; if (isProt) errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName); seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); freez(&seq->name); seq->name = cloneString(fileName); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); freeDnaSeq(&seq); } else if (twoBitIsSpec(fileName)) { struct twoBitSpec *tbs = twoBitSpecNew(fileName); struct twoBitFile *tbf = twoBitOpen(tbs->fileName); if (isProt) errAbort("%s is a two bit file, which doesn't work for proteins.", fileName); if (tbs->seqs != NULL) { struct twoBitSeqSpec *ss = NULL; for (ss = tbs->seqs; ss != NULL; ss = ss->next) { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name, ss->start, ss->end); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); dnaSeqFree(&seq); } } else { struct twoBitIndex *index = NULL; for (index = tbf->indexList; index != NULL; index = index->next) { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); dnaSeqFree(&seq); } } twoBitClose(&tbf); } else { static struct dnaSeq seq; struct lineFile *lf = lineFileOpen(fileName, TRUE); while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { searchOneMaskTrim(&seq, isProt, gf, outFile, maskHash, &totalSize, &count); } lineFileClose(&lf); } } carefulClose(&outFile); if (showStatus) printf("Searched %lld bases in %d sequences\n", totalSize, count); }