struct hash *loadGeno(char *genoFile) /* load genome sequences into a hash. This supports the multi-sequence * specs of twoBitLoadAll */ { struct dnaSeq *genos = NULL, *geno; struct hash *genoHash = hashNew(0); if (nibIsFile(genoFile)) genos = nibLoadAllMasked(NIB_MASK_MIXED|NIB_BASE_NAME, genoFile); else if (twoBitIsSpec(genoFile)) genos = twoBitLoadAll(genoFile); else genos = faReadDna(genoFile); while ((geno = slPopHead(&genos)) != NULL) { tolowers(geno->dna); hashAdd(genoHash, geno->name, geno); } return genoHash; }
void twoBitMask(char *inName, char *maskName, char *outName) /* twoBitMask - apply masking to a .2bit file, creating a new .2bit file. */ { struct hash *tbHash = hashNew(20); struct hash *bitmapHash = hashNew(20); struct twoBit *twoBitList = NULL; struct twoBit *twoBit = NULL; FILE *f = NULL; if (! twoBitIsFile(inName)) { if (twoBitIsSpec(inName)) errAbort("Sorry, this works only on whole .2bit files, not specs."); else errAbort("Input %s does not look like a proper .2bit file.", inName); } twoBitList = slurpInput(inName, tbHash, bitmapHash); /* Read mask data into bitmapHash, store it in twoBits: */ if ((type && endsWith(type, "bed")) || endsWith(maskName, ".bed")) maskWithBed(maskName, tbHash, bitmapHash); else if ((type && endsWith(type, "out")) || endsWith(maskName, ".out")) maskWithOut(maskName, tbHash, bitmapHash); else errAbort("Sorry, maskFile must end in \".bed\" or \".out\"."); /* Create a new .2bit file, write it out from twoBits. */ f = mustOpen(outName, "wb"); twoBitWriteHeader(twoBitList, f); for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { twoBitWriteOne(twoBit, f); } carefulClose(&f); /* Don't bother freeing twoBitList and hashes here -- just exit. */ }
void searchOneIndex(int fileCount, char *files[], struct genoFind *gf, char *outName, boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus) /* Search all sequences in all files against single genoFind index. */ { int i; char *fileName; int count = 0; long long totalSize = 0; gfOutputHead(gvo, outFile); for (i=0; i<fileCount; ++i) { fileName = files[i]; if (nibIsFile(fileName)) { struct dnaSeq *seq; if (isProt) errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName); seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); freez(&seq->name); seq->name = cloneString(fileName); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); freeDnaSeq(&seq); } else if (twoBitIsSpec(fileName)) { struct twoBitSpec *tbs = twoBitSpecNew(fileName); struct twoBitFile *tbf = twoBitOpen(tbs->fileName); if (isProt) errAbort("%s is a two bit file, which doesn't work for proteins.", fileName); if (tbs->seqs != NULL) { struct twoBitSeqSpec *ss = NULL; for (ss = tbs->seqs; ss != NULL; ss = ss->next) { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name, ss->start, ss->end); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); dnaSeqFree(&seq); } } else { struct twoBitIndex *index = NULL; for (index = tbf->indexList; index != NULL; index = index->next) { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); dnaSeqFree(&seq); } } twoBitClose(&tbf); } else { static struct dnaSeq seq; struct lineFile *lf = lineFileOpen(fileName, TRUE); while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { searchOneMaskTrim(&seq, isProt, gf, outFile, maskHash, &totalSize, &count); } lineFileClose(&lf); } } carefulClose(&outFile); if (showStatus) printf("Searched %lld bases in %d sequences\n", totalSize, count); }