struct twoBit *slurpInput(char *inName, struct hash *tbHash, struct hash *bitmapHash) /* Read .2bit file inName into memory and return list of twoBit items. * Populate tbHash with twoBit items, and bitmapHash with bitmaps for * easy masking. Both are hashed by twoBit sequence name. */ { struct twoBit *twoBitList = NULL; struct twoBit *twoBit = NULL; twoBitList = twoBitFromFile(inName); /* Free and clear the masking data (unless -add). Hash twoBits by name. */ for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { Bits *bits = bitAlloc(twoBit->size); if (add) { /* Store the currently masked bits: */ int i; for (i = 0; i < twoBit->maskBlockCount; i++) { bitSetRange(bits, twoBit->maskStarts[i], twoBit->maskSizes[i]); } } /* Free the current representation of masking -- it will be replaced. */ twoBit->maskBlockCount = 0; freez(&(twoBit->maskStarts)); freez(&(twoBit->maskSizes)); /* Hash twoBit and our new bitmap by sequence name. */ hashAddUnique(tbHash, twoBit->name, twoBit); hashAddUnique(bitmapHash, twoBit->name, bits); } return twoBitList; }
struct hash *hashPsls(char *pslFileName) { struct psl *pslList = NULL, *psl = NULL, *pslSubList = NULL, *pslNext = NULL; struct hash *pslHash = newHash(15); char *last = NULL; char key[128]; char *tmp = NULL; pslList = pslLoadAll(pslFileName); /* Fix psl names */ for(psl = pslList; psl != NULL; psl = psl->next) { tmp = strrchr(psl->qName, ';'); *tmp = '\0'; tmp = strstr(psl->qName,prefix); assert(tmp); /* checks if there are 2 occurrences of ":" in probe name as in full name */ /* if probe name is shortened to fit in the seq table, there is only 1 ":"*/ /* e.g. full: consensus:HG-U133A:212933_x_at; short:HG-U133A:212933_x_at;*/ if (countChars(psl->qName, *prefix) == 2) { tmp = strstr(tmp+1,prefix); assert(tmp); } tmp = tmp + strlen(prefix); safef(psl->qName, strlen(psl->qName), "%s", tmp); } /* Sort based on query name. */ slSort(&pslList, pslCmpQuery); /* For each psl, if it is has the same query name add it to the sublist. Otherwise store the sublist in the hash and start another. */ for(psl = pslList; psl != NULL; psl = pslNext) { pslNext = psl->next; if(last != NULL && differentWord(last, psl->qName)) { hashAddUnique(pslHash, last, pslSubList); pslSubList = NULL; } slAddTail(&pslSubList, psl); last = psl->qName; } /* Add the last sublist */ hashAddUnique(pslHash, last, pslSubList); return pslHash; }
struct ntContig *readNtFile(char *fileName, struct hash *ntContigHash, struct hash *ntCloneHash) /* Read in NT contig info. (NT contigs are contigs of finished clones.) */ { struct lineFile *lf; int lineSize, wordCount; char *line, *words[8]; struct ntContig *contigList = NULL, *contig = NULL; struct ntClonePos *pos; char *contigName; struct hashEl *hel; /* Parse file into ntContig/ntClonePos data structures. */ lf = lineFileOpen(fileName, TRUE); while (lineFileNext(lf, &line, &lineSize)) { wordCount = chopLine(line, words); if (wordCount == 0) continue; if (wordCount != 5) errAbort("Expecting 5 words line %d of %s", lf->lineIx, lf->fileName); contigName = words[0]; if (contig == NULL || !sameString(contigName, contig->name)) { AllocVar(contig); hel = hashAddUnique(ntContigHash, contigName, contig); contig->name = hel->name; slAddHead(&contigList, contig); } AllocVar(pos); hel = hashAddUnique(ntCloneHash, words[1], pos); pos->name = hel->name; pos->ntContig = contig; pos->pos = atoi(words[2]); pos->orientation = ((words[3][0] == '-') ? -1 : 1); pos->size = atoi(words[4]); slAddHead(&contig->cloneList, pos); } lineFileClose(&lf); /* Make sure everything is nicely sorted and sized. */ for (contig = contigList; contig != NULL; contig = contig->next) { slSort(&contig->cloneList, cmpNtClonePos); pos = slLastEl(contig->cloneList); contig->size = pos->pos + pos->size; } slReverse(&contigList); return contigList; }
struct hash *allChainsHash(char *fileName) /* Hash all the chains in a given file by their ids. */ { struct hash *chainHash = newHash(18); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; char chainId[20]; struct lm *lm = chainHash->lm; struct rbTreeNode **stack; lmAllocArray(lm, stack, 128); while ((chain = chainRead(lf)) != NULL) { struct indexedChain *ixc; lmAllocVar(lm, ixc); ixc->chain = chain; #ifdef SOON #endif /* SOON */ ixc->blockTree = rangeTreeNewDetailed(lm, stack); struct cBlock *block; for (block = chain->blockList; block != NULL; block = block->next) { struct range *r = rangeTreeAdd(ixc->blockTree, block->tStart, block->tEnd); r->val = block; } safef(chainId, sizeof(chainId), "%x", chain->id); hashAddUnique(chainHash, chainId, ixc); } lineFileClose(&lf); return chainHash; }
void wordStoreLoadMonomerOrder(struct wordStore *store, char *readsFile, char *fileName) /* Read in a file with one line for each monomer type, containing a word for each * monomer variant. Requires all variants already be in store. The readsFile is passed * just for nicer error reporting. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* Set up variables we'll put results in in store. */ store->typeHash = hashNew(0); store->typeList = NULL; while (lineFileNextReal(lf, &line)) { struct wordType *type; AllocVar(type); slAddHead(&store->typeList, type); while ((word = nextWord(&line)) != NULL) { struct wordInfo *info = hashFindVal(store->infoHash, word); if (info == NULL) errAbort("%s is in %s but not %s", word, lf->fileName, readsFile); struct wordInfoRef *ref; AllocVar(ref); ref->val = info; slAddHead(&type->list, ref); hashAddUnique(store->typeHash, word, type); } } slReverse(&store->typeList); lineFileClose(&lf); verbose(2, "Added %d types containing %d words from %s\n", slCount(store->typeList), store->typeHash->elCount, fileName); }
void fillInBioHash(char *fileName, struct hash *bioHash) /* Fill in the bioHash with key/value pairs from file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line = NULL; int regSize = 0; while(lineFileNextReal(lf, &line)) { char *key = NULL; char *val = NULL; char *mark = NULL; mark = strchr(line, '='); if(mark == NULL) // Error: not in boulder IO format. errAbort("pickCassettePcrPrimers::fillInBioHash() - ", "Couldn't find '=' in line %s. File %s doesn't appear to be in boulderIO format.", line, fileName); if(mark == line) // First character is '=' means end of record. break; key = line; val = mark+1; *mark = '\0'; hashAddUnique(bioHash, key, cloneString(val)); } lineFileClose(&lf); }
struct trans3 *seqListToTrans3List(struct dnaSeq *seqList, aaSeq *transLists[3], struct hash **retHash) /* Convert sequence list to a trans3 list and lists for each of three frames. */ { int frame; struct dnaSeq *seq; struct trans3 *t3List = NULL, *t3; struct hash *hash = newHash(0); for (seq = seqList; seq != NULL; seq = seq->next) { t3 = trans3New(seq); hashAddUnique(hash, t3->name, t3); slAddHead(&t3List, t3); for (frame = 0; frame < 3; ++frame) { slAddHead(&transLists[frame], t3->trans[frame]); } } slReverse(&t3List); for (frame = 0; frame < 3; ++frame) { slReverse(&transLists[frame]); } *retHash = hash; return t3List; }
struct groupSizeInfo *readSizes(char *fileName, struct hash *gsiHash) /* Read in file of format: * groupName guessedMin guessedMax * and save in hash and as list. */ { struct groupSizeInfo *gsiList = NULL, *gsi; struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *words[8]; struct hashEl *hel; while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 3, wordCount); AllocVar(gsi); hel = hashAddUnique(gsiHash, words[0], gsi); gsi->name = hel->name; gsi->guessedMin = atoi(words[1]); gsi->guessedMax = atoi(words[2]); slAddHead(&gsiList, gsi); } lineFileClose(&lf); slReverse(&gsiList); return gsiList; }
struct hash *createBedHash(struct bed *bedList) /** takes a list of beds and puts them in a hash with duplicates numbered as name_1, name_2 */ { struct hash *bedHash = newHash(5); struct bed *bed = NULL; struct dyString *ds = newDyString(1024); char *name = NULL; for(bed = bedList; bed != NULL; bed = bed->next) { int count = 0; char *targetName = NULL; struct bed *tmp = NULL; dyStringClear(ds); dyStringPrintf(ds,"%s_%d", bed->name, count); /* since we may have duplications, look for an empty slot in the hash */ while(TRUE && (count < 1000)) { tmp = hashFindVal(bedHash, ds->string); if(tmp == NULL) { hashAddUnique(bedHash, ds->string, bed); break; } else { dyStringClear(ds); dyStringPrintf(ds, "%s_%d", bed->name, ++count); } } } return bedHash; }
void loadAoHash(struct hash *aoHash, struct affyOffset *aoList) /* put the aoList into the hash */ { struct affyOffset *ao = NULL; for(ao = aoList; ao != NULL; ao = ao->next) { hashAddUnique(aoHash, ao->piece, ao); } }
static void rHashMetaList(struct hash *hash, struct meta *list) /* Add list, and any children of list to hash */ { struct meta *meta; for (meta = list; meta != NULL; meta = meta->next) { hashAddUnique(hash, meta->name, meta); if (meta->children) rHashMetaList(hash, meta->children); } }
struct hash *dnaSeqHash(struct dnaSeq *seqList) /* Return hash of sequences keyed by name. */ { int size = slCount(seqList)+1; int sizeLog2 = digitsBaseTwo(size); struct hash *hash = hashNew(sizeLog2); struct dnaSeq *seq; for (seq = seqList; seq != NULL; seq = seq->next) hashAddUnique(hash, seq->name, seq); return hash; }
void loadTagHash(struct hash *h, struct sageCounts *scList) { struct sageCounts *sc =NULL; int count=0; for(sc=scList;sc!=NULL;sc=sc->next) { if(count++ % 10000 == 0) { putTic(); } hashAddUnique(h,sc->tag,sc); } printf("\tDone.\n"); }
void checkDupe(char *fileName) /* checkDupe - Check for dupes in HUGO names. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[7]; struct hash *uniqHash = newHash(0); while (lineFileRow(lf, words)) { if (sameString(words[3], "hugo")) { hashAddUnique(uniqHash, words[6], NULL); } } }
struct hash *allChainsHash(char *fileName) /* Create a hash of all the chains in a file by their id. */ { struct hash *hash = newHash(0); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; char chainId[128]; while ((chain = chainRead(lf)) != NULL) { safef(chainId, sizeof(chainId), "%d", chain->id); hashAddUnique(hash, chainId, chain); } lineFileClose(&lf); return hash; }
struct sangRange *readRanges(char *fileName, struct hash *hash) /* Read range file into list/hash. */ { struct sangRange *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[3]; int wordCount; printf("Reading %s\n", fileName); while (lineFileNextRow(lf, words, 3)) { el = sangRangeLoad(words); slAddHead(&list, el); hashAddUnique(hash, el->name, el); } lineFileClose(&lf); slReverse(&list); return list; }
struct hash *hashNmerFile(char *file) { struct lineFile *lf = lineFileOpen(file, TRUE); struct hash *nmerHash = newHash(15); struct nmerAlign *nmerList = NULL, *nmer; char key[256]; char *words[6]; while(lineFileNextRowTab(lf, words, 6)) { nmer = parseNmerAlignRow(words); snprintf(key, sizeof(key), "%s-%s", nmer->seq, nmer->name); nmerList = hashFindVal(nmerHash, key); if(nmerList == NULL) hashAddUnique(nmerHash, key, nmer); else slAddTail(&nmerList, nmer); } lineFileClose(&lf); return nmerHash; }
struct chromInfo *readChroms(struct hash *chromHash, struct sqlConnection *conn) /* Return chromosomes in list/hash. */ { struct chromInfo *chrom, *chromList = NULL; char query[512]; char **row; struct sqlResult *sr; sqlSafef(query, sizeof query, "select * from chromInfo"); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { chrom = chromInfoLoad(row); hashAddUnique(chromHash, chrom->chrom, chrom); slAddHead(&chromList, chrom); } sqlFreeResult(&sr); slReverse(&chromList); return chromList; }
struct clone *readCloneList(char *fileName, struct hash *cloneHash) /* Read clone list from sequence.inf file and save it in list/hash. */ { struct clone *cloneList = NULL, *clone; struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *words[8]; struct hashEl *hel; while (lineFileRow(lf, words)) { AllocVar(clone); chopSuffix(words[0]); hel = hashAddUnique(cloneHash, words[0], clone); clone->name = hel->name; clone->size = lineFileNeedNum(lf, words, 2); clone->phase = lineFileNeedNum(lf, words, 3); slAddHead(&cloneList, clone); } lineFileClose(&lf); slReverse(&cloneList); return cloneList; }
struct sangPair *readPairs(char *fileName, struct hash *pairHash, struct hash *rangeHash) /* Read in pair file and connect pairs to relevant range. */ { struct sangPair *list = NULL, *el; struct hashEl *hel; struct sangInsert si; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[2]; int wordCount; printf("Reading %s\n", fileName); while (lineFileNextRow(lf, words, 2)) { sangInsertStaticLoad(words, &si); AllocVar(el); hel = hashAddUnique(pairHash, si.id, el); el->name = hel->name; el->range = hashMustFindVal(rangeHash, si.name); slAddHead(&list, el); } slReverse(&list); lineFileClose(&lf); return list; }
void createBeds(struct hash *bedHash, struct hash *pslHash, char *file, int numExps) { struct stanMad *smList=NULL, *sm=NULL; struct psl *psl = NULL; struct bed *bed = NULL; char buff[256]; warn("File is %s", file); smList = stanMadLoadAll(file); for(sm=smList; sm != NULL; sm = sm->next) { sprintf(buff, "%d", sm->clid); psl = hashFindVal(pslHash, buff); if(psl != NULL) { snprintf(buff,sizeof(buff), "%d-%s-%d", sm->clid, sm->prow, sm->pcol); bed = pslToBed(psl); bed->expCount = numExps; bed->expIds = needMem(sizeof(int) * numExps); bed->expScores = needMem(sizeof(float) * numExps); hashAddUnique(bedHash, buff, bed); } } }
void ctgToChromFa(char *chromName, char *insertFile, char *chromDir, char *orderLst, char *outName, struct hash *liftHash) /* ctgToChromFa - convert contig level fa files to chromosome level. */ { struct hash *uniq = newHash(0); struct bigInsert *bi; struct chromInserts *chromInserts; struct hash *insertHash = newHash(9); struct lineFile *lf = lineFileOpen(orderLst, TRUE); FILE *f = mustOpen(outName, "w"); char ctgFaName[512]; char *words[2]; int liftChromSize = 0; int actualChromSize = 0; boolean isFirst = TRUE; chromInsertsRead(insertFile, insertHash); chromInserts = hashFindVal(insertHash, chromName); fprintf(f, ">%s\n", chromName); while (lineFileNextRow(lf, words, 1)) { char *contig = words[0]; int nSize; if (liftHash != NULL) { struct lift *lift = hashMustFindVal(liftHash, contig); nSize = lift->nBefore; liftChromSize = lift->chromSize; } else nSize = chromInsertsGapSize(chromInserts, rmChromPrefix(contig), isFirst); hashAddUnique(uniq, contig, NULL); addN(f, nSize); actualChromSize += nSize; isFirst = FALSE; sprintf(ctgFaName, "%s/%s/%s.fa", chromDir, contig, contig); if (fileExists(ctgFaName)) { actualChromSize += addFa(f, ctgFaName); } else { warn("%s does not exist\n", ctgFaName); if (!cgiVarExists("missOk")) noWarnAbort(); } } lineFileClose(&lf); if (chromInserts != NULL) if ((bi = chromInserts->terminal) != NULL) { addN(f, bi->size); actualChromSize += bi->size; } if (liftHash != NULL) { if (actualChromSize > liftChromSize) errAbort("Error: chromosome size from lift file is %d, but actual fa size is %d. Possible inconsistency between lift and inserts?", liftChromSize, actualChromSize); else if (actualChromSize < liftChromSize) addN(f, (liftChromSize - actualChromSize)); } if (linePos != 0) fputc('\n', f); fclose(f); }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, char *geneLevels, char *pairsIn, char *outDir) /* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell * lines. */ { /* Load up cell descriptions into cell array */ struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions); int cellCount = slCount(cellList); struct expRecord **cellArray; AllocArray(cellArray, cellCount); int i; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) cellArray[i] = cell; verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions); /* Load up enhBed into a hash keyed by name */ struct bed *enh, *enhList; int fieldCount; bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount); if (fieldCount != 15) errAbort("Expecting bed 15 format in %s", enhBed); struct hash *enhHash = hashNew(16); for (enh = enhList; enh != NULL; enh = enh->next) { if (enh->expCount != cellCount) errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", cellCount, cellDescriptions, enh->expCount, enhBed); hashAddUnique(enhHash, enh->name, enh); } verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed); /* Get a hash with key of gene name and value an array of expression values. */ struct hash *geneHash = hashGeneLevels(geneLevels, cellCount); verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels); /* Open inPairs.bed, just to make sure it's there before we do any output. */ struct lineFile *lf = lineFileOpen(pairsIn, TRUE); /* Remove trailing slash from output dir if any */ if (lastChar(outDir) == '/') { int len = strlen(outDir); outDir[len-1] = 0; } /* Make output directory and open all output files. */ makeDirsOnPath(outDir); FILE *outFiles[cellCount]; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) { char path[PATH_LEN]; safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description); outFiles[i] = mustOpen(path, "w"); } /* Stream through input file and copy to appropriate outputs. */ char *words[bedKnownFields*2]; // Make a little bigger than any known bed int wordCount, wordsRequired = 0; char *separator = "->"; int separatorSize = strlen(separator); int pairCount = 0; while ((wordCount = lineFileChop(lf, words)) != 0) { /* Make sure all lines have same # of fields, and at least 4. */ if (wordsRequired == 0) { wordsRequired = wordCount; lineFileExpectAtLeast(lf, 4, wordCount); } else lineFileExpectWords(lf, wordsRequired, wordCount); ++pairCount; /* Parse out name field. */ char *name = words[3]; char *sepPos = stringIn(separator, name); if (sepPos == NULL) errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName); char *enhName = cloneStringZ(name, sepPos-name); char *geneName = sepPos + separatorSize; /* Look up enhancer and gene. */ enh = hashMustFindVal(enhHash, enhName); double *geneLevels = hashMustFindVal(geneHash, geneName); freez(&enhName); /* Output ones over minimum levels. */ for (i=0; i < cellCount; ++i) { double enhLevel = enh->expScores[i]; double geneLevel = geneLevels[i]; if (enhLevel >= minAct && geneLevel >= minExp) { int j; FILE *f = outFiles[i]; fprintf(f, "%s", words[0]); for (j=1; j<wordCount; ++j) fprintf(f, "\t%s", words[j]); fprintf(f, "\n"); } } } verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn); /* Clean up. */ lineFileClose(&lf); for (i=0; i<cellCount; ++i) carefulClose(&outFiles[i]); }
void trimUniq(bioSeq *seqList) /* Check that all seq's in list have a unique name. Try and * abbreviate longer sequence names. */ { struct hash *hash = newHash(0); bioSeq *seq; for (seq = seqList; seq != NULL; seq = seq->next) { char *saferString = needMem(strlen(seq->name)+1); char *c, *s; /* Some chars are safe to allow through, other chars cause * problems. It isn't necessarily a URL safe string that is * being calculated here. The original problem was a user had * the fasta header line of: * chr8|59823648:59825047|+ * The plus sign was being taken as the query name and this * created problems as that name was passed on to hgc via * the ss cart variable. The + sign became part of a URL * eventually. This loop allows only isalnum and =_/.:;_| * to get through as part of the header name. These characters * all proved to be safe as single character names, or all * together. */ s = saferString; for (c = seq->name; *c != '\0'; ++c) { if (c && (*c != '\0')) { if ( isalnum(*c) || (*c == '=') || (*c == '-') || (*c == '/') || (*c == '.') || (*c == ':') || (*c == ';') || (*c == '_') || (*c == '|') ) *s++ = *c; } } *s = '\0'; freeMem(seq->name); if (*saferString == '\0') { freeMem(saferString); saferString = cloneString("YourSeq"); } seq->name = saferString; if (strlen(seq->name) > 14) /* Try and get rid of long NCBI .fa cruft. */ { char *nameClone = NULL; char *abbrv = NULL; char *words[32]; int wordCount; boolean isEns = (stringIn("ENSEMBL:", seq->name) != NULL); nameClone = cloneString(seq->name); wordCount = chopString(nameClone, "|", words, ArraySize(words)); if (wordCount > 1) /* Looks like it's an Ensembl/NCBI * long name alright. */ { if (isEns) { abbrv = words[0]; if (abbrv[0] == 0) abbrv = words[1]; } else if (sameString(words[1], "dbSNP")) { if (wordCount > 2) abbrv = words[2]; else abbrv = nameClone; } else { abbrv = words[wordCount-1]; if (abbrv[0] == 0) abbrv = words[wordCount-2]; } if (hashLookup(hash, abbrv) == NULL) { freeMem(seq->name); seq->name = cloneString(abbrv); } freez(&nameClone); } } hashAddUnique(hash, seq->name, hash); } freeHash(&hash); }
void secondPass(char *inName, char *outName) /* Do second pass - pair HMM between homologous regions specified in * input. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); char *line; int lineSize; char *words[16]; int wordCount; struct wabaCrude *wcList = NULL, *wc; char qFileName[512]; struct dnaSeq *qSeqList = NULL, *seq; struct hash *tFileHash = newHash(8); struct hash *qSeqHash = NULL; FILE *out = mustOpen(outName, "w"); FILE *dynFile; printf("Second pass (HMM) input %s output %s\n", inName, outName); /* Load up alignments from file and sort. */ while (lineFileNext(lf, &line, &lineSize)) { wordCount = chopLine(line, words); if (wordCount != 10) errAbort("line %d of %s doesn't look like a waba first pass file", lf->lineIx, lf->fileName); wc = wabaCrudeLoad(words); slAddHead(&wcList, wc); } lineFileClose(&lf); slSort(&wcList, wcCmpQposScore); /* Go through alignments one by one, loading DNA as need be. */ qFileName[0] = 0; for (wc = wcList; wc != NULL; wc = wc->next) { struct hashEl *hel; struct dnaSeq *tSeqList, *tSeq, *qSeq; int qSize; DNA *qStart; int tMaxSize = 5000; int tMin, tMax, tMid, tSize; int score; /* Get target sequence. */ hel = hashLookup(tFileHash, wc->tFile); if (hel == NULL) { printf("Loading %s\n", wc->tFile); tSeqList = faReadAllDna(wc->tFile); hel = hashAdd(tFileHash, wc->tFile, tSeqList); } else { tSeqList = hel->val; } tSeq = findSeq(tSeqList, wc->tSeq); /* Get query sequence. */ if (!sameString(qFileName, wc->qFile)) { strcpy(qFileName, wc->qFile); printf("Loading %s\n", wc->qFile); freeDnaSeqList(&qSeqList); qSeqList = faReadAllDna(wc->qFile); freeHash(&qSeqHash); qSeqHash = newHash(0); for (qSeq = qSeqList; qSeq != NULL; qSeq = qSeq->next) hashAddUnique(qSeqHash, qSeq->name, qSeq); } qSeq = hashMustFindVal(qSeqHash, wc->qSeq); /* Do fine alignment. */ qSize = wc->qEnd - wc->qStart; qStart = qSeq->dna + wc->qStart; if (wc->strand < 0) reverseComplement(qStart, qSize); tMid = (wc->tStart + wc->tEnd)/2; tMin = tMid-tMaxSize/2; tMax = tMin + tMaxSize; if (tMin < 0) tMin = 0; if (tMax > tSeq->size) tMax = tSeq->size; printf("Aligning %s %s:%d-%d %c to %s.%s:%d-%d +\n", wc->qFile, qSeq->name, wc->qStart, wc->qEnd, (wc->strand < 0 ? '-' : '+'), wc->tFile, tSeq->name, tMin, tMax); fprintf(out, "Aligning %s %s:%d-%d %c to %s.%s:%d-%d +\n", wc->qFile, qSeq->name, wc->qStart, wc->qEnd, (wc->strand < 0 ? '-' : '+'), wc->tFile, tSeq->name, tMin, tMax); score = xenAlignSmall(qStart, qSize, tSeq->dna + tMin, tMax-tMin, out, FALSE); fprintf(out, "best score %d\n", score); if (wc->strand < 0) reverseComplement(qStart, qSize); } freeDnaSeqList(&qSeqList); hashTraverseVals(tFileHash, htvFreeSeq); wabaCrudeFreeList(&wcList); freeHash(&tFileHash); fclose(out); }
void ctgFaToFa(char *ctgFa, char *ctgCoords, char *ntDir) /* ctgFaToFa - Convert from one big file with all NT contigs to one contig per file.. */ { struct lineFile *lf; char fileName[512], *line; char *ntName, *hsName; char *parts[6]; int lineSize, partCount; struct hash *uniqHash = newHash(0); FILE *f = NULL; int dotMod = 0; struct hash *ntHash = newHash(0); struct hash *hsHash = newHash(0); struct ntContig *nt; char *words[8]; printf("Loading %s\n", ctgCoords); lf = lineFileOpen(ctgCoords, TRUE); while (lineFileRow(lf, words)) { ntName = words[0]; if ((nt = hashFindVal(ntHash, ntName)) != NULL) ++nt->cloneCount; else { AllocVar(nt); hashAddSaveName(ntHash, ntName, nt, &nt->name); hashAddSaveName(hsHash, words[1], nt, &nt->hsName); nt->cloneCount = 1; } } lineFileClose(&lf); lf = lineFileOpen(ctgFa, FALSE); makeDir(ntDir); while (lineFileNext(lf, &line, &lineSize)) { if ((++dotMod&0x1ffff) == 0) { printf("."); fflush(stdout); } if (line[0] == '>') { carefulClose(&f); line[lineSize-1] = 0; partCount = chopByChar(line, '|',parts,ArraySize(parts)); if (partCount < 3) { uglyf("partCount = %d\n", partCount); errAbort("Expecting | separated header line %d of %s", lf->lineIx, lf->fileName); } ntName = parts[1]; nt = hashFindVal(ntHash, ntName); hsName = parts[2]; if (nt == NULL) { hsName = firstWordInLine(ntName); nt = hashMustFindVal(hsHash, hsName); ntName = nt->name; } if (nt->cloneCount > 1) { if (!startsWith("Hs", hsName)) errAbort("Expecting %s to start with 'Hs' line %d of %s", hsName, lf->lineIx, lf->fileName); if (hashLookup(uniqHash, ntName)) ntName = nextFakeNtName(hsName, ntName); hashAddUnique(uniqHash, ntName, NULL); if (!startsWith("NT_", ntName)) errAbort("Expecting NT_ name line %d of %s", lf->lineIx, lf->fileName); sprintf(fileName, "%s/%s.fa", ntDir, ntName); f = mustOpen(fileName, "w"); fprintf(f, ">%s.1_1\n", ntName); } } else { if (f != NULL) mustWrite(f, line, lineSize); } } printf("\n"); carefulClose(&f); lineFileClose(&lf); }
void agpVsMap(char *agpName, char *infoName, char *gifName) /* agpVsMap - Plot clones in agp vs. map coordinates. */ { struct mapPos *mapList, *mp; struct agpFrag *agpList, *bp; struct hash *cloneHash = newHash(14); struct hashEl *hel; struct cloneInfo *cloneList = NULL, *clone; struct memGfx *mg = NULL; int pixWidth = 600; int pixHeight = 600; int rulerHeight = 20; int maxMapPos = 0, maxAgpPos = 0; double scaleMap, scaleAgp; Color orange, green; mapList = readInfoFile(infoName); agpList = readAgpFile(agpName); for (mp = mapList; mp != NULL; mp = mp->next) { if (mp->phase > 0) { AllocVar(clone); hel = hashAddUnique(cloneHash, mp->cloneName, clone); clone->name = hel->name; clone->mp = mp; slAddHead(&cloneList, clone); if (mp->pos > maxMapPos) maxMapPos = mp->pos; } } slReverse(&cloneList); for (bp = agpList; bp != NULL; bp = bp->next) { if (bp->chromStart > maxAgpPos) maxAgpPos = bp->chromStart; } /* Draw scatterplot on bitmap. */ mg = mgNew(pixWidth, pixHeight); mgClearPixels(mg); orange = mgFindColor(mg, 210, 150, 0); green = mgFindColor(mg, 0, 200, 0); mgDrawRuler(mg, 0, pixHeight-rulerHeight, rulerHeight, pixWidth, MG_BLACK, mgSmallFont(), 0, maxMapPos+1); scaleMap = (double)pixWidth/(double)(maxMapPos+1.0); scaleAgp = (double)(pixHeight)/(double)(maxAgpPos+1.0); for (bp = agpList; bp != NULL; bp = bp->next) { char cloneName[128]; fragToCloneName(bp->frag, cloneName); clone = hashFindVal(cloneHash, cloneName); if (clone == NULL) warn("%s is in %s but not %s", cloneName, agpName, infoName); else { int x = round(scaleMap*clone->mp->pos); int y = pixHeight - round(scaleAgp*bp->chromStart); int phase = clone->mp->phase; int back; if (phase <= 1) back = green; else if (phase == 2) back = orange; else back = MG_RED; drawPlus(mg, x, y, back); } } mgSaveGif(mg, gifName); }