Пример #1
0
struct twoBit *slurpInput(char *inName, struct hash *tbHash,
                          struct hash *bitmapHash)
/* Read .2bit file inName into memory and return list of twoBit items.
 * Populate tbHash with twoBit items, and bitmapHash with bitmaps for
 * easy masking.  Both are hashed by twoBit sequence name. */
{
    struct twoBit *twoBitList = NULL;
    struct twoBit *twoBit = NULL;
    twoBitList = twoBitFromFile(inName);
    /* Free and clear the masking data (unless -add).  Hash twoBits by name. */
    for (twoBit = twoBitList;  twoBit != NULL;  twoBit = twoBit->next)
    {
        Bits *bits = bitAlloc(twoBit->size);
        if (add)
        {
            /* Store the currently masked bits: */
            int i;
            for (i = 0;  i < twoBit->maskBlockCount;  i++)
            {
                bitSetRange(bits, twoBit->maskStarts[i], twoBit->maskSizes[i]);
            }
        }
        /* Free the current representation of masking -- it will be replaced. */
        twoBit->maskBlockCount = 0;
        freez(&(twoBit->maskStarts));
        freez(&(twoBit->maskSizes));
        /* Hash twoBit and our new bitmap by sequence name. */
        hashAddUnique(tbHash, twoBit->name, twoBit);
        hashAddUnique(bitmapHash, twoBit->name, bits);
    }
    return twoBitList;
}
Пример #2
0
struct hash *hashPsls(char *pslFileName)
{
struct psl *pslList = NULL, *psl = NULL, *pslSubList = NULL, *pslNext = NULL;
struct hash *pslHash = newHash(15);
char *last = NULL;

char key[128];
char *tmp = NULL;
pslList = pslLoadAll(pslFileName);

/* Fix psl names */
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    tmp = strrchr(psl->qName, ';');
    *tmp = '\0';
    tmp = strstr(psl->qName,prefix);
    assert(tmp);
    /* checks if there are 2 occurrences of ":" in probe name as in full name */
    /* if probe name is shortened to fit in the seq table, there is only 1 ":"*/
    /* e.g. full: consensus:HG-U133A:212933_x_at; short:HG-U133A:212933_x_at;*/

    if (countChars(psl->qName, *prefix) == 2) 
        {
        tmp = strstr(tmp+1,prefix);
        assert(tmp);
        }
    tmp = tmp + strlen(prefix);
    safef(psl->qName, strlen(psl->qName), "%s", tmp);
    }

/* Sort based on query name. */

slSort(&pslList, pslCmpQuery);
/* For each psl, if it is has the same query name add it to the
   sublist. Otherwise store the sublist in the hash and start
   another. */
for(psl = pslList; psl != NULL; psl = pslNext)
    {
    pslNext = psl->next;
    if(last != NULL && differentWord(last, psl->qName))
	{
	hashAddUnique(pslHash, last, pslSubList);
	pslSubList = NULL;
	}
    slAddTail(&pslSubList, psl);
    last = psl->qName;
    }
/* Add the last sublist */
hashAddUnique(pslHash, last, pslSubList);
return pslHash;
}
Пример #3
0
struct ntContig *readNtFile(char *fileName, 
	struct hash *ntContigHash, struct hash *ntCloneHash)
/* Read in NT contig info. (NT contigs are contigs of finished clones.) */
{
struct lineFile *lf;
int lineSize, wordCount;
char *line, *words[8];
struct ntContig *contigList = NULL, *contig = NULL;
struct ntClonePos *pos;
char *contigName;
struct hashEl *hel;

/* Parse file into ntContig/ntClonePos data structures. */
lf = lineFileOpen(fileName, TRUE);
while (lineFileNext(lf, &line, &lineSize))
    {
    wordCount = chopLine(line, words);
    if (wordCount == 0)
        continue;
    if (wordCount != 5)
        errAbort("Expecting 5 words line %d of %s", lf->lineIx, lf->fileName);
    contigName = words[0];
    if (contig == NULL || !sameString(contigName, contig->name))
        {
	AllocVar(contig);
	hel = hashAddUnique(ntContigHash, contigName, contig);
	contig->name = hel->name;
	slAddHead(&contigList, contig);
	}
    AllocVar(pos);
    hel = hashAddUnique(ntCloneHash, words[1], pos);
    pos->name = hel->name;
    pos->ntContig = contig;
    pos->pos = atoi(words[2]);
    pos->orientation = ((words[3][0] == '-') ? -1 : 1);
    pos->size = atoi(words[4]);
    slAddHead(&contig->cloneList, pos);
    }
lineFileClose(&lf);

/* Make sure everything is nicely sorted and sized. */
for (contig = contigList; contig != NULL; contig = contig->next)
    {
    slSort(&contig->cloneList, cmpNtClonePos);
    pos = slLastEl(contig->cloneList);
    contig->size = pos->pos + pos->size;
    }

slReverse(&contigList);
return contigList;
}
Пример #4
0
struct hash *allChainsHash(char *fileName)
/* Hash all the chains in a given file by their ids. */
{
struct hash *chainHash = newHash(18);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct chain *chain;
char chainId[20];
struct lm *lm = chainHash->lm;
struct rbTreeNode **stack;

lmAllocArray(lm, stack, 128);
while ((chain = chainRead(lf)) != NULL)
    {
    struct indexedChain *ixc;
    lmAllocVar(lm, ixc);
    ixc->chain = chain;
#ifdef SOON
#endif /* SOON */
    ixc->blockTree = rangeTreeNewDetailed(lm, stack);
    struct cBlock *block;
    for (block = chain->blockList; block != NULL; block = block->next)
	{
        struct range *r = rangeTreeAdd(ixc->blockTree, block->tStart, block->tEnd);
	r->val = block;
	}
    safef(chainId, sizeof(chainId), "%x", chain->id);
    hashAddUnique(chainHash, chainId, ixc);
    }
lineFileClose(&lf);
return chainHash;
}
Пример #5
0
void wordStoreLoadMonomerOrder(struct wordStore *store, char *readsFile, char *fileName)
/* Read in a file with one line for each monomer type, containing a word for each
 * monomer variant.  Requires all variants already be in store.  The readsFile is passed
 * just for nicer error reporting. */
{
/* Stuff for processing file a line at a time. */
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *word;

/* Set up variables we'll put results in in store. */
store->typeHash = hashNew(0);
store->typeList = NULL;

while (lineFileNextReal(lf, &line))
    {
    struct wordType *type;
    AllocVar(type);
    slAddHead(&store->typeList, type);
    while ((word = nextWord(&line)) != NULL)
        {
	struct wordInfo *info = hashFindVal(store->infoHash, word);
	if (info == NULL)
	    errAbort("%s is in %s but not %s", word, lf->fileName, readsFile);
	struct wordInfoRef *ref;
	AllocVar(ref);
	ref->val = info;
	slAddHead(&type->list, ref);
	hashAddUnique(store->typeHash, word, type);
	}
    }
slReverse(&store->typeList);
lineFileClose(&lf);
verbose(2, "Added %d types containing %d words from %s\n", 
    slCount(store->typeList), store->typeHash->elCount, fileName);
}
void fillInBioHash(char *fileName, struct hash *bioHash)
/* Fill in the bioHash with key/value pairs from file. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line = NULL;
int regSize = 0;
while(lineFileNextReal(lf, &line)) 
    {
    char *key = NULL;
    char *val = NULL;
    char *mark = NULL;
    mark = strchr(line, '=');
    if(mark == NULL) // Error: not in boulder IO format.
	errAbort("pickCassettePcrPrimers::fillInBioHash() - ",
		 "Couldn't find '=' in line %s. File %s doesn't appear to be in boulderIO format.",
		 line, fileName);
    if(mark == line) // First character is '=' means end of record.
	break;
    key = line;
    val = mark+1;
    *mark = '\0';
    hashAddUnique(bioHash, key, cloneString(val));
    }
lineFileClose(&lf);
}
struct trans3 *seqListToTrans3List(struct dnaSeq *seqList, aaSeq *transLists[3], struct hash **retHash)
/* Convert sequence list to a trans3 list and lists for each of three frames. */
{
int frame;
struct dnaSeq *seq;
struct trans3 *t3List = NULL, *t3;
struct hash *hash = newHash(0);

for (seq = seqList; seq != NULL; seq = seq->next)
    {
    t3 = trans3New(seq);
    hashAddUnique(hash, t3->name, t3);
    slAddHead(&t3List, t3);
    for (frame = 0; frame < 3; ++frame)
        {
	slAddHead(&transLists[frame], t3->trans[frame]);
	}
    }
slReverse(&t3List);
for (frame = 0; frame < 3; ++frame)
    {
    slReverse(&transLists[frame]);
    }
*retHash = hash;
return t3List;
}
Пример #8
0
struct groupSizeInfo *readSizes(char *fileName, struct hash *gsiHash)
/* Read in file of format:
 *     groupName guessedMin guessedMax
 * and save in hash and as list. */
{
struct groupSizeInfo *gsiList = NULL, *gsi;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
int wordCount;
char *words[8];
struct hashEl *hel;

while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    lineFileExpectWords(lf, 3, wordCount);
    AllocVar(gsi);
    hel = hashAddUnique(gsiHash, words[0], gsi);
    gsi->name = hel->name;
    gsi->guessedMin = atoi(words[1]);
    gsi->guessedMax = atoi(words[2]);
    slAddHead(&gsiList, gsi);
    }

lineFileClose(&lf);
slReverse(&gsiList);
return gsiList;
}
Пример #9
0
struct hash *createBedHash(struct bed *bedList)
/** takes a list of beds and puts them in a hash with duplicates
    numbered as name_1, name_2 */
{
struct hash *bedHash = newHash(5);
struct bed *bed = NULL;
struct dyString *ds = newDyString(1024);
char *name = NULL;
for(bed = bedList; bed != NULL; bed = bed->next)
    {
    int count = 0;
    char *targetName = NULL;
    struct bed *tmp = NULL;

    dyStringClear(ds);
    dyStringPrintf(ds,"%s_%d", bed->name, count);
    /* since we may have duplications, look for an empty slot in the hash */
    while(TRUE && (count < 1000))
	{
	tmp = hashFindVal(bedHash, ds->string);
	if(tmp == NULL)
	    {
	    hashAddUnique(bedHash, ds->string, bed);
	    break;
	    }
	else 
	    {     
	    dyStringClear(ds);
	    dyStringPrintf(ds, "%s_%d", bed->name, ++count);
	    }
	}
    }
return bedHash;
}
Пример #10
0
void loadAoHash(struct hash *aoHash, struct affyOffset *aoList)
/* put the aoList into the hash */
{
struct affyOffset *ao = NULL;
for(ao = aoList; ao != NULL; ao = ao->next)
    {
    hashAddUnique(aoHash, ao->piece, ao);
    }
}
Пример #11
0
static void rHashMetaList(struct hash *hash, struct meta *list)
/* Add list, and any children of list to hash */
{
    struct meta *meta;
    for (meta = list; meta != NULL; meta = meta->next)
    {
        hashAddUnique(hash, meta->name, meta);
        if (meta->children)
            rHashMetaList(hash, meta->children);
    }
}
Пример #12
0
struct hash *dnaSeqHash(struct dnaSeq *seqList)
/* Return hash of sequences keyed by name. */
{
int size = slCount(seqList)+1;
int sizeLog2 = digitsBaseTwo(size);
struct hash *hash = hashNew(sizeLog2);
struct dnaSeq *seq;
for (seq = seqList; seq != NULL; seq = seq->next)
    hashAddUnique(hash, seq->name, seq);
return hash;
}
Пример #13
0
void loadTagHash(struct hash *h, struct sageCounts *scList) 
{
struct sageCounts *sc =NULL;
int count=0;
for(sc=scList;sc!=NULL;sc=sc->next)
    {
    if(count++ % 10000 == 0) 
	{
	putTic();
	}
    hashAddUnique(h,sc->tag,sc);
    }
printf("\tDone.\n");
}
Пример #14
0
void checkDupe(char *fileName)
/* checkDupe - Check for dupes in HUGO names. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[7];
struct hash *uniqHash = newHash(0);

while (lineFileRow(lf, words))
    {
    if (sameString(words[3], "hugo"))
        {
	hashAddUnique(uniqHash, words[6], NULL);
	}
    }
}
Пример #15
0
struct hash *allChainsHash(char *fileName)
/* Create a hash of all the chains in a file by their id. */
{
struct hash *hash = newHash(0);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct chain *chain;
char chainId[128];

while ((chain = chainRead(lf)) != NULL)
    {
    safef(chainId, sizeof(chainId), "%d", chain->id);
    hashAddUnique(hash, chainId, chain);
    }
lineFileClose(&lf);
return hash;
}
struct sangRange *readRanges(char *fileName, struct hash *hash)
/* Read range file into list/hash. */
{
struct sangRange *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[3];
int wordCount;

printf("Reading %s\n", fileName);
while (lineFileNextRow(lf, words, 3))
    {
    el = sangRangeLoad(words);
    slAddHead(&list, el);
    hashAddUnique(hash, el->name, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct hash *hashNmerFile(char *file)
{
struct lineFile *lf = lineFileOpen(file, TRUE);
struct hash *nmerHash = newHash(15);
struct nmerAlign *nmerList = NULL, *nmer;
char key[256];
char *words[6];
while(lineFileNextRowTab(lf, words, 6))
    {
    nmer = parseNmerAlignRow(words);
    snprintf(key, sizeof(key), "%s-%s", nmer->seq, nmer->name);
    nmerList = hashFindVal(nmerHash, key);
    if(nmerList == NULL) 
	hashAddUnique(nmerHash, key, nmer);
    else
	slAddTail(&nmerList, nmer);
    }
lineFileClose(&lf);
return nmerHash;
}
Пример #18
0
struct chromInfo *readChroms(struct hash *chromHash, struct sqlConnection *conn)
/* Return chromosomes in list/hash. */
{
struct chromInfo *chrom, *chromList = NULL;
char query[512];
char **row;
struct sqlResult *sr;

sqlSafef(query, sizeof query, "select * from chromInfo");
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    chrom = chromInfoLoad(row);
    hashAddUnique(chromHash, chrom->chrom, chrom);
    slAddHead(&chromList, chrom);
    }
sqlFreeResult(&sr);
slReverse(&chromList);
return chromList;
}
struct clone *readCloneList(char *fileName, struct hash *cloneHash)
/* Read clone list from sequence.inf file and save it in list/hash. */
{
struct clone *cloneList = NULL, *clone;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
int wordCount;
char *words[8];
struct hashEl *hel;

while (lineFileRow(lf, words))
    {
    AllocVar(clone);
    chopSuffix(words[0]);
    hel = hashAddUnique(cloneHash, words[0], clone);
    clone->name = hel->name;
    clone->size = lineFileNeedNum(lf, words, 2);
    clone->phase = lineFileNeedNum(lf, words, 3);
    slAddHead(&cloneList, clone);
    }
lineFileClose(&lf);
slReverse(&cloneList);
return cloneList;
}
struct sangPair *readPairs(char *fileName, struct hash *pairHash, struct hash *rangeHash)
/* Read in pair file and connect pairs to relevant range. */
{
struct sangPair *list = NULL, *el;
struct hashEl *hel;
struct sangInsert si;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[2];
int wordCount;

printf("Reading %s\n", fileName);
while (lineFileNextRow(lf, words, 2))
    {
    sangInsertStaticLoad(words, &si);
    AllocVar(el);
    hel = hashAddUnique(pairHash, si.id, el);
    el->name = hel->name;
    el->range = hashMustFindVal(rangeHash, si.name);
    slAddHead(&list, el);
    }
slReverse(&list);
lineFileClose(&lf);
return list;
}
void createBeds(struct hash *bedHash, struct hash *pslHash, char *file, int numExps)
{
struct stanMad *smList=NULL, *sm=NULL;
struct psl *psl = NULL;
struct bed *bed = NULL;
char buff[256];
warn("File is %s", file);
smList = stanMadLoadAll(file);

for(sm=smList; sm != NULL; sm = sm->next)
    {
    sprintf(buff, "%d", sm->clid);
    psl = hashFindVal(pslHash, buff);
    if(psl != NULL) 
	{
	snprintf(buff,sizeof(buff), "%d-%s-%d", sm->clid, sm->prow, sm->pcol);
	bed = pslToBed(psl);
	bed->expCount = numExps;
	bed->expIds = needMem(sizeof(int) * numExps);
	bed->expScores = needMem(sizeof(float) * numExps);
	hashAddUnique(bedHash, buff, bed);
	}
    }
}
void ctgToChromFa(char *chromName, char *insertFile, char *chromDir, 
	char *orderLst, char *outName, struct hash *liftHash)
/* ctgToChromFa - convert contig level fa files to chromosome level. */
{
struct hash *uniq = newHash(0);
struct bigInsert *bi;
struct chromInserts *chromInserts;
struct hash *insertHash = newHash(9);
struct lineFile *lf = lineFileOpen(orderLst, TRUE);
FILE *f = mustOpen(outName, "w");
char ctgFaName[512];
char *words[2];
int liftChromSize = 0;
int actualChromSize = 0;
boolean isFirst = TRUE;

chromInsertsRead(insertFile, insertHash);
chromInserts = hashFindVal(insertHash, chromName);
fprintf(f, ">%s\n", chromName);
while (lineFileNextRow(lf, words, 1))
    {
    char *contig = words[0];
    int nSize;
    
    if (liftHash != NULL)
        {
	struct lift *lift = hashMustFindVal(liftHash, contig);
	nSize = lift->nBefore;
	liftChromSize = lift->chromSize;
	}
    else
        nSize = chromInsertsGapSize(chromInserts, rmChromPrefix(contig), isFirst);
    hashAddUnique(uniq, contig, NULL);
    addN(f, nSize);
    actualChromSize += nSize;
    isFirst = FALSE;
    sprintf(ctgFaName, "%s/%s/%s.fa", chromDir, contig, contig);
    if (fileExists(ctgFaName))
        {
	actualChromSize += addFa(f, ctgFaName);
	}
    else
        {
	warn("%s does not exist\n", ctgFaName);
	if (!cgiVarExists("missOk"))
	    noWarnAbort();
	}
    }
lineFileClose(&lf);
if (chromInserts != NULL)
    if  ((bi = chromInserts->terminal) != NULL)
        {
	addN(f, bi->size);
	actualChromSize += bi->size;
	}
if (liftHash != NULL)
    {
    if (actualChromSize > liftChromSize)
	errAbort("Error: chromosome size from lift file is %d, but actual fa size is %d.  Possible inconsistency between lift and inserts?",
		 liftChromSize, actualChromSize);
    else if (actualChromSize < liftChromSize)
	addN(f, (liftChromSize - actualChromSize));
    }
if (linePos != 0)
   fputc('\n', f);
fclose(f);
}
Пример #23
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
Пример #24
0
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, 
	char *geneLevels, char *pairsIn, char *outDir)
/* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell 
 * lines. */
{
/* Load up cell descriptions into cell array */
struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions);
int cellCount = slCount(cellList);
struct expRecord **cellArray;
AllocArray(cellArray, cellCount);
int i;
for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next)
    cellArray[i] = cell;
verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions);

/* Load up enhBed into a hash keyed by name */
struct bed *enh, *enhList;
int fieldCount;
bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount);
if (fieldCount != 15)
   errAbort("Expecting bed 15 format in %s", enhBed);
struct hash *enhHash = hashNew(16);
for (enh = enhList; enh != NULL; enh = enh->next)
    {
    if (enh->expCount != cellCount)
        errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", 
		cellCount, cellDescriptions, enh->expCount, enhBed);
    hashAddUnique(enhHash, enh->name, enh);
    }
verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed);

/* Get a hash with key of gene name and value an array of expression values. */
struct hash *geneHash = hashGeneLevels(geneLevels, cellCount);
verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels);

/* Open inPairs.bed, just to make sure it's there before we do any output. */
struct lineFile *lf = lineFileOpen(pairsIn, TRUE);

/* Remove trailing slash from output dir if any */
if (lastChar(outDir) == '/')
    {
    int len = strlen(outDir);
    outDir[len-1] = 0;
    }

/* Make output directory and open all output files. */
makeDirsOnPath(outDir);
FILE *outFiles[cellCount];
for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next)
    {
    char path[PATH_LEN];
    safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description);
    outFiles[i] = mustOpen(path, "w");
    }

/* Stream through input file and copy to appropriate outputs. */
char *words[bedKnownFields*2];	// Make a little bigger than any known bed
int wordCount, wordsRequired = 0;
char *separator = "->";
int separatorSize = strlen(separator);
int pairCount = 0;
while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    /* Make sure all lines have same # of fields, and at least 4. */
    if (wordsRequired == 0)
	{
        wordsRequired = wordCount;
	lineFileExpectAtLeast(lf, 4, wordCount);
	}
    else
	lineFileExpectWords(lf, wordsRequired, wordCount);
    ++pairCount;

    /* Parse out name field. */
    char *name = words[3];
    char *sepPos = stringIn(separator, name);
    if (sepPos == NULL)
        errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName);
    char *enhName = cloneStringZ(name, sepPos-name);
    char *geneName = sepPos + separatorSize;

    /* Look up enhancer and gene. */
    enh = hashMustFindVal(enhHash, enhName);
    double *geneLevels = hashMustFindVal(geneHash, geneName);
    freez(&enhName);

    /* Output ones over minimum levels. */
    for (i=0; i < cellCount; ++i)
        {
	double enhLevel = enh->expScores[i];
	double geneLevel = geneLevels[i];
	if (enhLevel >= minAct && geneLevel >= minExp)
	    {
	    int j;
	    FILE *f = outFiles[i];
	    fprintf(f, "%s", words[0]);
	    for (j=1; j<wordCount; ++j)
		fprintf(f, "\t%s", words[j]);
	    fprintf(f, "\n");
	    }
	}
    }
verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn);

/* Clean up. */
lineFileClose(&lf);
for (i=0; i<cellCount; ++i)
    carefulClose(&outFiles[i]);
}
Пример #25
0
void trimUniq(bioSeq *seqList)
/* Check that all seq's in list have a unique name.  Try and
 * abbreviate longer sequence names. */
{
struct hash *hash = newHash(0);
bioSeq *seq;

for (seq = seqList; seq != NULL; seq = seq->next)
    {
    char *saferString = needMem(strlen(seq->name)+1);
    char *c, *s;

    /*	Some chars are safe to allow through, other chars cause
     *	problems.  It isn't necessarily a URL safe string that is
     *	being calculated here.  The original problem was a user had
     *	the fasta header line of:
     *	chr8|59823648:59825047|+
     *	The plus sign was being taken as the query name and this
     *	created problems as that name was passed on to hgc via
     *	the ss cart variable.  The + sign became part of a URL
     *	eventually.  This loop allows only isalnum and =_/.:;_|
     *	to get through as part of the header name.  These characters
     *	all proved to be safe as single character names, or all
     *	together.
     */
    s = saferString;
    for (c = seq->name; *c != '\0'; ++c)
	{
	if (c && (*c != '\0'))
	    {
	    if ( isalnum(*c) || (*c == '=') || (*c == '-') || (*c == '/') ||
		(*c == '.') || (*c == ':') || (*c == ';') || (*c == '_') ||
		    (*c == '|') )
		*s++ = *c;
	    }
	}
    *s = '\0';
    freeMem(seq->name);
    if (*saferString == '\0')
	{
	freeMem(saferString);
	saferString = cloneString("YourSeq");
	}
    seq->name = saferString;

    if (strlen(seq->name) > 14)	/* Try and get rid of long NCBI .fa cruft. */
        {
	char *nameClone = NULL;
	char *abbrv = NULL;
	char *words[32];
	int wordCount;
	boolean isEns = (stringIn("ENSEMBL:", seq->name) != NULL);

	nameClone = cloneString(seq->name);
	wordCount = chopString(nameClone, "|", words, ArraySize(words));
	if (wordCount > 1)	/* Looks like it's an Ensembl/NCBI 
		                 * long name alright. */
	    {
	    if (isEns)
		{
	        abbrv = words[0];
		if (abbrv[0] == 0) abbrv = words[1];
		}
	    else if (sameString(words[1], "dbSNP"))
	        {
		if (wordCount > 2)
		    abbrv = words[2];
		else
		    abbrv = nameClone;
		}
	    else
		{
		abbrv = words[wordCount-1];
		if (abbrv[0] == 0) abbrv = words[wordCount-2];
		}
	    if (hashLookup(hash, abbrv) == NULL)
	        {
		freeMem(seq->name);
		seq->name = cloneString(abbrv);
		}
	    freez(&nameClone);
	    }
	}
    hashAddUnique(hash, seq->name, hash);
    }
freeHash(&hash);
}
Пример #26
0
void secondPass(char *inName, char *outName)
/* Do second pass - pair HMM between homologous regions specified in
 * input. */
{
struct lineFile *lf = lineFileOpen(inName, TRUE);
char *line;
int lineSize;
char *words[16];
int wordCount;
struct wabaCrude *wcList = NULL, *wc;
char qFileName[512];
struct dnaSeq *qSeqList = NULL, *seq;
struct hash *tFileHash = newHash(8);
struct hash *qSeqHash = NULL;
FILE *out = mustOpen(outName, "w");
FILE *dynFile;

printf("Second pass (HMM) input %s output %s\n", inName, outName);

/* Load up alignments from file and sort. */
while (lineFileNext(lf, &line, &lineSize))
    {
    wordCount = chopLine(line, words);
    if (wordCount != 10)
	errAbort("line %d of %s doesn't look like a waba first pass file",
	         lf->lineIx, lf->fileName);
    wc = wabaCrudeLoad(words);
    slAddHead(&wcList, wc);
    }
lineFileClose(&lf);
slSort(&wcList, wcCmpQposScore);


/* Go through alignments one by one, loading DNA as need be.  */
qFileName[0] = 0;
for (wc = wcList; wc != NULL; wc = wc->next)
    {
    struct hashEl *hel;
    struct dnaSeq *tSeqList, *tSeq, *qSeq;
    int qSize;
    DNA *qStart;
    int tMaxSize = 5000;
    int tMin, tMax, tMid, tSize;
    int score;

    /* Get target sequence. */
    hel = hashLookup(tFileHash, wc->tFile);
    if (hel == NULL)
	{
	printf("Loading %s\n", wc->tFile);
	tSeqList = faReadAllDna(wc->tFile);
	hel = hashAdd(tFileHash, wc->tFile, tSeqList);
	}
    else
	{
	tSeqList = hel->val;
	}
    tSeq = findSeq(tSeqList, wc->tSeq);

    /* Get query sequence. */
    if (!sameString(qFileName, wc->qFile))
	{
	strcpy(qFileName, wc->qFile);
	printf("Loading %s\n", wc->qFile);
	freeDnaSeqList(&qSeqList);
	qSeqList = faReadAllDna(wc->qFile);
	freeHash(&qSeqHash);
	qSeqHash = newHash(0);
	for (qSeq = qSeqList; qSeq != NULL; qSeq = qSeq->next)
	    hashAddUnique(qSeqHash, qSeq->name, qSeq);
	}
    qSeq = hashMustFindVal(qSeqHash, wc->qSeq);

    /* Do fine alignment. */
    qSize = wc->qEnd - wc->qStart;
    qStart = qSeq->dna + wc->qStart;
    if (wc->strand < 0)
	reverseComplement(qStart, qSize);

    tMid = (wc->tStart + wc->tEnd)/2;
    tMin = tMid-tMaxSize/2;
    tMax = tMin + tMaxSize;
    if (tMin < 0)
	tMin = 0;
    if (tMax > tSeq->size)
	tMax = tSeq->size;


    printf("Aligning %s %s:%d-%d %c to %s.%s:%d-%d +\n",
	wc->qFile, qSeq->name, wc->qStart, wc->qEnd, 
	(wc->strand < 0 ? '-' : '+'),
	wc->tFile, tSeq->name, tMin, tMax);

    fprintf(out, "Aligning %s %s:%d-%d %c to %s.%s:%d-%d +\n",
	wc->qFile, qSeq->name, wc->qStart, wc->qEnd, 
	(wc->strand < 0 ? '-' : '+'),
	wc->tFile, tSeq->name, tMin, tMax);

    score = xenAlignSmall(qStart, qSize, tSeq->dna + tMin, 
    	tMax-tMin, out, FALSE);        
    fprintf(out, "best score %d\n", score);

    if (wc->strand < 0)
	reverseComplement(qStart, qSize);
    }

freeDnaSeqList(&qSeqList);
hashTraverseVals(tFileHash, htvFreeSeq);
wabaCrudeFreeList(&wcList);
freeHash(&tFileHash);
fclose(out);
}
Пример #27
0
void ctgFaToFa(char *ctgFa, char *ctgCoords, char *ntDir)
/* ctgFaToFa - Convert from one big file with all NT contigs to one contig per file.. */
{
struct lineFile *lf;
char fileName[512], *line;
char *ntName, *hsName;
char *parts[6];
int lineSize, partCount;
struct hash *uniqHash = newHash(0);
FILE *f = NULL;
int dotMod = 0;
struct hash *ntHash = newHash(0);
struct hash *hsHash = newHash(0);
struct ntContig *nt;
char *words[8];

printf("Loading %s\n", ctgCoords);
lf = lineFileOpen(ctgCoords, TRUE);
while (lineFileRow(lf, words))
    {
    ntName = words[0];
    if ((nt = hashFindVal(ntHash, ntName)) != NULL)
        ++nt->cloneCount;
    else
        {
	AllocVar(nt);
	hashAddSaveName(ntHash, ntName, nt, &nt->name);
	hashAddSaveName(hsHash, words[1], nt, &nt->hsName);
	nt->cloneCount = 1;
	}
    }
lineFileClose(&lf);


lf = lineFileOpen(ctgFa, FALSE);
makeDir(ntDir);
while (lineFileNext(lf, &line, &lineSize))
    {
    if ((++dotMod&0x1ffff) == 0)
        {
	printf(".");
	fflush(stdout);
	}
    if (line[0] == '>')
        {
	carefulClose(&f);
	line[lineSize-1] = 0;
	partCount = chopByChar(line, '|',parts,ArraySize(parts));
	if (partCount < 3)
	    {
	    uglyf("partCount = %d\n", partCount);
	    errAbort("Expecting | separated header line %d of %s", lf->lineIx, lf->fileName); 
	    }
	ntName = parts[1];
	nt = hashFindVal(ntHash, ntName);
	hsName = parts[2];
	if (nt == NULL)
	    {
	    hsName = firstWordInLine(ntName);
	    nt = hashMustFindVal(hsHash, hsName);
	    ntName = nt->name;
	    }
	if (nt->cloneCount > 1)
	    {
	    if (!startsWith("Hs", hsName))
	        errAbort("Expecting %s to start with 'Hs' line %d of %s",
			hsName, lf->lineIx, lf->fileName);
	    if (hashLookup(uniqHash, ntName))
	        ntName = nextFakeNtName(hsName, ntName);
	    hashAddUnique(uniqHash, ntName, NULL);
	    if (!startsWith("NT_", ntName))
		errAbort("Expecting NT_ name line %d of %s", lf->lineIx, lf->fileName); 
	    sprintf(fileName, "%s/%s.fa", ntDir, ntName);
	    f = mustOpen(fileName, "w");
	    fprintf(f, ">%s.1_1\n", ntName);
	    }
	}
    else
        {
	if (f != NULL)
	    mustWrite(f, line, lineSize);
	}
    }
printf("\n");
carefulClose(&f);
lineFileClose(&lf);
}
Пример #28
0
void agpVsMap(char *agpName, char *infoName, char *gifName)
/* agpVsMap - Plot clones in agp vs. map coordinates. */
{
struct mapPos *mapList, *mp;
struct agpFrag *agpList, *bp;
struct hash *cloneHash = newHash(14);
struct hashEl *hel;
struct cloneInfo *cloneList = NULL, *clone;
struct memGfx *mg = NULL;
int pixWidth = 600;
int pixHeight = 600;
int rulerHeight = 20;
int maxMapPos = 0, maxAgpPos = 0;
double scaleMap, scaleAgp;
Color orange, green;

mapList = readInfoFile(infoName);
agpList = readAgpFile(agpName);

for (mp = mapList; mp != NULL; mp = mp->next)
    {
    if (mp->phase > 0)
        {
	AllocVar(clone);
	hel = hashAddUnique(cloneHash, mp->cloneName, clone);
	clone->name = hel->name;
	clone->mp = mp;
	slAddHead(&cloneList, clone);
	if (mp->pos > maxMapPos) maxMapPos = mp->pos;
	}
    }
slReverse(&cloneList);

for (bp = agpList; bp != NULL; bp = bp->next)
    {
    if (bp->chromStart > maxAgpPos) maxAgpPos = bp->chromStart;
    }

/* Draw scatterplot on bitmap. */
mg = mgNew(pixWidth, pixHeight);
mgClearPixels(mg);
orange = mgFindColor(mg, 210, 150, 0);
green = mgFindColor(mg, 0, 200, 0);
mgDrawRuler(mg, 0, pixHeight-rulerHeight, rulerHeight, pixWidth, MG_BLACK,
       mgSmallFont(), 0, maxMapPos+1);
scaleMap = (double)pixWidth/(double)(maxMapPos+1.0);
scaleAgp = (double)(pixHeight)/(double)(maxAgpPos+1.0);
for (bp = agpList; bp != NULL; bp = bp->next)
    {
    char cloneName[128];
    fragToCloneName(bp->frag, cloneName);
    clone = hashFindVal(cloneHash, cloneName);
    if (clone == NULL)
        warn("%s is in %s but not %s", cloneName, 
	    agpName, infoName);
    else
	{
	int x = round(scaleMap*clone->mp->pos);
	int y = pixHeight - round(scaleAgp*bp->chromStart);
	int phase = clone->mp->phase;
	int back;
	if (phase <= 1) back = green;
	else if (phase == 2) back = orange;
	else back = MG_RED;
	drawPlus(mg, x, y, back);
	}
    }

mgSaveGif(mg, gifName);
}