Exemplo n.º 1
0
struct prioritizedImage *bestImage(char *kgId, struct hash *kgToHash, struct hash *imageHash)
/* Return best image id if possible, otherwise 0 */
{
struct hashEl *extId = hashLookup(kgToHash, kgId);
struct prioritizedImage *best = NULL;

while (extId)
    {
    struct prioritizedImage *pi = hashFindVal(imageHash, extId->val);
    if (pi)
	{
	if (!best || pi->priority < best->priority)
	    {
	    best = pi;
	    }
	}
    extId = hashLookupNext(extId);
    }
return best;
}
void checkExceptions()
{
struct hashEl *hel= NULL;
struct coords *cel = NULL;
char *name;
char query[512];
struct sqlConnection *conn = hAllocConn();
struct sqlResult *sr;
char **row;
boolean matchFound = FALSE;
char *chrom;
int start = 0;
int end = 0;

verbose(1, "checking exceptions...\n");
safef(query, sizeof(query), "select name, chrom, chromStart, chromEnd from snp125Exceptions");
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    matchFound = FALSE;
    name = cloneString(row[0]);
    chrom = cloneString(row[1]);
    start = sqlUnsigned(row[2]);
    end = sqlUnsigned(row[3]);
    for (hel = hashLookup(coordHash, name); hel != NULL; hel= hashLookupNext(hel))
        {
	cel = (struct coords *)hel->val;
	if (sameString(cel->chrom, chrom) && cel->start == start && cel->end == end)
	    {
	    matchFound = TRUE;
	    break;
	    }
	}
    if (!matchFound)
        verbose(1, "no match found for %s at %s:%d-%d\n", name, chrom, start, end);
    }
}
struct raRecord *findRecordAtLevel(struct raLevel *level, char *key, char *release)
/* Find record of given key and release in level. */
{
/* Look up key in hash */
struct hashEl *firstEl = hashLookup(level->trackHash, key);

/* Loop through and return any ones that match on both key (implicit in hash find) and
 * in release. */
struct hashEl *hel;
for (hel = firstEl; hel != NULL; hel = hashLookupNext(hel))
    {
    struct raRecord *r = hel->val;
    struct raTag *releaseTag = raRecordFindTag(r, "release");
    char *rRelease = (releaseTag == NULL ? NULL : releaseTag->val);
    if (sameOk(release, rRelease))
        return r;
    }

/* If given record hash no defined release, return first match regardless of release. */
if (release == NULL && firstEl != NULL)
    return firstEl->val;

return NULL;
}
struct raRecord *findClosestParent(struct raLevel *level, struct raRecord *record, char *parentKey)
/* Look up key in level hash, and return the one that is closest to self, but before self
 * in the same file. This disregards release.   It is in fact used to do the
 * inheritance of releases. */
{
struct raRecord *closestParent = NULL;
int closestDistance = BIGNUM;
struct hashEl *hel;
for (hel = hashLookup(level->trackHash, parentKey); hel != NULL; hel = hashLookupNext(hel))
    {
    struct raRecord *parent = hel->val;
    int distance = record->startLineIx - parent->startLineIx;
    if (distance > 0)
        distance = BIGNUM/4 - distance;
    if (record->file != parent->file)
        distance = BIGNUM/2; 
    if (distance < closestDistance)
        {
	closestDistance = distance;
	closestParent = parent;
	}
    }
return closestParent;
}
Exemplo n.º 5
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
Exemplo n.º 6
0
static void agpSangerUnfinished(char *agpFile, char *contigFasta, char *agpOut)
/* Fix agp to match unfinished contigs in fasta */
{
struct lineFile *lf = lineFileOpen(agpFile, TRUE);
char *line, *words[16];
int lineSize, wordCount;
unsigned lastPos = 0;
struct agpFrag *agp;
struct agpGap *gap;
FILE *f;
char *lastObj = NULL;
f = mustOpen(agpOut, "w");
char *newChrom = NULL;
struct hash *hash = hashFasta(contigFasta);

verbose(2,"#\tprocessing AGP file: %s\n", agpFile);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == 0 || line[0] == '#' || line[0] == '\n')
        continue;
    //verbose(2,"#\tline: %d\n", lf->lineIx);
    wordCount = chopLine(line, words);
    if (wordCount < 5)
        errAbort("Bad line %d of %s: need at least 5 words, got %d\n",
		 lf->lineIx, lf->fileName, wordCount);

    if (!lastObj || !sameString(words[0],lastObj))
	{
	freez(&newChrom);
	newChrom = cloneString(words[0]);
	lastPos = 0;
	}

    	
		 
    if (words[4][0] != 'N')
	{
	lineFileExpectAtLeast(lf, 9, wordCount);
	agp = agpFragLoad(words);
	/* agp is 1-based but agp loaders do not adjust for 0-based: */
    	agp->chromStart -= 1;
	agp->fragStart  -= 1;
	if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
	    errAbort("Sizes don't match in %s and %s line %d of %s\n",
		agp->chrom, agp->frag, lf->lineIx, lf->fileName);

	char *root = cloneString(agp->frag);
	chopSuffixAt(root, '.');

	struct hashEl *e, *elist = hashLookup(hash, root);
	for (e = elist; e; e = hashLookupNext(e))
	    {
	    struct unfinishedContig *u = e->val;
            if ((u->fragStart <= agp->fragStart) && (u->fragEnd >= agp->fragEnd))
		{
		agp->frag = cloneString(u->frag);
		agp->fragEnd -= u->fragStart;
		agp->fragStart -= u->fragStart;
		}
	    }
	freeMem(root);
	}
    else
        {
	lineFileExpectAtLeast(lf, 8, wordCount);
	gap = agpGapLoad(words);
	/* to be consistent with agpFrag */
	gap->chromStart -= 1;
	agp = (struct agpFrag*)gap;
	}

    if (agp->chromStart != lastPos)
	errAbort("Start doesn't match previous end line %d of %s\n"
	    "agp->chromStart: %u\n" 
	    "agp->chromEnd: %u\n" 
	    "lastPos: %u\n" 
	    ,lf->lineIx, lf->fileName
	    ,agp->chromStart
	    ,agp->chromEnd
	    ,lastPos
	    );

    lastPos = agp->chromEnd;
    freez(&lastObj);
    lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */
	
    if (words[4][0] != 'N')
	{
	/* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */
	agpFragOutput(agp, f, '\t', '\n');
	agpFragFree(&agp);
	}
    else
        {
	/* restore back to 1-based for agp 
	 * because agpGapOutput doesn't compensate */
	gap->chromStart += 1;
	agpGapOutput(gap, f, '\t', '\n');
	agpGapFree(&gap);
	}
	
    }

carefulClose(&f);
}
struct bed *searchStrand(struct hash *sixers, struct cutter *ACGTo[], struct dnaSeq *seq, int startOffset, char strand)
/* Cheesey function that checks a strand for the enzymes after they're put in the hash/array structures.  
   This used to be a part of the matchEnzymes function but I do it twice now. */
{
struct cutter *enz;
struct bed *bedList = NULL;
int seqPos;

if (ACGTo[0] || ACGTo[1] || ACGTo[2] || ACGTo[3] || (sixers->elCount > 0)) 
    {
    for (seqPos = 0; seqPos < seq->size; seqPos++)
	{
	struct cutter *enzList = NULL;
	char sixer[7];
	int bedPos = (strand == '-') ? (seq->size - seqPos) : seqPos;
	if (seq->size - seqPos >= 6)
	    {
	    struct hashEl *el = NULL;
	    sixer[6] = '\0';
	    memcpy(sixer, seq->dna+seqPos, 6);
	    el = hashLookup(sixers, sixer);
	    if (el)
		{	    
		struct bed *add;
		enz = el->val;
		add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand);
		slAddHead(&bedList, add);
		/* Just in case there's another one with the same sequence. */
		while ((el = hashLookupNext(el)))
		    {
		    enz = el->val;
		    add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand);
		    slAddHead(&bedList, add);
		    }
		}
	    }
	/* Use a certain list depending on which letter we're on in the sequence. */
	if (seq->dna[seqPos] == 'A')
	    enzList = ACGTo[0];
	else if (seq->dna[seqPos] == 'C')
	    enzList = ACGTo[1];
	else if (seq->dna[seqPos] == 'G')
	    enzList = ACGTo[2];
	else if (seq->dna[seqPos] == 'T')
	    enzList = ACGTo[3];
	for (enz = enzList; enz != NULL; enz = enz->next)
	    {
	    int enzPos = 0;
	    int seqCurPos = seqPos;	
	    while (enzPos < enz->size && seqCurPos < seq->size && matchingBase(enz->seq[enzPos],seq->dna[seqCurPos]))
		{
		enzPos++; 
		seqCurPos++;
		}
	    if (enzPos == enz->size)
		{
		struct bed *add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand);
		slAddHead(&bedList, add);
		}
	    }
	}
    }
return bedList;
}
Exemplo n.º 8
0
void doPastedIdentifiers(struct sqlConnection *conn)
/* Process submit in paste identifiers page. */
{
char *idText = trimSpaces(cartString(cart, hgtaPastedIdentifiers));
htmlOpen("Table Browser (Input Identifiers)");
if (isNotEmpty(idText))
    {
    /* Write terms to temp file, checking whether they have matches, and
     * save temp file name. */
    boolean saveIdText = (strlen(idText) < MAX_IDTEXT);
    char *idTextForLf = saveIdText ? cloneString(idText) : idText;
    struct lineFile *lf = lineFileOnString("idText", TRUE, idTextForLf);
    char *line, *word;
    struct tempName tn;
    FILE *f;
    int totalTerms = 0, foundTerms = 0;
    struct slName* missingTerms = NULL;
    struct dyString *exampleMissingIds = dyStringNew(256);
    char *actualDb = database;
    if (sameWord(curTable, WIKI_TRACK_TABLE))
	actualDb = wikiDbName();
    struct hTableInfo *hti = maybeGetHti(actualDb, curTable, conn);
    char *idField = getIdField(actualDb, curTrack, curTable, hti);
    if (idField == NULL)
	{
	warn("Sorry, I can't tell which field of table %s to treat as the "
	     "identifier field.", curTable);
	webNewSection("Table Browser");
	cartRemove(cart, hgtaIdentifierDb);
	cartRemove(cart, hgtaIdentifierTable);
	cartRemove(cart, hgtaIdentifierFile);
	mainPageAfterOpen(conn);
	htmlClose();
	return;
	}
    struct slName *allTerms = NULL, *term;
    while (lineFileNext(lf, &line, NULL))
	{
	while ((word = nextWord(&line)) != NULL)
	    {
	    term = slNameNew(word);
	    slAddHead(&allTerms, term);
	    totalTerms++;
	    }
	}
    slReverse(&allTerms);
    lineFileClose(&lf);
    char *extraWhere = NULL;
    int maxIdsInWhere = cartUsualInt(cart, "hgt_maxIdsInWhere", DEFAULT_MAX_IDS_IN_WHERE);
    if (totalTerms > 0 && totalTerms <= maxIdsInWhere)
	extraWhere = slNameToInExpression(idField, allTerms);

    struct lm *lm = lmInit(0);
    struct hash *matchHash = getAllPossibleIds(conn, lm, idField, extraWhere);
    trashDirFile(&tn, "hgtData", "identifiers", ".key");
    f = mustOpen(tn.forCgi, "w");
    for (term = allTerms;  term != NULL;  term = term->next)
	{
	struct slName *matchList = NULL, *match;
	if (matchHash == NULL)
	    {
	    matchList = slNameNew(term->name);
	    }
	else
	    {
	    /* Support multiple alias->id mappings: */
	    char upcased[1024];
	    safecpy(upcased, sizeof(upcased), term->name);
	    touppers(upcased);
	    struct hashEl *hel = hashLookup(matchHash, upcased);
	    if (hel != NULL)
		{
		matchList = slNameNew((char *)hel->val);
		while ((hel = hashLookupNext(hel)) != NULL)
		    {
		    match = slNameNew((char *)hel->val);
		    slAddHead(&matchList, match);
		    }
		}
	    }
	if (matchList != NULL)
	    {
	    foundTerms++;
	    for (match = matchList;  match != NULL;  match = match->next)
		{
		mustWrite(f, match->name, strlen(match->name));
		mustWrite(f, "\n", 1);
		}
	    }
	else 
	    {
	    slAddHead(&missingTerms, slNameNew(term->name));
	    }
	}
    slReverse(&missingTerms);
    carefulClose(&f);
    cartSetString(cart, hgtaIdentifierDb, database);
    cartSetString(cart, hgtaIdentifierTable, curTable);
    cartSetString(cart, hgtaIdentifierFile, tn.forCgi);
    if (saveIdText)
	freez(&idTextForLf);
    else
	cartRemove(cart, hgtaPastedIdentifiers);
    int missingCount = totalTerms - foundTerms;
    if (missingCount > 0)
	{
	char *xrefTable, *aliasField;
	getXrefInfo(conn, &xrefTable, NULL, &aliasField);
	boolean xrefIsSame = xrefTable && sameString(curTable, xrefTable);
	struct tempName tn;
	trashDirFile(&tn, "hgt/missingIds", cartSessionId(cart), ".tmp");
	FILE *f = mustOpen(tn.forCgi, "w");
	int exampleCount = 0;
	for (term = missingTerms;  term != NULL;  term = term->next)
	    {
	    if (exampleCount < 10)
		{
		++exampleCount;
		dyStringPrintf(exampleMissingIds, "%s\n", term->name);
		}
	    fprintf(f, "%s\n", term->name);
	    }
	carefulClose(&f);

	dyStringPrintf(exampleMissingIds, "\n<a href=%s>Complete list of missing identifiers<a>\n", tn.forHtml);

	warn("Note: %d of the %d given identifiers have no match in "
	     "table %s, field %s%s%s%s%s.  "
	     "Try the \"describe table schema\" button for more "
	     "information about the table and field.\n"
	     "%d %smissing identifier(s):\n"
	     "%s\n",
	     (totalTerms - foundTerms), totalTerms,
	     curTable, idField,
	     (xrefTable ? (xrefIsSame ? "" : " or in alias table ") : ""),
	     (xrefTable ? (xrefIsSame ? "" : xrefTable) : ""),
	     (xrefTable ? (xrefIsSame ? " or in field " : ", field ") : ""),
	     (xrefTable ? aliasField : ""),
	     exampleCount,
	     exampleCount < missingCount ? "example " : "",
	     exampleMissingIds->string
	    );
	webNewSection("Table Browser");
	}
    lmCleanup(&lm);
    hashFree(&matchHash);
    }
else
    {
    cartRemove(cart, hgtaIdentifierFile);
    }
mainPageAfterOpen(conn);
htmlClose();
}
void tjLoadSome(struct region *regionList,
                struct joinedTables *joined, int fieldOffset, int keyOffset,
                char *idField, struct hash *idHash,
                struct slName *chopBefore, struct slName *chopAfter,
                struct tableJoiner *tj, boolean isPositional, boolean isFirst)
/* Load up rows. */
{
    struct region *region;
    struct dyString *sqlFields = dyStringNew(0);
    struct joinerDtf *dtf;
    struct slRef *ref;
    struct joinerPair *jp;
    int fieldCount = 0, keyCount = 0;
    int idFieldIx = -1;
    struct sqlConnection *conn = hAllocConn(tj->database);
    char *identifierFilter = NULL;
    char *filter;
    boolean needUpdateFilter = FALSE;
    struct joinedRow *jr;

    if (isFirst)
        identifierFilter = identifierWhereClause(idField, idHash);
    filter = filterClause(tj->database, tj->table, regionList->chrom, identifierFilter);

    /* Record combined filter. */
// Show only the SQL filter built from filter page options, not identifierFilter,
// because identifierFilter can get enormous (like 126kB for 12,500 rsIDs).
    char *filterNoIds = filterClause(tj->database, tj->table, regionList->chrom, NULL);
    if (filterNoIds != NULL)
    {
        if (joined->filter == NULL)
            joined->filter = dyStringNew(0);
        else
            dyStringAppend(joined->filter, " AND ");
        dyStringAppend(joined->filter, filterNoIds);
        if (!isFirst)
        {
            needUpdateFilter = TRUE;
            for (jr = joined->rowList; jr != NULL; jr = jr->next)
                jr->hitThisTable = FALSE;
        }
    }

    /* Create field spec for sql - first fields user will see, and
     * second keys if any. */
    for (dtf = tj->fieldList; dtf != NULL; dtf = dtf->next)
    {
        struct joinerDtf *dupe = joinerDtfClone(dtf);
        slAddTail(&joined->fieldList, dupe);
        dyStringAddList(sqlFields, dtf->field);
        ++fieldCount;
    }
    for (ref = tj->keysOut; ref != NULL; ref = ref->next)
    {
        struct joinerDtf *dupe;
        jp = ref->val;
        dupe = joinerDtfClone(jp->a);
        slAddTail(&joined->keyList, dupe);
        dyStringAddList(sqlFields, jp->a->field);
        ++keyCount;
    }
    if (idHash != NULL)
    {
        if (idField == NULL)
            internalErr();
        idFieldIx = fieldCount + keyCount;
        dyStringAddList(sqlFields, idField);
    }

    for (region = regionList; region != NULL; region = region->next)
    {
        char **row;
        /* We free at end of loop so we get new one for each chromosome. */
        char *filter = filterClause(tj->database, tj->table, region->chrom, identifierFilter);
        struct sqlResult *sr = regionQuery(conn, tj->table,
                                           sqlFields->string, region, isPositional, filter);
        while (sr != NULL && (row = sqlNextRow(sr)) != NULL)
        {
            if (idFieldIx < 0)
            {
                if (jrRowAdd(joined, row, fieldCount, keyCount) == NULL)
                    break;
            }
            else
            {
                char *id = row[idFieldIx];
                if (isFirst)
                {
                    if (hashLookup(idHash, id))
                    {
                        if (jrRowAdd(joined, row, fieldCount, keyCount) == NULL)
                            break;
                    }
                }
                else
                {
                    struct hashEl *bucket;
                    id = chopKey(chopBefore, chopAfter, id);
                    for (bucket = hashLookup(idHash, id); bucket != NULL;
                            bucket = hashLookupNext(bucket))
                    {
                        jr = bucket->val;
                        jr->hitThisTable = TRUE;
                        jrRowExpand(joined, jr, row,
                                    fieldOffset, fieldCount, keyOffset, keyCount);
                    }
                }
            }
        }
        sqlFreeResult(&sr);
        freez(&filter);
        if (!isPositional)
            break;
    }
    if (isFirst)
        slReverse(&joined->rowList);
    if (needUpdateFilter)
    {
        for (jr = joined->rowList; jr != NULL; jr = jr->next)
        {
            jr->passedFilter &= jr->hitThisTable;
        }
    }
    tj->loaded = TRUE;
    hFreeConn(&conn);
}
static void associationCalcDistances(struct order *ord, 
	struct sqlConnection *conn, /* connection to main database. */
	struct genePos **pGeneList, struct hash *geneHash, int maxCount)
/* Fill in distance fields in geneList. */
{
struct sqlResult *sr;
char **row;
struct hash *curTerms = newHash(8);
struct hash *protHash = NULL;
struct hash *lookupHash = geneHash;
char query[512];
struct genePos *gp;
char *geneId = curGeneId->name;

if (ord->protKey)
    {
    /* Build up hash of genes keyed by protein names. (The geneHash
     * passed in is keyed by the mrna name. */
    protHash = newHash(17);
    for (gp = *pGeneList; gp != NULL; gp = gp->next)
	{
	char *id = (ord->protKey 
	    ? (kgVersion == KG_III ? lookupProtein(conn, gp->name) : gp->protein)
	    : gp->name);
	hashAdd(protHash, id, gp);
	}

    /* Also switch current gene id and lookup hash to protein. */
    geneId = curGeneId->protein;
    lookupHash = protHash;
    }


/* Build up hash full of all go IDs associated with gene. */
if (geneId != NULL)
    {
    safef(query, sizeof(query), ord->queryOne, geneId);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
	{
	hashAdd(curTerms, row[0], NULL);
	}
    sqlFreeResult(&sr);
    }

/* Stream through association table counting matches. */
sr = sqlGetResult(conn, ord->queryAll);
while ((row = sqlNextRow(sr)) != NULL)
    {
    if (hashLookup(curTerms, row[1]))
	{
	struct hashEl *hel = hashLookup(lookupHash, row[0]);
	while (hel != NULL)
	    {
	    gp = hel->val;
	    gp->count += 1;
	    hel = hashLookupNext(hel);
	    }
	}
    }
sqlFreeResult(&sr);

/* Go through list translating non-zero counts to distances. */
for (gp = *pGeneList; gp != NULL; gp = gp->next)
    {
    if (gp->count > 0)
        {
	gp->distance = 1.0/gp->count;
	gp->count = 0;
	}
    if (sameString(gp->name, curGeneId->name))	/* Force self to top of list. */
        gp->distance = 0;
    }

hashFree(&protHash);
hashFree(&curTerms);
}
Exemplo n.º 11
0
void hgLoadNetDist(char *inTab, char *db, char *outTable)
{
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);

struct sqlConnection *hConn = sqlConnect(db);

FILE *missingFile=NULL;
int missingCount=0;

struct lineFile *lf=NULL;
char *row[3];
int rowCount=3;

if (sqlRemap)
    {
    fetchRemapInfo(db);
    missingHash = newHash(16);  
    missingFile = mustOpen("missing.tab","w");
    }

/* read edges from file */


lf=lineFileOpen(inTab, TRUE);

/* print final values, remapping if needed */

while (lineFileNextRowTab(lf, row, rowCount))
    {
    char *geneI = row[0];
    char *geneJ = row[1];
    char *dij = row[2];
    char *gi=NULL, *gj=NULL;
    if (sqlRemap)
	{ /* it is possible for each id to have multiple remap values in hash */
	struct hashEl *hi=NULL, *hj=NULL, *hjSave=NULL;
	hi = hashLookup(aliasHash,geneI);
	hj = hashLookup(aliasHash,geneJ);
	missingCount += handleMissing(hi, geneI, missingHash, missingFile);
	missingCount += handleMissing(hj, geneJ, missingHash, missingFile);
	hjSave = hj;
	/* do all combinations of i and j */	
	for(;hi;hi=hashLookupNext(hi))
	    {
	    gi = (char *)hi->val;
	    for(;hj;hj=hashLookupNext(hj))
		{
		gj = (char *)hj->val;
		fprintf(f,"%s\t%s\t%s\n",gi,gj,dij);
		}
	    hj = hjSave; /* reset it */
	    }
	}
    else
	{
	gi=geneI;
	gj=geneJ;
	fprintf(f,"%s\t%s\t%s\n",gi,gj,dij);
	}
    }

lineFileClose(&lf);
carefulClose(&f);    

if (sqlRemap)
    {
    carefulClose(&missingFile);
    if (missingCount == 0)
	unlink("missing.tab");
    else	    
    	printf("hgLoadNetDist %d id-remapping misses, see missing.tab\n", missingCount);
    }

createTable(hConn, outTable);
hgLoadTabFile(hConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}