Esempio n. 1
0
void fbOrPsl(Bits *acc, char *track, char *chrom, int chromSize)
/* Or in bits of psl file that correspond to chrom. */
{
struct lineFile *lf;
char fileName[512];
struct psl *psl;

chromFileName(track, chrom, fileName);
if (!fileExists(fileName))
    return;
lf = pslFileOpen(fileName);
while ((psl = pslNext(lf)) != NULL)
    {
    if (sameString(psl->tName, chrom))
	setPslBits(lf, acc, psl, 0, chromSize);
    pslFree(&psl);
    }
lineFileClose(&lf);
}
Esempio n. 2
0
static struct hash *collectQueryStats(char *pslFile, char *querySizeFile)
/* collect per-query statistics */
{
struct hash *queryStatsTbl = (querySizeFile != NULL)
    ? sumStatsLoad(querySizeFile)
    : hashNew(queryHashPowTwo);

struct lineFile *pslLf = pslFileOpen(pslFile);
struct psl* psl;

while ((psl = pslNext(pslLf)) != NULL)
    {
    struct sumStats *ss = sumStatsGetForQuery(queryStatsTbl, psl->qName, psl->qSize);
    sumStatsAccumulateQuery(ss, psl);
    pslFree(&psl);
    }
lineFileClose(&pslLf);
return queryStatsTbl;
}
void extractUsedPairs(struct hash *pairHash, char *inPslName, char *outPairName)
/* Extract pairs that are used in inPsl to outPair. */
{
struct hash *refHash = newHash(12);
struct pairRef *refList = NULL, *ref;
struct psl *psl;
struct lineFile *lf;

printf("Processing pairs from %s to %s\n", inPslName, outPairName);
lf = pslFileOpen(inPslName);
while ((psl = pslNext(lf)) != NULL)
    {
    char *name = psl->qName;
    struct hashEl *hel;
    struct pair *pair;
    if ((hel = hashLookup(pairHash, name)) != NULL)
	{
	pair = hel->val;
	if ((hel = hashLookup(refHash, name)) != NULL)
	    {
	    ref = hel->val;
	    }
	else
	    {
	    AllocVar(ref);
	    ref->pair = pair;
	    slAddHead(&refList, ref);
	    hashAdd(refHash, pair->a, ref);
	    hashAdd(refHash, pair->b, ref);
	    }
	if (sameString(name, pair->a))
	    ref->gotA = TRUE;
	else
	    ref->gotB = TRUE;
	}
    pslFree(&psl);
    }
slReverse(&refList);
writePairs(outPairName, refList);
slFreeList(&refList);
freeHash(&refHash);
}
Esempio n. 4
0
void loadPslsFromFile(char *pslFile, char *chrom, struct sqlConnection *conn)
/** Load the psls from the directed file (instead of the database. */
{
struct psl *psl = NULL, *pslNext = NULL, *pslList = NULL;
pslList = pslLoadAll(pslFile);
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    minPslStart = min(psl->tStart, minPslStart);
    maxPslEnd = max(psl->tEnd, maxPslEnd);
    }
chromPslBin = binKeeperNew(minPslStart, maxPslEnd);
agxSeenBin = binKeeperNew(minPslStart, maxPslEnd);
for(psl = pslList; psl != NULL; psl = pslNext)
    {
    pslNext = psl->next;
    if(sameString(psl->tName, chrom))
	binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl);
    else
	pslFree(&psl);
    }
}
Esempio n. 5
0
void blatFilter(char *outName, int inCount, char *inNames[])
/* blatFilter - filter blat alignments somewhat. */
{
int i;
FILE *f = mustOpen(outName, "w");

for (i=0; i<inCount; ++i)
    {
    char *inName = inNames[i];
    struct lineFile *lf = pslFileOpen(inName);
    struct psl *psl;
    while ((psl = pslNext(lf)) != NULL)
        {
	dotOut();
	if (psl->match + psl->repMatch + psl->nCount < 260 || detailTest(psl))
	    pslTabOut(psl, f);
	pslFree(&psl);
	}
    }
printf("\n");
}
void cloneAliPosTab(char *fileName, struct hash *cloneHash)
/* Write out clonePos.tab. */
{
char query[256];
struct sqlResult *sr;
char **row;
struct clonePos *posList = NULL, *pos;
struct cloneInfo *info;
struct sqlConnection *conn = hAllocConn();

sprintf(query, "select * from chr18_frags");
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct psl *psl = pslLoad(row);
    fragNameToCloneName(psl->qName);
    info = findClone(cloneHash, psl->qName);
    if ((pos = info->aliPos) == NULL)
	{
	AllocVar(pos);
	pos->info = info;
	info->aliPos = pos;
	pos->start = psl->tStart;
	pos->end = psl->tEnd;
	slAddHead(&posList, pos);
	}
    else
	{
	if (pos->start > psl->tStart)
	    pos->start = psl->tStart;
	if (pos->end < psl->tEnd)
	    pos->end = psl->tEnd;
	}
    pslFree(&psl);
    }
sqlFreeResult(&sr);
hFreeConn(&conn);
slSort(&posList, cmpClonePos);
writePosList(fileName, posList, "chr18");
}
Esempio n. 7
0
void copyIntronPsls(struct gbSelect* select, FILE* outPslFh,
                    struct recCounts* recCounts)
/* Copy an intron PSL file from the work directory if it exists */
{
char inPsl[PATH_LEN];
struct lineFile* inPslLf;
struct psl* psl;

gbAlignedGetPath(select, "intronPsl", gWorkDir, inPsl);
if (fileExists(inPsl))
    {
    gbVerbEnter(2, "installing from %s", inPsl);
    inPslLf = gzLineFileOpen(inPsl);
    while ((psl = pslNext(inPslLf)) != NULL)
        {
        copyIntronPsl(select, psl, inPsl, outPslFh, recCounts);
        pslFree(&psl);
        }
    gzLineFileClose(&inPslLf);
    gbVerbLeave(2, "installing from %s", inPsl);
    }
}
Esempio n. 8
0
static void processHspRec(struct ncbiBlastBlastOutput *outputRec, struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec,
                          struct ncbiBlastHsp *hspRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
/* process one HSP record, converting to a PSL */
{
int queryLen = (iterRec->ncbiBlastIterationQueryLen != NULL) 
    ? iterRec->ncbiBlastIterationQueryLen->text
    : outputRec->ncbiBlastBlastOutputQueryLen->text;
struct coords qUcsc = blastToUcsc(hspRec->ncbiBlastHspQueryFrom->text, hspRec->ncbiBlastHspQueryTo->text, queryLen,
                                  ((hspRec->ncbiBlastHspQueryFrame == NULL) ? 0 : hspRec->ncbiBlastHspQueryFrame->text));
struct coords tUcsc = blastToUcsc(hspRec->ncbiBlastHspHitFrom->text, hspRec->ncbiBlastHspHitTo->text, hitRec->ncbiBlastHitLen->text,
                                  ((hspRec->ncbiBlastHspHitFrame == NULL) ? 0 : hspRec->ncbiBlastHspHitFrame->text));
struct psl *psl = pslBuildFromHsp(getQName(outputRec, iterRec), qUcsc.size, qUcsc.start, qUcsc.end, qUcsc.strand, hspRec->ncbiBlastHspQseq->text,
                                  getTName(hitRec),  tUcsc.size, tUcsc.start, tUcsc.end, tUcsc.strand, hspRec->ncbiBlastHspHseq->text,
                                  flags);
if  ((psl->blockCount > 0) && ((hspRec->ncbiBlastHspEvalue->text <= eVal) || (eVal == -1)))
    {
    outputPsl(psl, pslFh);
    if (scoreFh != NULL)
        outputScore(psl, outputRec, iterRec, hitRec, hspRec, scoreFh);
    }
pslFree(&psl);
}
Esempio n. 9
0
void blatFlekFilter(char *outName, int inCount, char *inNames[])
/* blatFilter - filter blat alignments somewhat. */
{
int i;
FILE *f = mustOpen(outName, "w");

for (i=0; i<inCount; ++i)
    {
    char *inName = inNames[i];
    struct lineFile *lf = pslFileOpen(inName);
    struct psl *psl;
    while ((psl = pslNext(lf)) != NULL)
        {
	dotOut();
	if (psl->tEnd - psl->tStart < (psl->qEnd + psl->qStart) * 3)
	    pslTabOut(psl, f);
	else
	    writePslFrags(psl, f);
	pslFree(&psl);
	}
    }
printf("\n");
}
void readFile(char *pslFile)
/* Implements the readFile task */
{
FILE *outFh = NULL;
struct pslReader* pr = pslReaderFile(pslFile, gChrom);
struct psl* psl;
int numRows = 0;

if (gOutput != NULL)
    outFh = mustOpen(gOutput, "w");

while ((numRows < gMaxRows) && ((psl = pslReaderNext(pr)) != NULL))
    {
    if (outFh != NULL)
        pslTabOut(psl, outFh);
    pslFree(&psl);
    numRows++;
    }

carefulClose(&outFh);
pslReaderFree(&pr);
checkNumRows(pslFile, numRows);
}
Esempio n. 11
0
void copyPsls(struct gbSelect* select, unsigned pslFileType, FILE* outPslFh,
              struct gbEntryCnts* counts)
/* Copy a PSL file from the work directory if it exists, count alignments
 * for index. */
{
char inPsl[PATH_LEN];
struct lineFile* inPslLf;
struct psl* psl;

gbAlignedGetPath(select, gPslFileExt[pslFileType], gWorkDir, inPsl);
if (fileExists(inPsl))
    {
    gbVerbEnter(2, "installing from %s", inPsl);
    inPslLf = gzLineFileOpen(inPsl);
    while ((psl = pslNext(inPslLf)) != NULL)
        {
        copyPsl(select, pslFileType, psl, inPsl, outPslFh, counts);
        pslFree(&psl);
        }
    gzLineFileClose(&inPslLf);
    gbVerbLeave(2, "installing from %s", inPsl);
    }
}
static void mapPslPair(struct psl *inPsl, struct mapAln *mapAln,
                       FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh)
/* map one pair of query and target PSL */
{
struct psl* mappedPsl;
if (inPsl->tSize != mapAln->psl->qSize)
    errAbort("Error: inPsl %s tSize (%d) != mapping alignment %s qSize (%d) (perhaps you need to specify -swapMap?)\n",
             inPsl->tName, inPsl->tSize, mapAln->psl->qName, mapAln->psl->qSize);

mappedPsl = pslTransMap(mapOpts, inPsl, mapAln->psl);

/* only output if blocks were actually mapped */
if (mappedPsl != NULL)
    {
    if (suffix != NULL)
        addQNameSuffix(mappedPsl);
    pslTabOut(mappedPsl, outPslFh);
    if (mapInfoFh != NULL)
        writeMapInfo(mapInfoFh, inPsl, mapAln, mappedPsl);
    if (mappingPslFh != NULL)
        pslTabOut(mapAln->psl, mappingPslFh);
    }
pslFree(&mappedPsl);
}
Esempio n. 13
0
void somePsls(char *database, char *table, char *inList, char *outPsl)
/* somePsls - Get some psls from database. */
{
char *words[1], **row;
FILE *f = mustOpen(outPsl, "w");
struct lineFile *lf = lineFileOpen(inList, TRUE);
int count = 0, found = 0;
char query[256];
struct psl *psl;
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
while (lineFileRow(lf, words))
    {
    sqlSafef(query, sizeof query, "select * from %s where qName = '%s'", table, words[0]);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
	psl = pslLoad(row+1);
	pslTabOut(psl, f);
	pslFree(&psl);
	}
    sqlFreeResult(&sr);
    }
}
Esempio n. 14
0
struct altGraphX *agFromGp(char *db, struct genePred *gp, struct sqlConnection *conn, 
			   int maxGap, FILE *out)
/** Create an altGraphX record by clustering psl records within coordinates
    specified by genePred record. */
{
struct altGraphX *ag = NULL;
struct dnaSeq *genoSeq = NULL;
struct ggMrnaAli *maList=NULL, *ma=NULL, *maNext=NULL, *maSameStrand=NULL;
struct psl *pslList = NULL, *psl = NULL, *pslCluster = NULL, *pslNext = NULL;
char *chrom = gp->chrom;
int chromStart = BIGNUM;
int chromEnd = -1;

verbose(2, "agFromGp on %s %s:%d-%d\n", gp->name, gp->chrom, gp->txStart, gp->txEnd);

pslList = getPsls(gp, conn);
verbose(3, "  got %d psls\n", slCount(pslList));
if(slCount(pslList) == 0)
    {
    verbose(2, "No available alignments for %s.", gp->name);
    return NULL;
    }
/* expand to find the furthest boundaries of alignments */
expandToMaxAlignment(pslList, chrom, &chromStart, &chromEnd);
verbose(3, "  expanded to %s:%d-%d\n", chrom, chromStart, chromEnd);

/* get the sequence */
genoSeq = dnaFromChrom(db, chrom, chromStart, chromEnd, dnaLower);

for(psl = pslList; psl != NULL; psl = pslNext)
    {
    pslNext = psl->next;
    if(singleExonOk || pslHasIntron(psl, genoSeq, chromStart))
	{
	slAddHead(&pslCluster, psl);
	}
    else 
	{
	if(!useChromKeeper)
	    pslFree(&psl);
	}
    }
verbose(3, "  got %d psls after intron/singleExon check\n", slCount(pslCluster));
/* load and check the alignments */
maList = pslListToGgMrnaAliList(pslCluster, gp->chrom, chromStart, chromEnd, genoSeq, maxGap);
verbose(3, "  got %d in maList\n", slCount(maList));

for(ma = maList; ma != NULL; ma = maNext)
    {
    maNext = ma->next;
    verbose(4, "      ma->strand %s, gp->strand %s\n", ma->strand, gp->strand);
    if(ma->strand[0] == gp->strand[0])
	{
	slSafeAddHead(&maSameStrand, ma);
	}
    else
	ggMrnaAliFree(&ma);
    }
slReverse(&maSameStrand);

verbose(3, "  got %d in ma on same strand\n", slCount(maSameStrand));

/* If there is a cluster to work with create an geneGraph */
if(maSameStrand != NULL)
    {
    ag = agFromAlignments(db, maSameStrand, genoSeq, conn, chromStart, chromEnd,  out);
    }
else
    {
    dnaSeqFree(&genoSeq);
    ggMrnaAliFreeList(&maSameStrand);
    }

/* Only free psls if not using cache... */
if(!useChromKeeper)
    pslFreeList(&pslCluster);
return ag;
}
Esempio n. 15
0
void reviewOne(char *dir)
/* Review sanity files in one contig dir. */
{
char fileName[512];
struct block *blockList = NULL, *block;
struct hash *blockHash = newHash(16);
struct psl *psl;
struct lineFile *lf;
int i, aliSize;

sprintf(fileName, "%s/break.lst", dir);
if (!fileExists(fileName))
    return;
readBlockInfo(fileName, blockHash, &blockList);
if (blockList == NULL)
    return;

sprintf(fileName, "%s/sanity.psl", dir);
lf = lineFileOpen(fileName, TRUE);
while ((psl = pslNext(lf)) != NULL)
    {
    if (pslCalcMilliBad(psl, FALSE) < 20)
	{
	aliSize = psl->match + psl->repMatch;
	block = hashMustFindVal(blockHash, psl->qName);
	for (i=0; i<10; ++i)
	    {
	    if (aliSize <= thresholds[i+1])
		{
		++block->hitCount[i];
		break;
		}
	    }
	if (aliSize >= blockSize-2)
	    ++block->perfectCount;
	}
    else
        ++weakAliCount;
    pslFree(&psl);
    }
lineFileClose(&lf);

/* Loop through list gathering statistics on how blocks
 * hit genome. */
for (block = blockList; block != NULL; block = block->next)
    {
    int numBest = 0;
    int numGood = 0;
    if (block->maskedOut)
        ++repMaskedCount;
    else
	{
	++blockCount;
	if (block->perfectCount)
	    ++perfectCount;
	for (i=9; i >= 0; --i)
	    {
	    if ((numBest = block->hitCount[i]) > 0)
		break;
	    }
	if (numBest == 0)
	    {
	    ++missCount;
	    fprintf(missLog, "%s\n", block->name);
	    }
	else
	    ++hitCount[i];
	for (i=7; i<10; ++i)
	    numGood += block->hitCount[i];
	if (numGood > 1)
	    ++dupeCount;
	}
    }
freeHash(&blockHash);
slFreeList(&blockList);
}
void pslSort2(char *outDir, char *tempDir, boolean noHead)
/* Do second step of sort - merge all sorted files in tempDir
 * to final outdir. */
{
char fileName[512];
struct slName *tmpList, *tmp;
struct midFile *midList = NULL, *mid;
int aliCount = 0;
FILE *f = NULL;
char lastTargetAcc[256];
char targetAcc[256];


strcpy(lastTargetAcc, "");
tmpList = listDir(tempDir, "tmp*.psl");
if (tmpList == NULL)
    errAbort("No tmp*.psl files in %s\n", tempDir);
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    AllocVar(mid);
    mid->lf = pslFileOpen(fileName);
    slAddHead(&midList, mid);
    }
printf("writing %s", outDir);
fflush(stdout);
/* Write out the lowest sorting line from mid list until done. */
for (;;)
    {
    struct midFile *bestMid = NULL;
    if ( (++aliCount & 0xffff) == 0)
	{
	printf(".");
	fflush(stdout);
	}
    for (mid = midList; mid != NULL; mid = mid->next)
	{
	if (mid->lf != NULL && mid->psl == NULL)
	    {
	    if ((mid->psl = nextPsl(mid->lf)) == NULL)
		lineFileClose(&mid->lf);
	    }
	if (mid->psl != NULL)
	    {
	    if (bestMid == NULL || pslCmpTarget(&mid->psl, &bestMid->psl) < 0)
		bestMid = mid;
	    }
	}
    if (bestMid == NULL)
	break;
    getTargetAcc(bestMid->psl->tName, targetAcc);
    if (!sameString(targetAcc, lastTargetAcc))
	{
	strcpy(lastTargetAcc, targetAcc);
	carefulClose(&f);
	sprintf(fileName, "%s/%s.psl", outDir, targetAcc);
	f = mustOpen(fileName, "w");
	if (!noHead)
	    pslWriteHead(f);
	}
    pslTabOut(bestMid->psl, f);
    pslFree(&bestMid->psl);
    }
carefulClose(&f);
printf("\n");

printf("Cleaning up temp files\n");
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    remove(fileName);
    }
}
Esempio n. 17
0
static void clusterClone(int argc, char *argv[])
{
int i;

for (i=1; i < argc; ++i)
    {
    struct lineFile *lf;
    struct psl *psl;
    unsigned tSize;
    char *prevAccPart = (char *)NULL;
    char *prevAccName = (char *)NULL;
    char *prevTargetName = (char *)NULL;
    struct hashEl *el;
    struct hash *chrHash = newHash(0);
    struct hash *coordHash = newHash(0);
    struct coordEl *coord;
    struct coordEl **coordListPt = (struct coordEl **) NULL;
    unsigned querySize = 0;
    int partCount = 0;
    int partsConsidered = 0;

    verbose(2,"#\tprocess: %s\n", argv[i]);
    lf=pslFileOpen(argv[i]);
    while ((struct psl *)NULL != (psl = pslNext(lf)) )
	{
	char *accName = (char *)NULL;
	char *targetName = (char *)NULL;
	int chrCount = 0;
	double percentCoverage;

	accName = cloneString(psl->qName);
	if ((char *)NULL == prevAccPart)
	    {
	    prevAccPart = cloneString(psl->qName);  /* first time */
	    querySize = psl->qSize;
	    ++partsConsidered;
	    }
	chopSuffixAt(accName,'_');

	if ((char *)NULL == prevAccName)
		prevAccName = cloneString(accName);  /* first time */
	if ((char *)NULL == prevTargetName)
		prevTargetName = cloneString(psl->tName);  /* first time */

	/*	encountered a new accession name, process the one we
 	 *	were working on
	 */
	if (differentWord(accName, prevAccName))
	    {
	    if (partCount > 0)
		processResult(chrHash, coordHash, prevAccName, querySize,
		    partsConsidered);
	    else
		verbose(1,"# ERROR %s %s - no coordinates found in %d parts considered\n",
		    prevTargetName, prevAccName, partsConsidered);
	    freeMem(prevAccName);
	    prevAccName = cloneString(accName);
	    freeHash(&chrHash);
	    freeHash(&coordHash);
	    chrHash = newHash(0);
	    coordHash = newHash(0);
	    querySize = 0;
	    partCount = 0;
	    partsConsidered = 0;
	    }

	tSize = psl->tEnd - psl->tStart;
	percentCoverage = 100.0*((double)(tSize+1)/(psl->qSize + 1));
	if (differentWord(psl->qName, prevAccPart))
	    {
	    ++partsConsidered;
	    querySize += psl->qSize;
	    freeMem(prevAccPart);
	    prevAccPart = cloneString(psl->qName);
	    }

	targetName = cloneString(psl->tName);
	if (differentWord(targetName, prevTargetName))
	    {
	    freeMem(prevTargetName);
	    prevTargetName = cloneString(targetName);
	    }
	/*	keep a hash of chrom names encountered	*/
	el = hashLookup(chrHash, targetName);
	if (el == NULL)
	    {
	    if (percentCoverage > minCover)
		{
		hashAddInt(chrHash, targetName, 1);
		chrCount = 1;
		}
	    else
		{
		hashAddInt(chrHash, targetName, 0);
		chrCount = 0;
		}
	    }
	else
	    {
	    if (percentCoverage > minCover)
		{
		chrCount = ptToInt(el->val) + 1;
		el->val=intToPt(chrCount);
		}
	    }

	AllocVar(coord);
	coord->start = psl->tStart;
	coord->end = psl->tEnd;
	coord->qSize = psl->qSize;
	coord->strand = sameWord(psl->strand,"+") ? 1 : 0;
	/*	when coverage is sufficient	*/
	if (percentCoverage > minCover)
	    {
	    ++partCount;
	    coord->name = cloneString(psl->qName);
	    /*	for each chrom name, accumulate a list of coordinates */
	    el = hashLookup(coordHash, targetName);
	    if (el == NULL)
		{
		AllocVar(coordListPt);
		hashAdd(coordHash, targetName, coordListPt);
		}
	    else
		{
		coordListPt = el->val;
		}
	    slAddHead(coordListPt,coord);
	verbose(2,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }
	else
	    {
	verbose(3,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }


	freeMem(accName);
	freeMem(targetName);
	pslFree(&psl);
	}
    if (partCount > 0)
	processResult(chrHash, coordHash, prevAccName, querySize,
	    partsConsidered);
    else
	verbose(1,"# ERROR %s %s - no coordinates found\n",
	    prevTargetName, prevAccName);
    freeMem(prevAccName);
    freeHash(&chrHash);
    freeHash(&coordHash);
    lineFileClose(&lf);
    }
}	/*	static void clusterClone()	*/
Esempio n. 18
0
static void chkPslTable(struct gbSelect* select, struct sqlConnection* conn,
                        char* rootTable, char* chrom,
                        struct metaDataTbls* metaDataTbls,
                        unsigned typeFlags)
/* Validate a PSL of a mrna/est to genome alignment against the metadata.  If
 * not a chromosome-specific table, chrom should be null.  Chromosome-specific
 * tables are not required to exist (for testing purposes).  Also count the
 * number of alignments of a mrna. */
{
struct hTableInfo* tableInfo;
char table[64];
unsigned iRow = 0;
unsigned rowOffset;
char accWhere[64];
char query[512];
struct sqlResult *sr;
char **row;

/* need to specify an explicit chrom table, as there is an mrna table which is
 * not psl, so using mrna as a root name with a chrom that doesn't exist
 * returns the mrna instead of null */

if (chrom != NULL)
    safef(table, sizeof(table), "%s_%s", chrom, rootTable);
else
    safef(table, sizeof(table), "%s", rootTable);

gbVerbEnter(3, "chkPslTable %s", table);

tableInfo = hFindTableInfo(select->release->genome->database, chrom, table);
if (tableInfo == NULL)
    {
    /* If all table, require it */
    if (chrom == NULL)
        {
        if (testMode)
            fprintf(stderr, "Warning: no psl table %s.%s\n",
                    select->release->genome->database, table);
        else
            gbError("no psl table %s.%s", select->release->genome->database,
                    table);
        }
    }
else
    {
    rowOffset = (tableInfo->hasBin) ? 1 : 0;
    // FIXME: might be better as sqlDyString
    accWhere[0] = '\0';
    if (select->accPrefix != NULL)
        sqlSafefFrag(accWhere, sizeof(accWhere), " WHERE qName LIKE '%s%%'",
              select->accPrefix);
    sqlSafef(query, sizeof(query), "SELECT * FROM %s%-s", table, accWhere);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
        struct psl* psl = pslLoad(row+rowOffset);
        chkPsl(psl, iRow, select->release->genome->database, table,
               metaDataTbls, typeFlags);
        pslFree(&psl);
        iRow++;
        }
    sqlFreeResult(&sr);
    }
gbVerbLeave(3, "chkPslTable %s", table);
}
Esempio n. 19
0
void affyPslAndAtlasToBedNew(char *pslFile, char *atlasFile, char *bedOut, 
	char *expRecOut)
/** Main function that does all the work for new-style*/
{
struct lineFile *lf = lineFileOpen(atlasFile, TRUE);
char *line, *name;
int i, wordCount, expCount;
char **row;
double *data, median;
double invMedian, ratio, logRatio;
char *affyId;
struct hash *hash = newHash(17);
struct psl *psl;
struct bed *bed;
FILE *f = NULL;
int dataCount = 0, pslCount = 0, bedCount = 0;
int minExpVal = 20;

/* Open Atlas file and use first line to create experiment table. */
if (!lineFileNextReal(lf, &line))
    errAbort("%s is empty", lf->fileName);
if (startsWith("Affy", line))
    line += 4;
if (line[0] != '\t')
    errAbort("%s doesn't seem to be a new format atlas file", lf->fileName);
expCount = lineToExp(line+1, expRecOut);
if (expCount <= 0)
    errAbort("No experiments in %s it seems", lf->fileName);
warn("%d experiments\n", expCount);

f = mustOpen(bedOut, "w");

/* Build up a hash keyed by affyID with an int array of data
 * for value.  Do output in short case. */
AllocArray(row, expCount);
while (lineFileNextReal(lf, &line))
    {
    affyId = nextWord(&line);

    wordCount = chopByWhite(line, row, expCount);
    if (wordCount != expCount)
        errAbort("Expecting %d data points, got %d line %d of %s", 
		expCount, wordCount, lf->lineIx, lf->fileName);
    if (hashLookup(hash, affyId))
	{
        warn("Duplicate %s, skipping all but first.", affyId);
	continue;
	}
    AllocArray(data, expCount);
    for (i=0; i<expCount; ++i)
	{
        data[i] = atof(row[i]);
        if (data[i] < minExpVal)
	    data[i] = minExpVal;
	}
    median = findPositiveMedian(data, expCount, minExpVal);
    if (median >= 0)
	{
	invMedian = 1.0/median;
	for (i=0; i<expCount; ++i)
	    {
	    double val = data[i];
	    val = safeLog2(invMedian*val);
	    data[i] = val;
	    }
	if (shortOut)
	    shortDataOut(f, affyId, expCount, data);
	else
	    hashAdd(hash, affyId, data);
        }
    data = NULL;
    ++dataCount;
    }
lineFileClose(&lf);
warn("%d rows of expression data\n", dataCount);

/* Stream through psl file, converting it to bed with expression data. */
if (!shortOut)
    {
    lf = pslFileOpen(pslFile);
    while ((psl = pslNext(lf)) != NULL)
	{
	++pslCount;
        /* get probe id from sequence name */
        name=parseNameFromHgc(psl->qName);
	data = hashFindVal(hash, name);
        if (data != NULL)
	    {
            struct bed *bed = bedFromPsl(psl);
	    bed->expCount = expCount;
	    AllocArray(bed->expIds, expCount);
	    AllocArray(bed->expScores, expCount);
	    for (i=0; i<expCount; ++i)
		{
		bed->expScores[i] = data[i];
		bed->expIds[i] = i;
		}
	    bedTabOutN(bed, 15, f);
	    ++bedCount;

	    bedFree(&bed);
	    }
	pslFree(&psl);
	}
    warn("%d records in %s", pslCount, pslFile);
    warn("%d records written to %s", bedCount, bedOut);
    }
lineFileClose(&lf);
carefulClose(&f);
}
void pslDropOverlap(char *inName, char *outName)
/* Simplify psl - print only select non-tab fields. */
{
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = mustOpen(outName, "w");
struct psl *psl;
char *line;
int lineSize;
int skipMatch = 0;
int totMatch = 0, totMis = 0, totIns = 0, totRepMatch = 0;
int totSkip = 0;
int totLines = 0;

if (!lineFileNext(lf, &line, &lineSize))
    errAbort("%s is empty\n", inName);
if (startsWith("psLayout version", line))
   {
   int i;
   uglyf("Skipping header\n");
   for (i=0; i<4; ++i)
       lineFileNext(lf, &line, &lineSize);
   }
else
    lineFileReuse(lf);
while ((psl = pslNext(lf)) != NULL)
    {
    totLines++;
    totMatch += psl->match;
    totMis += psl->misMatch;
    totIns += psl->blockCount-1;
    totRepMatch += psl->repMatch;
    if (sameString(psl->qName, psl->tName))
        {
        int i;
        int newBlockCount = psl->blockCount;
        if (psl->tStart == psl->qStart)
            {
            pslFree(&psl);
            continue;
            }
        for (i = psl->blockCount-1 ; i >= 0 ; i--)
            {
            int ts = psl->tStarts[i];
            int te = psl->tStarts[i]+psl->blockSizes[i];
            int qs = psl->qStarts[i];
            int qe = psl->qStarts[i]+psl->blockSizes[i];
            if (psl->strand[0] == '-')
                reverseIntRange(&qs, &qe, psl->qSize);
            if (psl->strand[1] == '-')
                reverseIntRange(&ts, &te, psl->tSize);
            if (ts == qs)
                {
                newBlockCount--;
                psl->match -= psl->blockSizes[i];
                printf( "skip block size %d #%d blk %d %d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\n",
                  psl->blockSizes[i],i, psl->blockCount, psl->match, psl->misMatch, psl->repMatch, psl->nCount, 
                  psl->strand,
                  psl->qName, psl->qSize, psl->qStart, psl->qEnd,
                  psl->tName, psl->tSize, psl->tStart, psl->tEnd
                  );
                /* debug */
                if (psl->match > 200000000)
                    {
                    printf("blk %d : ", psl->blockCount);
                    pslOutRev(psl, stdout);
                    }
                deleteElement(psl->tStarts, i , psl->blockCount);
                deleteElement(psl->qStarts, i , psl->blockCount);
                deleteElement(psl->blockSizes, i , psl->blockCount);
                totSkip++;
                skipMatch += psl->blockSizes[i];
                }
            }
        psl->blockCount = newBlockCount;
        }
    /* fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n",
	  psl->match, psl->misMatch, psl->repMatch, psl->nCount, 
      psl->qNumInsert, psl->qBaseInsert, psl->tNumInsert, psl->tBaseInsert,
      psl->strand,
	  psl->qName, psl->qSize, psl->qStart, psl->qEnd,
	  psl->tName, psl->tSize, psl->tStart, psl->tEnd,
      psl->blockCount-1 );
      */

    pslTabOut(psl, f);

    pslFree(&psl);
    }
printf( "Total skipped %d blocks out of %d alignments, match %d out of %d, in %s\n",
	totSkip, totLines, skipMatch, totMatch,  outName );
fclose(f);
lineFileClose(&lf);
}
/* Version for Zoo species */
boolean convertCoordinatesZoo(FILE *goodOut, FILE *badOut, 
			void (*goodResult)(FILE *out, struct coordConvRep *report),
			void (*badResult)(FILE *out, struct coordConvRep *report)) 
/* tries to convert coordinates and prints report 
 depending on function pointers provided. In generial
 goodResult and badResult either generate html or tesxt
 if we are in cgi or testing mode respectively. */
{
struct blatServerTable *serve = NULL;
struct coordConvRep *ccr = createCoordConvRep_mod();
struct dbDb *newDbRec = NULL, *oldDbRec = NULL;
struct sqlConnection *conn = sqlConnect(origGenome);
struct linkedFeatures *lfList = NULL, *lf;
struct sqlResult *sr = NULL;

boolean success = FALSE;

/* Keeps track if we're in an inverted match or not */
boolean inversion = FALSE;

/* Two possible reasons two fail */
boolean incoherent = FALSE;
boolean max_apart= FALSE;

char track[256];
char success_message[256];
char **row;
int rowOffset;
int conv_total=0;
int iteration = 0;

/* These two distances check how different the distance is between the converted and unconverted coordinates.  
   In this case if the distance between a converted versus unconverted block is more than 10 times
   and greater than 10 000 bases, set up a warning... */

int ref_end=0,ref_start,comp_end=0,comp_start=0;

/* Load info from databases into ccr */
oldDbRec = loadDbInformation_mod(origGenome);
ccr->from->chrom = cloneString(chrom);
ccr->from->chromStart = chromStart;
ccr->from->chromEnd = chromEnd;
ccr->from->version = cloneString(oldDbRec->name);
ccr->from->date = cloneString(oldDbRec->description);
ccr->from->nibDir = cloneString(oldDbRec->nibPath);
ccr->seqSize=1000;
newDbRec = loadDbInformation_mod(newGenome);
ccr->to->version = cloneString(newDbRec->name);
ccr->to->date = cloneString(newDbRec->description);
ccr->to->nibDir = cloneString(newDbRec->nibPath);
ccr->good=FALSE;

/* Create the correct track name...  Will have to be changed when multiple versions? */

sprintf(track,"%s_%s",origGenome,newGenome);

/* Get the information from loading the track. */
/* Double check we are not using a track connecting 1 and 2 */

if(!(strstr(track,"2") && strstr(track,"1")))
    {
    sr = hRangeQuery(conn, track, chrom, chromStart, chromEnd, NULL, &rowOffset);
    }

while ((row = sqlNextRow(sr)) != NULL)
    {
    /* Find the correponding track */
    struct psl *psl = pslLoad(row+rowOffset);
    
    /* If first time through... */
    if(iteration==0)
	{
	/* Fill in stuff if first time through... */
	ccr->to->chrom=cloneString(psl->qName);
	ccr->to->chromStart=psl->qStart;
	
	/* Actual point of conversion of coordinates */
	ccr->from->next->chromStart=psl->tStart;      
	ccr->good=TRUE;
	
	success=TRUE;
	}
    
    /* check for erroneous conversion if not first time through */
    /* Check for inversions, massive insertions... */
    
    /* Check for inversion (old start is "bigger" than new start)*/	
    
    if(iteration > 0)
	{
	if((comp_start> psl->qStart))
	    {
	    /* If not currently in an inversion state */
	    if(!inversion )
		/* If not the second time through (first time inversion could be detected) */
		if(iteration > 2)
		    incoherent=TRUE;
	    
	    /* Reset variables used for measuring distance... */
	    
	    /* Set inversion state variable to true */
	    inversion = TRUE;
	    
	    
	    /* Check to see if there are too great distances ... */
	    
	    if( ((comp_start - psl->qEnd)>(10 * (psl->tStart - ref_end))) && ((comp_start - psl->qEnd) > 10000))
		max_apart=TRUE;
	    }
	else 
	    /* No inversion */
	    {
	    /* Check if previous state was an inversion (then flip flop)...*/
	    if(inversion)
		incoherent = TRUE;
	    else
		{
		/* Check to see if the mapping is too far apart */
		if( ((psl->qStart - comp_end) > (10 * (psl->tStart - ref_end))) && ((psl->qStart - comp_end) > 10000))
		    max_apart=TRUE;
		}
	    }
	}
    
    if(inversion)
	{
	if(iteration == 1)
	    ccr->to->chromEnd=comp_end;
	
	ccr->to->chromStart=psl->qStart;
	}
    else
	ccr->to->chromEnd=psl->qEnd;
    
    ccr->from->next->chromEnd=psl->tEnd;
    
    if(max_apart || incoherent)
	{
	success=FALSE;
	break;
	}
    
    if(psl->tStart > ref_end)
	conv_total+=(psl->tEnd - psl->tStart);
    else
	conv_total+=(psl->tEnd - ref_end);
    
    ref_end=psl->tEnd;
    comp_end=psl->qEnd;
    ref_start=psl->tStart;
    comp_start=psl->qStart;
        
    iteration++;
    pslFree(&psl);
    }
		    
if(!success)
    {
    /* Check to see if using version two of zoo.  Not integrated into the database at this stage... */
    if((strstr(origGenome,"2") && strstr(newGenome,"1"))|| (strstr(newGenome,"2") && strstr(origGenome,"1")))
	sprintf(success_message,"Couldn't convert between these two genomes since the cross conversion between the two zoo dataset hasn't been fully integrated into the database");
    else if (max_apart)
	sprintf(success_message, "Coordinates couldn't reliably be converted between the two species.  Try using a smaller window. ");
    else if (incoherent)
	sprintf(success_message, "Coordinates couldn't be converted due to inconsistent inversions.");
    else
	sprintf(success_message,"Couldn't find a corresponding region for the original genome to the new genome.");
    
    ccr->msg=cloneString(success_message);
    badResult(badOut,ccr);
    }
else
    {
    sprintf(success_message,"Successfully converted (%3.1f%% of the original region was converted.)",((float)(conv_total * 100))/(float)(chromEnd-chromStart));
    ccr->msg=cloneString(success_message);
    goodResult(goodOut,ccr);
    }

dbDbFree(&oldDbRec);
dbDbFree(&newDbRec);
coordConvRepFreeList(&ccr); 
return success;
}
void oneChrom(char *database, char *chrom, char *refAliTrack, char *bedTrack,
              struct hash *otherHash, struct stats *stats)
/* Process one chromosome. */
{
    struct bed *bedList = NULL, *bed;
    struct sqlConnection *conn = hAllocConn(database);
    struct sqlResult *sr;
    char **row;
    int rowOffset;
    int chromSize = hChromSize(database, chrom);
    struct binKeeper *bk = binKeeperNew(0, chromSize);
    struct psl *pslList = NULL;
    struct dnaSeq *chromSeq = NULL;

    if (endsWith(bedTrack, ".bed"))
    {
        struct lineFile *lf = lineFileOpen(bedTrack, TRUE);
        char *row[3];
        while (lineFileRow(lf, row))
        {
            if (sameString(chrom, row[0]))
            {
                bed = bedLoad3(row);
                slAddHead(&bedList, bed);
            }
        }
        lineFileClose(&lf);
    }
    else
    {
        sr = hChromQuery(conn, bedTrack, chrom, NULL, &rowOffset);
        while ((row = sqlNextRow(sr)) != NULL)
        {
            bed = bedLoad3(row+rowOffset);
            slAddHead(&bedList, bed);
        }
        sqlFreeResult(&sr);
    }
    slReverse(&bedList);
    uglyf("Loaded beds\n");

    sr = hChromQuery(conn, refAliTrack, chrom, NULL, &rowOffset);
    while ((row = sqlNextRow(sr)) != NULL)
    {
        struct psl *psl = pslLoad(row + rowOffset);
        slAddHead(&pslList, psl);
        binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
    }
    sqlFreeResult(&sr);
    uglyf("Loaded psls\n");

    chromSeq = hLoadChrom(database, chrom);
    /* Fetch entire chromosome into memory. */
    uglyf("Loaded human seq\n");

    for (bed = bedList; bed != NULL; bed = bed->next)
    {
        struct binElement *el, *list = binKeeperFind(bk, bed->chromStart, bed->chromEnd);
        for (el = list; el != NULL; el = el->next)
        {
            struct psl *fullPsl = el->val;
            struct psl *psl = pslTrimToTargetRange(fullPsl,
                                                   bed->chromStart, bed->chromEnd);
            if (psl != NULL)
            {
                foldPslIntoStats(psl, chromSeq, otherHash, stats);
                pslFree(&psl);
            }
        }
        slFreeList(&list);
        stats->bedCount += 1;
        stats->bedBaseCount += bed->chromEnd - bed->chromStart;
        sqlFreeResult(&sr);
    }
    freeDnaSeq(&chromSeq);
    pslFreeList(&pslList);
    binKeeperFree(&bk);
    hFreeConn(&conn);
}
Esempio n. 23
0
struct psl* pslTransMap(unsigned opts, struct psl *inPsl, struct psl *mapPsl)
/* map a psl via a mapping psl, a single psl is returned, or NULL if it
 * couldn't be mapped. */
{
int mappedPslMax = 8; /* allocated space in output psl */
int iMapBlk = 0;
char inPslOrigStrand[3];
boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl));
boolean cnv1 = (pslIsProtein(inPsl) && !pslIsProtein(mapPsl));
boolean cnv2 = (pslIsProtein(mapPsl) && !pslIsProtein(inPsl));
int iBlock;
struct psl* mappedPsl;

/* sanity check size, but allow names to vary to allow ids to have
 * unique-ifying suffixes. */
if (inPsl->tSize != mapPsl->qSize)
    errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)",
            inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize);

/* convert protein PSLs */
if (cnv1)
    pslProtToNA(inPsl);
if (cnv2)
    pslProtToNA(mapPsl);

/* need to ensure common sequence is in same orientation, save strand for later */
safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand);
if (rcInPsl)
    pslRc(inPsl);

mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);

/* Fill in ungapped blocks.  */
for (iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
    {
    struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
    while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
                    &mappedPslMax))
        continue;
    }

/* finish up psl, or free if no blocks were added */
assert(mappedPsl->blockCount <= mappedPslMax);
if (mappedPsl->blockCount == 0)
    pslFree(&mappedPsl);  /* nothing made it */
else
    {
    setPslBounds(mappedPsl);
    adjustOrientation(opts, inPsl, inPslOrigStrand, mappedPsl);
    }

/* restore input */
if (rcInPsl)
    {
    pslRc(inPsl);
    strcpy(inPsl->strand, inPslOrigStrand);
    }
if (cnv1)
    pslNAToProt(inPsl);
if (cnv2)
    pslNAToProt(mapPsl);

return mappedPsl;
}
Esempio n. 24
0
void pslSort2(char *outFile, char *tempDir)
/* Do second step of sort - merge all sorted files in tempDir
 * to final. */
{
char fileName[512];
struct slName *tmpList, *tmp;
struct midFile *midList = NULL, *mid;
int aliCount = 0;
FILE *f = mustOpen(outFile, "w");


if (!nohead)
    pslWriteHead(f);
tmpList = listDir(tempDir, "tmp*.psl");
if (tmpList == NULL)
    errAbort("No tmp*.psl files in %s\n", tempDir);
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    AllocVar(mid);
    mid->lf = pslFileOpen(fileName);
    slAddHead(&midList, mid);
    }
verbose(1, "writing %s", outFile);
fflush(stdout);
/* Write out the lowest sorting line from mid list until done. */
for (;;)
    {
    struct midFile *bestMid = NULL;
    if ( (++aliCount & 0xffff) == 0)
	{
	verboseDot();
	fflush(stdout);
	}
    for (mid = midList; mid != NULL; mid = mid->next)
	{
	if (mid->lf != NULL && mid->psl == NULL)
	    {
	    if ((mid->psl = nextPsl(mid->lf)) == NULL)
		lineFileClose(&mid->lf);
	    }
	if (mid->psl != NULL)
	    {
	    if (bestMid == NULL || pslCmpQuery(&mid->psl, &bestMid->psl) < 0)
		bestMid = mid;
	    }
	}
    if (bestMid == NULL)
	break;
    pslTabOut(bestMid->psl, f);
    pslFree(&bestMid->psl);
    }
printf("\n");
fclose(f);

/* The followint really shouldn't be necessary.... */
for (mid = midList; mid != NULL; mid = mid->next)
    lineFileClose(&mid->lf);

printf("Cleaning up temp files\n");
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    remove(fileName);
    }
}