Пример #1
0
void axtAndBed(char *inAxt, char *inBed, char *outAxt)
/* axtAndBed - Intersect an axt with a bed file and output axt.. */
{
struct hash *tHash = readBed(inBed); /* target keyed, binKeeper value */
struct lineFile *lf = lineFileOpen(inAxt, TRUE);
struct axt *axt;
struct binElement *list = NULL, *el;
FILE *f = mustOpen(outAxt, "w");
struct axtScoreScheme *ss = axtScoreSchemeDefault();

while ((axt = axtRead(lf)) != NULL)
    {
    struct chromInfo *ci = hashFindVal(tHash, axt->tName);
    if (ci != NULL)
	{
	list = binKeeperFind(ci->bk, axt->tStart, axt->tEnd);
	if (list != NULL)
	    {
	    /* Flatten out any overlapping elements by projecting them
	     * onto a 0/1 valued character array and then looking for 
	     * runs of 1 in this array. */
	    int tStart = axt->tStart;
	    int tEnd = axt->tEnd;
	    int tSize = tEnd - tStart;
	    int i, s = 0;
	    char c, lastC = 0;
	    char *merger = NULL;
	    AllocArray(merger, tSize+1);
	    for (el = list; el != NULL; el = el->next)
		{
		int s = el->start - tStart;
		int e = el->end - tStart;
		int sz;
		if (s < 0) s = 0;
		if (e > tSize) e = tSize;
		sz = e - s;
		if (sz > 0)
		    memset(merger + s, 1, sz);
		}
	    for (i=0; i<=tSize; ++i)
		{
		c = merger[i];
		if (c && !lastC)
		    {
		    s = i;
		    lastC = c;
		    }
		else if (!c && lastC)
		    {
		    axtSubsetOnT(axt, s+tStart, i+tStart, ss, f);
		    lastC = c;
		    }
		}
	    freez(&merger);
	    slFreeList(&list);
	    }
	}
    axtFree(&axt);
    }
}
struct seqPair *readAxtBlocks(char *fileName, struct hash *pairHash, FILE *f)
/* Read in axt file and parse blocks into pairHash */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct dyString *dy = newDyString(512);
struct axt *axt;
struct seqPair *spList = NULL, *sp;

lineFileSetMetaDataOutput(lf, f);
lineFileSetUniqueMetaData(lf);
while ((axt = axtRead(lf)) != NULL)
    {
    dyStringClear(dy);
    dyStringPrintf(dy, "%s%c%s", axt->qName, axt->qStrand, axt->tName);
    sp = hashFindVal(pairHash, dy->string);
    if (sp == NULL)
        {
	AllocVar(sp);
	slAddHead(&spList, sp);
	hashAddSaveName(pairHash, dy->string, sp, &sp->name);
	sp->qName = cloneString(axt->qName);
	sp->tName = cloneString(axt->tName);
	sp->qStrand = axt->qStrand;
	}
    axtAddBlocksToBoxInList(&sp->blockList, axt);
    sp->axtCount += 1;
    axtFree(&axt);
    }
lineFileClose(&lf);
dyStringFree(&dy);
slSort(&spList, seqPairCmp);
return spList;
}
Пример #3
0
void axtToPsl(char *inName, char *tSizeFile, char *qSizeFile, char *outName)
/* axtToPsl - Convert axt to psl format. */
{
struct hash *tSizeHash = readSizes(tSizeFile);
struct hash *qSizeHash = readSizes(qSizeFile);
struct lineFile *lf = lineFileOpen(inName, TRUE);
char strand[2];
FILE *f = mustOpen(outName, "w");
struct psl* psl;
struct axt *axt;
strand[1] = '\0';

while ((axt = axtRead(lf)) != NULL)
    {
    int qSize = findSize(qSizeHash, axt->qName);
    int qStart =  axt->qStart;
    int qEnd = axt->qEnd;
    if (axt->qStrand == '-')
        reverseIntRange(&qStart, &qEnd, qSize);
    strand[0] = axt->qStrand;
    psl = pslFromAlign(axt->qName, qSize, qStart, qEnd, axt->qSym, 
                       axt->tName, findSize(tSizeHash, axt->tName),
                       axt->tStart, axt->tEnd, axt->tSym, strand,
                       PSL_IS_SOFTMASK);
    if (psl != NULL)
	{
	pslTabOut(psl, f);
	pslFree(&psl);
	}
    axtFree(&axt);
    }
lineFileClose(&lf);
carefulClose(&f);
}
Пример #4
0
void predict(struct c1Counts cKozak[10], struct c1Counts *cAll, 
	char *axtFile, char *outFile, struct hash *rsiHash)
/* Predict location of initial ATG */
{
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
FILE *f = mustOpen(outFile, "w");
struct oddsMatrix kozak[10];
int i;
int bestPos, firstPos, actualPos;
double bestScore, firstScore, actualScore;
struct axt *axt;

for (i=0; i<10; ++i)
    countToOdds(&cKozak[i], cAll, &kozak[i]);
while ((axt = axtRead(lf)) != NULL)
    {
    struct refSeqInfo *rsi = hashFindVal(rsiHash, axt->tName);
    if (rsi != NULL && rsi->cdsStart >= 5)
        {
	findBestHit(axt, kozak, 10, &bestPos, &bestScore, &firstPos, 
		&firstScore);
	actualPos = tIxToSymIx(axt, rsi->cdsStart - 5);
	actualScore = scoreMotif(kozak, 10, 
		axt->tSym+actualPos, axt->qSym + actualPos);
	/* Score motif at position. */
	fprintf(f, "%s\t%d\t%f\t%d\t%f\t%d\t%f\n", axt->tName, 
		rsi->cdsStart, actualScore,
		tIxFromSymIx(axt, bestPos) + 5, bestScore, 
		tIxFromSymIx(axt, firstPos) + 5, firstScore);
	}
    axtFree(&axt);
    }
carefulClose(&f);
lineFileClose(&lf);
}
Пример #5
0
void twinOrfStats(char *axtFile, char *raFile, char *outFile)
/* twinOrfStats - Collect stats on refSeq cDNAs aligned to another species via axtForEst. */
{
struct hash *rsiHash = readRefRa(raFile);
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
FILE *f = mustOpen(outFile, "w");
struct axt *axt;
static struct countMatrix kozak[10], all, utr5, utr3, cds;
static struct c2Counts c2All, c2Utr5, c2Utr3, c2Cds;
char label[64];
char *predictFile = optionVal("predict", NULL);
int i;
struct codonCounts codons;

initCounts(&codons, 1);

threshold = optionFloat("threshold", threshold);
while ((axt = axtRead(lf)) != NULL)
    {
    struct refSeqInfo *rsi = hashFindVal(rsiHash, axt->tName);
    if (rsi != NULL && rsi->cdsStart >= 5)
        {
	if (checkAtg(axt, rsi->cdsStart))
	    {
	    for (i=0; i<10; ++i)
		addPos(&kozak[i], axt, rsi->cdsStart - 5 + i);
	    addRange(&all, &c2All, axt, 0, rsi->size);
	    addRange(&utr5, &c2Utr5, axt, 0, rsi->cdsStart);
	    addRange(&cds, &c2Cds, axt, rsi->cdsStart, rsi->cdsEnd);
	    addRange(&utr3, &c2Utr3, axt, rsi->cdsEnd, rsi->size);
	    addCodons(&codons, axt, rsi->cdsStart, rsi->cdsEnd-3);
	    }
	}
    axtFree(&axt);
    }
lineFileClose(&lf);
dumpCounts(f, &all, "all");
dumpCounts(f, &utr5, "utr5");
dumpCounts(f, &cds, "cds");
dumpCounts(f, &utr3, "utr3");
dumpM1(f, &c2All, "c2_all");
dumpM1(f, &c2Utr5, "c2_utr5");
dumpM1(f, &c2Cds, "c2_cds");
dumpM1(f, &c2Utr3, "c2_utr3");
for (i=0; i<10; ++i)
    {
    sprintf(label, "kozak[%d]", i-5);
    dumpCounts(f, &kozak[i], label);
    }
dumpCodon(f, &codons, "codon");
if (predictFile)
    {
    predict(kozak, &all, axtFile, predictFile, rsiHash);
    }
}
Пример #6
0
void writeMousePartsAsMaf(FILE *f, struct hash *mouseHash, 
	char *ratMouseDir, char *mouseChrom,
	int mouseStart, int mouseEnd, int mouseChromSize, 
	struct hash *rSizeHash, struct hash *dupeHash)
/* Write out mouse/rat alignments that intersect given region of mouse.
 * This gets a little involved because we need to do random access on
 * the mouse/rat alignment files, which are too big to fit into memory.
 * On disk we have a mouse/rat alignment file for each mouse chromosome,
 * and an index of it.  When we first access a mouse chromosome we load
 * the index for that chromosome into memory, and open the alignment file.
 * We then do a seek and read to load a particular alignment. */
{
struct mouseChromCache *mcc = NULL;
struct binElement *list = NULL, *el;
char aliName[512];

/* Get cache for this mouse chromosome */
mcc = hashFindVal(mouseHash, mouseChrom);
if (mcc == NULL)
    {
    mcc = newMouseChromCache(mouseChrom, mouseChromSize, ratMouseDir);
    hashAdd(mouseHash, mouseChrom, mcc);
    }
if (mcc->lf == NULL)
    return;

/* Get list of positions and process one axt into a maf for each */
list = binKeeperFindSorted(mcc->bk, mouseStart, mouseEnd);
for (el = list; el != NULL; el = el->next)
    {
    struct axt *axt;
    struct mafAli temp;
    long long *pPos, pos;
    pPos = el->val;
    pos = *pPos;
    sprintf(aliName, "%s.%lld", mouseChrom, pos);
    if (!hashLookup(dupeHash, aliName))
	{
	int rChromSize;
	hashAdd(dupeHash, aliName, NULL);
	lineFileSeek(mcc->lf, pos, SEEK_SET);
	axt = axtRead(mcc->lf);
	rChromSize = hashIntVal(rSizeHash, axt->qName);
	prefixAxt(axt, rPrefix, mPrefix);
	mafFromAxtTemp(axt, mouseChromSize, rChromSize, &temp);
	mafWriteGood(f, &temp);
	axtFree(&axt);
	}
    }
slFreeList(&list);
}
void axtPretty(char *inName, char *outName)
/* axtPretty - Convert axt to more human readable format.. */
{
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = mustOpen(outName, "w");
struct axt *axt;
int lineSize = optionInt("line", 70);

while ((axt = axtRead(lf)) != NULL)
    {
    axtOutPretty(axt, lineSize, f);
    axtFree(&axt);
    }
}
void axtSwapFile(char *source, char *targetSizes, char *querySizes, char *dest)
/* axtSwapFile - Swap source and query in an axt file. */
{
    struct hash *tHash = loadIntHash(targetSizes);
    struct hash *qHash = loadIntHash(querySizes);
    struct lineFile *lf = lineFileOpen(source, TRUE);
    FILE *f = mustOpen(dest, "w");
    struct axt *axt;

    while ((axt = axtRead(lf)) != NULL)
    {
        axtSwap(axt, hashIntVal(tHash, axt->tName), hashIntVal(qHash, axt->qName));
        axtWrite(axt, f);
        axtFree(&axt);
    }
}
Пример #9
0
void axtDropSelf(char *inFile, char *outFile)
/* axtDropSelf - Drop alignments that just align same thing to itself. */
{
FILE *f = mustOpen(outFile, "w");
struct lineFile *lf = lineFileOpen(inFile, TRUE);
struct axt *axt;
while ((axt = axtRead(lf)) != NULL)
    {
    if (axt->qStart != axt->tStart || axt->qEnd != axt->tEnd ||
        axt->qStrand != axt->tStrand || !sameString(axt->qName, axt->tName))
	{
	axtWrite(axt,f);
	}
    axtFree(&axt);
    }
}
Пример #10
0
void axtSplitByTarget(char *inName, char *outDir)
/* axtSplitByTarget - Split a single axt file into one file per target. */
{
struct hash *outHash = newHash(8);  /* FILE valued hash */
struct lineFile *lf = lineFileOpen(inName, TRUE);
struct axt *axt;

makeDir(outDir);
while ((axt = axtRead(lf)) != NULL)
    {
    FILE *f = getSplitFile(outHash, outDir, axt->tName, axt->tStart);
    axtWrite(axt, f);
    totalWritten += strlen(axt->tName) + strlen(axt->qName) + 40 + strlen(axt->qSym)+ strlen(axt->tSym);
    axtFree(&axt);
    }
}
void subsetAxt(char *inName, char *outName, char *scoreFile, int threshold)
/* subsetAxt - Rescore alignments and output those over threshold. */
{
struct axtScoreScheme *ss = axtScoreSchemeRead(scoreFile);
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = mustOpen(outName, "w");
struct axt *axt;

if (threshold <= 0)
    errAbort("Threshold must be a positive number");
while ((axt = axtRead(lf)) != NULL)
    {
    subsetOne(axt, ss, threshold, f);
    axtFree(&axt);
    axt = NULL;
    }
}
Пример #12
0
struct binKeeper *loadAxtsIntoRange(char *fileName, char *tPrefix, char *qPrefix)
/* Read in an axt file and shove it into a bin-keeper. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct binKeeper *bk = binKeeperNew(0, maxChromSize);
struct axt *axt;
int count = 0;

while ((axt = axtRead(lf)) != NULL)
    {
    binKeeperAdd(bk, axt->tStart, axt->tEnd, axt);
    ++count;
    }
uglyf("LOaded %d from %s\n", count, fileName);
lineFileClose(&lf);
return bk;
}
Пример #13
0
void axtIndex(char *in, char *out)
/* axtIndex - Create summary file for axt. */
{
struct lineFile *lf = lineFileOpen(in, TRUE);
FILE *f = mustOpen(out, "w");
struct axt *axt;

for (;;)
    {
    off_t pos = lineFileTell(lf);
    axt = axtRead(lf);
    if (axt == NULL)
        break;
    fprintf(f, "%d %d %lld\n", axt->tStart, axt->tEnd - axt->tStart, (unsigned long long) pos); 
    axtFree(&axt);
    }
carefulClose(&f);
}
Пример #14
0
void newStitch3(char *axtFile, char *output)
/* newStitch3 - Another stitching experiment - with kd-trees.. */
{
struct hash *pairHash = newHash(0);  /* Hash keyed by qSeq<strand>tSeq */
struct dyString *dy = newDyString(512);
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
struct axt *axt;
struct seqPair *spList = NULL, *sp;
FILE *f = mustOpen(output, "w");

/* Read input file and divide alignments into various parts. */
while ((axt = axtRead(lf)) != NULL)
    {
    struct cBlock *block;
    if (axt->score < 500)
        {
	axtFree(&axt);
	continue;
	}
    dyStringClear(dy);
    dyStringPrintf(dy, "%s%c%s", axt->qName, axt->qStrand, axt->tName);
    sp = hashFindVal(pairHash, dy->string);
    if (sp == NULL)
        {
	AllocVar(sp);
	slAddHead(&spList, sp);
	hashAddSaveName(pairHash, dy->string, sp, &sp->name);
	}
    AllocVar(block);
    block->qStart = axt->qStart;
    block->qEnd = axt->qEnd;
    block->tStart = axt->tStart;
    block->tEnd = axt->tEnd;
    block->score = axt->score;
    slAddHead(&sp->blockList, block);
    axtFree(&axt);
    }
for (sp = spList; sp != NULL; sp = sp->next)
    {
    slReverse(&sp->blockList);
    chainPair(sp, f);
    }
dyStringFree(&dy);
}
void axtRescore(char *in, char *out)
/* axtRescore - Recalculate scores in axt. */
{
struct lineFile *lf = lineFileOpen(in, TRUE);
FILE *f = mustOpen(out, "w");
struct axt *axt;

lineFileSetMetaDataOutput(lf, f);
axtScoreSchemeDnaWrite(scoreScheme, f, "axtRescore");
for (;;)
    {
    axt = axtRead(lf);
    if (axt == NULL)
        break;
    axt->score = axtScore(axt, scoreScheme);
    axtWrite(axt, f);
    axtFree(&axt);
    }
}
void axtDropOverlap(char *inName, char *tSizeFile, char *qSizeFile, char *outName)
/* used for cleaning up self alignments - deletes all overlapping self alignments */
{
struct hash *qSizeHash = readSizes(qSizeFile);
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = mustOpen(outName, "w");
struct axt *axt;
int totMatch = 0;
int totSkip = 0;
int totLines = 0;

while ((axt = axtRead(lf)) != NULL)
    {
    totLines++;
    totMatch += axt->score;
	if (sameString(axt->qName, axt->tName))
        {
        int qs = axt->qStart;
        int qe = axt->qEnd;
        if (axt->qStrand == '-')
            reverseIntRange(&qs, &qe, findSize(qSizeHash, axt->qName));
        if (axt->tStart == qs && axt->tEnd == qe) 
            {
            /*
            printf( "skip %c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\n",
              axt->qStrand,
              axt->qName, axt->symCount, axt->qStart, axt->qEnd,
              axt->tName, axt->symCount, axt->tStart, axt->tEnd
              );
              */
            totSkip++;
            continue;
            }
        }
    axtWrite(axt, f);

    axtFree(&axt);
    }
fclose(f);
lineFileClose(&lf);
}
Пример #17
0
void setAliBits(char *axtBestDir, char *chrom, int chromSize,
	Bits *aliBits, Bits *matchBits)
/* Set bits where there are alignments and matches. */
{
char axtFileName[512];
struct axt *axt;
struct lineFile *lf;

sprintf(axtFileName, "%s/%s.axt", axtBestDir, chrom);
if ((lf = lineFileMayOpen(axtFileName, TRUE)) == NULL)
    {
    warn("Couldn't open %s", axtFileName);
    return;
    }
while ((axt = axtRead(lf)) != NULL)
    {
    axtSetBits(axt, chromSize, aliBits, matchBits);
    axtFree(&axt);
    }
lineFileClose(&lf);
}
struct mafAli *axtLoadAsMafInRegion(struct sqlConnection *conn, char *table,
	char *chrom, int start, int end,
	char *tPrefix, char *qPrefix, int tSize,  struct hash *qSizeHash)
/* Return list of alignments in region from axt external file as a maf. */
{
char **row;
unsigned int extFileId = 0;
struct lineFile *lf = NULL;
struct mafAli *maf, *mafList = NULL;
struct axt *axt;
int rowOffset;
struct sqlResult *sr = hRangeQuery(conn, table, chrom,
    start, end, NULL, &rowOffset);

while ((row = sqlNextRow(sr)) != NULL)
    {
    struct scoredRef ref;
    scoredRefStaticLoad(row + rowOffset, &ref);
    if (ref.extFile != extFileId)
	{
	char *path = hExtFileName(sqlGetDatabase(conn),"extFile", ref.extFile);
	lf = lineFileOpen(path, TRUE);
	extFileId = ref.extFile;
	}
    lineFileSeek(lf, ref.offset, SEEK_SET);
    axt = axtRead(lf);
    if (axt == NULL)
        internalErr();
    maf = mafFromAxt(axt, tSize, tPrefix, hashIntVal(qSizeHash, axt->qName), qPrefix);
    axtFree(&axt);
    slAddHead(&mafList, maf);
    }
sqlFreeResult(&sr);
lineFileClose(&lf);
slReverse(&mafList);
return mafList;
}
Пример #19
0
void axtQueryCount(char *fileName)
/* axtQueryCount - Count bases covered on each query sequence. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = newHash(0);
struct axt *axt;
struct qInfo *qList = NULL, *q;

while ((axt = axtRead(lf)) != NULL)
    {
    char *qName = axt->qName;
    if ((q = hashFindVal(hash, qName)) == NULL)
        {
	AllocVar(q);
	slAddHead(&qList, q);
	hashAddSaveName(hash, qName, q, &q->name);
	}
    q->covered += axt->qEnd - axt->qStart;
    axtFree(&axt);
    }
slSort(&qList, qInfoCmpName);
for (q = qList; q != NULL; q = q->next)
    printf("%s\t%d\n", q->name, q->covered);
}
Пример #20
0
void ggcChrom(struct chromGenes *chrom, char *axtFile, 
	struct ggcInfo *g, struct hash *restrictHash, 
	FILE *fParts)
/* Tabulate matches on chromosome. */
{
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
bool *hits, *covers;
int hitCount = 0, coverCount = 0;
struct axt *axt;
struct genePred *gp;
int closeSize = g->closeSize;
int closeHalf = closeSize/2;

/* Build up array of booleans - one per base - which are
 * 1's where mouse/human align and bases match, zero 
 * elsewhere. */
AllocArray(hits, chrom->size);
AllocArray(covers, chrom->size);
printf("%s (%d bases)\n", chrom->name, chrom->size);
while ((axt = axtRead(lf)) != NULL)
    {
    int tPos = axt->tStart;
    int symCount = axt->symCount, i;
    char t, q, *tSym = axt->tSym, *qSym = axt->qSym;

    if (axt->tEnd > chrom->size)
        errAbort("tEnd %d, chrom size %d in %s", 
		axt->tEnd, chrom->size, axtFile);
    if (axt->tStrand == '-')
        errAbort("Can't handle minus strand on target in %s", axtFile);
    for (i=0; i<symCount; ++i)
        {
	t = tSym[i];
	if (t != '-')
	    {
	    q = qSym[i];
	    if (toupper(t) == toupper(q))
		{
	        hits[tPos] = TRUE;
		++hitCount;
		}
	    if (q == '-')
	       covers[tPos] = 1;
	    else
	       covers[tPos] = 2;
	    ++tPos;
	    }
	}
    axtFree(&axt);
    }

for (gp = chrom->geneList; gp != NULL; gp = gp->next)
    {
    int exonIx;
    int utr3Size = 0, utr5Size = 0, cdsAllSize = 0;
    int utr3Pos = 0, utr5Pos = 0, cdsAllPos = 0;
    bool *utr3Hits = NULL, *utr3Covers = NULL;
    bool *utr5Hits = NULL, *utr5Covers = NULL;
    bool *cdsAllHits = NULL, *cdsAllCovers = NULL;
    bool isRev = (gp->strand[0] == '-');


    /* Filter out genes not in restrict hash if any. */
    ++totalGenes;
    if (restrictHash != NULL)
        if (!hashLookup(restrictHash, gp->name))
	    continue;
    ++reviewedGenes;

    /* Filter out genes without meaningful UTRs */
    if (gp->cdsStart - gp->txStart < g->closeSize/2 || 
    	gp->txEnd - gp->cdsEnd < g->closeSize/2)
        continue;
    ++genesUsed;

    /* Total up UTR and CDS sizes. */
    for (exonIx=0; exonIx<gp->exonCount; ++exonIx)
	 {
	 int eStart = gp->exonStarts[exonIx];
	 int eEnd = gp->exonEnds[exonIx];
	 int eSize = eEnd - eStart;
	 int oneUtr, oneCds;
	 oneCds = rangeIntersection(gp->cdsStart, gp->cdsEnd, eStart, eEnd);
	 if (oneCds > 0)
	     {
	     cdsAllSize += oneCds;
	     }
	 if (eStart < gp->cdsStart)
	     {
	     int utrStart = eStart;
	     int utrEnd = min(gp->cdsStart, eEnd);
	     int utrSize = utrEnd - utrStart;
	     if (isRev)
		 utr3Size += utrSize;
	     else
		 utr5Size += utrSize;
	     }
	 if (eEnd > gp->cdsEnd)
	     {
	     int utrStart = max(gp->cdsEnd, eStart);
	     int utrEnd = eEnd;
	     int utrSize = utrEnd - utrStart;
	     if (isRev)
		 utr5Size += utrSize;
	     else
		 utr3Size += utrSize;
	     }
	 }

    /* Condense hits from UTRs and CDSs */
    if (utr5Size > 0)
	{
	AllocArray(utr5Hits, utr5Size);
	AllocArray(utr5Covers, utr5Size);
	}
    if (utr3Size > 0)
	{
	AllocArray(utr3Hits, utr3Size);
	AllocArray(utr3Covers, utr3Size);
	}
    if (cdsAllSize > 0)
	{
	AllocArray(cdsAllHits, cdsAllSize);
	AllocArray(cdsAllCovers, cdsAllSize);
	}
    for (exonIx=0; exonIx<gp->exonCount; ++exonIx)
	{
	int eStart = gp->exonStarts[exonIx];
	int eEnd = gp->exonEnds[exonIx];
	int eSize = eEnd - eStart;
	int oneUtr, oneCds;
	oneCds = rangeIntersection(gp->cdsStart, gp->cdsEnd, eStart, eEnd);
	if (oneCds > 0)
	    {
	    int cdsStart = eStart;
	    int cdsEnd = gp->cdsEnd;

	    if (cdsStart < gp->cdsStart)
		cdsStart = gp->cdsStart;
	    memcpy(cdsAllHits + cdsAllPos, hits + cdsStart, oneCds * sizeof(*hits));
	    memcpy(cdsAllCovers + cdsAllPos, covers + cdsStart, oneCds * sizeof(*covers));
	    cdsAllPos += oneCds;
	    }
	if (eStart < gp->cdsStart)
	    {
	    int utrStart = eStart;
	    int utrEnd = min(gp->cdsStart, eEnd);
	    int utrSize = utrEnd - utrStart;
	    if (isRev)
		{
		memcpy(utr3Hits + utr3Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr3Covers + utr3Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr3Pos += utrSize;
		}
	    else
		{
		memcpy(utr5Hits + utr5Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr5Covers + utr5Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr5Pos += utrSize;
		}
	    }
	if (eEnd > gp->cdsEnd)
	    {
	    int utrStart = max(gp->cdsEnd, eStart);
	    int utrEnd = eEnd;
	    int utrSize = utrEnd - utrStart;
	    if (isRev)
		{
		memcpy(utr5Hits + utr5Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr5Covers + utr5Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr5Pos += utrSize;
		}
	    else
		{
		memcpy(utr3Hits + utr3Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr3Covers + utr3Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr3Pos += utrSize;
		}
	    }
	}
    assert(utr3Pos == utr3Size);
    assert(utr5Pos == utr5Size);
    assert(cdsAllPos == cdsAllSize);

    tallyHits(&g->utr5, utr5Hits, utr5Covers, utr5Size, isRev);
    tallyHits(&g->utr3, utr3Hits, utr3Covers, utr3Size, isRev);
    tallyHits(&g->cdsAll, cdsAllHits, cdsAllCovers, cdsAllSize, isRev);

    /* Optionally write out file with gene by gene info. */
    if (fParts != NULL)
        {
	/* Write header line first time through. */
	static boolean firstTime = TRUE;
	if (firstTime)
	    {
	    firstTime = FALSE;
	    fprintf(fParts, "#accession\tsize_5\tali_5\tmatch_5\tsize_c\tali_c\tmatch_c\tsize_3\tali_3\tmatch_3\n");
	    }
	fprintf(fParts, "%s\t", gp->name);
	fprintf(fParts, "%d\t%d\t%d\t", utr5Size, 
		countBools(utr5Covers, utr5Size),
		countBools(utr5Hits, utr5Size));
	fprintf(fParts, "%d\t%d\t%d\t", cdsAllSize, 
		countBools(cdsAllCovers, cdsAllSize),
		countBools(cdsAllHits, cdsAllSize));
	fprintf(fParts, "%d\t%d\t%d\n", utr3Size, 
		countBools(utr3Covers, utr3Size),
		countBools(utr3Hits, utr3Size));
	}

    /* Tally upstream/downstream hits. */
	{
	int s1 = gp->txStart - closeHalf;
	int e1 = s1 + closeSize;
	int s2 = gp->txEnd - closeHalf;
	int e2 = s2 + closeSize;
	if (isRev)
	    {
	    tallyInRange(&g->down, hits, covers, chrom->size, gp->txStart - g->baseDown,
		gp->txStart, isRev);
	    tallyInRange(&g->up, hits, covers, chrom->size, gp->txEnd, 
		gp->txEnd + g->baseUp, isRev);
	    tallyInRange(&g->txEnd, hits, covers, chrom->size, s1, e1, isRev);
	    tallyInRange(&g->txStart, hits, covers, chrom->size, s2, e2, isRev);
	    }
	else
	    {
	    tallyInRange(&g->up, hits, covers, chrom->size, gp->txStart - g->baseUp,
		gp->txStart, isRev);
	    tallyInRange(&g->down, hits, covers, chrom->size, gp->txEnd, 
		gp->txEnd + g->baseDown, isRev);
	    tallyInRange(&g->txStart, hits, covers, chrom->size, s1, e1, isRev);
	    tallyInRange(&g->txEnd, hits, covers, chrom->size, s2, e2, isRev);
	    }
	}

    /* Tally hits in coding exons */
    for (exonIx=0; exonIx < gp->exonCount; ++exonIx)
        {
	int eStart = gp->exonStarts[exonIx];
	int eEnd = gp->exonEnds[exonIx];
	/* Single coding exon. */
	if (eStart <= gp->cdsStart && eEnd >= gp->cdsEnd)
	   {
	   eStart = gp->cdsStart;
	   eEnd = gp->cdsEnd;
	   tallyInRange(&g->cdsSingle, hits, covers, chrom->size,
	   		eStart, eEnd, isRev);
	   }
	/* Initial coding exon */
	else if (eStart < gp->cdsStart && eEnd > gp->cdsStart)
	    {
	    int cs = gp->cdsStart - closeHalf;
	    int ce = cs + closeSize;
	    eStart = gp->cdsStart;
	    if (isRev)
	        {
		tallyInRange(&g->tlEnd, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsLast, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    else
	        {
		tallyInRange(&g->tlStart, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsFirst, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    }
	/* Final coding exon */
	else if (eStart < gp->cdsEnd && eEnd > gp->cdsEnd)
	    {
	    int cs = gp->cdsEnd - closeHalf;
	    int ce = cs + closeSize;
	    eEnd = gp->cdsEnd;
	    if (isRev)
	        {
		tallyInRange(&g->tlStart, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsFirst, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    else
	        {
		tallyInRange(&g->tlEnd, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsLast, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    }
	/* Middle (but not only) coding exon */
	else if (eStart >= gp->cdsStart && eEnd <= gp->cdsEnd)
	    {
	    tallyInRange(&g->cdsMiddle, hits, covers, chrom->size, eStart, eEnd, isRev);
	    }
	else
	    {
	    }
	}
	

    /* Tally hits in introns and splice sites. */
    for (exonIx=1; exonIx<gp->exonCount; ++exonIx)
        {
	int iStart = gp->exonEnds[exonIx-1];
	int iEnd = gp->exonStarts[exonIx];
	int s1 = iStart - closeHalf;
	int e1 = s1 + closeSize;
	int s2 = iEnd - closeHalf;
	int e2 = s2 + closeSize;
	if (isRev)
	    {
	    tallyInRange(&g->splice3, hits, covers, chrom->size, 
		    s1, e1, isRev);
	    tallyInRange(&g->splice5, hits, covers, chrom->size, 
		    s2, e2, isRev);
	    }
	else
	    {
	    tallyInRange(&g->splice5, hits, covers, chrom->size, 
		    s1, e1, isRev);
	    tallyInRange(&g->splice3, hits, covers, chrom->size, 
		    s2, e2, isRev);
	    }
	tallyInRange(&g->intron, hits, covers, chrom->size, iStart, iEnd, isRev);
	}
    freez(&utr5Hits);
    freez(&utr3Hits);
    freez(&cdsAllHits);
    freez(&utr5Covers);
    freez(&utr3Covers);
    freez(&cdsAllCovers);
    }
freez(&hits);
freez(&covers);
lineFileClose(&lf);
}
Пример #21
0
void twinOrfStats(char *axtFile, char *raFile, char *outFile)
/* twinOrfStats - Collect stats on refSeq cDNAs aligned to another species via axtForEst. */
{
struct hash *rsiHash = readRefRa(raFile);
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
FILE *f = mustOpen(outFile, "w");
struct axt *axt;
static struct c1Counts c1Kozak[10], c1all, c1utr5, c1utr3, c1cds;
static struct c2Counts c2Kozak[10], c2All, c2Utr5, c2Utr3, c2Cds;
static struct c3Counts c3All, c3Utr5, c3Utr3, c3Cds;
char label[64];
char *predictFile = optionVal("predict", NULL);
int i;
static struct c3Counts cod1, cod2, cod3, stop, earlyCod1, earlyCod2, earlyCod3;
int earlySize;

initC3Counts(&cod1, 0);
initC3Counts(&cod2, 0);
initC3Counts(&cod3, 0);
initC3Counts(&earlyCod1, 0);
initC3Counts(&earlyCod2, 0);
initC3Counts(&earlyCod3, 0);
initC3Counts(&c3Utr3, 0);
initC3Counts(&c3Utr5, 0);
initC3Counts(&stop, 0);

threshold = optionFloat("threshold", threshold);
earlyAaSize = optionInt("earlyAaSize", earlyAaSize);
earlySize = 3*earlyAaSize;
while ((axt = axtRead(lf)) != NULL)
    {
    struct refSeqInfo *rsi = hashFindVal(rsiHash, axt->tName);
    if (rsi != NULL && rsi->cdsStart >= 6)
        {
	if (checkAtg(axt, rsi->cdsStart))
	    {
	    for (i=0; i<10; ++i)
		addPos(&c1Kozak[i], &c2Kozak[i], axt, rsi->cdsStart - 5 + i);
	    addRange(&c1all, &c2All, &c3All, axt, 0, rsi->size);
	    addRange(&c1utr5, &c2Utr5, &c3Utr5, axt, 0, rsi->cdsStart);
	    addRange(&c1cds, &c2Cds, &c3Cds, axt, rsi->cdsStart, rsi->cdsEnd);
	    addRange(&c1utr3, &c2Utr3, &c3Utr3, axt, rsi->cdsEnd, rsi->size);

	    /* The +3+1 in the expression below breaks down as so:  the
	     * +3 is to move past the first 'ATG' codon, which is part of
	     * the Kozak consensus model, not the coding model.  The +1
	     * is so that we look at the 2nd and 3rd bases of the previous
	     * codon, and the first base of the current codon.   */
	    addCodons(&earlyCod1, axt, rsi->cdsStart+3+1, rsi->cdsStart+1+earlySize);
	    addCodons(&earlyCod2, axt, rsi->cdsStart+3+2, rsi->cdsStart+2+earlySize);
	    addCodons(&earlyCod3, axt, rsi->cdsStart+3+3, rsi->cdsStart+3+earlySize);
	    addCodons(&cod1, axt, rsi->cdsStart+3+1+earlySize, rsi->cdsEnd-5);
	    addCodons(&cod2, axt, rsi->cdsStart+3+2+earlySize, rsi->cdsEnd-4);
	    addCodons(&cod3, axt, rsi->cdsStart+3+3+earlySize, rsi->cdsEnd-3);
	    addCodons(&stop, axt, rsi->cdsEnd-3, rsi->cdsEnd);
	    }
	}
    axtFree(&axt);
    }
lineFileClose(&lf);

dumpC1(f, &c1all, "c1_all");
dumpC2(f, &c2All, "c2_all");
dumpC3(f, &c3All, "c3_all");

dumpC1(f, &c1utr5, "c1_utr5");
dumpC2(f, &c2Utr5, "c2_utr5");
dumpC3(f, &c3Utr5, "c3_utr5");

dumpC1(f, &c1cds, "c1_cds");
dumpC2(f, &c2Cds, "c2_cds");
dumpC3(f, &c3Cds, "c3_cds");

dumpC1(f, &c1utr3, "c1_utr3");
dumpC2(f, &c2Utr3, "c2_utr3");
dumpC3(f, &c3Utr3, "c3_utr3");

for (i=0; i<10; ++i)
    {
    sprintf(label, "c1_kozak[%d]", i-5);
    dumpC1(f, &c1Kozak[i], label);
    sprintf(label, "c2_kozak[%d]", i-5);
    dumpC2(f, &c2Kozak[i], label);
    }
dumpC3(f, &earlyCod1, "earlyCod1");
dumpC3(f, &earlyCod2, "earlyCod2");
dumpC3(f, &earlyCod3, "earlyCod3");
dumpC3(f, &cod1, "cod1");
dumpC3(f, &cod2, "cod2");
dumpC3(f, &cod3, "cod3");
dumpC3(f, &stop, "stop");

if (predictFile)
    {
    predict(c1Kozak, &c1all, axtFile, predictFile, rsiHash);
    }
}
Пример #22
0
void checkExp(char *bedFileName, char *tNibDir, char *nibList)
{
struct lineFile *bf = lineFileOpen(bedFileName , TRUE), *af = NULL;
char *row[PSEUDOGENELINK_NUM_COLS] ;
struct pseudoGeneLink *ps;
char *tmpName[512], cmd[512];
struct axt *axtList = NULL, *axt, *mAxt = NULL;
struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seqList = NULL;
struct nibInfo *qNib = NULL, *tNib = NULL;
FILE *op;
int ret;

if (nibHash == NULL)
    nibHash = hashNew(0);
while (lineFileNextRow(bf, row, ArraySize(row)))
    {
    struct misMatch *misMatchList = NULL;
    struct binKeeper *bk = NULL;
    struct binElement *el, *elist = NULL;
    struct psl *mPsl = NULL, *rPsl = NULL, *pPsl = NULL, *psl ;
    struct misMatch *mf = NULL;
    ps = pseudoGeneLinkLoad(row);
    tmpName[0] = cloneString(ps->name);
    chopByChar(tmpName[0], '.', tmpName, sizeof(tmpName));
    verbose(2,"name %s %s:%d-%d\n",
            ps->name, ps->chrom, ps->chromStart,ps->chromEnd);
    /* get expressed retro from hash */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart, ps->chromEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        rPsl = el->val;
        verbose(2,"retroGene %s %s:%d-%d\n",rPsl->qName, ps->chrom, ps->chromStart,ps->chromEnd);
        }
    /* find mrnas that overlap parent gene */
    bk = hashFindVal(mrnaHash, ps->gChrom);
    elist = binKeeperFindSorted(bk, ps->gStart , ps->gEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        pPsl = el->val;
        verbose(2,"parent %s %s:%d %d,%d\n",
                pPsl->qName, pPsl->tName,pPsl->tStart,
                pPsl->match, pPsl->misMatch);
        }
    /* find self chain */
    bk = hashFindVal(chainHash, ps->chrom);
    elist = binKeeperFind(bk, ps->chromStart , ps->chromEnd ) ;
    slSort(&elist, chainCmpScoreDesc);
    for (el = elist; el != NULL ; el = el->next)
        {
        struct chain *chain = el->val, *subChain, *retChainToFree, *retChainToFree2;
        int qs = chain->qStart;
        int qe = chain->qEnd;
        int id = chain->id;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        if (!sameString(chain->qName , ps->gChrom) || 
                !positiveRangeIntersection(qs, qe, ps->gStart, ps->gEnd))
            {
            verbose(2," wrong chain %s:%d-%d %s:%d-%d parent %s:%d-%d\n", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd,
                ps->gChrom,ps->gStart,ps->gEnd);
            continue;
            }
        verbose(2,"chain id %d %4.0f",chain->id, chain->score);
        chainSubsetOnT(chain, ps->chromStart+7, ps->chromEnd-7, 
            &subChain,  &retChainToFree);
        if (subChain != NULL)
            chain = subChain;
        chainSubsetOnQ(chain, ps->gStart, ps->gEnd, 
            &subChain,  &retChainToFree2);
        if (subChain != NULL)
            chain = subChain;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        verbose(2," %s:%d-%d %s:%d-%d ", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd);
        if (subChain != NULL)
            verbose(2,"subChain %s:%d-%d %s:%d-%d\n",
                    subChain->qName, subChain->qStart, subChain->qEnd, 
                    subChain->tName,subChain->tStart,subChain->tEnd);

	qNib = nibInfoFromCache(nibHash, tNibDir, chain->qName);
	tNib = nibInfoFromCache(nibHash, tNibDir, chain->tName);
	tSeq = nibInfoLoadStrand(tNib, chain->tStart, chain->tEnd, '+');
	qSeq = nibInfoLoadStrand(qNib, chain->qStart, chain->qEnd, chain->qStrand);
	axtList = chainToAxt(chain, qSeq, chain->qStart, tSeq, chain->tStart,
	    maxGap, BIGNUM);
        verbose(2,"axt count %d misMatch cnt %d\n",slCount(axtList), slCount(misMatchList));
        for (axt = axtList; axt != NULL ; axt = axt->next)
            {
            addMisMatch(&misMatchList, axt, chain->qSize);
            }
        verbose(2,"%d in mismatch list %s id %d \n",slCount(misMatchList), chain->qName, id);
        chainFree(&retChainToFree);
        chainFree(&retChainToFree2);
        break;
        }
    /* create axt of each expressed retroGene to parent gene */
        /* get alignment for each mrna overlapping retroGene */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart , ps->chromEnd ) ;
    {
    char queryName[512];
    char axtName[512];
    char pslName[512];
    safef(queryName, sizeof(queryName), "/tmp/query.%s.fa", ps->chrom);
    safef(axtName, sizeof(axtName), "/tmp/tmp.%s.axt", ps->chrom);
    safef(pslName, sizeof(pslName), "/tmp/tmp.%s.psl", ps->chrom);
    op = fopen(pslName,"w");
    for (el = elist ; el != NULL ; el = el->next)
        {
        psl = el->val;
        pslOutput(psl, op, '\t','\n');
        qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0);

        if (qSeq != NULL)
            slAddHead(&seqList, qSeq);
        else
            errAbort("seq %s not found \n", psl->qName);
        }
    fclose(op);
    faWriteAll(queryName, seqList);
    safef(cmd,sizeof(cmd),"pslPretty -long -axt %s %s %s %s",pslName , nibList, queryName, axtName);
    ret = system(cmd);
    if (ret != 0)
        errAbort("ret is %d %s\n",ret,cmd);
    verbose(2, "ret is %d %s\n",ret,cmd);
    af = lineFileOpen(axtName, TRUE);
    while ((axt = axtRead(af)) != NULL)
        slAddHead(&mAxt, axt);
    lineFileClose(&af);
    }
    slReverse(&mAxt);
    /* for each parent/retro pair, count bases matching retro and parent better */
    for (el = elist; el != NULL ; el = el->next)
        {
        int i, scoreRetro=0, scoreParent=0, scoreNeither=0;
        struct dyString *parentMatch = newDyString(16*1024);
        struct dyString *retroMatch = newDyString(16*1024);
        mPsl = el->val;

        if (mAxt != NULL)
            {
            verbose(2,"mrna %s %s:%d %d,%d axt %s\n",
                    mPsl->qName, mPsl->tName,mPsl->tStart,
                    mPsl->match, mPsl->misMatch, 
                    mAxt->qName);
            assert(sameString(mPsl->qName, mAxt->qName));
            for (i = 0 ; i< (mPsl->tEnd-mPsl->tStart) ; i++)
                {
                int j = mAxt->tStart - mPsl->tStart;
                verbose(5, "listLen = %d\n",slCount(&misMatchList));
                if ((mf = matchFound(&misMatchList, (mPsl->tStart)+i)) != NULL)
                    {
                    if (toupper(mf->retroBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match retro[%d] %d %c == %c parent %c %d\n",
                                i,mf->retroLoc, mf->retroBase, mAxt->qSym[j+i], 
                                mf->parentBase, mf->parentLoc);
                        dyStringPrintf(retroMatch, "%d,", mf->retroLoc);
                        scoreRetro++;
                        }
                    else if (toupper(mf->parentBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match parent[%d] %d %c == %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->qSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        dyStringPrintf(parentMatch, "%d,", mf->parentLoc);
                        scoreParent++;
                        }
                    else
                        {
                        verbose (3,"match neither[%d] %d %c != %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->tSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        scoreNeither++;
                        }
                    }
                }
            verbose(2,"final score %s parent %d retro %d  neither %d\n",
                    mPsl->qName, scoreParent, scoreRetro, scoreNeither);
            fprintf(outFile,"%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n",
                    ps->chrom, ps->chromStart, ps->chromEnd, ps->name, ps->score, 
                    mPsl->tName, mPsl->tStart, mPsl->tEnd, mPsl->qName, 
                    scoreParent, scoreRetro, scoreNeither, parentMatch->string, retroMatch->string);
            mAxt = mAxt->next;
            }
        dyStringFree(&parentMatch);
        dyStringFree(&retroMatch);
        }
    }
}
void liftAxt(char *destFile, struct hash *liftHash, 
	int sourceCount, char *sources[], boolean querySide)
/* Lift up coordinates in .axt file. */
{
FILE *f = mustOpen(destFile, "w");
int sourceIx;
int dotMod = dots;

for (sourceIx = 0; sourceIx < sourceCount; ++sourceIx)
    {
    char *source = sources[sourceIx];
    struct lineFile *lf = lineFileOpen(source, TRUE);
    struct axt *axt;
    lineFileSetMetaDataOutput(lf, f);
    verbose(1, "Lifting %s\n", source);
    while ((axt = axtRead(lf)) != NULL)
        {
	struct liftSpec *spec;
	struct axt a = *axt;
	char *seqName;
	if (querySide)
	    seqName = a.qName;
	else
	    seqName = a.tName;
	spec = findLift(liftHash, seqName, lf);
	if (spec == NULL)
	    {
	    if (how != carryMissing)
	        {
		axtFree(&axt);
		continue;
		}
	    }
	else
	    {
	    int offset;
	    char strand = (querySide ? a.qStrand : a.tStrand);
	    cantHandleSpecRevStrand(spec);
	    if (strand == '-')
		{
		int ctgEnd = spec->offset + spec->oldSize;
		offset = spec->newSize - ctgEnd;
		}
	    else
		offset = spec->offset;
	    if (querySide)
	        {
		a.qStart += offset;
		a.qEnd += offset;
		a.qName = spec->newName;
		}
	    else
	        {
		a.tStart += offset;
		a.tEnd += offset;
		a.tName = spec->newName;
		if (strand == '-')
                    warn("Target minus strand, please double check results.");
                }
            }
        axtWrite(&a, f);
        axtFree(&axt);
        doDots(&dotMod);
        }
    lineFileClose(&lf);
    if (dots)
        verbose(1, "\n");
    }
}
Пример #24
0
void axtHiQualDiffs(char *axtFile, struct hash *qacHash, FILE *f)
/* Write out high quality diffs in axtFile to f. */
{
char *qName = cloneString("");
UBYTE *qQuals = NULL;
UBYTE *quals = NULL;
struct qac *qac = NULL;
struct axt *axt = NULL;
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
int qStart, qDir, qPos, qWinStart, qWinEnd, tPos;
int qWinSize     = optionInt("winSize",     11);
int qQualMin     = optionInt("diffQualMin", 30);
int qWinQualMin  = optionInt("winQualMin",  25);
int qWinMaxDiff  = optionInt("winMaxDiff",  2);
boolean qIndelOk = optionExists("indelOk");
boolean qIgnore98 = optionExists("ignore98");
boolean chimpPos = optionExists("chimpPos");
int qHalfWinSize = qWinSize/2;

while ((axt = axtRead(lf)) != NULL)
    {
    char *qSym = axt->qSym, *tSym = axt->tSym;
    int symIx, symCount = axt->symCount;
    char qc,tc;
    toUpperN(qSym, symCount);
    toUpperN(tSym, symCount);
    if (!sameString(axt->qName, qName))
        {
	freez(&qName);
	qName = cloneString(axt->qName);
	qac = hashMustFindVal(qacHash, qName);
	freez(&qQuals);
	qQuals = needHugeMem(qac->uncSize);
	rleUncompress(qac->data, qac->compSize, qQuals, qac->uncSize);
	}
    if (axt->qStrand == '+')
        {
	qStart = axt->qStart;
	qDir = 1;
	}
    else
        {
	qStart = qac->uncSize - axt->qStart - 1;
	qDir = -1;
	}
    qPos = qStart;
    tPos = axt->tStart;
    for (symIx = 0; symIx < symCount; ++symIx)
        {
	qc = qSym[symIx];
	tc = tSym[symIx];
	if (qc == '-')
	    tPos += 1;
	else if (tc == '-')
	    qPos += qDir;
	else 
	    {
	    if (qc != tc)
		{
		qWinStart = qPos - qHalfWinSize;
		qWinEnd = qWinStart + qWinSize;
		if (qWinStart >= 0 && qWinEnd < qac->uncSize)
		    {
		    if (qQuals[qPos] >= qQualMin)
		        {
			int i;
			boolean ok = TRUE;
			for (i = qWinStart; i<qWinEnd; ++i)
			    if (qQuals[i] < qWinQualMin)
			        {
				ok = FALSE;
				break;
				}
			if (ok)
			    {
			    int diffCount = 0;
			    int symWinStart = symIx - qHalfWinSize;
			    int symWinEnd = symWinStart + qWinSize;
			    for (i=symWinStart; i < symWinEnd; ++i)
			        {
				qc = qSym[i];
				tc = tSym[i];
				if (qc == '-' || tc == '-')
				    {
				    ok = FALSE;
				    break;
				    }
				if (qc != tc)
				    ++diffCount;
				}
			    if (ok && diffCount <= qWinMaxDiff && (!qIgnore98 || qQuals[qPos] != 98) )
				{
				if (chimpPos)
				    fprintf(f, "%s\t%d\t%d\t%c\t%c\t%s\t%d\t%d\n",
					    axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx], axt->qName, qPos, qPos+1);
				else
				    fprintf(f, "%s\t%d\t%d\t%c\t%c\n",
					    axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx]);
				}
			    }
			}
		    }
		}
	    qPos += qDir;
	    tPos += 1;
	    }
	}
    axtFree(&axt);
    }
lineFileClose(&lf);
}
Пример #25
0
void axtCalcMatrix(int fileCount, char *files[])
/* axtCalcMatrix - Calculate substitution matrix and make indel histogram. */
{
int *histIns, *histDel, *histPerfect, *histGapless, *histT, *histQ;
int maxInDel = optionInt("maxInsert", 21);
static int matrix[4][4];
static char bestGapless[256], bestPerfect[256];
int i, j, total = 0;
double scale;
int fileIx;
struct axt *axt;
static int trans[4] = {A_BASE_VAL, C_BASE_VAL, G_BASE_VAL, T_BASE_VAL};
static char *bases[4] = {"A", "C", "G", "T"};
int totalT = 0, totalMatch = 0, totalMismatch = 0, 
	tGapStart = 0, tGapExt=0, qGapStart = 0, qGapExt = 0;

AllocArray(histIns, maxInDel+1);
AllocArray(histDel, maxInDel+1);
AllocArray(histPerfect, maxPerfect+1);
AllocArray(histGapless, maxPerfect+1);
AllocArray(histT, maxInDel+1);
AllocArray(histQ, maxInDel+1);
for (fileIx = 0; fileIx < fileCount; ++fileIx)
    {
    char *fileName = files[fileIx];
    struct lineFile *lf = lineFileOpen(fileName, TRUE);
    while ((axt = axtRead(lf)) != NULL)
        {
	totalT += axt->tEnd - axt->tStart;
	addMatrix(matrix, axt->tSym, axt->qSym, axt->symCount);
	addInsert(histIns, maxInDel, axt->tSym, axt->symCount,
		&tGapStart, &tGapExt);
	addInsert(histDel, maxInDel, axt->qSym, axt->symCount,
		&qGapStart, &qGapExt);
	addPerfect(axt, histPerfect, maxPerfect, 
		axt->qSym, axt->tSym, axt->symCount, bestPerfect);
	addGapless(axt, histGapless, maxPerfect, 
		axt->qSym, axt->tSym, axt->symCount, bestGapless);
	axtFree(&axt);
	}
    lineFileClose(&lf);
    }


printf("   ");
for (i=0; i<4; ++i)
    printf("%5s ", bases[i]);
printf("\n");

for (i=0; i<4; ++i)
    {
    for (j=0; j<4; ++j)
	{
	int one = matrix[i][j];
        total += matrix[i][j];
	if (i == j)
	    totalMatch += one;
	else
	    totalMismatch += one;
	}
    }
scale = 1.0 / total;

for (i=0; i<4; ++i)
    {
    int it = trans[i];
    printf(" %s", bases[i]);
    for (j=0; j<4; ++j)
        {
	int jt = trans[j];
	printf(" %5.4f", matrix[it][jt] * scale);
	}
    printf("\n");
    }
printf("\n");

for (i=1; i<21; ++i)
    {
    if (i == 20)
        printf(">=");
    printf("%2d  %6.4f%% %6.4f%%\n", i, 100.0*histIns[i]/totalT, 
    	100.0*histDel[i]/totalT);
    }

#ifdef OLD
for (i=0; i<100; i += 10)
    {
    int delSum = 0, insSum=0, perfectSum = 0, perfectBaseSum = 0;
    for (j=0; j<10; ++j)
        {
	int ix = i+j;
	insSum += histIns[ix];
	delSum += histDel[ix];
	perfectSum += histPerfect[ix];
	perfectBaseSum += histPerfect[ix] * ix;
	}
    printf("%2d to %2d:  %6.4f%% %6.4f%% %6d %7d\n", i, i+9, 
    	100.0*insSum/totalT, 100.0*delSum/totalT, perfectSum, perfectBaseSum);
    }
for (i=0; i<1000; i += 100)
    {
    int delSum = 0, insSum=0, perfectSum = 0, perfectBaseSum = 0;
    for (j=0; j<100; ++j)
        {
	int ix = i+j;
	int ins = histIns[ix];
	int del = histDel[ix];
	both = ins + del;
	insSum += ins;
	delSum += del;
	perfectSum += histPerfect[ix];
	perfectBaseSum += histPerfect[ix] * ix;
	}
    printf("%3d to %3d:  %6.4f%% %6.4f%% %6d %7d\n", i, i+99, 
    	100.0*insSum/totalT, 100.0*delSum/totalT, perfectSum, perfectBaseSum);
    }
printf(">1000  %6.4f%% %6.4f%% %6d %7d\n", 
	100.0*histIns[1000]/totalT, 100.0*histDel[1000]/totalT, histPerfect[1000],
	histPerfect[1000]*1000);
both = histIns[1000] + histDel[1000];
#endif /* OLD */

printf("\n");
printMedianEtc("perfect", histPerfect, maxPerfect, bestPerfect);
printMedianEtc("gapless", histGapless, maxPerfect, bestGapless);
printf("\n");
printLabeledPercent("totalT:    ", totalT, totalT);
printLabeledPercent("matches:   ", totalMatch, totalT);
printLabeledPercent("mismatches:", totalMismatch, totalT);
printLabeledPercent("tGapStart: ", tGapStart, totalT);
printLabeledPercent("qGapStart: ", qGapStart, totalT);
printLabeledPercent("tGapExt:   ", tGapExt, totalT);
printLabeledPercent("qGapExt:   ", qGapExt, totalT);
printLabeledPercent("baseId:    ", totalMatch, totalMatch+totalMismatch);
}