Ejemplo n.º 1
0
struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed)
/* Get sequence defined by bed.  Exclude introns. */
{
struct dnaSeq *seq;
if (bed->blockCount <= 1)
    {
    seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd);
    freeMem(seq->name);
    seq->name = cloneString(bed->name);
    }
else
    {
    int totalBlockSize = bedTotalBlockSize(bed);
    AllocVar(seq);
    seq->name = cloneString(bed->name);
    seq->dna = needMem(totalBlockSize+1);
    seq->size = totalBlockSize;
    int i;
    int seqOffset = 0;
    for (i=0; i<bed->blockCount; ++i)
        {
	int exonSize = bed->blockSizes[i];
	int exonStart = bed->chromStart + bed->chromStarts[i];
	struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize);
	memcpy(seq->dna + seqOffset, exon->dna, exonSize);
	seqOffset += exonSize;
	dnaSeqFree(&exon);
	}
    }
if (bed->strand[0] == '-')
    reverseComplement(seq->dna, seq->size);
return seq;
}
Ejemplo n.º 2
0
void readCachedSeqPart(char *seqName, int start, int size, boolean getMasked,
     struct hash *hash, struct dlList *fileCache, 
     struct dnaSeq **retSeq, int *retOffset, boolean *retIsNib)
/* Read sequence hopefully using file cashe. If sequence is in a nib
 * file just read part of it. */
{
struct seqFilePos *sfp = hashMustFindVal(hash, seqName);
FILE *f = openFromCache(fileCache, sfp);
if (sfp->isTwoBit)
    {
    *retSeq = twoBitReadSeqFrag((struct twoBitFile *)f, seqName, start, start + size);
    *retOffset = start;
    *retIsNib = TRUE;
    }
else if (sfp->isNib)
    {
    *retSeq = nibLdPartMasked((getMasked ? NIB_MASK_MIXED : 0), sfp->file, f, sfp->pos, start, size);
    *retOffset = start;
    *retIsNib = TRUE;
    }
else
    {
    if (getMasked)
        errAbort("masked sequences not supported with fasta files");
    *retSeq = readSeqFromFaPos(sfp, f);
    *retOffset = 0;
    *retIsNib = FALSE;
    }
}
Ejemplo n.º 3
0
void readCachedSeqPart(char *seqName, int start, int size, 
     struct hash *hash, struct dlList *fileCache, 
     struct dnaSeq **retSeq, int *retOffset, boolean *retIsPartial)
/* Read sequence hopefully using file cashe. If sequence is in a nib
 * file just read part of it. */
{
struct seqFilePos *sfp = hashMustFindVal(hash, seqName);
FILE *f = openFromCache(fileCache, sfp->file);
if (sfp->isNib)
    {
    *retSeq = nibLdPartMasked(NIB_MASK_MIXED, sfp->file, f, sfp->pos, start, size);
    *retOffset = start;
    *retIsPartial = TRUE;
    }
else if (sfp->isTwoBit)
    {
    *retSeq = twoBitReadSeqFrag(sfp->tbf, seqName, start, start+size);
    *retOffset = start;
    *retIsPartial = TRUE;
    }
else
    {
    *retSeq = readSeqFromFaPos(sfp, f);
    *retOffset = 0;
    *retIsPartial = FALSE;
    }
}
Ejemplo n.º 4
0
void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, 
	int start, int end)
/* Output sequence. */
{
struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end);
if (noMask)
    toUpperN(seq->dna, seq->size);
faWriteNext(f, seq->name, seq->dna, seq->size);
dnaSeqFree(&seq);
}
Ejemplo n.º 5
0
struct dnaSeq *nibTwoCacheSeq(struct nibTwoCache *ntc, char *seqName)
/* Return all of sequence. This will have repeats in lower case. */
{
if (ntc->isTwoBit)
    return twoBitReadSeqFrag(ntc->tbf, seqName, 0, 0);
else
    {
    struct nibInfo *nib = nibInfoFromCache(ntc->nibHash, ntc->pathName, seqName);
    return nibLdPart(nib->fileName, nib->f, nib->size, 0, nib->size);
    }
}
Ejemplo n.º 6
0
struct dnaSeq *dnaLoadSingle(char *fileName, int *retStart, int *retEnd, int *retParentSize)
/* Return sequence if it's a nib file or 2bit part, NULL otherwise. */
{
struct dnaSeq *seq = NULL;
unsigned start = 0, end = 0;
int parentSize = 0;
if (nibIsFile(fileName))
    {
    /* Save offset out of fileName for auto-lifting */
    char filePath[PATH_LEN];
    char name[PATH_LEN];
    nibParseName(0, fileName, filePath, name, &start, &end);

    if (end != 0)	/* It's just a range. */
        {
	FILE *f;
	int size;
	nibOpenVerify(filePath, &f, &size);
	parentSize = size;
	}
    seq =  nibLoadAllMasked(NIB_MASK_MIXED, fileName);
    if (end == 0)
         parentSize = end = seq->size;
    freez(&seq->name);
    seq->name = cloneString(name);
    }
else if (twoBitIsRange(fileName))
    {
    /* Save offset out of fileName for auto-lifting */
    char *rangeSpec = cloneString(fileName);
    int start, end;
    char *file, *seqName;
    twoBitParseRange(rangeSpec, &file, &seqName, &start, &end);

    /* Load sequence. */
        {
	struct twoBitFile *tbf = twoBitOpen(file);
	parentSize = twoBitSeqSize(tbf, seqName);
	seq = twoBitReadSeqFrag(tbf, seqName, start, end);
	twoBitClose(&tbf);
	}
    if (end == 0)
        end = seq->size;
    freez(&rangeSpec);
    }
if (retStart != NULL)
    *retStart = start;
if (retEnd != NULL)
    *retEnd = end;
if (retParentSize != NULL)
    *retParentSize = parentSize;
return seq;
}
Ejemplo n.º 7
0
void doFetch(char *inputFileName, char *sequenceFileName, char *outputFileName)
/* lookup sequence for each line */
{
struct lineFile *lf = NULL;
char *line;
char *row[6];
int elementCount;
struct twoBitFile *tbf;

char *fileChrom = NULL;
int start = 0;
int end = 0;
char *name = NULL;
int score = 0;
char *strand = NULL;

struct dnaSeq *chunk = NULL;

FILE *outputFileHandle = mustOpen(outputFileName, "w");

tbf = twoBitOpen(sequenceFileName);

lf = lineFileOpen(inputFileName, TRUE);
while (lineFileNext(lf, &line, NULL))
    {
    elementCount = chopString(line, "\t", row, ArraySize(row));
    if (elementCount != 6) continue;

    fileChrom = cloneString(row[0]);
    start = sqlUnsigned(row[1]);
    end = sqlUnsigned(row[2]);
    name = cloneString(row[3]);
    score = sqlUnsigned(row[4]);
    strand = cloneString(row[5]);

    if (start == end) continue;
    assert (end > start);

    chunk = twoBitReadSeqFrag(tbf, fileChrom, start, end);
    touppers(chunk->dna);
    if (sameString(strand, "-"))
        reverseComplement(chunk->dna, chunk->size);
    fprintf(outputFileHandle, "%s\t%d\t%d\t%s\t%d\t%s\t%s\n", fileChrom, start, end, name, score, strand, chunk->dna);
    dnaSeqFree(&chunk);
    }

lineFileClose(&lf);
carefulClose(&outputFileHandle);
}
Ejemplo n.º 8
0
struct dnaSeq *nibTwoLoadOne(char *pathName, char *seqName)
/* Return sequence from a directory full of nibs or a .2bit file. 
 * The sequence will have repeats in lower case. */
{
struct dnaSeq *seq;
if (twoBitIsFile(pathName))
    {
    struct twoBitFile *tbf = twoBitOpen(pathName);
    seq = twoBitReadSeqFrag(tbf, seqName, 0, 0);
    twoBitClose(&tbf);
    }
else
    {
    char path[512];
    safef(path, sizeof(path), "%s/%s.nib", pathName, seqName);
    seq = nibLoadAllMasked(NIB_MASK_MIXED, path);
    }
return seq;
}
Ejemplo n.º 9
0
struct dnaSeq *readFromCache(struct dlList *cache, char *dirName, char *seqName,
	int start, int size, int seqSize, boolean isTwoBit)
/* Return dnaSeq read from the appropriate nib file.  
 * You need to dnaSeqFree this when done (it is the nib
 * file that is cached, not the sequence). */
{
struct cachedSeqFile *cn = openFromCache(cache, dirName, seqName, isTwoBit);
if (isTwoBit)
    {
    return twoBitReadSeqFrag(cn->tbf, seqName, start, start+size);
    }
else
    {
    if (seqSize != cn->size)
	errAbort("%s/%s is %d bases in .lav file and %d in .nib file\n",
	   dirName, seqName, seqSize, cn->size); 
    if ((start+size) > cn->size )
	printf("%s/%s is %d bases in .lav file and %d in .nib file start %d size %d end %d\n",
	   dirName, seqName, seqSize, cn->size, start, size, start+size); 
    return nibLdPartMasked(NIB_MASK_MIXED, cn->fileName, cn->f, cn->size, start, size);
    }
}
Ejemplo n.º 10
0
void seqFromPsl(char *inPsl, char *inTwoBit, char *outFa)
/* seqFromPsl - Extract masked sequence from database corresponding to psl file. */
{
struct twoBitFile *tbf = twoBitOpen(inTwoBit);
struct lineFile *lf = pslFileOpen(inPsl);
FILE *f = mustOpen(outFa, "w");
struct psl *psl;

while ((psl = pslNext(lf)) != NULL)
    {
    char faHead[512];
    struct dnaSeq *seq = twoBitReadSeqFrag(tbf, psl->tName,
    	psl->tStart, psl->tEnd);
    if (psl->strand[0] == '-')
        reverseComplement(seq->dna, seq->size);
    safef(faHead, sizeof(faHead), "%s (%s:%d-%d)", 
    	psl->qName, psl->tName, psl->tStart+1, psl->tEnd);
    if (hardMask)
        lowerToN(seq->dna, seq->size);
    faWriteNext(f, faHead, seq->dna, seq->size);
    }
carefulClose(&f);
}
void loadIfNewSeq(char *seqPath, boolean isTwoBit, char *newName, char strand, 
	char **pName, struct dnaSeq **pSeq, char *pStrand)
/* Load sequence unless it is already loaded.  Reverse complement
 * if necessary. */
{
struct dnaSeq *seq;
if (sameString(newName, *pName))
    {
    if (strand != *pStrand)
        {
	seq = *pSeq;
	reverseComplement(seq->dna, seq->size);
	*pStrand = strand;
	}
    }
else
    {
    char fileName[512];
    freeDnaSeq(pSeq);
    if (isTwoBit)
        {
	struct twoBitFile *tbf = twoBitOpenCached(seqPath);
	*pSeq = seq = twoBitReadSeqFrag(tbf, newName, 0, 0);
	verbose(1, "Loaded %d bases of %s from %s\n", seq->size, newName, seqPath);
	}
    else
	{
	snprintf(fileName, sizeof(fileName), "%s/%s.nib", seqPath, newName);
	*pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
	verbose(1, "Loaded %d bases in %s\n", seq->size, fileName);
	}
    *pName = newName;
    *pStrand = strand;
    if (strand == '-')
	reverseComplement(seq->dna, seq->size);
    }
}
void fillHoles(struct mafAli *mafList, struct subSpecies *speciesList, struct twoBitFile *twoBit)
{
int lastEnd = 100000000;
struct mafAli *prevMaf = NULL, *maf, *nextMaf;
struct subSpecies *species;

/*
for(species = speciesList; species; species = species->next)
    {
    blockStatus = &species->blockStatus;
    blockStatus->mc->rightStatus = MAF_NEW_STATUS;
    blockStatus->mc->rightLen = 0;
    }
    */

for(maf = mafList; maf ; prevMaf = maf, maf = nextMaf)
    {
    struct mafComp *mc = NULL, *masterMc, *lastMc = NULL;
    struct mafAli *newMaf = NULL;
    struct blockStatus *blockStatus;

    nextMaf = maf->next;

    masterMc=maf->components;
    if (masterMc->start > lastEnd)
	{
	struct subSpecies *species;

	for(species = speciesList; species; species = species->next)
	    {
	    mc = NULL;
//	    printf("looking at %s\n",species->name);
	    blockStatus = &species->blockStatus;
	    if (blockStatus->mc)
		{
//	    printf("should match at %s\n",blockStatus->mc->src);
		switch (blockStatus->mc->rightStatus)
		    {
		    case MAF_MISSING_STATUS:
		    //printf("missing right\n");
		    case MAF_NEW_NESTED_STATUS:
		    case MAF_MAYBE_NEW_NESTED_STATUS:
		    case MAF_CONTIG_STATUS:
		    case MAF_TANDEM_STATUS:
		    case MAF_INSERT_STATUS:
			AllocVar(mc);
			mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus;
			mc->rightLen = mc->leftLen = blockStatus->mc->rightLen;
			mc->src = blockStatus->mc->src;
			mc->srcSize = blockStatus->mc->srcSize;
			mc->strand = blockStatus->mc->strand;
			mc->start = blockStatus->mc->start + blockStatus->mc->size;
			if (lastMc == NULL)
			    {
			    struct mafComp *miniMasterMc = NULL;
			    char *seqName;
			    struct dnaSeq *seq;

			    AllocVar(miniMasterMc);
			    miniMasterMc->next = mc;
			    miniMasterMc->strand = '+';
			    miniMasterMc->srcSize = masterMc->srcSize;
			    miniMasterMc->src = masterMc->src;
			    miniMasterMc->start = lastEnd;
			    miniMasterMc->size =  masterMc->start - lastEnd;

			    if ((seqName = strchr(miniMasterMc->src, '.')) != NULL)
				seqName++;
			    else 
			    	seqName = miniMasterMc->src;

//			    printf("hole filled from %d to %d\n",lastEnd, masterMc->start);
			    seq = twoBitReadSeqFrag(twoBit, seqName, lastEnd, masterMc->start);
			    miniMasterMc->text = seq->dna;

			    AllocVar(newMaf);
			    newMaf->textSize = maf->textSize;
			    newMaf->components = miniMasterMc;
			    newMaf->next = maf;
			    if (prevMaf)
				prevMaf->next = newMaf;
			    else
				mafList = newMaf;
			    //masterMc = miniMasterMc;
			    }
			else
			    {
			    lastMc->next = mc;
			    }
			lastMc = mc;
			if  (blockStatus->mc->rightStatus ==  MAF_MISSING_STATUS)
			    {
			    if (addN)
				{
				char buffer[256];

				safef(buffer, sizeof(buffer), "%s.N",species->name);
				mc->src = cloneString(buffer);
				mc->start = 0;
				mc->srcSize = 200000;
				mc->size =  masterMc->start - lastEnd;
				mc->text = needMem(mc->size + 1);
				memset(mc->text, 'N', mc->size);
				}
			    }
			else
			    {
			    if (addDash)
				{
				mc->size = masterMc->size;
				mc->srcSize = blockStatus->mc->srcSize;
				mc->text = needMem(mc->size + 1);
				memset(mc->text, '-', mc->size);
				mc->text[mc->size] = 0;
				if (mc->size == 0)
				    errAbort("bad dash add");
				mc->size = 0;
				}
			    }
			break;
		    }
		}
	    }
	}
    lastEnd = masterMc->start + masterMc->size;
    for(lastMc = masterMc; lastMc->next; lastMc = lastMc->next)
	;

    for(species = speciesList; species; species = species->next)
	{
	blockStatus = &species->blockStatus;
	
	mc = NULL;
	if ((blockStatus->masterStart <= masterMc->start) && 
	    (blockStatus->masterEnd > masterMc->start) && 
	 ((mc = mafMayFindCompPrefix(maf, species->name,NULL)) == NULL))
	    {
	    if (blockStatus->mc != NULL)
		{
		switch (blockStatus->mc->rightStatus)
		    {
		    case MAF_MISSING_STATUS:
		    case MAF_CONTIG_STATUS:
		    case MAF_TANDEM_STATUS:
		    case MAF_INSERT_STATUS:
		    case MAF_NEW_NESTED_STATUS:
		    case MAF_MAYBE_NEW_NESTED_STATUS:
			AllocVar(mc);
			mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus;
			if (mc->rightStatus == MAF_NEW_NESTED_STATUS)
			    mc->rightStatus = MAF_INSERT_STATUS;
			if (mc->leftStatus == MAF_NEW_NESTED_STATUS)
			    mc->leftStatus = MAF_INSERT_STATUS;
			mc->rightLen = mc->leftLen = blockStatus->mc->rightLen;
			mc->src = blockStatus->mc->src;
			mc->strand = blockStatus->mc->strand;
			mc->srcSize = blockStatus->mc->srcSize;
			mc->start = blockStatus->mc->start + blockStatus->mc->size ;
			lastMc->next = mc;
			lastMc = mc;
			if  (addN && (blockStatus->mc->rightStatus ==  MAF_MISSING_STATUS))
			    {
			    char buffer[256];

			    safef(buffer, sizeof(buffer), "%s.N",species->name);
			    mc->src = cloneString(buffer);
			    mc->start = 0;
			    mc->srcSize = 200000;
			    mc->size = maf->textSize;
			    mc->text = needMem(mc->size + 1);
			    memset(mc->text, 'N', mc->size);
			    }
			else if (addDash)
			    {
				mc->size = masterMc->size;
			    mc->text = needMem(mc->size + 1);
			    if (mc->size == 0)
				errAbort("bad dash add");
			    memset(mc->text, '-', mc->size);
			    mc->text[mc->size] = 0;
			    mc->size = 0;
			    }
			    
			break;
		    default:
			break;
		    }
		}
	    }
	if (mc)
	    {
	    blockStatus->mc = mc;
	    }
	}
    }
}
Ejemplo n.º 13
0
void searchOneIndex(int fileCount, char *files[], struct genoFind *gf, char *outName, 
	boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus)
/* Search all sequences in all files against single genoFind index. */
{
int i;
char *fileName;
int count = 0; 
long long totalSize = 0;

gfOutputHead(gvo, outFile);
for (i=0; i<fileCount; ++i)
    {
    fileName = files[i];
    if (nibIsFile(fileName))
        {
	struct dnaSeq *seq;

	if (isProt)
	    errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName);
	seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
	freez(&seq->name);
	seq->name = cloneString(fileName);
	searchOneMaskTrim(seq, isProt, gf, outFile,
			  maskHash, &totalSize, &count);
	freeDnaSeq(&seq);
	}
    else if (twoBitIsSpec(fileName))
	{
	struct twoBitSpec *tbs = twoBitSpecNew(fileName);
	struct twoBitFile *tbf = twoBitOpen(tbs->fileName);
	if (isProt)
	    errAbort("%s is a two bit file, which doesn't work for proteins.", 
	    	fileName);
	if (tbs->seqs != NULL)
	    {
	    struct twoBitSeqSpec *ss = NULL;
	    for (ss = tbs->seqs;  ss != NULL;  ss = ss->next)
		{
		struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name,
						       ss->start, ss->end);
		searchOneMaskTrim(seq, isProt, gf, outFile,
				  maskHash, &totalSize, &count);
		dnaSeqFree(&seq);
		}
	    }
	else
	    {
	    struct twoBitIndex *index = NULL;
	    for (index = tbf->indexList; index != NULL; index = index->next)
		{
		struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0);
		searchOneMaskTrim(seq, isProt, gf, outFile,
				  maskHash, &totalSize, &count);
		dnaSeqFree(&seq);
		}
	    }
	twoBitClose(&tbf);
	}
    else
        {
	static struct dnaSeq seq;
	struct lineFile *lf = lineFileOpen(fileName, TRUE);
	while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
	    {
	    searchOneMaskTrim(&seq, isProt, gf, outFile,
			      maskHash, &totalSize, &count);
	    }
	lineFileClose(&lf);
	}
    }
carefulClose(&outFile);
if (showStatus)
    printf("Searched %lld bases in %d sequences\n", totalSize, count);
}
Ejemplo n.º 14
0
static struct dnaSeq *dnaLoadNextFromStack(struct dnaLoad *dl)
/* Load next piece of DNA from stack of files.  Return NULL
 * when stack is empty. */
{
struct dnaLoadStack *dls;
struct dnaSeq *seq = NULL;
while ((dls = dl->stack) != NULL)
    {
    if (dls->twoBit)
        {
	if (dls->tbi != NULL)
	    {
	    seq = twoBitReadSeqFrag(dls->twoBit, dls->tbi->name, 0, 0);
	    dls->tbi = dls->tbi->next;
	    return seq;
	    }
	else
	    {
	    dl->stack = dls->next;
	    dnaLoadStackFree(&dls);
	    }
	}
    else if (dls->textIsFa)
        {
	DNA *dna;
	char *name;
	int size;
	if (faMixedSpeedReadNext(dls->textFile, &dna, &size, &name))
	    {
	    AllocVar(seq);
	    seq->dna = needLargeMem(size+1);
	    memcpy((void *)seq->dna, (void *)dna, size);
	    seq->dna[size] = 0;
	    seq->size = size;
	    seq->name = cloneString(name);
	    dl->curStart = 0;
	    dl->curEnd = size;
	    dl->curSize = size;
	    return seq;
	    }
	else
	    {
	    dl->stack = dls->next;
	    dnaLoadStackFree(&dls);
	    }
	}
    else	/* It's a file full of file names. */
        {
	char *line;
	if (lineFileNextReal(dls->textFile, &line))
	    {
	    line  = trimSpaces(line);
	    if ((seq = dnaLoadSingle(line, &dl->curStart, &dl->curEnd, &dl->curSize)) != NULL)
	         return seq;
	    else
	         {
		 struct dnaLoadStack *newDls;
		 newDls = dnaLoadStackNew(line);
		 slAddHead(&dl->stack, newDls);
		 }
	    }
	else
	    {
	    dl->stack = dls->next;
	    dnaLoadStackFree(&dls);
	    }
	}
    }
dl->finished = TRUE;
return NULL;
}
Ejemplo n.º 15
0
void checkExp(char *bedFileName, char *tNibDir, char *nibList)
{
struct lineFile *bf = lineFileOpen(bedFileName , TRUE), *af = NULL;
char *row[PSEUDOGENELINK_NUM_COLS] ;
struct pseudoGeneLink *ps;
char *tmpName[512], cmd[512];
struct axt *axtList = NULL, *axt, *mAxt = NULL;
struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seqList = NULL;
struct nibInfo *qNib = NULL, *tNib = NULL;
FILE *op;
int ret;

if (nibHash == NULL)
    nibHash = hashNew(0);
while (lineFileNextRow(bf, row, ArraySize(row)))
    {
    struct misMatch *misMatchList = NULL;
    struct binKeeper *bk = NULL;
    struct binElement *el, *elist = NULL;
    struct psl *mPsl = NULL, *rPsl = NULL, *pPsl = NULL, *psl ;
    struct misMatch *mf = NULL;
    ps = pseudoGeneLinkLoad(row);
    tmpName[0] = cloneString(ps->name);
    chopByChar(tmpName[0], '.', tmpName, sizeof(tmpName));
    verbose(2,"name %s %s:%d-%d\n",
            ps->name, ps->chrom, ps->chromStart,ps->chromEnd);
    /* get expressed retro from hash */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart, ps->chromEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        rPsl = el->val;
        verbose(2,"retroGene %s %s:%d-%d\n",rPsl->qName, ps->chrom, ps->chromStart,ps->chromEnd);
        }
    /* find mrnas that overlap parent gene */
    bk = hashFindVal(mrnaHash, ps->gChrom);
    elist = binKeeperFindSorted(bk, ps->gStart , ps->gEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        pPsl = el->val;
        verbose(2,"parent %s %s:%d %d,%d\n",
                pPsl->qName, pPsl->tName,pPsl->tStart,
                pPsl->match, pPsl->misMatch);
        }
    /* find self chain */
    bk = hashFindVal(chainHash, ps->chrom);
    elist = binKeeperFind(bk, ps->chromStart , ps->chromEnd ) ;
    slSort(&elist, chainCmpScoreDesc);
    for (el = elist; el != NULL ; el = el->next)
        {
        struct chain *chain = el->val, *subChain, *retChainToFree, *retChainToFree2;
        int qs = chain->qStart;
        int qe = chain->qEnd;
        int id = chain->id;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        if (!sameString(chain->qName , ps->gChrom) || 
                !positiveRangeIntersection(qs, qe, ps->gStart, ps->gEnd))
            {
            verbose(2," wrong chain %s:%d-%d %s:%d-%d parent %s:%d-%d\n", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd,
                ps->gChrom,ps->gStart,ps->gEnd);
            continue;
            }
        verbose(2,"chain id %d %4.0f",chain->id, chain->score);
        chainSubsetOnT(chain, ps->chromStart+7, ps->chromEnd-7, 
            &subChain,  &retChainToFree);
        if (subChain != NULL)
            chain = subChain;
        chainSubsetOnQ(chain, ps->gStart, ps->gEnd, 
            &subChain,  &retChainToFree2);
        if (subChain != NULL)
            chain = subChain;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        verbose(2," %s:%d-%d %s:%d-%d ", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd);
        if (subChain != NULL)
            verbose(2,"subChain %s:%d-%d %s:%d-%d\n",
                    subChain->qName, subChain->qStart, subChain->qEnd, 
                    subChain->tName,subChain->tStart,subChain->tEnd);

	qNib = nibInfoFromCache(nibHash, tNibDir, chain->qName);
	tNib = nibInfoFromCache(nibHash, tNibDir, chain->tName);
	tSeq = nibInfoLoadStrand(tNib, chain->tStart, chain->tEnd, '+');
	qSeq = nibInfoLoadStrand(qNib, chain->qStart, chain->qEnd, chain->qStrand);
	axtList = chainToAxt(chain, qSeq, chain->qStart, tSeq, chain->tStart,
	    maxGap, BIGNUM);
        verbose(2,"axt count %d misMatch cnt %d\n",slCount(axtList), slCount(misMatchList));
        for (axt = axtList; axt != NULL ; axt = axt->next)
            {
            addMisMatch(&misMatchList, axt, chain->qSize);
            }
        verbose(2,"%d in mismatch list %s id %d \n",slCount(misMatchList), chain->qName, id);
        chainFree(&retChainToFree);
        chainFree(&retChainToFree2);
        break;
        }
    /* create axt of each expressed retroGene to parent gene */
        /* get alignment for each mrna overlapping retroGene */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart , ps->chromEnd ) ;
    {
    char queryName[512];
    char axtName[512];
    char pslName[512];
    safef(queryName, sizeof(queryName), "/tmp/query.%s.fa", ps->chrom);
    safef(axtName, sizeof(axtName), "/tmp/tmp.%s.axt", ps->chrom);
    safef(pslName, sizeof(pslName), "/tmp/tmp.%s.psl", ps->chrom);
    op = fopen(pslName,"w");
    for (el = elist ; el != NULL ; el = el->next)
        {
        psl = el->val;
        pslOutput(psl, op, '\t','\n');
        qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0);

        if (qSeq != NULL)
            slAddHead(&seqList, qSeq);
        else
            errAbort("seq %s not found \n", psl->qName);
        }
    fclose(op);
    faWriteAll(queryName, seqList);
    safef(cmd,sizeof(cmd),"pslPretty -long -axt %s %s %s %s",pslName , nibList, queryName, axtName);
    ret = system(cmd);
    if (ret != 0)
        errAbort("ret is %d %s\n",ret,cmd);
    verbose(2, "ret is %d %s\n",ret,cmd);
    af = lineFileOpen(axtName, TRUE);
    while ((axt = axtRead(af)) != NULL)
        slAddHead(&mAxt, axt);
    lineFileClose(&af);
    }
    slReverse(&mAxt);
    /* for each parent/retro pair, count bases matching retro and parent better */
    for (el = elist; el != NULL ; el = el->next)
        {
        int i, scoreRetro=0, scoreParent=0, scoreNeither=0;
        struct dyString *parentMatch = newDyString(16*1024);
        struct dyString *retroMatch = newDyString(16*1024);
        mPsl = el->val;

        if (mAxt != NULL)
            {
            verbose(2,"mrna %s %s:%d %d,%d axt %s\n",
                    mPsl->qName, mPsl->tName,mPsl->tStart,
                    mPsl->match, mPsl->misMatch, 
                    mAxt->qName);
            assert(sameString(mPsl->qName, mAxt->qName));
            for (i = 0 ; i< (mPsl->tEnd-mPsl->tStart) ; i++)
                {
                int j = mAxt->tStart - mPsl->tStart;
                verbose(5, "listLen = %d\n",slCount(&misMatchList));
                if ((mf = matchFound(&misMatchList, (mPsl->tStart)+i)) != NULL)
                    {
                    if (toupper(mf->retroBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match retro[%d] %d %c == %c parent %c %d\n",
                                i,mf->retroLoc, mf->retroBase, mAxt->qSym[j+i], 
                                mf->parentBase, mf->parentLoc);
                        dyStringPrintf(retroMatch, "%d,", mf->retroLoc);
                        scoreRetro++;
                        }
                    else if (toupper(mf->parentBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match parent[%d] %d %c == %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->qSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        dyStringPrintf(parentMatch, "%d,", mf->parentLoc);
                        scoreParent++;
                        }
                    else
                        {
                        verbose (3,"match neither[%d] %d %c != %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->tSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        scoreNeither++;
                        }
                    }
                }
            verbose(2,"final score %s parent %d retro %d  neither %d\n",
                    mPsl->qName, scoreParent, scoreRetro, scoreNeither);
            fprintf(outFile,"%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n",
                    ps->chrom, ps->chromStart, ps->chromEnd, ps->name, ps->score, 
                    mPsl->tName, mPsl->tStart, mPsl->tEnd, mPsl->qName, 
                    scoreParent, scoreRetro, scoreNeither, parentMatch->string, retroMatch->string);
            mAxt = mAxt->next;
            }
        dyStringFree(&parentMatch);
        dyStringFree(&retroMatch);
        }
    }
}
Ejemplo n.º 16
0
struct axt *pslToAxt(struct psl *psl, struct hash *qHash, char *tNibDir, 
	struct dlList *fileCache)
{
static char *tName = NULL, *qName = NULL;
static struct dnaSeq *tSeq = NULL;
struct dyString *q = newDyString(16*1024);
struct dyString *t = newDyString(16*1024);
int blockIx;
int qs, ts ;
int lastQ = 0, lastT = 0, size;
int qOffset = 0;
int tOffset = 0;
struct axt *axt = NULL;
boolean qIsNib = FALSE;
boolean tIsNib = FALSE;
int cnt = 0;
//struct dnaSeq *tSeq = NULL;
struct nibInfo *tNib = NULL;

struct dnaSeq *qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0);
   // hGenBankGetMrna(psl->qName, NULL);
/*
freeDnaSeq(&qSeq);
freez(&qName);
assert(mrnaList != NULL);
for (mrna = mrnaList; mrna != NULL ; mrna = mrna->next)
    {
    assert(mrna != NULL);
    cnt++;
    if (sameString(mrna->name, psl->qName))
        {
        qSeq = cloneDnaSeq(mrna);
        assert(qSeq != NULL);
        break;
        }
    }
    */
if (qSeq == NULL)
    {
    warn("mrna sequence data not found %s, searched %d sequences\n",psl->qName,cnt);
    dyStringFree(&q);
    dyStringFree(&t);
    dnaSeqFree(&tSeq);
    dnaSeqFree(&qSeq);
    return NULL;
    }
if (qSeq->size != psl->qSize)
    {
    warn("sequence %s aligned is different size %d from mrna.fa file %d \n",psl->qName,psl->qSize,qSeq->size);
    dyStringFree(&q);
    dyStringFree(&t);
    dnaSeqFree(&tSeq);
    dnaSeqFree(&qSeq);
    return NULL;
    }
qName = cloneString(psl->qName);
if (qIsNib && psl->strand[0] == '-')
    qOffset = psl->qSize - psl->qEnd;
else
    qOffset = 0;
verbose(5,"qString len = %d qOffset = %d\n",qSeq->size,qOffset);
if (tName == NULL || !sameString(tName, psl->tName) || tIsNib)
    {
    freeDnaSeq(&tSeq);
    freez(&tName);
    tName = cloneString(psl->tName);
    tNib = nibInfoFromCache(nibHash, tNibDir, tName);
    assert(tNib !=NULL);
    tSeq = nibInfoLoadStrand(tNib, psl->tStart, psl->tEnd, '+');
    assert(tSeq !=NULL);
    tOffset = psl->tStart;
    //readCachedSeqPart(tName, psl->tStart, psl->tEnd-psl->tStart, 
//	tHash, fileCache, &tSeq, &tOffset, &tIsNib);
    }
verbose(4,"strand t %s \n",psl->strand);
if (tSeq != NULL)
    verbose(5,"tString len = %d tOffset = %d\n",tSeq->size,tOffset);
else
    errAbort("tSeq is NULL\n");
if (psl->strand[0] == '-')
    reverseComplement(qSeq->dna, qSeq->size);
//if (strlen(psl->strand) > 1 )
//    if (psl->strand[1] == '-')
//        reverseComplement(tSeq->dna, tSeq->size);
for (blockIx=0; blockIx < psl->blockCount; ++blockIx)
    {
    qs = psl->qStarts[blockIx] - qOffset;
    ts = psl->tStarts[blockIx] - tOffset;

    if (blockIx != 0)
        {
	int qGap, tGap, minGap;
	qGap = qs - lastQ;
	tGap = ts - lastT;
	minGap = min(qGap, tGap);
	if (minGap > 0)
	    {
	    writeGap(q, qGap, qSeq->dna + lastQ, t, tGap, tSeq->dna + lastT);
	    }
	else if (qGap > 0)
	    {
	    writeInsert(q, t, qSeq->dna + lastQ, qGap);
	    }
	else if (tGap > 0)
	    {
	    writeInsert(t, q, tSeq->dna + lastT, tGap);
	    }
	}
    size = psl->blockSizes[blockIx];
    assert(qSeq != NULL);
    dyStringAppendN(q, qSeq->dna + qs, size);
    lastQ = qs + size;
    dyStringAppendN(t, tSeq->dna + ts, size);
    lastT = ts + size;
    }

if (strlen(q->string) != strlen(t->string))
    warn("Symbol count(t) %d != %d inconsistent at t %s:%d and qName %s\n%s\n%s\n",
    	(int)strlen(t->string), (int)strlen(q->string), psl->tName, psl->tStart, psl->qName, t->string, q->string);
if (psl->strand[0] == '-')
    {
    reverseComplement(q->string, q->stringSize);
    reverseComplement(t->string, t->stringSize);
    }
axt = axtCreate(q->string, t->string, min(q->stringSize,t->stringSize), psl);
dyStringFree(&q);
dyStringFree(&t);
//dnaSeqFree(&tSeq);
dnaSeqFree(&qSeq);
if (qIsNib)
    freez(&qName);
//if (tIsNib)
//    freez(&tName);
return axt;
}