Esempio n. 1
0
void filterPsls()
{
struct psl *origPslList=NULL, *pslList=NULL, *psl=NULL;
int startCount=0, stopCount=0;
char buff[256];
origPslList = pslLoadAll(pslIn);

/* some messages for the user */
startCount = slCount(origPslList);
sprintf(buff, "Filtering %d psl using seqIdent=%g and basePct=%g\n", 
	startCount, seqIdent, basePct);
msg(buff);

/* do our filtering */
pslList = filterBySeqIdentity(seqIdent, origPslList);
pslFreeList(&origPslList);
origPslList = filterByBasePct(basePct, pslList);


/* let the user know we're done */
if(origPslList != NULL)
{
stopCount = slCount(origPslList);
pslWriteAll(origPslList, pslOut, FALSE);
pslFreeList(&origPslList);
}
pslFreeList(&origPslList);
pslFreeList(&pslList);
sprintf(buff, "After filtering %d of %d are left\n", stopCount, startCount);
msg(buff);
}
Esempio n. 2
0
void protDat(char *protName, char *blatName, char *aliasFile, char *outName)
{
FILE *outFile = mustOpen(outName, "w");
struct hash *protHash = newHash(10);
struct hash *blatHash = newHash(10);
struct hash *aliasHash = newHash(10);
struct psl *psls, *pslPtr, *protPsls, *blatPsl;
struct lineFile *lf = lineFileOpen(aliasFile, TRUE);
struct alias *alPtr;
char buffer[1024];
char *words[3];
int numWords = optionExists("fb") ? 2 : 3;

while (lineFileNextRow(lf, words, numWords))
    {
    AllocVar(alPtr);
    alPtr->kgName = cloneString(words[1]);
    if (numWords == 3)
	alPtr->spName = cloneString(words[2]);
    hashAdd(aliasHash, cloneString(words[0]), alPtr);
    }

protPsls = pslLoadAll(protName);

pslPtr = psls = pslLoadAll(blatName);
for(; pslPtr; pslPtr = pslPtr->next)
    hashAdd(blatHash, pslPtr->qName, pslPtr);

for(pslPtr = protPsls; pslPtr; pslPtr = pslPtr->next)
    {
    if ((blatPsl = hashFindVal(blatHash, pslPtr->qName)) != NULL)
	{
	if ((alPtr = hashFindVal(aliasHash, pslPtr->qName)) != NULL)
	    {
	    if (numWords == 3)
		sprintf(buffer,"%s.%s:%d-%d.%s.%s",pslPtr->qName,blatPsl->tName, 
		    blatPsl->tStart, blatPsl->tEnd,alPtr->kgName, alPtr->spName); 
	    else
		sprintf(buffer,"%s.%s:%d-%d.%s",pslPtr->qName,blatPsl->tName, 
		    blatPsl->tStart, blatPsl->tEnd,alPtr->kgName); 
	    pslPtr->qName = buffer;
	    pslTabOut(pslPtr, outFile);
	    }
	}
    }
}
void readInPslHash(struct hash *pslHash, char *file)
{
struct psl *pslList, *psl;
pslList = pslLoadAll(file);
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    hashAdd(pslHash, psl->qName, psl);
    }
}
Esempio n. 4
0
struct hash *hashPsls(char *fileName)
/* Return hash of all psls in file. */
{
struct psl *pslList = pslLoadAll(fileName), *psl;
struct hash *hash = newHash(20);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(hash, psl->qName, psl);
uglyf("Loaded %d psls from %s\n", slCount(pslList), fileName);
return hash;
}
Esempio n. 5
0
struct hash *hashPsls(char *pslFileName)
{
struct psl *pslList = NULL, *psl = NULL, *pslSubList = NULL, *pslNext = NULL;
struct hash *pslHash = newHash(15);
char *last = NULL;

char key[128];
char *tmp = NULL;
pslList = pslLoadAll(pslFileName);

/* Fix psl names */
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    tmp = strrchr(psl->qName, ';');
    *tmp = '\0';
    tmp = strstr(psl->qName,prefix);
    assert(tmp);
    /* checks if there are 2 occurrences of ":" in probe name as in full name */
    /* if probe name is shortened to fit in the seq table, there is only 1 ":"*/
    /* e.g. full: consensus:HG-U133A:212933_x_at; short:HG-U133A:212933_x_at;*/

    if (countChars(psl->qName, *prefix) == 2) 
        {
        tmp = strstr(tmp+1,prefix);
        assert(tmp);
        }
    tmp = tmp + strlen(prefix);
    safef(psl->qName, strlen(psl->qName), "%s", tmp);
    }

/* Sort based on query name. */

slSort(&pslList, pslCmpQuery);
/* For each psl, if it is has the same query name add it to the
   sublist. Otherwise store the sublist in the hash and start
   another. */
for(psl = pslList; psl != NULL; psl = pslNext)
    {
    pslNext = psl->next;
    if(last != NULL && differentWord(last, psl->qName))
	{
	hashAddUnique(pslHash, last, pslSubList);
	pslSubList = NULL;
	}
    slAddTail(&pslSubList, psl);
    last = psl->qName;
    }
/* Add the last sublist */
hashAddUnique(pslHash, last, pslSubList);
return pslHash;
}
Esempio n. 6
0
struct hapRegions *hapRegionsNew(char *hapPslFile, FILE *hapRefMappedFh, FILE *hapRefCDnaFh)
/* construct a new hapRegions object from PSL alignments of the haplotype
 * pseudo-chromosomes to the haplotype regions of the reference chromsomes. */
{
    struct psl *mapping, *mappings = pslLoadAll(hapPslFile);
    struct hapRegions *hr;
    AllocVar(hr);
    hr->refMap = hashNew(12);
    hr->hapMap = hashNew(12);
    hr->hapRefMappedFh = hapRefMappedFh;
    hr->hapRefCDnaFh = hapRefCDnaFh;

    while ((mapping = slPopHead(&mappings)) != NULL)
        addHapMapping(hr, mapping);
    return hr;
}
Esempio n. 7
0
static struct hash* loadPslByQname(char* inPslFile)
/* load PSLs in to hash by qName.  Make sure target strand is positive
 * to make process easier later. */
{
struct hash* pslsByQName = hashNew(0);
struct psl *psls = pslLoadAll(inPslFile);
struct psl *psl;
while ((psl = slPopHead(&psls)) != NULL)
    {
    if (pslTStrand(psl) != '+')
        pslRc(psl);
    struct hashEl *hel = hashStore(pslsByQName, psl->qName);
    struct psl** queryPsls = (struct psl**)&hel->val;
    slAddHead(queryPsls, psl);
    }
return pslsByQName;
}
Esempio n. 8
0
void doPsls(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, 
	    char *netTable, char *pslFileName, char *pslTableName,
	    char *outBedName, char *selectedFileName, 
            int *foundCount, int *notFoundCount)
/* Map over psls. */
{
FILE *bedOut = NULL;
FILE *selectedOut = NULL;
struct bed *bed = NULL;
struct psl *psl=NULL, *pslList = NULL;
/* Load psls. */
warn("Loading psls.");
if(pslFileName)
    pslList=pslLoadAll(pslFileName);
else
    pslList=loadPslFromTable(conn, pslTableName, chrom, 0, BIGNUM);
/* Convert psls. */
warn("Converting psls.");
assert(outBedName);
bedOut = mustOpen(outBedName, "w");
if (selectedFileName != NULL)
    selectedOut = mustOpen(selectedFileName, "w");
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    if(differentString(psl->tName, chrom))
	continue;
    occassionalDot();
    bed = orthoBedFromPsl(conn, db, orthoDb, netTable, psl);
    if(bed != NULL && bed->blockCount > 0)
	{
	(*foundCount)++;
	bedTabOutN(bed, 12, bedOut);
        if (selectedOut != NULL)
            pslTabOut(psl, selectedOut);
	}
    else
	(*notFoundCount)++;
    bedFree(&bed);
    }
carefulClose(&selectedOut);
carefulClose(&bedOut);
}
Esempio n. 9
0
void loadPslsFromFile(char *pslFile, char *chrom, struct sqlConnection *conn)
/** Load the psls from the directed file (instead of the database. */
{
struct psl *psl = NULL, *pslNext = NULL, *pslList = NULL;
pslList = pslLoadAll(pslFile);
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    minPslStart = min(psl->tStart, minPslStart);
    maxPslEnd = max(psl->tEnd, maxPslEnd);
    }
chromPslBin = binKeeperNew(minPslStart, maxPslEnd);
agxSeenBin = binKeeperNew(minPslStart, maxPslEnd);
for(psl = pslList; psl != NULL; psl = pslNext)
    {
    pslNext = psl->next;
    if(sameString(psl->tName, chrom))
	binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl);
    else
	pslFree(&psl);
    }
}
Esempio n. 10
0
struct bed *createBedsFromPsls(char *pslFile, int expCount)
/** creates a list of beds from a pslfile, allocates memory for
arrays as determined by expCount */
{
struct psl *pslList = NULL, *psl = NULL;
struct bed *bedList = NULL, *bed = NULL;
pslList = pslLoadAll(pslFile);
for(psl = pslList; psl != NULL; psl = psl->next)
    {

    bed = bedFromPsl(psl);
    freez(&bed->name);
    bed->name=parseNameFromHgc(psl->qName);
    bed->score = 0;
    bed->expCount = 0;
    bed->expIds = needMem(sizeof(int)*expCount);
    bed->expScores = needMem(sizeof(float)*expCount);
    slAddHead(&bedList,bed);
    }
slReverse(&bedList);
pslFreeList(&pslList);
return bedList;
}
struct psl* doDnaAlignment(struct dnaSeq *seq, char *db, char *blatHost, 
	char *port, char *nibDir, struct hash *tFileCache) 
/* get the alignment from the blat host for this sequence */
{
struct psl *pslList = NULL;
int conn =0;
struct tempName pslTn;
FILE *f = NULL;
struct gfOutput *gvo;


if(seq == NULL || db == NULL)
    errAbort("coordConv::doDnaAlignment() - dnaSeq and/or db can't be NULL.");
if(strlen(seq->dna) != seq->size)
    errAbort("coordConv::doDnaAlignment() - there seems to be something fishy about %s: the size doesn't equal the length", seq->name);
/* if there are too many n's it can cause the blat server to hang */
if(strstr(seq->dna, "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn") )
  return NULL; 
makeTempName(&pslTn,"ccR", ".psl");
f = mustOpen(pslTn.forCgi, "w");
gvo = gfOutputPsl(920, FALSE, FALSE, f, FALSE, FALSE);
gfOutputHead(gvo, f);

/* align to genome, both strands */
conn = gfConnect(blatHost, port);
gfAlignStrand(&conn, nibDir, seq, FALSE, 20, tFileCache, gvo);
reverseComplement(seq->dna, seq->size);
conn = gfConnect(blatHost, port);
gfAlignStrand(&conn, nibDir, seq, TRUE, 20 , tFileCache, gvo);
gfOutputQuery(gvo, f);
carefulClose(&f);
pslList = pslLoadAll(pslTn.forCgi);
remove(pslTn.forCgi);
gfOutputFree(&gvo);
return pslList;
}
Esempio n. 12
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
Esempio n. 13
0
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, 
	char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome)
/* txGeneCdsMap - Create mapping between CDS region of gene and genome. */
{
/* Load info into hash. */
struct hash *infoHash = hashNew(18);
struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }
lineFileClose(&lf);

/* Load refPep/tx alignments into hash keyed by tx. */
struct hash *refPslHash = hashNew(18);
struct psl *psl, *pslList  = pslLoadAll(refPepToTxPsl);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(refPslHash, psl->tName, psl);

struct hash *refToPepHash = hashTwoColumnFile(refToPepTab);
struct hash *chromSizeHash = hashNameIntFile(chromSizes);

/* Load in bed. */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Open output, and stream through bedList, writing output. */
FILE *fCdsToRna = mustOpen(cdsToRna, "w");
FILE *fRnaToGenome = mustOpen(rnaToGenome, "w");
int refTotal = 0, refFound = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
	{
	char *chrom = bed->chrom;
	int chromSize = hashIntVal(chromSizeHash, chrom);
	info = hashMustFindVal(infoHash, bed->name);
	pick = hashMustFindVal(pickHash, bed->name);
	if (info->isRefSeq)
	    {
	    char *refAcc = txAccFromTempName(bed->name);
	    if (!startsWith("NM_", refAcc))
		errAbort("Don't think I did find that refSeq acc, got %s", refAcc);
	    char *protAcc = hashMustFindVal(refToPepHash, refAcc);
	    ++refTotal;
	    if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna))
	        ++refFound;
	    }
	else
	    {
	    fakeCdsToMrna(bed, fCdsToRna);
	    }
	fakeRnaToGenome(bed, chromSize, fRnaToGenome);
	}
    }
verbose(1, "Missed %d of %d refSeq protein mappings.  A small number of RefSeqs just map\n"
           "to genome in the UTR.\n", refTotal - refFound, refTotal);
carefulClose(&fCdsToRna);
carefulClose(&fRnaToGenome);
}
Esempio n. 14
0
int main(int argc, char *argv[]) {
   /*
    * Arguments/options
    */
   char outputFile[50];
   char inputFile[50];
   char query[100];
   char target[100];

   ///////////////////////////////////////////////////////////////////////////
   // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
   ///////////////////////////////////////////////////////////////////////////

   while(1) {
      static struct option long_options[] = {
         { "query", required_argument, 0, 'q' },
         { "target", required_argument, 0, 't' },
         { "outputFile", required_argument, 0, 'o' },
         { "inputFile", required_argument, 0, 'i' },
         { "help", no_argument, 0, 'h' },
         { 0, 0, 0, 0 }
      };

      int option_index = 0;

      int key = getopt_long(argc, argv, "i:o:q:t:h", long_options, &option_index);

      if(key == -1) {
         break;
      }

      switch(key) {
         case 'i':
            strcpy(inputFile, optarg);
            break;
         case 'o':
            strcpy(outputFile, optarg);
            break;
         case 'q':
            strcpy(query, optarg);
            break;
         case 't':
            strcpy(target, optarg);
            break;
         case 'h':
            usage();
            return 0;
         default:
            usage();
            return 1;
      }
   }

   ///////////////////////////////////////////////////////////////////////////
   // (0) Check the inputs.
   ///////////////////////////////////////////////////////////////////////////

   assert(outputFile != NULL);
   assert(query != NULL);
   assert(target != NULL);

   FILE *fileHandle = fopen(outputFile, "w");
   pslWriteHead(fileHandle);
   struct psl *pslList = pslLoadAll(inputFile);
   mapPSLs(pslList, fileHandle, query, target);
   fclose(fileHandle);
   
   return 0;
}