示例#1
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
示例#2
0
/* main function */
int main_bismark (int argc, char *argv[]) {
    
    char *output, *outReportfile, *outCpGfile, *outbedGraphfile, *row[100], *samfilecopy;
    char *forwardcg, *forwardchg, *forwardchh, *forwardread, *forwardread1;
    char *reversecg, *reversechg, *reversechh, *reverseread, *reverseread1;
    unsigned long long int *cnt;
    unsigned long long int *cnt2 = NULL;
    int optSam = 0, c, optaddChr = 0, optStats = 0, optBis = 0, optFull = 0, optKeep = 0;
    unsigned int optisize = 500;
    int optcov = 5;
    char *optoutput = NULL;
    struct hash *cpgHash = newHash(0);
    struct hash *chgHash = newHash(0);
    struct hash *chhHash = newHash(0);
    time_t start_time, end_time;
    start_time = time(NULL);
    
    while ((c = getopt(argc, argv, "SCsbFBo:c:I:h?")) >= 0) {
        switch (c) {
            case 'S': optSam = 1; break;
            case 'C': optaddChr = 1; break;
            case 's': optStats = 1; break;
            case 'b': optBis = 1; break;
            case 'F': optFull = 1; break;
            case 'B': optKeep = 1; break;
            case 'c': optcov = (int)strtol(optarg, 0, 0); break;
            case 'I': optisize = (unsigned int)strtol(optarg, 0, 0); break;
            case 'o': optoutput = strdup(optarg); break;
            case 'h':
            case '?': return bismark_usage(); break;
            default: return 1;
        }
    }
    if (optind + 3 > argc)
        return bismark_usage();

    char *chr_size_file = argv[optind];
    char *cpg_bed_file = argv[optind+1];
    char *sam_file = argv[optind+2];

    fprintf(stderr, "* CpG file provided: %s\n", cpg_bed_file);
    fprintf(stderr, "* Insert size cutoff: %u\n", optisize);
    fprintf(stderr, "* Read coverage threshold: %i\n", optcov);
   
    struct hash *chrHash = hashNameIntFile(chr_size_file);
    
    samfilecopy = cloneString(sam_file);
    int numFields = chopByChar(samfilecopy, ',', row, ArraySize(row));
    fprintf(stderr, "* Provided %i BAM/SAM file(s)\n", numFields);


    if(optFull) {
        fprintf(stderr, "* Warning: will run in Full mode, 8 track files and 1 report file will be generated\n");
        fprintf(stderr, "* Warning: will output stats over each C (in CHG)\n");
        fprintf(stderr, "* Warning: will output stats over each C (in CHH)\n");
        optStats = 0;
        optBis = 1;
    }
    
    if(optStats) {
        fprintf(stderr, "* Warning: will report stats only as -s specified\n");
    }
    // if use select bismark like output, read cpgHash at each C stats
    if(optBis) {
        fprintf(stderr, "* Warning: will output stats over each C (in CpG)\n");
        cpgHash = cpgBed2BinKeeperHashBismark(chrHash, cpg_bed_file);
    }else{
        fprintf(stderr, "* Warning: will output stats over each CpG\n");
        cpgHash = cpgBed2BinKeeperHash(chrHash, cpg_bed_file);
    }

    if(optoutput) {
        output = optoutput;
    } else {
        output = cloneString(get_filename_without_ext(basename(row[0])));
    }
    

    if(asprintf(&outCpGfile, "%s.CpG.bedGraph", output) < 0)
        errAbort("Mem Error.\n");
    if(asprintf(&outbedGraphfile, "%s.density.bedGraph", output) < 0)
        errAbort("Mem Error.\n");
    if (asprintf(&outReportfile, "%s.report", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardcg, "%s.forward.CG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardchg, "%s.forward.CHG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardchh, "%s.forward.CHH.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardread, "%s.forward.Density.bed", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardread1, "%s.forward.Density.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reversecg, "%s.reverse.CG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reversechg, "%s.reverse.CHG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reversechh, "%s.reverse.CHH.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reverseread, "%s.reverse.Density.bed", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reverseread1, "%s.reverse.Density.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    

    //sam file to bed file
    //fprintf(stderr, "* Parsing the SAM/BAM file\n");
    cnt = bismarkBamParse(sam_file, chrHash, cpgHash, chgHash, chhHash, forwardread, reverseread, optSam, optaddChr, optFull, optisize);
    
    //write to file
    if (optFull){
        fprintf(stderr, "* Output CpG methylation calls\n");
        writecpgBismarkLite(cpgHash, forwardcg, reversecg, optcov);
        fprintf(stderr, "* Output CHG methylation calls\n");
        writecpgBismarkLiteHash(chgHash, forwardchg, reversechg, optcov);
        fprintf(stderr, "* Output CHH methylation calls\n");
        writecpgBismarkLiteHash(chhHash, forwardchh, reversechh, optcov);
        fprintf(stderr, "* Sorting methylation calls\n");
        sortBedfile(forwardcg);
        sortBedfile(reversecg);
        sortBedfile(forwardchg);
        sortBedfile(reversechg);
        sortBedfile(forwardchh);
        sortBedfile(reversechh);
        fprintf(stderr, "* Sorting density bed\n");
        sortBedfile(forwardread);
        sortBedfile(reverseread);
        fprintf(stderr, "* Generating density bedGraph\n");
        bedItemOverlapCount(chrHash, forwardread, forwardread1);
        bedItemOverlapCount(chrHash, reverseread, reverseread1);
    }else{
        cnt2 = writecpgBismark(cpgHash, outbedGraphfile, outCpGfile, optStats, optcov);
        //sort output
        if(!optStats) {
            fprintf(stderr, "* Sorting output density\n");
            sortBedfile(outbedGraphfile);
        }
        //sort output
        if(!optStats) {
            fprintf(stderr, "* Sorting output CpG methylation call\n");
            sortBedfile(outCpGfile);
        }
    }

    //generate bigWig
    //fprintf(stderr, "* Generating bigWig\n");
    //bigWigFileCreate(outbedGraphfile, chr_size_file, 256, 1024, 0, 1, outbigWigfile);
    //bedGraphToBigWig(outbedGraphfile, chr_size_file, outbigWigfile);
    
    //write report file
    fprintf(stderr, "* Preparing report file\n");
    writeReportBismark(outReportfile, cnt, cnt2, numFields, row, optBis, hashIntSum(chrHash));

    if(!optKeep){
        fprintf(stderr, "* Deleting (huge) density bed files\n");
        unlink(forwardread);
        unlink(reverseread);
    }
    
    //cleaning
    hashFree(&chrHash);
    hashFree(&cpgHash);
    hashFree(&chgHash);
    hashFree(&chhHash);
    free(outCpGfile);
    free(outbedGraphfile);
    //free(outbigWigfile);
    free(outReportfile);
    free(samfilecopy);
    free(forwardcg);
    free(forwardchg);
    free(forwardchh);
    free(forwardread);
    free(forwardread1);
    free(reversecg);
    free(reversechg);
    free(reversechh);
    free(reverseread);
    free(reverseread1);
    end_time = time(NULL);
    fprintf(stderr, "* Done, time used %.0f seconds.\n", difftime(end_time, start_time));
    return 0;
}
示例#3
0
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, 
	char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome)
/* txGeneCdsMap - Create mapping between CDS region of gene and genome. */
{
/* Load info into hash. */
struct hash *infoHash = hashNew(18);
struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }
lineFileClose(&lf);

/* Load refPep/tx alignments into hash keyed by tx. */
struct hash *refPslHash = hashNew(18);
struct psl *psl, *pslList  = pslLoadAll(refPepToTxPsl);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(refPslHash, psl->tName, psl);

struct hash *refToPepHash = hashTwoColumnFile(refToPepTab);
struct hash *chromSizeHash = hashNameIntFile(chromSizes);

/* Load in bed. */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Open output, and stream through bedList, writing output. */
FILE *fCdsToRna = mustOpen(cdsToRna, "w");
FILE *fRnaToGenome = mustOpen(rnaToGenome, "w");
int refTotal = 0, refFound = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
	{
	char *chrom = bed->chrom;
	int chromSize = hashIntVal(chromSizeHash, chrom);
	info = hashMustFindVal(infoHash, bed->name);
	pick = hashMustFindVal(pickHash, bed->name);
	if (info->isRefSeq)
	    {
	    char *refAcc = txAccFromTempName(bed->name);
	    if (!startsWith("NM_", refAcc))
		errAbort("Don't think I did find that refSeq acc, got %s", refAcc);
	    char *protAcc = hashMustFindVal(refToPepHash, refAcc);
	    ++refTotal;
	    if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna))
	        ++refFound;
	    }
	else
	    {
	    fakeCdsToMrna(bed, fCdsToRna);
	    }
	fakeRnaToGenome(bed, chromSize, fRnaToGenome);
	}
    }
verbose(1, "Missed %d of %d refSeq protein mappings.  A small number of RefSeqs just map\n"
           "to genome in the UTR.\n", refTotal - refFound, refTotal);
carefulClose(&fCdsToRna);
carefulClose(&fRnaToGenome);
}
示例#4
0
文件: filter.c 项目: lidaof/iteres
/* main filter function */
int main_filter(int argc, char *argv[]){
    char *output, *subfam, *out, *outReport;
    unsigned long long int *cnt;
    struct hash *hashRmsk = newHash(0);
    struct hash *hashRep = newHash(0);
    struct hash *hashFam = newHash(0);
    struct hash *hashCla = newHash(0);
    int optSam = 0, optthreshold = 1;
    char *optoutput = NULL, *optname = NULL, *optclass = NULL, *optfamily = NULL;
    unsigned int optreadlist = 0, optQual = 10, optisize = 500, optExt = 150;
    int filterField = 0, c, optDup = 0, optNorm = 0, optaddChr = 0, optDis =0, optTreat = 0;
    float optCov = 0.0001;

    time_t start_time, end_time;
    start_time = time(NULL);
    
    while ((c = getopt(argc, argv, "SQ:g:N:n:c:t:f:rRTDCE:I:o:h?")) >= 0) {
        switch (c) {
            case 'S': optSam = 1; break;
            case 'Q': optQual = (unsigned int)strtol(optarg, 0, 0); break;
            case 'g': optCov = atof(optarg); break;
            case 'N': optNorm = (unsigned int)strtol(optarg, 0, 0); break;
            case 't': optthreshold = (unsigned int)strtol(optarg, 0, 0); break;
            case 'r': optreadlist = 1; break;
            case 'R': optDup = 1; break;
            case 'T': optTreat = 1; break;
            case 'D': optDis = 1; break;
            case 'C': optaddChr = 1; break;
            case 'n': optname = strdup(optarg); break;
            case 'c': optclass = strdup(optarg); break;
            case 'f': optfamily = strdup(optarg); break;
            case 'E': optExt = (unsigned int)strtol(optarg, 0, 0); break;
            case 'I': optisize = (unsigned int)strtol(optarg, 0, 0); break;
            case 'o': optoutput = strdup(optarg); break;
            case 'h':
            case '?': return filter_usage(); break;
            default: return 1;
        }
    }
    if (optind + 4 > argc)
        return filter_usage();

    char *chr_size_file = argv[optind];
    char *rep_size_file = argv[optind+1];
    char *rmsk_file = argv[optind+2];
    char *sam_file = argv[optind+3];
    
    if ( (optname && optclass) || (optname && optfamily) || (optclass && optfamily) || (optname && optclass && optfamily))
        errAbort("Please specify only one filter, either -n, -c or -f.");
    
    int nindex = 0;
    if (optNorm == 0){
        nindex = 7;
    } else if (optNorm == 1){
        nindex = 8;
    } else if (optNorm == 2) {
        nindex = 6;    
    } else if (optNorm == 3) {
        nindex = 4;    
    } else{
        errAbort("Wrong normalization method specified");
    }
    
    subfam = cloneString("ALL");
    if (optname) {
        optclass = NULL;
        optfamily = NULL;
        subfam = cloneString(optname);
        filterField = 10;
    }else if (optclass) {
        optname = NULL;
        optfamily = NULL;
        subfam = cloneString(optclass);
        filterField = 11;
    } else if (optfamily) {
        optname = NULL;
        optclass= NULL;
        subfam = cloneString(optfamily);
        filterField = 12;
    }
    if (sameString(subfam, "ALL")){
        fprintf(stderr, "* You didn't specify any filter, will output all repeats\n");
        filterField = 0;
    }
    
    if(optoutput) {
        output = optoutput;
    } else {
        output = cloneString(get_filename_without_ext(basename(sam_file)));
    }
    
    struct hash *chrHash = hashNameIntFile(chr_size_file);
    struct hash *repHash = hashNameIntFile(rep_size_file);
    
    fprintf(stderr, "* Start to parse the rmsk file\n");
    rmsk2binKeeperHash(rmsk_file, chrHash, repHash, &hashRmsk, &hashRep, &hashFam, &hashCla, filterField, subfam);
    
    //sam file
    fprintf(stderr, "* Start to parse the SAM/BAM file\n");
    //if (optPair){
    //    cnt = PEsamFile2nodupRepbedFile(sam_file, chrHash, hashRmsk, hashRep, hashFam, hashCla, optSam, optQual, 1, optDup, optaddChr, optisize);
    //} else {
    //    cnt = samFile2nodupRepbedFile(sam_file, chrHash, hashRmsk, hashRep, hashFam, hashCla, optSam, optQual, 1, optDup, optaddChr);
    //}
    cnt = samFile2nodupRepbedFileNew(sam_file, chrHash, hashRmsk, hashRep, hashFam, hashCla, optSam, optQual, 1, optDup, optaddChr, optDis, optisize, optExt, optCov, optTreat, NULL, NULL, 0);


    fprintf(stderr, "* Preparing the output file\n");
    if (asprintf(&out, "%s_%s.iteres.loci", output, subfam) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&outReport, "%s_%s.iteres.reportloci", output, subfam) < 0)
        errAbort("Preparing output wrong");
    
    writeFilterOut(hashRmsk, out, optreadlist, optthreshold, subfam, cnt[nindex]); 
    
    //write report file
    fprintf(stderr, "* Preparing report file\n");
    writeReport(outReport, cnt, optQual, subfam);
    
    hashFree(&chrHash);
    hashFree(&repHash);
    hashFree(&hashRmsk);
    hashFree(&hashRep);
    hashFree(&hashFam);
    hashFree(&hashCla);
    free(out);
    free(outReport);
    
    end_time = time(NULL);
    fprintf(stderr, "* Done, time used %.0f seconds.\n", difftime(end_time, start_time));
    return 0;    
}