Exemplo n.º 1
0
int main(int argc, char** argv) {
	struct mafFile* mf;
	struct mafAli* ali;
	struct mafComp* mc;
	
	if ( argc != 2) {
		printf("remove_self maf-file\n");
		return 1;
	}
	
	init_scores70();
	
	mafWriteStart(stdout, 0);
	
	mf = mafOpen(argv[1], 0);
	while((ali = mafNext(mf)) != NULL) {
		mc = ali->components;

		if(mc->next->strand == '+' && mc->start > mc->next->start)
			continue;
		else if(mc->next->strand == '-' && mc->start > (mc->next->srcSize - mc->next->start - mc->next->size))
			continue;
		else
			mafWrite(stdout, ali);
	}

	mafFileFree(&mf);
	
	mafWriteEnd(stdout);

	return 0;
}
void mafMeFirst(char *inMaf, char *meFile, char *outMaf)
/* mafMeFirst - Move component to top if it is one of the named ones.  Useful 
 * in conjunction with mafFrags when you don't want the one with the gene name 
 * to be in the middle.. */
{
struct hash *meHash = hashWordsInFile(meFile, 18);
struct mafFile *mf = mafOpen(inMaf);
FILE *f = mustOpen(outMaf, "w");
mafWriteStart(f, mf->scoring);
struct mafAli *maf;
while ((maf = mafNext(mf)) != NULL)
    {
    struct mafComp *comp = compInHash(maf, meHash);
    if (comp == NULL)
        errAbort("No components in %s in maf ending line %d of %s",
		meFile, mf->lf->lineIx, mf->lf->fileName);
    slRemoveEl(&maf->components, comp);
    slAddHead(&maf->components, comp);
    mafWrite(f, maf);
    mafAliFree(&maf);
    }

mafWriteEnd(f);
carefulClose(&f);
}
Exemplo n.º 3
0
int main (int argc, char **argv)
{
	int nrow = DEFAULT_NROW,
	which_seq = 0,
	do_order = 0,
	misses_allowed = 0,
	output_mode = -1;
    float cutoff = DEFAULT_CUTOFF;
    char *strand = NULL,
	 *id = NULL;
    struct mafFile *file = NULL;
    struct mafAli *ali = NULL;
    struct mafComp *comp = NULL;
    struct MOTIF *motif = NULL;
    struct MATCH *matches = NULL;

    id = ckalloc (STRSIZE);
    get_args (argc, argv, &output_mode, &file, &id, &motif, &do_order, &nrow, &cutoff, &misses_allowed);
    strand = ckalloc (sizeof (char) * (nrow + 1));

    while (NULL != (ali = mafNext (file)))
    {
        for (comp = ali->components, which_seq = 0; comp; comp = comp->next, which_seq++)
            strand[which_seq] = comp->strand;
        strand[which_seq] = '\0';

	/* skip blocks that don't have all seqs in them */
 	if (which_seq != nrow)
	{
            mafAliFree (&ali);
	    continue;
	}

	/* forward strand */
	get_matches (&matches, FORWARD, ali, nrow, motif, do_order, misses_allowed);

	/* reverse strand */
        for (comp = ali->components; comp; comp = comp->next)
            do_revcomp((uchar *)comp->text, ali->textSize );

        get_matches (&matches, REVERSE, ali, nrow, motif, do_order, misses_allowed);

	/* output matches */
        if (matches)
            output_matches (matches, strand, id, nrow, motif);
        free_match_list (&matches);

        mafAliFree (&ali);
    }

    mafFileFree (&file);
    free (strand);
    free_motif_list (&motif);
    free (id);

    return 0;
}
Exemplo n.º 4
0
struct mafFile *mafReadAll(char *fileName)
/* Read all elements in a maf file */
{
struct mafFile *mf = mafOpen(fileName);
struct mafAli *ali;
while ((ali = mafNext(mf)) != NULL)
    {
    slAddHead(&mf->alignments, ali);
    }
slReverse(&mf->alignments);
return mf;
}
Exemplo n.º 5
0
/* Construct a malnSet from a MAF file. defaultBranchLength is used to
 * assign branch lengths when inferring trees from the MAF. */
struct malnSet *malnSet_constructFromMaf(struct Genomes *genomes, char *mafFileName, int maxInputBlkWidth, double defaultBranchLength, struct Genome *treelessRootGenome) {
    struct malnSet *malnSet = malnSet_construct(genomes, mafFileName);
    struct mafFile *mafFile = mafOpen(mafFileName);
    struct mafAli *ali;
    while ((ali = mafNext(mafFile)) != NULL) {
        checkMafAli(ali);
        addMafAli(malnSet, ali, maxInputBlkWidth, defaultBranchLength, treelessRootGenome);
        mafAliFree(&ali);
    }
    malnSet_assert(malnSet);
    mafFileFree(&mafFile);
    return malnSet;
}
void mafStats(char *twoBitFile, char *mafDir, char *outFile)
/* mafStats - Calculate basic stats on maf file including species-by-species 
 * coverage and percent ID. */
{
struct twoBitFile *tbf = twoBitOpen(twoBitFile);
FILE *f = mustOpen(outFile, "w");
struct twoBitIndex *ix;
long genomeSize = 0;
struct hash *speciesHash = hashNew(0);
struct speciesAcc *speciesList = NULL, *species;
for (ix = tbf->indexList; ix != NULL; ix = ix->next)
    {
    unsigned chromSize = twoBitSeqSizeNoNs(tbf, ix->name);
    genomeSize += chromSize;
    char mafFileName[PATH_LEN];
    safef(mafFileName, sizeof(mafFileName), "%s/%s.maf", mafDir, ix->name);
    struct mafFile *mf = mafMayOpen(mafFileName);
    verbose(1, "processing %s\n", ix->name);
    if (mf == NULL)
        {
	warn("%s doesn't exist", mafFileName);
	continue;
	}
    struct mafAli *maf;
    while ((maf = mafNext(mf)) != NULL)
        {
	struct mafComp *mc;
	for (mc = maf->components; mc != NULL; mc = mc->next)
	    {
	    if (mc->text != NULL)
		toUpperN(mc->text, maf->textSize);
	    }
	addCounts(maf, speciesHash, &speciesList);
	mafAliFree(&maf);
	}
    mafFileFree(&mf);
    }
slReverse(&speciesList);

for (species = speciesList; species != NULL; species = species->next)
    {
    fprintf(f, "counts: %s\t%ld\t%ld\t%ld\n", species->name, species->covCount, species->aliCount, species->idCount);
    fprintf(f, "precents: %s\t%4.2f%%\t%4.2f%%\t%4.2f%%\n", 
    	species->name, 100.0 * species->covCount/genomeSize,
	100.0 * species->aliCount/genomeSize,
	100.0 * species->idCount/species->aliCount);
    }
carefulClose(&f);
}
void mafToPsl(char *querySrc, char *targetSrc, char *inName, char *outName)
/* mafToPsl - Convert maf to psl format. */
{
struct mafFile *mf = mafOpen(inName);
FILE *pslFh = mustOpen(outName, "w");
struct mafAli *maf;

while ((maf = mafNext(mf)) != NULL)
    {
    mafAliToPsl(querySrc, targetSrc, maf, pslFh);
    mafAliFree(&maf);
    }
carefulClose(&pslFh);
mafFileFree(&mf);
}
Exemplo n.º 8
0
void mafToFa(char *inName, char *outName)
/* mafToFa - convert maf file to fasta. */
{
struct mafFile *mf = mafOpen(inName);
FILE *faFh = mustOpen(outName, "w");
struct mafAli *maf;

while ((maf = mafNext(mf)) != NULL)
    {
    mafAliToFa(maf, faFh);
    mafAliFree(&maf);
    }
carefulClose(&faFh);
mafFileFree(&mf);
}
struct mafAli *mafLoadInRegion2(struct sqlConnection *conn,
    struct sqlConnection *conn2, char *table, char *chrom,
    int start, int end, char *file)
/* Return list of alignments in region. */
{
char **row;
unsigned int extFileId = 0;
struct mafAli *maf, *mafList = NULL;
struct mafFile *mf = NULL;
int rowOffset;

if (file != NULL)
    mf = mafOpen(file);

struct sqlResult *sr = hRangeQuery(conn, table, chrom,
    start, end, NULL, &rowOffset);

while ((row = sqlNextRow(sr)) != NULL)
    {
    struct scoredRef ref;
    scoredRefStaticLoad(row + rowOffset, &ref);
    if ((file != NULL) && (ref.extFile != 0))
	errAbort("expect extFile to be zero if file specified\n");
    if ((file == NULL) && (ref.extFile == 0))
	errAbort("expect extFile to be not zero or file specified\n");

    if (ref.extFile != extFileId)
	{
	char *path = hExtFileNameC(conn2, "extFile", ref.extFile);
	mafFileFree(&mf);
	mf = mafOpen(path);
	extFileId = ref.extFile;
	}
    lineFileSeek(mf->lf, ref.offset, SEEK_SET);
    maf = mafNext(mf);
    if (maf == NULL)
        internalErr();
    slAddHead(&mafList, maf);
    }
sqlFreeResult(&sr);
mafFileFree(&mf);
slReverse(&mafList);
/* hRangeQuery may return items out-of-order when bin is used in the query,
 * so sort here in order to avoid trouble at base-level view: */
slSort(&mafList, mafCmp);
return mafList;
}
struct mafAli *readMafs(struct mafFile *mf)
{
struct mafAli *maf;
char buffer[2048];
char buffer2[2048];
struct strandHead *strandHead;
struct mafAli *mafList = NULL;

while((maf = mafNext(mf)) != NULL)
    {
    struct mafComp *mc, *masterMc = maf->components;
    char *species = buffer;
    char *chrom;

    strcpy(species, masterMc->src);
    chrom = strchr(species,'.');
    if (chrom)
	*chrom++ = 0;
    if (masterSpecies == NULL)
	{
	masterSpecies = cloneString(species);
	masterChrom = cloneString(chrom);
	//printf("master %s %s\n",masterSpecies,masterChrom);
	}
    else
	{
	if (!sameString(masterSpecies, species))
	    errAbort("first species (%s) not master species (%s)\n",species,masterSpecies);
	}

    for(mc= masterMc->next; mc; mc = mc->next)
	{
	struct linkBlock *linkBlock;
	struct subSpecies *subSpecies = NULL;

	strcpy(species, mc->src);
	chrom = strchr(species,'.');
	*chrom++ = 0;

	if ((subSpecies = hashFindVal(speciesHash, species)) == NULL)
	    {
	    //printf("new species %s\n",species);
	    AllocVar(subSpecies);
	    subSpecies->name = cloneString(species);
	    subSpecies->hash = newHash(6);
	    subSpecies->blockStatus.strand = '+';
	    subSpecies->blockStatus.masterStart = masterMc->start;
	    slAddHead(&speciesList, subSpecies);
	    hashAdd(speciesHash, species, subSpecies);
	    }
	subSpecies->blockStatus.masterEnd = masterMc->start + masterMc->size ;
	sprintf(buffer2, "%s%c%s", masterChrom,mc->strand,chrom);
	if ((strandHead = hashFindVal(subSpecies->hash, buffer2)) == NULL)
	    {
	    //printf("new strand %s for species %s\n",buffer2, species);
	    AllocVar(strandHead);
	    hashAdd(subSpecies->hash, buffer2, strandHead);
	    strandHead->name = cloneString(buffer2);
	    strandHead->species = cloneString(species);
	    strandHead->qName = cloneString(chrom);
	    strandHead->qSize = mc->srcSize;
	    strandHead->strand = mc->strand;
	    slAddHead(&strandHeads, strandHead);
	    }
	AllocVar(linkBlock);
	linkBlock->mc = mc;
	linkBlock->cb.qStart = mc->start;
	linkBlock->cb.qEnd = mc->start + mc->size;
	linkBlock->cb.tStart = masterMc->start;
	linkBlock->cb.tEnd = masterMc->start + masterMc->size;


	slAddHead(&strandHead->links, linkBlock);
	}
    slAddHead(&mafList, maf);
    }
slReverse(&mafList);

return mafList;
}
Exemplo n.º 11
0
void scanMaf(char *database, char *fileName, struct hash *chromHash, boolean covRestrict, int spCount)
/* Scan through maf file (which must be sorted by
 * chromosome) and fill in coverage histograms on 
 * each chromosome. */
{
struct mafFile *mf = mafOpen(fileName);
struct mafAli *ali = NULL;
struct mafComp *comp = NULL;
struct chromSizes *lastCs = NULL, *cs = NULL;
char *chrom = NULL;
int start = 0, end = 0, size = 0, j, k;
int idStart = 0, idEnd = 0, idSize = 0;
UBYTE *cov = NULL;
UBYTE *align = NULL;
UBYTE *id = NULL;
char *tPtr[MAXALIGN];
bool hit = FALSE;

while ((ali = mafNext(mf)) != NULL)
    {
    int cCount = slCount(ali->components);
    int i = 1;
    int nextStart, idNextStart;

    comp = ali->components; 
    tPtr[0] = comp->text;
    chrom = strchr(comp->src,'.')+1;
    if (chrom == NULL)
         chrom = comp->src;
    start = comp->start;
    idStart = comp->start;
    nextStart = idNextStart = start;
    cs = hashMustFindVal(chromHash, chrom);
    if (cs != lastCs)
        {
	if (lastCs != NULL)
	    closeChromCov(fileName, lastCs, &cov, &align, &id);
	AllocArray(cov, cs->totalSize);
	AllocArray(align, cs->totalSize);
	AllocArray(id, cs->totalSize);
	if (covRestrict)
            {
	    restrictCov(cov, cs->totalSize, cs->restrictList);
	    restrictCov(align, cs->totalSize, cs->restrictList);
	    restrictCov(id, cs->totalSize, cs->restrictList);
            }
	restrictGaps(database, cov, cs->totalSize, chrom);
	restrictGaps(database, align, cs->totalSize, chrom);
	restrictGaps(database, id, cs->totalSize, chrom);
	cs->unrestrictedSize = calcUnrestrictedSize(cov, cs->totalSize);
	lastCs = cs;
	}
    /* don't count if few alignments than spCount */
    if ((ali->components->next == NULL) || (cCount < spCount))
        {
        mafAliFree(&ali);
        continue;
        }
    //printf("coverage %d, size %d\n", start, comp->size);
    incNoOverflow(cov+start, comp->size);
    for (comp = ali->components->next; comp != NULL; comp = comp->next)
        {
	if (comp->size > 0)	// do not process e lines
	    {
	    tPtr[i] = comp->text;
	    i++;
	    assert (i < MAXALIGN-1);
	    }
	else
	    --cCount;
        }
    size = 0;
    assert(cs != NULL);
    /* count gapless columns */
    for (j = 0 ; j<ali->textSize ; j++)
        {
	hit = TRUE;
        /* look for aligning bases in query seqs , abort if any is a gap */
        for (i = 1 ; i < cCount ; i++)
            {
            if (tPtr[i][j] == '-' || tPtr[0][j] == '-')
                {
     //   printf("align %d, size %d\n", start, size);
                incNoOverflow(align+start, size);
                cs->totalDepth += size;
                start = nextStart;
                size = 0;
                hit = FALSE;
                break;
                }
            }
        if (hit)
            size++;
        /* if there is a gap in the target, start a new alignment block*/
        if (tPtr[0][j] != '-')
            nextStart++;
        }
    assert(cs!=NULL);
    end = start+size;
    if (end > cs->totalSize)
	{
        if (cs->name != NULL)
            errAbort("End %d past end %ld of %f\n", end, (long)cs->totalSize, ali->score);
        else
            {
            if (ali!=NULL)
                errAbort("End %d past end %ld %f\n", end, (long)cs->totalSize, ali->score );
            else
                errAbort("End %d past end %ld \n", end, (long)cs->totalSize);
            }
	}
    incNoOverflow(align+start, size-1);
    cs->totalDepth += size-1;

    /* count percent id */
    idSize = 0;
    assert(cs != NULL);
    for (k = 0 ; k<ali->textSize ; k++)
        {
	hit = TRUE;
        char tc = toupper(tPtr[0][k]);
        for (i = 1 ; i < cCount ; i++)
            {
            if (toupper(tPtr[i][k]) != tc || tc == '-' || tc == 'N')
                {
                incNoOverflow(id+idStart, idSize);
                idStart = idNextStart;
                idSize = 0;
                hit = FALSE;
                break;
                }
            }
        if (hit)
            idSize++;
        /* skip over gaps */
        if (tc != '-')
            idNextStart++;
        }
    assert(cs!=NULL);
    idEnd = idStart+idSize;
    if (idEnd > cs->totalSize)
	{
        if (cs->name != NULL)
            errAbort("End %d past end %ld of %f\n", idEnd, (long)cs->totalSize, ali->score);
        else
            {
            if (ali!=NULL)
                errAbort("End %d past end %ld %f\n", idEnd, (long)cs->totalSize, ali->score );
            else
                errAbort("End %d past end %ld \n", idEnd, (long)cs->totalSize);
            }
	}
    incNoOverflow(id+idStart, idSize-1);
    mafAliFree(&ali);
    }
closeChromCov(fileName, cs, &cov, &align, &id);
}
Exemplo n.º 12
0
void extractMafs(char *file, FILE *f, struct hash *regionHash)
/* extract MAFs in a file from regions specified in hash */
{
char *chrom = NULL;
struct bed *bed = NULL;
struct mafFile *mf = mafOpen(file);
struct mafAli *maf = NULL;
struct mafComp *mc;
char path[256];

verbose(1, "extracting from %s\n", file);
maf = mafNext(mf);
while (maf)
    {
    mc = maf->components;
    if (!chrom || differentString(chrom, chromFromSrc(mc->src)))
        chrom = cloneString(chromFromSrc(mc->src));         /* new chrom */
    bed = (struct bed *)hashFindVal(regionHash, chrom);
    if (!bed)
        {
        /* no regions on this chrom -- skip to next chrom */
        do
            mafAliFree(&maf);
        while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom));
        continue;  // start over with this maf
        }
    verbose(2, "region: %s:%d-%d\n", 
            bed->chrom, bed->chromStart+1, bed->chromEnd);
    if (outDir)
        {
        if (f)
            endOutFile(f);
        safef(path, sizeof (path), "%s/%s.maf", dir, bed->name);
        f = startOutFile(path);
        }

    /* skip mafs before region, stopping if chrom changes */
    while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) &&
        (mc->start + mc->size) <= bed->chromStart)
        {
        mafAliFree(&maf);
        maf = mafNext(mf);
        }

    /* extract all mafs and pieces of mafs in region */
    while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) &&
        (bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start))
        {
        int mafStart = mc->start;
        int mafEnd = mc->start + mc->size;
        struct mafAli *full = maf;
        if (mafStart < bed->chromStart || mafEnd > bed->chromEnd)
            {
            full = maf;
            maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps);
            mc = maf->components;
            }
        verbose(2, "   %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size);
        mafWrite(f, maf);
        struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1)
            ? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf);
        if (maf != full)
            mafAliFree(&maf);
        mafAliFree(&full);
        maf = nextMaf;
        }
    /* get next region */
    hashRemove(regionHash, bed->chrom);
    if (bed->next)
        hashAdd(regionHash, bed->chrom, bed->next);
    }
mafFileFree(&mf);
}
void hgLoadMafSummary(char *db, char *table, char *fileName)
/* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */
{
long mafCount = 0, allMafCount = 0;
struct mafComp *mcMaster = NULL;
struct mafAli *maf;
struct mafFile *mf = mafOpen(fileName);
struct sqlConnection *conn;
FILE *f = hgCreateTabFile(".", table);
long componentCount = 0;
struct hash *componentHash = newHash(0);

if (!test)
    {
    conn = sqlConnect(database);
    mafSummaryTableCreate(conn, table, hGetMinIndexLength(db));
    }
verbose(1, "Indexing and tabulating %s\n", fileName);

/* process mafs */
while ((maf = mafNext(mf)) != NULL)
    {
    mcMaster = mafMaster(maf, mf, fileName);
    allMafCount++;
    if (mcMaster->srcSize < minSeqSize)
	continue;
    while (mcMaster->size > maxSize)
        {
        /* break maf into maxSize pieces */
        int end = mcMaster->start + maxSize;
        struct mafAli *subMaf = 
                mafSubset(maf, mcMaster->src, mcMaster->start, end);
        verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src,
                                        mcMaster->start, mcMaster->size);
        componentCount += 
            processMaf(subMaf, componentHash, f, mf, fileName);
        mafAliFree(&subMaf);
        subMaf = mafSubset(maf, mcMaster->src, 
                                end, end + (mcMaster->size - maxSize));
        mafAliFree(&maf);
        maf = subMaf;
        mcMaster = mafMaster(maf, mf, fileName);
        }
    if (mcMaster->size != 0)
        {
        /* remainder of maf after splitting off maxSize submafs */
        componentCount += 
            processMaf(maf, componentHash, f, mf, fileName);
        }
    mafAliFree(&maf);
    mafCount++;
    }
mafFileFree(&mf);
flushSummaryBlocks(componentHash, f);
verbose(1, 
    "Created %ld summary blocks from %ld components and %ld mafs from %s\n",
        summaryCount, componentCount, allMafCount, fileName);
if (test)
    return;
verbose(1, "Loading into %s table %s...\n", database, table);
hgLoadTabFile(conn, ".", table, &f);
verbose(1, "Loading complete");
hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", 
                        summaryCount, fileName);
}