void scanMaf(char *database, char *fileName, struct hash *chromHash, boolean covRestrict, int spCount) /* Scan through maf file (which must be sorted by * chromosome) and fill in coverage histograms on * each chromosome. */ { struct mafFile *mf = mafOpen(fileName); struct mafAli *ali = NULL; struct mafComp *comp = NULL; struct chromSizes *lastCs = NULL, *cs = NULL; char *chrom = NULL; int start = 0, end = 0, size = 0, j, k; int idStart = 0, idEnd = 0, idSize = 0; UBYTE *cov = NULL; UBYTE *align = NULL; UBYTE *id = NULL; char *tPtr[MAXALIGN]; bool hit = FALSE; while ((ali = mafNext(mf)) != NULL) { int cCount = slCount(ali->components); int i = 1; int nextStart, idNextStart; comp = ali->components; tPtr[0] = comp->text; chrom = strchr(comp->src,'.')+1; if (chrom == NULL) chrom = comp->src; start = comp->start; idStart = comp->start; nextStart = idNextStart = start; cs = hashMustFindVal(chromHash, chrom); if (cs != lastCs) { if (lastCs != NULL) closeChromCov(fileName, lastCs, &cov, &align, &id); AllocArray(cov, cs->totalSize); AllocArray(align, cs->totalSize); AllocArray(id, cs->totalSize); if (covRestrict) { restrictCov(cov, cs->totalSize, cs->restrictList); restrictCov(align, cs->totalSize, cs->restrictList); restrictCov(id, cs->totalSize, cs->restrictList); } restrictGaps(database, cov, cs->totalSize, chrom); restrictGaps(database, align, cs->totalSize, chrom); restrictGaps(database, id, cs->totalSize, chrom); cs->unrestrictedSize = calcUnrestrictedSize(cov, cs->totalSize); lastCs = cs; } /* don't count if few alignments than spCount */ if ((ali->components->next == NULL) || (cCount < spCount)) { mafAliFree(&ali); continue; } //printf("coverage %d, size %d\n", start, comp->size); incNoOverflow(cov+start, comp->size); for (comp = ali->components->next; comp != NULL; comp = comp->next) { if (comp->size > 0) // do not process e lines { tPtr[i] = comp->text; i++; assert (i < MAXALIGN-1); } else --cCount; } size = 0; assert(cs != NULL); /* count gapless columns */ for (j = 0 ; j<ali->textSize ; j++) { hit = TRUE; /* look for aligning bases in query seqs , abort if any is a gap */ for (i = 1 ; i < cCount ; i++) { if (tPtr[i][j] == '-' || tPtr[0][j] == '-') { // printf("align %d, size %d\n", start, size); incNoOverflow(align+start, size); cs->totalDepth += size; start = nextStart; size = 0; hit = FALSE; break; } } if (hit) size++; /* if there is a gap in the target, start a new alignment block*/ if (tPtr[0][j] != '-') nextStart++; } assert(cs!=NULL); end = start+size; if (end > cs->totalSize) { if (cs->name != NULL) errAbort("End %d past end %ld of %f\n", end, (long)cs->totalSize, ali->score); else { if (ali!=NULL) errAbort("End %d past end %ld %f\n", end, (long)cs->totalSize, ali->score ); else errAbort("End %d past end %ld \n", end, (long)cs->totalSize); } } incNoOverflow(align+start, size-1); cs->totalDepth += size-1; /* count percent id */ idSize = 0; assert(cs != NULL); for (k = 0 ; k<ali->textSize ; k++) { hit = TRUE; char tc = toupper(tPtr[0][k]); for (i = 1 ; i < cCount ; i++) { if (toupper(tPtr[i][k]) != tc || tc == '-' || tc == 'N') { incNoOverflow(id+idStart, idSize); idStart = idNextStart; idSize = 0; hit = FALSE; break; } } if (hit) idSize++; /* skip over gaps */ if (tc != '-') idNextStart++; } assert(cs!=NULL); idEnd = idStart+idSize; if (idEnd > cs->totalSize) { if (cs->name != NULL) errAbort("End %d past end %ld of %f\n", idEnd, (long)cs->totalSize, ali->score); else { if (ali!=NULL) errAbort("End %d past end %ld %f\n", idEnd, (long)cs->totalSize, ali->score ); else errAbort("End %d past end %ld \n", idEnd, (long)cs->totalSize); } } incNoOverflow(id+idStart, idSize-1); mafAliFree(&ali); } closeChromCov(fileName, cs, &cov, &align, &id); }
void extractMafs(char *file, FILE *f, struct hash *regionHash) /* extract MAFs in a file from regions specified in hash */ { char *chrom = NULL; struct bed *bed = NULL; struct mafFile *mf = mafOpen(file); struct mafAli *maf = NULL; struct mafComp *mc; char path[256]; verbose(1, "extracting from %s\n", file); maf = mafNext(mf); while (maf) { mc = maf->components; if (!chrom || differentString(chrom, chromFromSrc(mc->src))) chrom = cloneString(chromFromSrc(mc->src)); /* new chrom */ bed = (struct bed *)hashFindVal(regionHash, chrom); if (!bed) { /* no regions on this chrom -- skip to next chrom */ do mafAliFree(&maf); while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom)); continue; // start over with this maf } verbose(2, "region: %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd); if (outDir) { if (f) endOutFile(f); safef(path, sizeof (path), "%s/%s.maf", dir, bed->name); f = startOutFile(path); } /* skip mafs before region, stopping if chrom changes */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (mc->start + mc->size) <= bed->chromStart) { mafAliFree(&maf); maf = mafNext(mf); } /* extract all mafs and pieces of mafs in region */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start)) { int mafStart = mc->start; int mafEnd = mc->start + mc->size; struct mafAli *full = maf; if (mafStart < bed->chromStart || mafEnd > bed->chromEnd) { full = maf; maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps); mc = maf->components; } verbose(2, " %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size); mafWrite(f, maf); struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1) ? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf); if (maf != full) mafAliFree(&maf); mafAliFree(&full); maf = nextMaf; } /* get next region */ hashRemove(regionHash, bed->chrom); if (bed->next) hashAdd(regionHash, bed->chrom, bed->next); } mafFileFree(&mf); }
struct mafAli *hgMafFrag( char *database, /* Database, must already have hSetDb to this */ char *track, /* Name of MAF track */ char *chrom, /* Chromosome (in database genome) */ int start, int end, /* start/end in chromosome */ char strand, /* Chromosome strand. */ char *outName, /* Optional name to use in first component */ struct slName *orderList /* Optional order of organisms. */ ) /* mafFrag- Extract maf sequences for a region from database. * This creates a somewhat unusual MAF that extends from start * to end whether or not there are actually alignments. Where * there are no alignments (or alignments missing a species) * a . character fills in. The score is always zero, and * the sources just indicate the species. You can mafFree this * as normal. */ { int chromSize = hChromSize(database, chrom); struct sqlConnection *conn = hAllocConn(database); struct dnaSeq *native = hChromSeq(database, chrom, start, end); struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end); char masterSrc[128]; struct hash *orgHash = newHash(10); struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL; int curPos = start, symCount = 0; struct slName *name; int order = 0; /* Check that the mafs are really copacetic, the particular * subtype we think is in the database that this (relatively) * simple code can handle. */ safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom); mafCheckFirstComponentSrc(mafList, masterSrc); mafCheckFirstComponentStrand(mafList, '+'); slSort(&mafList, mafCmp); /* Prebuild organisms if possible from input orderList. */ for (name = orderList; name != NULL; name = name->next) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, name->name, org, &org->name); org->dy = dyStringNew(native->size*1.5); org->order = order++; if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, database, org, &org->name); org->dy = dyStringNew(native->size*1.5); if (nativeOrg == NULL) nativeOrg = org; } /* Go through all mafs in window, mostly building up * org->dy strings. */ for (maf = mafList; maf != NULL; maf = maf->next) { struct mafComp *mc, *mcMaster = maf->components; struct mafAli *subMaf = NULL; order = 0; if (curPos < mcMaster->start) { fillInMissing(nativeOrg, orgList, native, start, curPos, mcMaster->start); symCount += mcMaster->start - curPos; } if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst * backtracking */ { if (mafNeedSubset(maf, masterSrc, curPos, end)) { subMaf = mafSubset(maf, masterSrc, curPos, end); if (subMaf == NULL) continue; } else subMaf = maf; for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order) { /* Extract name up to dot into 'orgName' */ char buf[128], *e, *orgName; if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */ continue; mc->leftStatus = mc->rightStatus = 0; /* squash annotation */ e = strchr(mc->src, '.'); if (e == NULL) orgName = mc->src; else { int len = e - mc->src; if (len >= sizeof(buf)) errAbort("organism/database name %s too long", mc->src); memcpy(buf, mc->src, len); buf[len] = 0; orgName = buf; } /* Look up dyString corresponding to org, and create a * new one if necessary. */ org = hashFindVal(orgHash, orgName); if (org == NULL) { if (orderList != NULL) errAbort("%s is not in orderList", orgName); AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, orgName, org, &org->name); org->dy = dyStringNew(native->size*1.5); dyStringAppendMultiC(org->dy, '.', symCount); if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL && order > org->order) org->order = order; org->hit = TRUE; /* Fill it up with alignment. */ dyStringAppendN(org->dy, mc->text, subMaf->textSize); } for (org = orgList; org != NULL; org = org->next) { if (!org->hit) dyStringAppendMultiC(org->dy, '.', subMaf->textSize); org->hit = FALSE; } symCount += subMaf->textSize; curPos = mcMaster->start + mcMaster->size; if (subMaf != maf) mafAliFree(&subMaf); } } if (curPos < end) { fillInMissing(nativeOrg, orgList, native, start, curPos, end); symCount += end - curPos; } mafAliFreeList(&mafList); slSort(&orgList, oneOrgCmp); if (strand == '-') { for (org = orgList; org != NULL; org = org->next) reverseComplement(org->dy->string, org->dy->stringSize); } /* Construct our maf */ AllocVar(maf); maf->textSize = symCount; for (org = orgList; org != NULL; org = org->next) { struct mafComp *mc; AllocVar(mc); if (org == orgList) { if (outName != NULL) { mc->src = cloneString(outName); mc->srcSize = native->size; mc->strand = '+'; mc->start = 0; mc->size = native->size; } else { mc->src = cloneString(masterSrc); mc->srcSize = chromSize; mc->strand = strand; if (strand == '-') reverseIntRange(&start, &end, chromSize); mc->start = start; mc->size = end-start; } } else { int size = countAlpha(org->dy->string); mc->src = cloneString(org->name); mc->srcSize = size; mc->strand = '+'; mc->start = 0; mc->size = size; } mc->text = cloneString(org->dy->string); dyStringFree(&org->dy); slAddHead(&maf->components, mc); } slReverse(&maf->components); slFreeList(&orgList); freeHash(&orgHash); hFreeConn(&conn); return maf; }
void hgLoadMafSummary(char *db, char *table, char *fileName) /* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */ { long mafCount = 0, allMafCount = 0; struct mafComp *mcMaster = NULL; struct mafAli *maf; struct mafFile *mf = mafOpen(fileName); struct sqlConnection *conn; FILE *f = hgCreateTabFile(".", table); long componentCount = 0; struct hash *componentHash = newHash(0); if (!test) { conn = sqlConnect(database); mafSummaryTableCreate(conn, table, hGetMinIndexLength(db)); } verbose(1, "Indexing and tabulating %s\n", fileName); /* process mafs */ while ((maf = mafNext(mf)) != NULL) { mcMaster = mafMaster(maf, mf, fileName); allMafCount++; if (mcMaster->srcSize < minSeqSize) continue; while (mcMaster->size > maxSize) { /* break maf into maxSize pieces */ int end = mcMaster->start + maxSize; struct mafAli *subMaf = mafSubset(maf, mcMaster->src, mcMaster->start, end); verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src, mcMaster->start, mcMaster->size); componentCount += processMaf(subMaf, componentHash, f, mf, fileName); mafAliFree(&subMaf); subMaf = mafSubset(maf, mcMaster->src, end, end + (mcMaster->size - maxSize)); mafAliFree(&maf); maf = subMaf; mcMaster = mafMaster(maf, mf, fileName); } if (mcMaster->size != 0) { /* remainder of maf after splitting off maxSize submafs */ componentCount += processMaf(maf, componentHash, f, mf, fileName); } mafAliFree(&maf); mafCount++; } mafFileFree(&mf); flushSummaryBlocks(componentHash, f); verbose(1, "Created %ld summary blocks from %ld components and %ld mafs from %s\n", summaryCount, componentCount, allMafCount, fileName); if (test) return; verbose(1, "Loading into %s table %s...\n", database, table); hgLoadTabFile(conn, ".", table, &f); verbose(1, "Loading complete"); hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", summaryCount, fileName); }
void xmfaToMaf(char *in, char *out) /* xmfaToMaf - Convert from xmfa to maf format. */ { int c; FILE *input = mustOpen(in, "r"); FILE *output = mustOpen(out, "w"); char* commentLine; struct dnaSeq* sequence; struct mafAli *ali; struct sqlConnection* conn = hAllocConn(); mafWriteStart(output, "mlagan"); AllocVar(ali); while(myFaReadMixedNext(input, TRUE, "default name", TRUE, &commentLine, &sequence)) { char srcName[128]; c = fgetc(input); if(c == '=' || c == '>') { /* add the current sequence and process the block if we've see an '='*/ char org[32]; char chrom[32]; int start; int stop; char strand; struct mafComp *comp; double score; char buffer[1024]; ungetc(c, input); AllocVar(comp); /* parse the comment line */ sscanf(commentLine, ">%s %[^:]:%d-%d %c", org, chrom, &start, &stop, &strand); /* build the name */ safef(srcName, sizeof(srcName), "%s.%s", optionVal(org, org), chrom); comp->src = cloneString(srcName); sqlSafef(buffer, 1024, "SELECT size FROM %s.chromInfo WHERE chrom = \"%s\"", optionVal(org, org), chrom); assert(sqlQuickQuery(conn, buffer, buffer, 1024) != 0); comp->srcSize = atoi(buffer); comp->strand = strand; start = start - 1; comp->start = start; comp->size = ungappedSize(sequence); if(strand == '-') comp->start = comp->srcSize - (comp->start + comp->size); comp->text = sequence->dna; sequence->dna = 0; slAddHead(&ali->components, comp); freeDnaSeq(&sequence); if(c == '=') { fscanf(input, "= score=%lf\n", &score); ali->score = score; slReverse(&ali->components); mafWrite(output, ali); mafAliFree(&ali); AllocVar(ali); } } } mafWriteEnd(output); }