/* get the maf alignments for a particular mafFrame */ struct mafAli *getAliForFrame(char *mafTable, struct mafFrames *frame) { struct sqlConnection *conn = hAllocConn(); struct mafAli *aliAll = mafLoadInRegion(conn, mafTable, frame->chrom, frame->chromStart, frame->chromEnd ); struct mafAli *ali; struct mafAli *list = NULL; struct mafAli *nextAli; for(ali = aliAll; ali; ali = nextAli) { nextAli = ali->next; ali->next = NULL; char *masterSrc = ali->components->src; struct mafAli *subAli = NULL; if (mafNeedSubset(ali, masterSrc, frame->chromStart, frame->chromEnd)) { subAli = mafSubset( ali, masterSrc, frame->chromStart, frame->chromEnd); if (subAli == NULL) continue; } if (subAli) { slAddHead(&list, subAli); mafAliFree(&ali); } else slAddHead(&list, ali); } slReverse(&list); int size = 0; for(ali = list; ali; ali = ali->next) { size += ali->components->size; } assert(size == frame->chromEnd - frame->chromStart); hFreeConn(&conn); return list; }
static struct mafAli *getAliForRange(char *database, char *mafTable, char *chrom, int start, int end) { struct sqlConnection *conn = hAllocConn(database); struct mafAli *aliAll = mafLoadInRegion(conn, mafTable, chrom, start, end); struct mafAli *ali; struct mafAli *list = NULL; struct mafAli *nextAli; hFreeConn(&conn); for(ali = aliAll; ali; ali = nextAli) { nextAli = ali->next; ali->next = NULL; char *masterSrc = ali->components->src; struct mafAli *subAli = NULL; if (mafNeedSubset(ali, masterSrc, start, end)) { subAli = mafSubset( ali, masterSrc, start, end); if (subAli == NULL) continue; } if (subAli) { slAddHead(&list, subAli); mafAliFree(&ali); } else slAddHead(&list, ali); } slReverse(&list); list = padOutAli(list, database, chrom, start, end); return list; }
void extractMafs(char *file, FILE *f, struct hash *regionHash) /* extract MAFs in a file from regions specified in hash */ { char *chrom = NULL; struct bed *bed = NULL; struct mafFile *mf = mafOpen(file); struct mafAli *maf = NULL; struct mafComp *mc; char path[256]; verbose(1, "extracting from %s\n", file); maf = mafNext(mf); while (maf) { mc = maf->components; if (!chrom || differentString(chrom, chromFromSrc(mc->src))) chrom = cloneString(chromFromSrc(mc->src)); /* new chrom */ bed = (struct bed *)hashFindVal(regionHash, chrom); if (!bed) { /* no regions on this chrom -- skip to next chrom */ do mafAliFree(&maf); while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom)); continue; // start over with this maf } verbose(2, "region: %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd); if (outDir) { if (f) endOutFile(f); safef(path, sizeof (path), "%s/%s.maf", dir, bed->name); f = startOutFile(path); } /* skip mafs before region, stopping if chrom changes */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (mc->start + mc->size) <= bed->chromStart) { mafAliFree(&maf); maf = mafNext(mf); } /* extract all mafs and pieces of mafs in region */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start)) { int mafStart = mc->start; int mafEnd = mc->start + mc->size; struct mafAli *full = maf; if (mafStart < bed->chromStart || mafEnd > bed->chromEnd) { full = maf; maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps); mc = maf->components; } verbose(2, " %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size); mafWrite(f, maf); struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1) ? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf); if (maf != full) mafAliFree(&maf); mafAliFree(&full); maf = nextMaf; } /* get next region */ hashRemove(regionHash, bed->chrom); if (bed->next) hashAdd(regionHash, bed->chrom, bed->next); } mafFileFree(&mf); }
void hgLoadMafSummary(char *db, char *table, char *fileName) /* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */ { long mafCount = 0, allMafCount = 0; struct mafComp *mcMaster = NULL; struct mafAli *maf; struct mafFile *mf = mafOpen(fileName); struct sqlConnection *conn; FILE *f = hgCreateTabFile(".", table); long componentCount = 0; struct hash *componentHash = newHash(0); if (!test) { conn = sqlConnect(database); mafSummaryTableCreate(conn, table, hGetMinIndexLength(db)); } verbose(1, "Indexing and tabulating %s\n", fileName); /* process mafs */ while ((maf = mafNext(mf)) != NULL) { mcMaster = mafMaster(maf, mf, fileName); allMafCount++; if (mcMaster->srcSize < minSeqSize) continue; while (mcMaster->size > maxSize) { /* break maf into maxSize pieces */ int end = mcMaster->start + maxSize; struct mafAli *subMaf = mafSubset(maf, mcMaster->src, mcMaster->start, end); verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src, mcMaster->start, mcMaster->size); componentCount += processMaf(subMaf, componentHash, f, mf, fileName); mafAliFree(&subMaf); subMaf = mafSubset(maf, mcMaster->src, end, end + (mcMaster->size - maxSize)); mafAliFree(&maf); maf = subMaf; mcMaster = mafMaster(maf, mf, fileName); } if (mcMaster->size != 0) { /* remainder of maf after splitting off maxSize submafs */ componentCount += processMaf(maf, componentHash, f, mf, fileName); } mafAliFree(&maf); mafCount++; } mafFileFree(&mf); flushSummaryBlocks(componentHash, f); verbose(1, "Created %ld summary blocks from %ld components and %ld mafs from %s\n", summaryCount, componentCount, allMafCount, fileName); if (test) return; verbose(1, "Loading into %s table %s...\n", database, table); hgLoadTabFile(conn, ".", table, &f); verbose(1, "Loading complete"); hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", summaryCount, fileName); }
struct mafAli *hgMafFrag( char *database, /* Database, must already have hSetDb to this */ char *track, /* Name of MAF track */ char *chrom, /* Chromosome (in database genome) */ int start, int end, /* start/end in chromosome */ char strand, /* Chromosome strand. */ char *outName, /* Optional name to use in first component */ struct slName *orderList /* Optional order of organisms. */ ) /* mafFrag- Extract maf sequences for a region from database. * This creates a somewhat unusual MAF that extends from start * to end whether or not there are actually alignments. Where * there are no alignments (or alignments missing a species) * a . character fills in. The score is always zero, and * the sources just indicate the species. You can mafFree this * as normal. */ { int chromSize = hChromSize(database, chrom); struct sqlConnection *conn = hAllocConn(database); struct dnaSeq *native = hChromSeq(database, chrom, start, end); struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end); char masterSrc[128]; struct hash *orgHash = newHash(10); struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL; int curPos = start, symCount = 0; struct slName *name; int order = 0; /* Check that the mafs are really copacetic, the particular * subtype we think is in the database that this (relatively) * simple code can handle. */ safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom); mafCheckFirstComponentSrc(mafList, masterSrc); mafCheckFirstComponentStrand(mafList, '+'); slSort(&mafList, mafCmp); /* Prebuild organisms if possible from input orderList. */ for (name = orderList; name != NULL; name = name->next) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, name->name, org, &org->name); org->dy = dyStringNew(native->size*1.5); org->order = order++; if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, database, org, &org->name); org->dy = dyStringNew(native->size*1.5); if (nativeOrg == NULL) nativeOrg = org; } /* Go through all mafs in window, mostly building up * org->dy strings. */ for (maf = mafList; maf != NULL; maf = maf->next) { struct mafComp *mc, *mcMaster = maf->components; struct mafAli *subMaf = NULL; order = 0; if (curPos < mcMaster->start) { fillInMissing(nativeOrg, orgList, native, start, curPos, mcMaster->start); symCount += mcMaster->start - curPos; } if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst * backtracking */ { if (mafNeedSubset(maf, masterSrc, curPos, end)) { subMaf = mafSubset(maf, masterSrc, curPos, end); if (subMaf == NULL) continue; } else subMaf = maf; for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order) { /* Extract name up to dot into 'orgName' */ char buf[128], *e, *orgName; if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */ continue; mc->leftStatus = mc->rightStatus = 0; /* squash annotation */ e = strchr(mc->src, '.'); if (e == NULL) orgName = mc->src; else { int len = e - mc->src; if (len >= sizeof(buf)) errAbort("organism/database name %s too long", mc->src); memcpy(buf, mc->src, len); buf[len] = 0; orgName = buf; } /* Look up dyString corresponding to org, and create a * new one if necessary. */ org = hashFindVal(orgHash, orgName); if (org == NULL) { if (orderList != NULL) errAbort("%s is not in orderList", orgName); AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, orgName, org, &org->name); org->dy = dyStringNew(native->size*1.5); dyStringAppendMultiC(org->dy, '.', symCount); if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL && order > org->order) org->order = order; org->hit = TRUE; /* Fill it up with alignment. */ dyStringAppendN(org->dy, mc->text, subMaf->textSize); } for (org = orgList; org != NULL; org = org->next) { if (!org->hit) dyStringAppendMultiC(org->dy, '.', subMaf->textSize); org->hit = FALSE; } symCount += subMaf->textSize; curPos = mcMaster->start + mcMaster->size; if (subMaf != maf) mafAliFree(&subMaf); } } if (curPos < end) { fillInMissing(nativeOrg, orgList, native, start, curPos, end); symCount += end - curPos; } mafAliFreeList(&mafList); slSort(&orgList, oneOrgCmp); if (strand == '-') { for (org = orgList; org != NULL; org = org->next) reverseComplement(org->dy->string, org->dy->stringSize); } /* Construct our maf */ AllocVar(maf); maf->textSize = symCount; for (org = orgList; org != NULL; org = org->next) { struct mafComp *mc; AllocVar(mc); if (org == orgList) { if (outName != NULL) { mc->src = cloneString(outName); mc->srcSize = native->size; mc->strand = '+'; mc->start = 0; mc->size = native->size; } else { mc->src = cloneString(masterSrc); mc->srcSize = chromSize; mc->strand = strand; if (strand == '-') reverseIntRange(&start, &end, chromSize); mc->start = start; mc->size = end-start; } } else { int size = countAlpha(org->dy->string); mc->src = cloneString(org->name); mc->srcSize = size; mc->strand = '+'; mc->start = 0; mc->size = size; } mc->text = cloneString(org->dy->string); dyStringFree(&org->dy); slAddHead(&maf->components, mc); } slReverse(&maf->components); slFreeList(&orgList); freeHash(&orgHash); hFreeConn(&conn); return maf; }