int main(int argc, char *argv[]) /* read snpTable, output deletions that pass input filtering */ { struct slName *chromList = NULL; if (argc != 4) usage(); database = argv[1]; hSetDb(database); snpTable = argv[2]; if (!hTableExists(snpTable)) errAbort("no %s table\n", snpTable); exceptionsTable = argv[3]; if (!hTableExists(exceptionsTable)) errAbort("no %s table\n", exceptionsTable); verbose(1, "loading exceptions...\n"); getExceptions(); chromList = hAllChromNames(); outputFileHandle = mustOpen("deletions.tab", "w"); getDeletions(); carefulClose(&outputFileHandle); return 0; }
struct bed *hWholeTrackAsBedList(char *track) /* Get entire track as a list of beds. */ { struct slName *chrom, *chromList = hAllChromNames(); struct bed *bedList = NULL; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct bed *chromBedList = hGetBedRange(track, chrom->name, 0, 0, NULL); bedList = slCat(chromBedList, bedList); } slFreeList(&chromList); return bedList; }
static void selectFromSplitTable(char *db, char *table, struct hTableInfo *tblInfo, struct sqlConnection *conn, FILE *outFh) /* select from a split table */ { struct slName *chroms = hAllChromNames(db); struct slName *chrom; char chromTable[256]; for (chrom = chroms; chrom != NULL ; chrom = chrom->next) if (inclChrom(chrom->name)) { safef(chromTable, sizeof(chromTable), "%s_%s", chrom->name, table); selectFromTable(chromTable, tblInfo, conn, outFh); } }
void testIntersect(char *db, char *track1, char *track2) /* testIntersect - Test some ideas on intersections. */ { struct slName *chromList = NULL, *chrom; struct sqlConnection *conn; hSetDb(db); if (optionExists("chrom")) chromList = slNameNew(optionVal("chrom", NULL)); else chromList = hAllChromNames(); conn = hAllocConn(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) intersectOnChrom(db, conn, chrom->name, track1, track2); hFreeConn(&conn); }
void hgChroms(char *db) /* hgChroms - print chromosomes for a genome. */ { struct slName *chrom, *chroms = hAllChromNames(db); for (chrom = chroms; chrom != NULL; chrom = chrom->next) { if (inclChrom(chrom)) { if (noPrefix && startsWith("chr", chrom->name)) { printf("%s\n", chrom->name + strlen("chr")); } else { printf("%s\n", chrom->name); } } } }
void chromKeeperInit(char *db) /* Initialize the chromKeeper to a given database (hg15,mm2, etc). */ { struct slName *names = NULL, *name = NULL; int count=0; names = hAllChromNames(db); chromCount = slCount(names); assert(chromNames == NULL && chromRanges == NULL); AllocArray(chromNames, chromCount); AllocArray(chromRanges, chromCount); for(name=names; name != NULL; name = name->next) { int size = hChromSize(db, name->name); chromRanges[count] = binKeeperNew(0,size); chromNames[count] = cloneString(name->name); count++; } slFreeList(&names); }
void getChromNamesAndDirForDb(char *db) { struct sqlConnection *conn = hConnectCentral(); char query[512]; char buff[512]; char *tmpMark = NULL; int buffSize = 512; sqlSafef(query, sizeof(query), "select nibPath from dbDb where name='%s'", db); if(sqlQuickQuery(conn, query, buff, buffSize) == NULL) errAbort("Coun't find nib dir for genome %s\n", db); dirName = needMem(buffSize*sizeof(char)); tmpMark = strrchr(buff, '/'); if(tmpMark != NULL) *tmpMark = '\0'; snprintf(dirName, buffSize, "%s/mixedNib/", buff); chromNames = hAllChromNames(); hDisconnectCentral(&conn); }
void bedCons(char *database, char *refAliTrack, char *bedTrack) /* bedCons - Look at conservation of a BED track vs. a refence * (nonredundant) alignment track. */ { struct slName *chromList, *chrom; struct stats *stats = NULL; struct hash *otherHash = makeOtherHash(database, "mouseChrom"); if (optionExists("chrom")) chromList = newSlName(optionVal("chrom", NULL)); else chromList = hAllChromNames(database); AllocVar(stats); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { uglyf("%s\n", chrom->name); oneChrom(database, chrom->name, refAliTrack, bedTrack, otherHash, stats); } printStats(stats); }
void eisenInput(char *database, char *outFile) /* eisenInput - Create input for Eisen-style cluster program. */ { struct slName *chromList = NULL, *chromEl; FILE *f = mustOpen(outFile, "w"); char *chrom; struct hash *refLinkHash = hashNew(0); struct refLink *refLinkList; struct hash *erHash = hashNew(0); struct expRecord *erList = NULL, *er; /* Load info good for all chromosomes. */ refLinkList = loadRefLink(database, refLinkHash); erList = loadExpRecord(expRecordTable, "hgFixed"); for (er = erList; er != NULL; er = er->next) { char sid[16]; snprintf(sid, sizeof(sid), "%u", er->id); hashAdd(erHash, sid, er); } /* Do it chromosome by chromosome. */ chromList = hAllChromNames(database); for (chromEl = chromList; chromEl != NULL; chromEl = chromEl->next) { chrom = chromEl->name; uglyf("%s\n", chrom); oneChromInput(database, chrom, hChromSize(database, chrom), "rnaCluster", expTrack, refLinkHash, erHash, f); } /* Cleanup time! */ expRecordFreeList(&erList); freeHash(&erHash); refLinkFreeList(&refLinkList); freeHash(&refLinkHash); }
void expToRna(char *database, char *rnaTable, char *expTable, char *outName) /* expToRna - Make a little two column table that associates * rnaClusters with expression info. */ { struct slName *chromList = NULL, *chromEl; char *chrom = optionVal("chrom", NULL); FILE *f = mustOpen(outName, "w"); if (chrom != NULL) chromList = newSlName(chrom); else chromList = hAllChromNames(database); for (chromEl = chromList; chromEl != NULL; chromEl = chromEl->next) { chrom = chromEl->name; uglyf("%s\n", chrom); doOneChrom(database, chrom, rnaTable, expTable, f); } printf("%d dupe, %d uniq, %d miss, %d total, %d hits\n", dupeCount, uniqCount, missCount, dupeCount + uniqCount + missCount, hitCount); }
void regionPicker(char *database, char *axtBestDir, char *output) /* regionPicker - Code to pick regions to annotate deeply.. */ { struct sqlConnection *conn = NULL; struct slName *allChroms = NULL, *chrom = NULL; struct region *regionList = NULL, *region; FILE *f = mustOpen(output, "w"); struct stats *stats; struct scoredWindow *winList = NULL; struct hash *chromLimitHash = NULL; AllocVar(stats); chromLimitHash = getChromLimits(database); /* Figure out which regions to process from command line. * By default will do whole genome. */ if (sameWord(clRegion, "genome")) { allChroms = hAllChromNames(database); for (chrom = allChroms; chrom != NULL; chrom = chrom->next) { if (!endsWith(chrom->name, "_random")) { AllocVar(region); region->name = cloneString(chrom->name); region->chrom = cloneString(chrom->name); region->start = 0; region->end = hChromSize(database, chrom->name); slAddHead(®ionList, region); } } slReverse(®ionList); } else if (startsWith("chr", clRegion) && strchr(clRegion, ':') == NULL) { AllocVar(region); region->name = cloneString(clRegion); region->chrom = cloneString(clRegion); region->start = 0; region->end = hChromSize(database, clRegion); slAddHead(®ionList, region); } else { regionList = loadRegionFile(database, clRegion); } /* Gather statistics one region at a time and then * print them. */ conn = hAllocConn(database); for (region = regionList; region != NULL; region = region->next) { printf("Processing %s %s:%d-%d\n", region->name, region->chrom, region->start, region->end); statsOnSpan(database, conn, region, axtBestDir, stats, f, &winList); } fprintf(f, "\n"); reportStats(stats, f); fprintf(f, "\n"); uglyf("Got %d windows with no gaps\n", slCount(winList)); countChromWindows(database, winList, f); outputPicks(winList, database, chromLimitHash, stats, f); }
void snpValid() /* Test snpMap --> dbSnpRs/affy for one assembly. */ { char *Org; char *dbSnpTbl = NULL; struct dbSnpRs *dbSnps = NULL; struct dbSnpRs *dbSnp = NULL; struct affy10KDetails *affy10s = NULL; struct affy10KDetails *affy10 = NULL; struct affy120KDetails *affy120s = NULL; struct affy120KDetails *affy120 = NULL; struct axtScoreScheme *simpleDnaScheme = NULL; int match = 0; /* good match of minimal acceptable quality */ int mismatch = 0; /* unacceptable match quality */ int missing = 0; /* unable to find rsId in dbSnpRs/affy */ int goodrc = 0; /* matches after reverse-complement */ int assemblyDash = 0; /* assembly context is just a single dash - (complex cases) */ int gapNib = 0; /* nib returns n's, we are in the gap */ int totalMatch = 0; int totalMismatch = 0; int totalMissing = 0; int totalGoodrc = 0; int totalAssemblyDash = 0; int totalGapNib = 0; boolean affy = FALSE; int mode = 3; void *next = NULL; char *id = NULL; char *seq = NULL; char affy120id[12]; int matchScore = 100; int misMatchScore = 100; int gapOpenPenalty = 400; int gapExtendPenalty = 50; int noDna = 0; int snpMapRows = 0; /* controls whether affy120k, affy10k, or dbSnpRs is used currently affys are human only */ if (!hDbIsActive(db)) { printf("Currently no support for db %s\n", db); return; } hSetDb(db); Org = hOrganism(db); if (sameWord(Org,"Human")) affy = TRUE; if (sameWord(Org,"Human")) dbSnpTbl = "dbSnpRsHg"; else if (sameWord(Org,"Mouse")) dbSnpTbl = "dbSnpRsMm"; else if (sameWord(Org,"Rat")) dbSnpTbl = "dbSnpRsRn"; else { printf("Currently no support for Org %s\n", Org); return; } simpleDnaScheme = axtScoreSchemeSimpleDna(matchScore, misMatchScore, gapOpenPenalty, gapExtendPenalty); uglyf("dbSnp Table=%s \n",dbSnpTbl); uglyf("Affy=%s \n", affy ? "TRUE" : "FALSE" ); dbSnps = readDbSnps(dbSnpTbl); printf("read hgFixed.%s \n",dbSnpTbl); if (affy) { affy10s = readAffy10(); printf("read hgFixed.affy10KDetails \n"); affy120s = readAffy120(); printf("read hgFixed.affy120KDetails \n"); } int bogus = 0; // debug if (0) { printf("rsId assembly-sequence \n"); printf("---------------------------------------------- \n"); for (dbSnp = dbSnps; dbSnp != NULL; dbSnp = dbSnp->next) { printf("%s %s \n", dbSnp->rsId, dbSnp->assembly ); // debug: cut it short for testing only if (++bogus > 1) break; } printf("\n"); printf("\n"); } bogus=0; struct slName *cns = hAllChromNames(); struct slName *cn=NULL; if (!cns) { printf("testDb: hAllChromNames returned empty list \n"); return; } if (affy) { mode=1; /* start on affy120 with numbers in snpMap.rsId */ } else { mode=2; /* start on dbSnps with "rs*" in snpMap.rsId */ } for (cn = cns; cn != NULL; cn = cn->next) { struct dnaSeq *chromSeq = NULL; struct snpMap *snps = NULL; struct snpMap *snp = NULL; if (chr != NULL) if (!sameWord(chr,cn->name)) continue; //uglyf("testDb: beginning chrom %s \n",cn->name); chromSeq = hLoadChrom(cn->name); printf("testDb: chrom %s : size (%u) \n",cn->name,chromSeq->size); snps = readSnps(cn->name); printf("read %s.snpMap where chrom=%s \n",db,cn->name); dbSnp = dbSnps; affy10 = affy10s; affy120 = affy120s; printf("=========================================================\n"); for (snp = snps; snp != NULL; snp = snp->next) { int cmp = -1; char *nibDna=NULL; char *nibDnaRc=NULL; ++snpMapRows; /* printf("%s %s %u %u %s\n", snp->name, snp->chrom, snp->chromStart, snp->chromEnd, nibDna ); */ while (cmp < 0) { while (cmp < 0) { switch (mode) { case 1: next = affy120; break; case 2: next = dbSnp; break; case 3: next = affy10; break; } if (next == NULL) { switch (mode) { case 1: ++mode; break; case 2: ++mode; break; case 3: cmp = 1; break; } } else { break; } } if (cmp < 0) { switch (mode) { case 1: safef(affy120id, sizeof(affy120id), "%d", affy120->affyId); /* have int type but want string */ id = affy120id; break; case 2: id = dbSnp->rsId; break; case 3: id = affy10->affyId; break; } cmp=mystrcmp(id, snp->name); } if (cmp < 0) { switch (mode) { case 1: affy120 = affy120->next; break; case 2: dbSnp = dbSnp->next; break; case 3: affy10 = affy10->next; break; } } } if (cmp==0) { int strand=1; char *rc = NULL; int m = 0; int lf = 0; /* size of left flank context (lower case dna) */ int rf = 0; /* size of right flank context (lower case dna) */ int ls = 0; /* total size of assembly dna context plus actual region in dbSnpRs/affy */ char *origSeq = NULL; /* use to display the original dnSnpRs.assembly seq */ switch (mode) { case 1: seq = affy120->sequenceA; break; case 2: seq = dbSnp->assembly; break; case 3: seq = affy10->sequenceA; break; } if (sameString(seq,"-")) { ++assemblyDash; if (Verbose) printf("(no assembly context) rsId=%s chrom=%s %u %u \n assembly=%s \n\n", id, snp->chrom, snp->chromStart, snp->chromEnd, seq ); continue; } origSeq = seq; lf = leftFlank(origSeq); rf = rightFlank(origSeq); seq = cloneString(origSeq); stripDashes(seq); /* remove dashes indicating insert to simplify and correct processing of nib data */ ls = strlen(seq); /* used to be: lengthOneDash(seq); */ //debug //uglyf("about to call checkandFetchNib origSeq=%s lf=%d, rf=%d ls=%d \n", origSeq, lf, rf, ls); nibDna = checkAndFetchNib(chromSeq, snp, lf, ls); if (nibDna==NULL) { ++noDna; printf("no dna for %s %s %u %u \n", snp->name, snp->chrom, snp->chromStart, snp->chromEnd ); continue; } //debug //uglyf("got past checkandFetchNib call: \n nibDna=%s \n",nibDna); if (allNs(nibDna)) { ++gapNib; ++mismatch; if (Verbose) printf("(nib gap) rsId=%s chrom=%s %u %u \n assembly=%s \n snpMap=%s \n\n", id, snp->chrom, snp->chromStart, snp->chromEnd, seq, nibDna ); continue; } m = misses(seq,nibDna); if (m > 1) { //debug //uglyf("rc: about to call checkandFetchNib \n"); rc = checkAndFetchNib(chromSeq, snp, rf, ls); if (rc==NULL) { ++noDna; printf("no dna for %s %s %u %u \n", snp->name, snp->chrom, snp->chromStart, snp->chromEnd ); continue; } //debug //uglyf("rc: got past checkandFetchNib call: \n rc Dna=%s \n",rc); reverseComplement(rc,strlen(rc)); int n = misses(seq, rc); if (n < m) { strand=-1; m = n; } } if (m <= 1) { ++match; if (strand < 1) ++goodrc; } else { struct dnaSeq query, target; struct axt *axtAln = NULL; int bestScore = 0; ZeroVar(&query); query.dna = seq; query.size = strlen(query.dna); ZeroVar(&target); target.dna = nibDna; target.size = strlen(target.dna); axtAln = axtAffine(&query, &target, simpleDnaScheme); strand = 1; if (axtAln) { bestScore = axtAln->score / ls; } axtFree(&axtAln); if (bestScore < threshold) { ZeroVar(&target); target.dna = rc; target.size = strlen(target.dna); axtAln = axtAffine(&query, &target, simpleDnaScheme); if ((axtAln) && (bestScore < (axtAln->score / ls))) { strand = -1; bestScore = axtAln->score / ls; } axtFree(&axtAln); } if (bestScore >= threshold) { ++match; if (strand < 1) ++goodrc; } else { ++mismatch; } if ((bestScore < threshold) || Verbose) { printf( "score=%d misses=%u strand=%d rsId=%s chrom=%s %u %u lf=%d ls=%d \n" " assembly=%s \n" " snpMap=%s \n" "rc snpMap=%s \n" "\n", bestScore, m, strand, id, snp->chrom, snp->chromStart, snp->chromEnd, lf, ls, seq, nibDna, rc ); } } freez(&rc); freez(&seq); } else { char snpLkup[10] = ""; /* this id is missing from dbSnpRs/affy! */ ++missing; switch (mode) { case 1: safef(snpLkup,sizeof(snpLkup),"%s","affy120"); break; case 2: safef(snpLkup,sizeof(snpLkup),"%s",dbSnpTbl); break; case 3: safef(snpLkup,sizeof(snpLkup),"%s","affy10"); break; } if (Verbose) printf("snpMap.name=%s is missing from %s (now at %s) \n\n",snp->name,snpLkup,id); } freez(&nibDna); // debug: cut it short for testing only //break; } snpMapFreeList(&snps); dnaSeqFree(&chromSeq); printf("\n\n\n Total matches for chrom %s:\n ",cn->name); printf(" matches: %u \n ",match); printf(" mismatches: %u \n",mismatch); printf("missing from dbSnpRs: %u \n",missing); printf(" rev compl matches: %u \n",goodrc); printf(" assembly = -: %u \n",assemblyDash); printf(" nib in gap : %u \n",gapNib); printf("\n\n=========================================\n"); totalMatch += match; totalMismatch += mismatch; totalMissing += missing; totalGoodrc += goodrc; totalAssemblyDash += assemblyDash; totalGapNib += gapNib; match = 0; mismatch = 0; missing = 0; goodrc = 0; assemblyDash = 0; gapNib = 0; // debug: cut it to just one or two chrom for testing //if (++bogus > 1) // break; printf("\n"); printf("\n"); } slFreeList(&cns); dbSnpRsFreeList(&dbSnps); if (affy) { affy10KDetailsFreeList(&affy10s); affy120KDetailsFreeList(&affy120s); } axtScoreSchemeFree(&simpleDnaScheme); printf("\n\n\n Grand Totals: \n "); printf(" matches: %u \n ",totalMatch); printf(" mismatches: %u \n",totalMismatch); printf("missing from dbSnpRs: %u \n",totalMissing); printf(" rev compl matches: %u \n",totalGoodrc); printf(" assembly = -: %u \n",totalAssemblyDash); printf(" nib in gap : %u \n",totalGapNib); printf("\n Total rows in snpMap: %u \n ",snpMapRows); printf("\n # no dna found for : %u \n ",noDna); printf("\n\n=========================================\n"); }
int checkTableCoords(char *db) /* Check several invariants (see comments in check*() above), * summarize errors, return nonzero if there are errors. */ { struct sqlConnection *conn = hAllocConn(db); struct slName *tableList = NULL, *curTable = NULL; struct slName *allChroms = NULL; boolean gotError = FALSE; allChroms = hAllChromNames(db); if (theTable == NULL) tableList = getTableNames(conn); else if (sqlTableExists(conn, theTable)) tableList = newSlName(theTable); else errAbort("Error: specified table \"%s\" does not exist in database %s.", theTable, db); for (curTable = tableList; curTable != NULL; curTable = curTable->next) { struct hTableInfo *hti = NULL; struct slName *chromList = NULL, *chromPtr = NULL; char *table = curTable->name; char tableChrom[32], trackName[128], tableChromPrefix[33]; hParseTableName(db, table, trackName, tableChrom); hti = hFindTableInfo(db, tableChrom, trackName); if (hti != NULL && hti->isPos) { /* watch out for presence of both split and non-split tables; * hti for non-split will be replaced with hti of split. */ if (splitAndNonSplitExist(conn, table, tableChrom)) continue; safef(tableChromPrefix, sizeof(tableChromPrefix), "%s_", tableChrom); if (hti->isSplit) chromList = newSlName(tableChrom); else chromList = allChroms; /* invariant: chrom must be described in chromInfo. */ /* items with bad chrom will be invisible to hGetBedRange(), so * catch them here by SQL query. */ /* The SQL query is too huge for scaffold-based db's, check count: */ if (hChromCount(db) <= MAX_SEQS_SUPPORTED) { if (isNotEmpty(hti->chromField)) { struct dyString *bigQuery = newDyString(1024); dyStringClear(bigQuery); sqlDyStringPrintf(bigQuery, "select count(*) from %s where ", table); for (chromPtr=chromList; chromPtr != NULL; chromPtr=chromPtr->next) { sqlDyStringPrintf(bigQuery, "%s != '%s' ", hti->chromField, chromPtr->name); if (chromPtr->next != NULL) dyStringAppend(bigQuery, "AND "); } gotError |= reportErrors(BAD_CHROM, table, sqlQuickNum(conn, bigQuery->string)); dyStringFree(&bigQuery); } for (chromPtr=chromList; chromPtr != NULL; chromPtr=chromPtr->next) { char *chrom = chromPtr->name; struct bed *bedList = hGetBedRange(db, table, chrom, 0, 0, NULL); if (hti->isSplit && isNotEmpty(hti->chromField)) gotError |= checkSplitTableOnlyChrom(bedList, table, hti, tableChrom); gotError |= checkStartEnd(bedList, table, hti, testChromSize(chrom)); if (hti->hasCDS) gotError |= checkCDSStartEnd(bedList, table, hti); if (hti->hasBlocks && !ignoreBlocks) gotError |= checkBlocks(bedList, table, hti); bedFreeList(&bedList); } } } } return gotError; }