void txCdsGoodBed(char *database, char *outBed, char *outCds) /* txCdsGoodBed - Create positive example training set for SVM. This is based on * the refSeq reviewed genes, but we fragment a certain percentage of them so as * not to end up with a SVM that *requires* a complete transcript. */ { struct sqlConnection *conn = sqlConnect(database); char *refTrack = "refGene"; char *statusTable = "refSeqStatus"; if (!sqlTableExists(conn, refTrack)) errAbort("table %s doesn't exist in %s", refTrack, database); if (!sqlTableExists(conn, statusTable)) errAbort("table %s doesn't exist in %s", statusTable, database); FILE *fBed = mustOpen(outBed, "w"); FILE *fCds = mustOpen(outCds, "w"); char *query = "NOSQLINJ select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds " "from refGene r,refSeqStatus s where r.name=s.mrnaAcc and s.status='Reviewed'"; struct sqlResult *sr = sqlGetResult(conn, query); char **row; double randScale = 1.0/RAND_MAX; int id = 0; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row); int start = gp->txStart, end = gp->txEnd; char *type = "refReviewed"; if (rand()*randScale < frag) { double midRatio = rand()*randScale; if (midRatio > 0.5) gpFragLimits(gp, 0, midRatio, &start, &end); else gpFragLimits(gp, midRatio, 1.0, &start, &end); type = "refFrag"; } gpPartOutAsBed(gp, start, end, fBed, type, ++id, 0); gpPartOutAsCds(gp, start, end, fCds, type, id); } carefulClose(&fBed); }
void txCdsBadBed(char *database, char *altSpliceBed, char *outBed) /* txCdsBadBed - Create a bed file with regions that don't really have CDS, * but that might look like it.. */ { /* Open up database and make sure all the tables we want are there. */ char *refTrack = "refGene"; char *vegaPseudo = "vegaPseudoGene"; char *retroPseudo = "retroMrnaInfo"; struct sqlConnection *conn = sqlConnect(database); if (!sqlTableExists(conn, refTrack)) errAbort("table %s doesn't exist in %s", refTrack, database); if (!sqlTableExists(conn, vegaPseudo)) errAbort("table %s doesn't exist in %s", vegaPseudo, database); if (!sqlTableExists(conn, retroPseudo)) errAbort("table %s doesn't exist in %s", retroPseudo, database); /* Read in alt file and output larger retained and bleeding introns. */ struct bed *bed, *intronyList = loadRetainedAndBleeding(altSpliceBed); FILE *f = mustOpen(outBed, "w"); for (bed = intronyList; bed != NULL; bed = bed->next) { int size = bed->chromEnd - bed->chromStart; if (size > 400) { fprintf(f, "%s\t%d\t%d\t", bed->chrom, bed->chromStart, bed->chromEnd); fprintf(f, "%s%d\t", bed->name, ++id); fprintf(f, "%d\t%s\t", bed->score, bed->strand); fprintf(f, "0\t0\t0\t1\t"); fprintf(f, "%d,\t%d,\n", bed->chromEnd - bed->chromStart, 0); } } /* Read in refGene, and write out larger 3' UTRs, and occassional antisense copies. */ char query[512]; safef(query, sizeof(query), "select * from %s", refTrack); int rowOffset = 0; if (sqlFieldIndex(conn, refTrack, "bin") == 0) rowOffset = 1; struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row + rowOffset); int start, end; if (gp->strand[0] == '+') { start = gp->cdsEnd; end = gp->txEnd; } else { start = gp->txStart; end = gp->cdsStart; } if (end - start > 400) { gpPartOutAsBed(gp, start, end, f, "utr", ++id, 400); } if (rand()%20 == 0) { gp->strand[0] = (gp->strand[0] == '+' ? '-' : '+'); gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "anti", ++id, 0); } } sqlFreeResult(&sr); /* Write out vega pseudo-genes. */ safef(query, sizeof(query), "select * from %s", vegaPseudo); rowOffset = 0; if (sqlFieldIndex(conn, vegaPseudo, "bin") == 0) rowOffset = 1; sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row + rowOffset); gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "vega", ++id, 0); } /* Write out retroGenes. */ safef(query, sizeof(query), "select * from %s where score > 600", retroPseudo); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct bed *bed = bedLoad12(row); char name[128]; safef(name, sizeof(name), "retro_%d_%s", ++id, bed->name); bed->name = name; bedTabOutN(bed, 12, f); } carefulClose(&f); }