Esempio n. 1
0
void txCdsGoodBed(char *database, char *outBed, char *outCds)
/* txCdsGoodBed - Create positive example training set for SVM. This is based on
 * the refSeq reviewed genes, but we fragment a certain percentage of them so as 
 * not to end up with a SVM that *requires* a complete transcript. */
{
struct sqlConnection *conn = sqlConnect(database);
char *refTrack = "refGene";
char *statusTable = "refSeqStatus";
if (!sqlTableExists(conn, refTrack))
    errAbort("table %s doesn't exist in %s", refTrack, database);
if (!sqlTableExists(conn, statusTable))
    errAbort("table %s doesn't exist in %s", statusTable, database);
FILE *fBed = mustOpen(outBed, "w");
FILE *fCds = mustOpen(outCds, "w");
char *query =
   "NOSQLINJ select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds "
   "from refGene r,refSeqStatus s where r.name=s.mrnaAcc and s.status='Reviewed'";
struct sqlResult *sr = sqlGetResult(conn, query);
char **row;
double randScale = 1.0/RAND_MAX;
int id = 0;
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *gp = genePredLoad(row);
    int start = gp->txStart, end = gp->txEnd;
    char *type = "refReviewed";
    if (rand()*randScale < frag)
        {
	double midRatio = rand()*randScale;
	if (midRatio > 0.5)
	     gpFragLimits(gp, 0, midRatio, &start, &end);
	else
	     gpFragLimits(gp, midRatio, 1.0, &start, &end);
	type = "refFrag";
	}
    gpPartOutAsBed(gp, start, end, fBed, type, ++id, 0);
    gpPartOutAsCds(gp, start, end, fCds, type, id);
    }
carefulClose(&fBed);
}
void txCdsBadBed(char *database, 
	char *altSpliceBed, char *outBed)
/* txCdsBadBed - Create a bed file with regions that don't really have CDS, 
 * but that might look like it.. */
{
/* Open up database and make sure all the tables we want are there. */
char *refTrack = "refGene";
char *vegaPseudo = "vegaPseudoGene";
char *retroPseudo = "retroMrnaInfo";
struct sqlConnection *conn = sqlConnect(database);
if (!sqlTableExists(conn, refTrack))
    errAbort("table %s doesn't exist in %s", refTrack, database);
if (!sqlTableExists(conn, vegaPseudo))
    errAbort("table %s doesn't exist in %s", vegaPseudo, database);
if (!sqlTableExists(conn, retroPseudo))
    errAbort("table %s doesn't exist in %s", retroPseudo, database);

/* Read in alt file and output larger retained and bleeding introns. */
struct bed *bed, *intronyList = loadRetainedAndBleeding(altSpliceBed);
FILE *f = mustOpen(outBed, "w");
for (bed = intronyList; bed != NULL; bed = bed->next)
    {
    int size = bed->chromEnd - bed->chromStart;
    if (size > 400)
	{
	fprintf(f, "%s\t%d\t%d\t", bed->chrom, bed->chromStart, bed->chromEnd);
	fprintf(f, "%s%d\t", bed->name, ++id);
	fprintf(f, "%d\t%s\t", bed->score, bed->strand);
	fprintf(f, "0\t0\t0\t1\t");
	fprintf(f, "%d,\t%d,\n", bed->chromEnd - bed->chromStart, 0);
	}
    }

/* Read in refGene, and write out larger 3' UTRs, and occassional antisense copies.  */
char query[512];
safef(query, sizeof(query), "select * from %s", refTrack);
int rowOffset = 0;
if (sqlFieldIndex(conn, refTrack, "bin") == 0)
    rowOffset = 1;
struct sqlResult *sr = sqlGetResult(conn, query);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *gp = genePredLoad(row + rowOffset);
    int start, end;
    if (gp->strand[0] == '+')
        {
	start = gp->cdsEnd;
	end = gp->txEnd;
	}
    else
        {
	start = gp->txStart;
	end = gp->cdsStart;
	}
    if (end - start > 400)
        {
	gpPartOutAsBed(gp, start, end, f, "utr", ++id, 400);
	}
    if (rand()%20 == 0)
        {
	gp->strand[0] = (gp->strand[0] == '+' ? '-' : '+');
	gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "anti", ++id, 0);
	}
    }
sqlFreeResult(&sr);

/* Write out vega pseudo-genes. */
safef(query, sizeof(query), "select * from %s", vegaPseudo);
rowOffset = 0;
if (sqlFieldIndex(conn, vegaPseudo, "bin") == 0)
    rowOffset = 1;
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *gp = genePredLoad(row + rowOffset);
    gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "vega", ++id, 0);
    }

/* Write out retroGenes. */
safef(query, sizeof(query), "select * from %s where score > 600", retroPseudo);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct bed *bed = bedLoad12(row);
    char name[128];
    safef(name, sizeof(name), "retro_%d_%s", ++id, bed->name);
    bed->name = name;
    bedTabOutN(bed, 12, f);
    }

carefulClose(&f);
}