示例#1
0
void pairedEndQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf)
/* Look for other end,  do a pairwise alignment, and save results in database. */
{
verbose(2, "pairedEndQa on %u %s %s\n", ef->id, ef->cdwFileName, ef->submitFileName);
/* Get other end, return if not found. */
struct cdwValidFile *otherVf = cdwOppositePairedEnd(conn, ef, vf);
if (otherVf == NULL)
    return;

if (otherVf->fileId > vf->fileId)
    return;

struct cdwValidFile *vf1, *vf2;
struct cdwQaPairedEndFastq *pair = cdwQaPairedEndFastqFromVfs(conn, vf, otherVf, &vf1, &vf2);
if (pair != NULL)
    {
    cdwValidFileFree(&otherVf);
    return;
    }

/* Get target assembly and figure out path for BWA index. */
struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, vf->ucscDb);
assert(assembly != NULL);
char genoFile[PATH_LEN];
safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", 
    cdwValDataDir, assembly->ucscDb, assembly->ucscDb);

verbose(1, "aligning subsamples on %u vs. %u paired reads\n", vf1->fileId, vf2->fileId);

/* Make alignments of subsamples. */
char *sample1 = NULL, *sample2 = NULL, *sai1 = NULL, *sai2 = NULL;
makeTmpSai(conn, vf1, genoFile, &sample1, &sai1);
makeTmpSai(conn, vf2, genoFile, &sample2, &sai2);

/* Make paired end alignment */
char *tmpSam = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sam"));
char command[6*PATH_LEN];
safef(command, sizeof(command),
   "bwa sampe -n 1 -N 1 -f %s %s %s %s %s %s"
   , tmpSam, genoFile, sai1, sai2, sample1, sample2);
mustSystem(command);

/* Make ra file with pairing statistics */
char *tmpRa = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".ra"));
safef(command, sizeof(command), 
    "edwSamPairedEndStats -maxInsert=%d %s %s", maxInsert, tmpSam, tmpRa);
mustSystem(command);

/* Read RA file into variables. */
struct cdwQaPairedEndFastq *pe = cdwQaPairedEndFastqOneFromRa(tmpRa);

/* Update database with record. */
struct sqlConnection *freshConn = cdwConnectReadWrite();
char query[256];
sqlSafef(query, sizeof(query),
    "insert into cdwQaPairedEndFastq "
    "(fileId1,fileId2,concordance,distanceMean,distanceStd,distanceMin,distanceMax,recordComplete) "
    " values (%u,%u,%g,%g,%g,%g,%g,1)"
    , vf1->fileId, vf2->fileId, pe->concordance, pe->distanceMean
    , pe->distanceStd, pe->distanceMin, pe->distanceMax);
sqlUpdate(conn, query);
sqlDisconnect(&freshConn);

/* Clean up and go home. */
cdwValidFileFree(&otherVf);
remove(sample1);
remove(sample2);
remove(sai1);
remove(sai2);
remove(tmpSam);
remove(tmpRa);
#ifdef SOON
#endif /* SOON */
freez(&sample1);
freez(&sample2);
freez(&sai1);
freez(&sai2);
freez(&tmpSam);
freez(&tmpRa);
cdwQaPairedEndFastqFree(&pe);
cdwValidFileFree(&otherVf);
}
示例#2
0
void doClusterMotifDetails(struct sqlConnection *conn, struct trackDb *tdb, 
                                struct factorSource *cluster)
/* Display details about TF binding motif(s) in cluster */
{
char *motifTable = trackDbSetting(tdb, "motifTable");         // localizations
char *motifPwmTable = trackDbSetting(tdb, "motifPwmTable");   // PWM used to draw sequence logo
char *motifMapTable = trackDbSetting(tdb, "motifMapTable");   // map target to motif
struct slName *motifNames = NULL, *mn; // list of canonical motifs for the factor
struct dnaMotif *motif = NULL;
struct bed6FloatScore *hit = NULL, *maxHit = NULL;
char **row;
char query[256];

if (motifTable != NULL && sqlTableExists(conn, motifTable))
    {
    struct sqlResult *sr;
    int rowOffset;
    char where[256];

    if (motifMapTable == NULL || !sqlTableExists(conn, motifMapTable))
        {
        // Assume cluster name is motif name if there is no map table
        motifNames = slNameNew(cluster->name);
        }
    else
        {
        sqlSafef(query, sizeof(query),
                "select motif from %s where target = '%s'", motifMapTable, cluster->name);
        char *ret = sqlQuickString(conn, query);
        if (ret == NULL)
            {
            // missing target from table -- no canonical motif
            webNewEmptySection();
            return;
            }
        motifNames = slNameListFromString(ret, ',');
        }
    for (mn = motifNames; mn != NULL; mn = mn->next)
        {
        sqlSafefFrag(where, sizeof(where), "name='%s' order by score desc limit 1", mn->name);
        sr = hRangeQuery(conn, motifTable, cluster->chrom, cluster->chromStart,
                     cluster->chromEnd, where, &rowOffset);
        if ((row = sqlNextRow(sr)) != NULL)
            {
            hit = bed6FloatScoreLoad(row + rowOffset);
            if (maxHit == NULL || maxHit->score < hit->score)
                maxHit = hit;
            }
        sqlFreeResult(&sr);
        }
    }
if (maxHit == NULL)
    {
    // Maintain table layout
    webNewEmptySection();
    return;
    }
hit = maxHit;

webNewSection("Canonical Motif in Cluster");
char posLink[1024];
safef(posLink, sizeof(posLink),"<a href=\"%s&db=%s&position=%s%%3A%d-%d\">%s:%d-%d</a>",
        hgTracksPathAndSettings(), database, 
            cluster->chrom, hit->chromStart+1, hit->chromEnd,
            cluster->chrom, hit->chromStart+1, hit->chromEnd);
printf("<b>Motif Name:</b>  %s<br>\n", hit->name);
printf("<b>Motif Score");
printf(":</b>  %.2f<br>\n", hit->score);
printf("<b>Motif Position:</b> %s<br>\n", posLink);
printf("<b>Motif Strand:</b> %c<br>\n", (int)hit->strand[0]);

struct dnaSeq *seq = hDnaFromSeq(database, seqName, hit->chromStart, hit->chromEnd, dnaLower);
if (seq == NULL)
    return;
if (hit->strand[0] == '-')
    reverseComplement(seq->dna, seq->size);
if (motifPwmTable != NULL && sqlTableExists(conn, motifPwmTable))
    {
    motif = loadDnaMotif(hit->name, motifPwmTable);
    if (motif == NULL)
        return;
    motifLogoAndMatrix(&seq, 1, motif);
    }
}
matrix * NMFgetSampleAccuracy(struct hash *config)
/*Read all the folds and calculate training and testing accuracies from best models*/
{
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * validationDir = hashMustFindVal(config, "validationDir");
char * modelDir = hashMustFindVal(config, "modelDir");
int fold, folds = foldsCountFromDataDir(config);
int split, splits  = splitsCountFromDataDir(config);

matrix * accuracies = NULL;
char filename[256];
FILE * fp;
for(split = 1; split <= splits; split++)
    {
    for(fold = 1; fold <= folds; fold++)
        {
        //cat togetehr the training and validation KH values and record which were used to train
        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/metadata.tab", trainingDir, split, fold);
        fp = fopen(filename, "r");
        if(fp == NULL)
            errAbort("Couldn't open file %s\n", filename);
        matrix * trMetadata = f_fill_matrix(fp, 1);
        fclose(fp);

        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/metadata.tab", validationDir, split, fold);
        fp = fopen(filename, "r");
        if(fp == NULL)
            errAbort("Couldn't open file %s\n", filename);
        matrix * valMetadata = f_fill_matrix(fp, 1);
        fclose(fp);

        struct slInt * trainingList = list_indices(trMetadata->cols);
        matrix * metadata = append_matrices(trMetadata, valMetadata, 1);

        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/NMFpredictor.training.results", modelDir, split, fold);
        fp = fopen(filename , "r");
        if(!fp)
            errAbort("Couldn't open training results file %s", filename);
        matrix * trainingPred = f_fill_matrix(fp, 1);
        fclose(fp);

        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/NMFpredictor.validation.results", modelDir, split, fold);
        fp = fopen(filename , "r");
        if(!fp)
            errAbort("Couldn't open validation results file %s", filename);
        matrix * valPred = f_fill_matrix(fp, 1);
        fclose(fp);

        //calc the accuracy by sample
        matrix * predictions = append_matrices(trainingPred, valPred, 1);
        matrix * accuraciesInFold = NMFpopulateAccuracyMatrix(predictions, metadata, trainingList);
        //add the accuracies to the running totals
        if(split == 1 && fold == 1)
            accuracies = copy_matrix(accuraciesInFold);
        else
            add_matrices_by_colLabel(accuracies, accuraciesInFold);
		//clean up
        free_matrix(trainingPred);
        free_matrix(valPred);
        free_matrix(predictions);
        free_matrix(trMetadata);
        free_matrix(valMetadata);
        free_matrix(metadata);
        free_matrix(accuraciesInFold);
        }
    }
//normalize accuracies over number of splits and folds
int i;
for(i = 0; i < accuracies->cols; i++)
    {
    if(accuracies->graph[0][i] != NULL_FLAG)
        accuracies->graph[0][i] = (accuracies->graph[0][i] / ((folds-1) * splits));
    if(accuracies->graph[1][i] != NULL_FLAG)
        accuracies->graph[1][i] = (accuracies->graph[1][i] / (1 * splits));
    }
return accuracies;
}
void sqlExecProgProfile(char *profile, char *prog, char **progArgs, int userArgc, char *userArgv[])
/* 
 * Exec one of the sql programs using user and password defined in localDb.XXX variables from ~/.hg.conf 
 * progArgs is NULL-terminate array of program-specific arguments to add,
 * which maybe NULL. userArgv are arguments passed in from the command line.
 * The program is execvp-ed, this function does not return. 
 */
{
int i, j = 0, nargc=cntArgv(progArgs)+userArgc+6, defaultFileNo, returnStatus;
pid_t child_id;
char **nargv, defaultFileName[256], defaultFileArg[256], *homeDir;

// install cleanup signal handlers
sqlProgInitSigHandlers();

/* Assemble defaults file */
if ((homeDir = getenv("HOME")) == NULL)
    errAbort("sqlExecProgProfile: HOME is not defined in environment; cannot create temporary password file");

nukeOldCnfs(homeDir);
// look for special parameter -profile=name
for (i = 0; i < userArgc; i++)
    if (startsWith("-profile=", userArgv[i]))
	profile=cloneString(userArgv[i]+strlen("-profile="));

safef(defaultFileName, sizeof(defaultFileName), "%s/.hgsql.cnf-XXXXXX", homeDir);
defaultFileNo=sqlMakeDefaultsFile(defaultFileName, profile, "client");

safef(defaultFileArg, sizeof(defaultFileArg), "--defaults-file=%s", defaultFileName);

AllocArray(nargv, nargc);

nargv[j++] = prog;
nargv[j++] = defaultFileArg;   /* --defaults-file must come before other options */
if (progArgs != NULL)
    {
    for (i = 0; progArgs[i] != NULL; i++)
        nargv[j++] = progArgs[i];
    }
for (i = 0; i < userArgc; i++)
    if (!startsWith("-profile=", userArgv[i]))
	nargv[j++] = userArgv[i];
nargv[j++] = NULL;

// flush before forking so we can't accidentally get two copies of the output
fflush(stdout);
fflush(stderr);

child_id = fork();
killChildPid = child_id;
if (child_id == 0)
    {
    execvp(nargv[0], nargv);
    _exit(42);  /* Why 42?  Why not?  Need something user defined that mysql isn't going to return */
    }
else
    {
    /* Wait until the child process completes, then delete the temp file */
    wait(&returnStatus);
    unlink (defaultFileName);
    if (WIFEXITED(returnStatus))
        {
	int childExitStatus = WEXITSTATUS(returnStatus);
        if (childExitStatus == 42)
            errAbort("sqlExecProgProfile: exec failed");
	else
	    // Propagate child's exit status:
	    _exit(childExitStatus);
        }
    else
        errAbort("sqlExecProgProfile: child process exited with abnormal status %d", returnStatus);
    }
}
示例#5
0
static void parseDbXrefs()
/* Parse the db_xref entries for various features to build a single dbx entry
 * in the kvt and to obtain the locus and mim ids for the kvt */
{
static char* LOCUS_ID = "LocusID:";
static char* GENE_ID = "GeneID:";
static char* MIM_ID = "MIM:";
struct slName* head = NULL, *xref, *prevXref;
struct keyVal* dbXrefKv = NULL;
struct keyVal* locusLinkIdKv = NULL;
struct keyVal* geneIdKv = NULL;
struct keyVal* omimIdKv = NULL;
if (dbXrefBuf == NULL)
    dbXrefBuf = dyStringNew(256);
dyStringClear(dbXrefBuf);
if (omimIdBuf == NULL)
    omimIdBuf = dyStringNew(256);
dyStringClear(omimIdBuf);
locusLinkId[0] = '\0';

/* split into a list and sort so we can remove dups */
if (gbCdsDbxField->val->stringSize > 0)
    head = slCat(head, parseDbXrefStr(gbCdsDbxField->val->string));
if (gbGeneDbxField->val->stringSize > 0)
    head = slCat(head, parseDbXrefStr(gbGeneDbxField->val->string));
slNameSort(&head);

xref = head;
prevXref = NULL;
while (xref != NULL)
    {
    /* skip if dup of previous */
    if ((prevXref == NULL) || !sameString(prevXref->name, xref->name))
        {
        if (dbXrefBuf->stringSize > 0)
            dyStringAppendC(dbXrefBuf, ' ');
        dyStringAppend(dbXrefBuf, xref->name);
        updateKvt(&dbXrefKv, "dbx", dbXrefBuf->string);

        /* find number in db_xref like LocusID:27 or GeneID:27 */
        if (startsWith(LOCUS_ID, xref->name))
            {
            safef(locusLinkId, sizeof(locusLinkId), "%s",
                  xref->name+strlen(LOCUS_ID));
            updateKvt(&locusLinkIdKv, "loc", locusLinkId);
            }
        else if (startsWith(GENE_ID, xref->name))
            {
            safef(geneId, sizeof(geneId), "%s",
                  xref->name+strlen(GENE_ID));
            updateKvt(&geneIdKv, "gni", geneId);
            }
        else if (startsWith(MIM_ID, xref->name))
            {
            if (omimIdBuf->stringSize > 0)
                dyStringAppendC(omimIdBuf, ' ');
            dyStringAppend(omimIdBuf, xref->name+strlen(MIM_ID));
            updateKvt(&omimIdKv, "mim", omimIdBuf->string);
            }
        }
    prevXref = xref;
    xref = xref->next;
    }
slFreeList(&head);
}
示例#6
0
void testOutSequence(struct htmlPage *tablePage, struct htmlForm *mainForm,
     char *org, char *db, char *group, char *track, char *table, 
     int expectedRows)
/* Get as sequence and make sure count agrees with expected. */
/* mainForm not used */
{
struct htmlPage *outPage;
int attempts = 0;
struct htmlFormVar *typeVar;

if (tablePage->forms == NULL) 
    errAbort("testOutSequence: Missing form (tablePage)");

htmlPageSetVar(tablePage, NULL, hgtaOutputType, "sequence");
outPage = quickSubmit(tablePage, org, db, group, track, table,
    "seqUi1", hgtaDoTopSubmit, "submit");
while (outPage == NULL && attempts < MAX_ATTEMPTS) 
    {
    printf("testOutSequence: trying again to get seqUi1\n");
    outPage = quickSubmit(tablePage, org, db, group, track, table,
        "seqUi1", hgtaDoTopSubmit, "submit");
    attempts++;
    }
if (outPage == NULL) 
    {
    qaStatusSoftError(tablesTestList->status,
        "Error in testOutSequence - couldn't get outPage");
    return;
    }
if (outPage->forms == NULL)
    {
    qaStatusSoftError(tablesTestList->status,
        "Error in testOutSequence - missing form");
    htmlPageFree(&outPage);
    return;
    }

/* Since some genomic sequence things are huge, this will
 * only test in case where it's a gene prediction. */
typeVar = htmlFormVarGet(outPage->forms, hgtaGeneSeqType);
if (typeVar != NULL)
    {
    struct htmlPage *seqPage;
    static char *types[] = {"protein", "mRNA"};
    int i;
    for (i=0; i<ArraySize(types); ++i)
        {
        char *type = types[i];
        if (slNameInList(typeVar->values, type))
             {
	     struct htmlPage *page;
	     char testName[128];
	     htmlPageSetVar(outPage, NULL, hgtaGeneSeqType, type);
	     safef(testName, sizeof(testName), "%sSeq", type);
	     page = quickSubmit(outPage, org, db, group, track, table,
	        testName, hgtaDoGenePredSequence, "submit");
	     checkFaOutput(page, expectedRows, TRUE);
	     htmlPageFree(&page);
	     }
         }
    htmlPageSetVar(outPage, NULL, hgtaGeneSeqType, "genomic");
    serialSubmit(&outPage, org, db, group, track, table, "seqUi2", hgtaDoGenePredSequence, "submit");
    // check that outPage != NULL

    /* On genomic page uncheck intron if it's there, then get results * and count them. */
    if (htmlFormVarGet(outPage->forms, "hgSeq.intron") != NULL)
         htmlPageSetVar(outPage, NULL, "hgSeq.intron", NULL);
    seqPage = quickSubmit(outPage, org, db, group, track, table, "genomicSeq", hgtaDoGenomicDna, "submit");
    // check that seqPage != NULL
    checkFaOutput(seqPage, expectedRows, FALSE);
    htmlPageFree(&seqPage);
    }

htmlPageFree(&outPage);
}
示例#7
0
void doubleCellPrint(struct column *col, struct subjInfo *si,
        struct sqlConnection *conn)
/* print double value */
{
char *s = col->cellVal(col, si, conn);
char buf[256];
if (sameString(s,"."))  // known bad data value
    safef(buf,sizeof(buf),"%s", s);
else
    {
    if (sameWord(col->name, "LastPVisit") 	||
	sameWord(col->name, "LastTrVisit"))
	{
    	if (sameWord(s, "-1"))
	   {
    	   safef(buf,sizeof(buf),"N/A");
	   }
    	else if (sameWord(s, "-2"))
	   {
    	   safef(buf,sizeof(buf),"N/D");
	   }
    	else if (sameWord(s, "-3.000")||sameWord(s, "-3.0")||sameWord(s, "-3"))
	   {
    	   safef(buf,sizeof(buf),"&nbsp");
	   }
	else
	    {
    	    safef(buf,sizeof(buf),"%.1f",sqlDouble(s));
	    }
	}
    else if (sameWord(col->name, "LastTrCD4Blk")
         ||  sameWord(col->name, "LastPCD4Blk")
         ||  sameWord(col->name, "LastPAntiGP120")
         ||  sameWord(col->name, "LastTrAntiGP120"))
    	{
    	if (sameWord(s, "-3.000"))
	   {
    	   safef(buf,sizeof(buf),"&nbsp");
	   }
    	else if (sameWord(s, "-2"))
	   {
    	   safef(buf,sizeof(buf),"N/D");
	   }
    	else if (sameWord(s, "-1"))
	   {
    	   safef(buf,sizeof(buf),"N/A");
	   }
	else
	    {
    	    safef(buf,sizeof(buf),"%.3f",sqlDouble(s));
	    }
	}
    else
	{
    	safef(buf,sizeof(buf),"%.1f",sqlDouble(s));
	}
    }
freeMem(s);
hPrintf("<TD align=right>");
hPrintf("%s", buf);
hPrintf("</TD>");
}
static void doBlat(struct sqlConnection *conn, int taxon, char *db)
/* place probe seq from non-BAC with blat that have no alignments yet */
{
int rc = 0;
char *blatSpec=NULL;
char cmdLine[256];
char path1[256];
char path2[256];
struct dyString *dy = dyStringNew(0);

/* (non-BACs needing alignment) */
dyStringClear(dy);
dyStringPrintf(dy, 
    "select concat(\"vgPrb_\",e.id), e.seq"
    " from vgPrb e, vgPrbAli a"
    " where e.id = a.vgPrb"
    " and a.db = '%s'"
    " and a.status = 'new'"
    " and e.taxon = %d"
    " and e.type <> 'bac'"
    " and e.seq <> ''"
    " order by e.id"
    , db, taxon);
//restore: 
rc = sqlSaveQuery(conn, dy->string, "blat.fa", TRUE);
verbose(1,"rc = %d = count of sequences for blat, to get psls for taxon %d\n",rc,taxon);

if (rc == 0) 
    {
    unlink("blat.fa");
    system("rm -f blatNearBest.psl; touch blatNearBest.psl");  /* make empty file */
    return;
    }

/* make .ooc and blat on kolossus */

safef(path1,sizeof(path1),"/gbdb/%s/%s.2bit",db,db);
safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db);
//restore: 
verbose(1,"copy: [%s] to [%s]\n",path1,path2);  copyFile(path1,path2);

safef(cmdLine,sizeof(cmdLine),
"ssh kolossus 'cd %s; blat -makeOoc=11.ooc -tileSize=11"
" -repMatch=1024 %s.2bit /dev/null /dev/null'",
    getCurrentDir(),db);
//restore: 
system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");

safef(cmdLine,sizeof(cmdLine),
	"ssh kolossus 'cd %s; blat %s.2bit blat.fa -ooc=11.ooc -noHead blat.psl'",
	getCurrentDir(),db);
//restore: 
system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");

/* using blat even with -fastMap was way too slow - took over a day,
 * so instead I will make a procedure to write a fake psl for the BACs
 * which you will see called below */

safef(path2,sizeof(path2),"%s.2bit",db);
verbose(1,"rm %s\n",path2); unlink(path2); 

safef(path2,sizeof(path2),"11.ooc");
verbose(1,"rm %s\n",path2); unlink(path2); 


/* skip psl header and sort on query name */
safef(cmdLine,sizeof(cmdLine), "sort -k 10,10 blat.psl > blatS.psl");
verbose(1,"cmdLine=[%s]\n",cmdLine);
system(cmdLine); 

/* keep near best within 5% of the best */
safef(cmdLine,sizeof(cmdLine), 
    "pslCDnaFilter -globalNearBest=0.005 -minId=0.96 -minNonRepSize=20 -minCover=0.50"
    " blatS.psl blatNearBest.psl");
verbose(1,"cmdLine=[%s]\n",cmdLine);
system(cmdLine); 

unlink("blat.fa");
unlink("blat.psl");
unlink("blatS.psl");

freez(&blatSpec);
dyStringFree(&dy);
}
static void doPslMapAli(struct sqlConnection *conn, 
    int taxon, char *db, 
    int fromTaxon, char *fromDb)
{
char cmd[256];

struct dyString *dy = dyStringNew(0);
char path[256];
char dnaPath[256];
char toDb[12];

safef(toDb,sizeof(toDb),"%s", db);
toDb[0]=toupper(toDb[0]);

safef(dnaPath,sizeof(dnaPath),"/cluster/data/%s/nib", db);
if (!fileExists(dnaPath))
    {
    safef(dnaPath,sizeof(dnaPath),"/cluster/data/%s/%s.2bit", db, db);
    if (!fileExists(dnaPath))
	errAbort("unable to locate nib dir or .2bit for %s: %s", db, dnaPath);
    }
    
safef(path,sizeof(path),"/gbdb/%s/liftOver/%sTo%s.over.chain.gz", fromDb, fromDb, toDb);
if (!fileExists(path))
    errAbort("unable to locate chain file %s",path);

/* get non-bac $db.vgProbes not yet aligned */
getPslMapAli(conn, db, fromTaxon, fromDb, FALSE);
/* get bac $db.vgProbes not yet aligned */
getPslMapAli(conn, db, fromTaxon, fromDb, TRUE);
/* get .fa for pslRecalcMatch use */
getPslMapFa(conn, db, fromTaxon);

/* non-bac */
safef(cmd,sizeof(cmd),
"zcat %s | pslMap -chainMapFile -swapMap  nonBac.psl stdin stdout "
"|  sort -k 14,14 -k 16,16n > unscoredNB.psl"
,path);
verbose(1,"%s\n",cmd); system(cmd);

safef(cmd,sizeof(cmd),
"pslRecalcMatch unscoredNB.psl %s" 
" pslMap.fa nonBac.psl"
,dnaPath);
verbose(1,"%s\n",cmd); system(cmd);

/* bac */
safef(cmd,sizeof(cmd),
"zcat %s | pslMap -chainMapFile -swapMap  bac.psl stdin stdout "
"|  sort -k 14,14 -k 16,16n > unscoredB.psl"
,path);
verbose(1,"%s\n",cmd); system(cmd);

safef(cmd,sizeof(cmd),
"pslRecalcMatch unscoredB.psl %s" 
" pslMap.fa bacTemp.psl"
,dnaPath);
verbose(1,"%s\n",cmd); system(cmd);

safef(cmd,sizeof(cmd),
"pslCDnaFilter -globalNearBest=0.00001 -minCover=0.05"
" bacTemp.psl bac.psl");
verbose(1,"%s\n",cmd); system(cmd);

safef(cmd,sizeof(cmd),"cat bac.psl nonBac.psl > vgPrbPslMap.psl");
verbose(1,"%s\n",cmd); system(cmd);

dyStringFree(&dy);

}
示例#10
0
static struct tissueSampleVals *getTissueSampleVals(char *geneId, boolean doLogTransform,
                                                char *version, double *maxValRet)
/* Get sample data for the gene.  Optionally log10 it. Return maximum value seen */
{
struct hash *tsHash = hashNew(0);
struct tissueSampleVals *tsv;
struct hashEl *hel;
struct slDouble *val;
double maxVal = 0;
struct gtexSampleData *sd = NULL;
char query[256];
char **row;
char buf[256];
char *sampleDataTable = "gtexSampleData";
safef(buf, sizeof(buf), "%s%s", sampleDataTable, gtexVersionSuffixFromVersion(version));
struct sqlConnection *conn = hAllocConn("hgFixed");
assert(sqlTableExists(conn, buf));
sqlSafef(query, sizeof(query), "select * from %s where geneId='%s'", buf, geneId);
struct sqlResult *sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    sd = gtexSampleDataLoad(row);
    if ((hel = hashLookup(tsHash, sd->tissue)) == NULL)
        {
        AllocVar(tsv);
        hashAdd(tsHash, sd->tissue, tsv);
        }
    else
        tsv = (struct tissueSampleVals *)hel->val;
    maxVal = max(maxVal, sd->score);
    val = slDoubleNew(sd->score);
    slAddHead(&tsv->valList, val);
    }
/*  Fill in tissue descriptions, fill values array and calculate stats for plotting
        Then make a list, suitable for sorting by tissue or score
    NOTE: Most of this not needed for R implementation */
struct gtexTissue *tis = NULL, *tissues = gtexGetTissues(version);
struct tissueSampleVals *tsList = NULL;
int i;
if (doLogTransform)
    maxVal = log10(maxVal+1.0);
for (tis = tissues; tis != NULL; tis = tis->next)
    {
    tsv = hashFindVal(tsHash, tis->name);
    if (tsv == NULL)
        {
        /* no non-zero values for this tissue/gene */
        AllocVar(tsv);
        val = slDoubleNew(0.0);
        slAddHead(&tsv->valList, val);
        }
    tsv->name = tis->name;
    tsv->description = tis->description;
    tsv->color = tis->color;
    int count = tsv->count = slCount(tsv->valList);
    double *vals = AllocArray(tsv->vals, count);
    for (i=0; i<count; i++)
        {
        val = slPopHead(&tsv->valList);
        if (doLogTransform)
            vals[i] = log10(val->val+1.0);
        else
            vals[i] = val->val;
        }
    doubleBoxWhiskerCalc(tsv->count, tsv->vals, 
                                &tsv->min, &tsv->q1, &tsv->median, &tsv->q3, &tsv->max);
    slAddHead(&tsList, tsv);
    }
if (maxValRet != NULL)
    *maxValRet = maxVal;
return tsList;
}
示例#11
0
struct hash *makeMotifBed(char *gffDir, char *outBed)
/* Make bed file from GFFs.  Return hash of transcription factors. */
{
static char *consLevelPath[3] = {"3", "2", "0"};
static char *consLevelBed[3] = {"2", "1", "0"};
static char *pLevelPath[3] = {"p001b", "p005b", "nobind"};
static char *pLevelBed[3] = {"good", "weak", "none"};
int cIx, pIx;
FILE *f = mustOpen(outBed, "w");
struct hash *tfHash = newHash(0);
struct hash *yrcHash = newHash(18);
struct yrc *yrcList = NULL, *yrc;

for (cIx=0; cIx<3; ++cIx)
   {
   for (pIx=0; pIx<3; ++pIx)
       {
       struct lineFile *lf;
       char *row[10];
       char fileName[PATH_LEN];
       char hashKey[256];

       safef(fileName, sizeof(fileName), "%s/IGR_v24.%s.%s.GFF",
       	   gffDir, consLevelPath[cIx], pLevelPath[pIx]);
       lf = lineFileOpen(fileName, TRUE);
       while (lineFileRow(lf, row))
            {
	    char *name = row[9];
	    char *e;
	    int chromIx, chromStart, chromEnd;
	    if (!sameWord(row[8], "Site"))
	        errAbort("Expecting 'Site' line %d of %s", lf->lineIx, lf->fileName);
	    e = strchr(name, ';');
	    if (e == NULL)
	        errAbort("Expecting semicolon line %d of %s", lf->lineIx, lf->fileName);
	    *e = 0;
	    chromIx = romanToArabicChrom(row[0], lf);
	    chromStart = lineFileNeedNum(lf, row, 3);
	    chromEnd = lineFileNeedNum(lf, row, 4);
	    safef(hashKey, sizeof(hashKey), "%s.%d.%d", name, chromIx, chromStart);
	    if ((yrc = hashFindVal(yrcHash, hashKey)) == NULL)
	        {
		AllocVar(yrc);
		yrc->chromIx= chromIx;
		yrc->chromStart = chromStart;
		yrc->chromEnd = chromEnd;
		yrc->name = hashStoreName(tfHash, name);
		yrc->pLevel = pIx;
		yrc->consLevel = cIx;
		hashAdd(yrcHash, hashKey, yrc);
		slAddHead(&yrcList, yrc);
		}
	    else
	        {
		if (pIx < yrc->pLevel)
		    yrc->pLevel = pIx;
		if (cIx < yrc->consLevel)
		    yrc->consLevel = cIx;
		}
	    }
       lineFileClose(&lf);
       }
   }
for (yrc = yrcList; yrc != NULL; yrc = yrc->next)
    {
    fprintf(f, "chr%d\t", yrc->chromIx+1);
    fprintf(f, "%d\t", yrc->chromStart);
    fprintf(f, "%d\t", yrc->chromEnd);
    fprintf(f, "%s\t", yrc->name);
    fprintf(f, "%d\t", (int)(1000/(yrc->pLevel + yrc->consLevel + 1)));
    fprintf(f, "%s\t", pLevelBed[yrc->pLevel]);
    fprintf(f, "%s\n", consLevelBed[yrc->consLevel]);
    }
carefulClose(&f);
hashFree(&yrcHash);
return tfHash;
}
示例#12
0
void getInvariants(char *db, struct snpExceptions *exceptionList, 
		   struct slName *chromList, char *fileBase)
/* write list of invariants to output file */
{
struct sqlConnection *conn     = hAllocConn(db);
struct sqlResult     *sr       = NULL;
struct snpExceptions *el       = NULL;
struct slName        *chrom    = NULL;
char                **row      = NULL;
char                  query[1024];
unsigned long int     invariantCount;
char                  thisFile[64];
FILE                 *outFile;
int                   colCount, i;
char                  idString[3];

for (el=exceptionList; el!=NULL; el=el->next)
    {
    if (el->exceptionId<10)
	safef(idString,sizeof(idString), "0%d", el->exceptionId);
    else
	safef(idString,sizeof(idString), "%d",  el->exceptionId);
    invariantCount = 0;
    if (startsWith("select",el->query))
	{
	safef(thisFile, sizeof(thisFile), "%s.%s.bed", fileBase, idString);
	outFile = mustOpen(thisFile, "w");
	fprintf(outFile, "# exceptionId:\t%d\n# query:\t%s;\n", el->exceptionId, el->query);
	for (chrom=chromList; chrom!=NULL; chrom=chrom->next)
	    {
	    fflush(outFile); /* to keep an eye on output progress */
	    sqlSafef(query, sizeof(query),
		  "%-s and chrom='%s'", el->query, chrom->name);
	    sr = sqlGetResult(conn, query);
	    colCount = sqlCountColumns(sr);
	    while ((row = sqlNextRow(sr))!=NULL)
		{
		invariantCount++; 
		fprintf(outFile, "%s", row[0]);
		for (i=1; i<colCount; i++)
		    fprintf(outFile, "\t%s", row[i]);
		fprintf(outFile, "\n");
		}
	    }
	}
    else if (startsWith("group",el->query))
	{
	struct slName *nameList = NULL;
	struct slName *name     = NULL;
	safef(thisFile, sizeof(thisFile), "%s.%s.bed", fileBase, idString);
	outFile = mustOpen(thisFile, "w");
	fprintf(outFile, "# exceptionId:\t%d\n# query:\t%s;\n", el->exceptionId, el->query);
	nameList = getGroupList(db, el->query);
	for (name=nameList; name!=NULL; name=name->next)
	    {
	    sqlSafef(query, sizeof(query),
		  "select chrom,chromStart,chromEnd,name,%d as score,class,locType,observed "
		  "from snp where name='%s'", el->exceptionId, name->name);
	    sr = sqlGetResult(conn, query);
	    colCount = sqlCountColumns(sr);
	    while ((row = sqlNextRow(sr))!=NULL)
		{
		invariantCount++; 
		fprintf(outFile, "%s", row[0]);
		for (i=1; i<colCount; i++)
		    fprintf(outFile, "\t%s", row[i]);
		fprintf(outFile, "\n");
		}
	    }
	}
    else
	{
	printf("Invariant %d has no query string\n", el->exceptionId);
	continue;
	}
    carefulClose(&outFile);
    printf("Invariant %d has %lu exceptions, written to this file: %s\n", 
	   el->exceptionId, invariantCount, thisFile);
    fflush(stdout);
    sqlSafef(query, sizeof(query),
	  "update snpExceptions set num=%lu where exceptionId=%d",
	  invariantCount, el->exceptionId);    
    sr=sqlGetResult(conn, query); /* there's probably a better way to do this */
    }
}
示例#13
0
static void doDownload(struct sqlConnection *conn)
/* Try to force user's browser to download by giving special response headers */
{
int imageId = cartUsualInt(cart, hgpId, 0);
char url[1024];
char *p = NULL;
char dir[256];
char name[128];
char extension[64];
int w = 0, h = 0;
int sd = -1;

if (!visiGeneImageSize(conn, imageId, &w, &h))
    imageId = 0;

if (imageId == 0)
    {
    problemPage("invalid imageId","");
    }
else
    {
    p=visiGeneFullSizePath(conn, imageId);
    splitPath(p, dir, name, extension);
    safef(url,sizeof(url),"%s%s%s", dir, name, extension);
    sd = netUrlOpen(url);
    if (sd < 0)
	{
	problemPage("Couldn't open", url);
	}
    else
	{
	char *newUrl = NULL;
	int newSd = 0;
	/* url needed for err msgs and redirect url*/
	if (netSkipHttpHeaderLinesHandlingRedirect(sd, url, &newSd, &newUrl))
	    {
	    char buf[32*1024];
	    int readSize;
	    if (newUrl)
		{
		freeMem(newUrl);
		sd = newSd;
		}
	    printf("Content-Type: application/octet-stream\n");
	    printf("Content-Disposition: attachment; filename=%s%s\n", name, extension);
	    printf("\n");
	    while ((readSize = read(sd, buf, sizeof(buf))) > 0)
	        fwrite(buf, 1,  readSize, stdout);
	    close(sd);
	    sd = -1;
	    fflush(stdout);
	    fclose(stdout);
	    }
	else
	    {
	    problemPage("Skip http header problem", url);
    	    }
	freeMem(newUrl);
	}
    }
}
示例#14
0
void doImage(struct sqlConnection *conn)
/* Put up image page. */
{
int imageId = cartUsualInt(cart, hgpId, 0);
char *sidUrl = cartSidUrlString(cart);
char buf[1024];
char url[1024];
char *p = NULL;
char dir[256];
char name[128];
char extension[64];
int w = 0, h = 0;
htmlSetBgColor(0xE0E0E0);
htmStart(stdout, "do image");

puts(
"<script type=\"text/JavaScript\">"
"document.getElementsByTagName('html')[0].style.height=\"100%\";"
"document.getElementsByTagName('body')[0].style.height=\"100%\";"
"</script>"
);

if (!visiGeneImageSize(conn, imageId, &w, &h))
    imageId = 0;

if (imageId != 0)
    {
    printf("<B>");
    smallCaption(conn, imageId);
    printf(".</B> Click image to zoom in, drag or arrow keys to move. "
	   "Caption is below.<BR>\n");

    p=visiGeneFullSizePath(conn, imageId);

    splitPath(p, dir, name, extension);
#ifdef DEBUG
    safef(buf,sizeof(buf),"../bigImageTest.html?url=%s%s/%s&w=%d&h=%d",
	    dir,name,name,w,h);
#else
    safef(buf,sizeof(buf),"../bigImage.html?url=%s%s/%s&w=%d&h=%d",
	    dir,name,name,w,h);
#endif
    printf("<IFRAME name=\"bigImg\" width=\"100%%\" height=\"90%%\" SRC=\"%s\"></IFRAME><BR>\n", buf);

    fullCaption(conn, imageId);

    safef(buf,sizeof(buf),"%s%s%s", dir, name, extension);
    safef(url,sizeof(url),"%s?%s=go&%s&%s=%d",
    	hgVisiGeneCgiName(), hgpDoDownload, sidUrl, hgpId, imageId);

    printf("<B>Full-size image:</B> %d x %d &nbsp; <A HREF='%s'> download </A> ", w, h, url);

    /* Currently this is dangerous for users with less than 1 GB RAM to use
       on large images, because their machines can thrash themselves into a coma.
       X-windows (i.e. used by FireFox) will allocate 5 bytes per pixel.
       If the image size in pixels times 5 exceeds real ram size, then
       Linux thrashes incessantly.  But you can hit ctrl-alt-F1 to
       get a text only screen, then kill the bad processes (FF) and then
       you can restore desktop with ctrl-alt-F7.  Hiram says that's a
       feature credited to SCO-Unix.  On my 1GB machines at work/home,
       I never encountered any problem what-so-ever, even with the
       largest visiGene AllenBrain - about 19000x9000 pix.

    printf(" &nbsp;&nbsp; <A HREF='%s'> view </A>\n", buf);
    */
    printf("\n");

    }
htmlEnd();
}
示例#15
0
void dbTrash(char *db)
/* dbTrash - drop tables from a database older than specified N hours. */
{
char query[256];
struct sqlResult *sr;
char **row;
int updateTimeIx;
int createTimeIx;
int dataLengthIx;
int indexLengthIx;
int nameIx;
int timeIxUsed;
unsigned long long totalSize = 0;
// expiredTableNames: table exists and is in metaInfo and subject to age limits
struct slName *expiredTableNames = NULL;
struct slName *lostTables = NULL;	// tables existing but not in metaInfo
unsigned long long lostTableCount = 0;
struct hash *expiredHash = newHash(10); // as determined by metaInfo
struct hash *notExpiredHash = newHash(10);
struct sqlConnection *conn = sqlConnect(db);

if (extFileCheck)
    checkExtFile(conn);

time_t ageSeconds = (time_t)(ageHours * 3600);	/*	age in seconds	*/
sqlSafef(query,sizeof(query),"select name,UNIX_TIMESTAMP(lastUse) from %s WHERE "
    "lastUse < DATE_SUB(NOW(), INTERVAL %ld SECOND);", CT_META_INFO,ageSeconds);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    hashAddInt(expiredHash, row[0], sqlSigned(row[1]));
sqlFreeResult(&sr);
sqlSafef(query,sizeof(query),"select name,UNIX_TIMESTAMP(lastUse) from %s WHERE "
    "lastUse >= DATE_SUB(NOW(), INTERVAL %ld SECOND);",CT_META_INFO,ageSeconds);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    hashAddInt(notExpiredHash, row[0], sqlSigned(row[1]));
sqlFreeResult(&sr);

if (tableStatus)  // show table status is very expensive, use only when asked
    {
    /*	run through the table status business to get table size information */
    sqlSafef(query,sizeof(query),"show table status");
    STATUS_INIT;
    while ((row = sqlNextRow(sr)) != NULL)
	{
	/* if not doing history too, and this is the history table, next row */
	if ((!historyToo) && (sameWord(row[nameIx],"history")))
	    continue;
	/* also skip the metaInfo table */
	if ((!historyToo) && (sameWord(row[nameIx],CT_META_INFO)))
	    continue;
	/* don't delete the extFile table  */
	if (sameWord(row[nameIx],CT_EXTFILE))
	    continue;

	SCAN_STATUS;

	if (hashLookup(expiredHash,row[nameIx]))
	    {
	    slNameAddHead(&expiredTableNames, row[nameIx]);
	    verbose(3,"%s %ld drop %s\n",row[timeIxUsed], (unsigned long)timep,
		    row[nameIx]);
	    /*	 If sizes are non-NULL, add them up	*/
	    if ( ((char *)NULL != row[dataLengthIx]) &&
		    ((char *)NULL != row[indexLengthIx]) )
		totalSize += sqlLongLong(row[dataLengthIx])
		    + sqlLongLong(row[indexLengthIx]);
	    hashRemove(expiredHash, row[nameIx]);
	    }
	else
	    {
	    if (hashLookup(notExpiredHash,row[nameIx]))
		verbose(3,"%s %ld OK %s\n",row[timeIxUsed], (unsigned long)timep,
		    row[nameIx]);
	    else
		{	/* table exists, but not in metaInfo, is it old enough ? */
		if (timep < dropTime)
		    {
		    slNameAddHead(&expiredTableNames, row[nameIx]);
		    verbose(2,"%s %ld dropt %s lost table\n",
			row[timeIxUsed], (unsigned long)timep, row[nameIx]);
		    /*       If sizes are non-NULL, add them up     */
		    if ( ((char *)NULL != row[dataLengthIx]) &&
			((char *)NULL != row[indexLengthIx]) )
			    totalSize += sqlLongLong(row[dataLengthIx])
				+ sqlLongLong(row[indexLengthIx]);
		    }
		else
		    verbose(3,"%s %ld OKt %s\n",row[timeIxUsed],
			(unsigned long)timep, row[nameIx]);
		}
	    }
	}
    sqlFreeResult(&sr);
    }
else
    {	// simple 'show tables' is more efficient than 'show table status'
    sqlSafef(query,sizeof(query),"show tables");
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
	if (hashLookup(expiredHash,row[0]))
	    {
	    slNameAddHead(&expiredTableNames, row[0]);
	    time_t lastUse = (time_t)hashIntVal(expiredHash,row[0]);
	    struct tm *lastUseTm = localtime(&lastUse);
	    verbose(3,"%4d-%02d-%02d %02d:%02d:%02d %ld drop %s\n",
		lastUseTm->tm_year+1900, lastUseTm->tm_mon+1,
		lastUseTm->tm_mday, lastUseTm->tm_hour, lastUseTm->tm_min,
		lastUseTm->tm_sec, (unsigned long)lastUse,row[0]);
	    hashRemove(expiredHash, row[0]);
	    }
	else if (hashLookup(notExpiredHash,row[0]))
	    {
	    time_t lastUse = (time_t)hashIntVal(notExpiredHash,row[0]);
	    struct tm *lastUseTm = localtime(&lastUse);
	    verbose(3,"%4d-%02d-%02d %02d:%02d:%02d %ld OK %s\n",
		lastUseTm->tm_year+1900, lastUseTm->tm_mon+1,
		lastUseTm->tm_mday, lastUseTm->tm_hour, lastUseTm->tm_min,
		lastUseTm->tm_sec, (unsigned long)lastUse,row[0]);
	    }
	else
	    {
	    struct slName *el = slNameNew(row[0]);
	    slAddHead(&lostTables, el);
	    }
        }
    sqlFreeResult(&sr);
    lostTableCount = slCount(lostTables);
    // If tables exist, but not in metaInfo, check their age to expire them.
    // It turns out even this show table status is slow too, so, only
    // run thru it if asked to eliminate lost tables.  It is better to
    // do this operation with the stand-alone perl script on the customTrash
    // database machine.
    if (delLostTable && lostTables)
	{
	struct slName *el;
	for (el = lostTables; el != NULL; el = el->next)
	    {
	    if (sameWord(el->name,"history"))
		continue;
	    if (sameWord(el->name,CT_META_INFO))
		continue;
	    if (sameWord(el->name,CT_EXTFILE))
		continue;
	    boolean oneTableOnly = FALSE; // protect against multiple tables
	    /*	get table time information to see if it is expired */
	    sqlSafef(query,sizeof(query),"show table status like '%s'", el->name);
	    STATUS_INIT;

	    while ((row = sqlNextRow(sr)) != NULL)
		{
		if (oneTableOnly)
		    errAbort("ERROR: query: '%s' returned more than one table "
				"name\n", query);
		else
		    oneTableOnly = TRUE;
		if (differentWord(row[nameIx], el->name))
		    errAbort("ERROR: query: '%s' did not return table name '%s' != '%s'\n", query, el->name, row[nameIx]);

		SCAN_STATUS;

		if (timep < dropTime)
		    {
		    slNameAddHead(&expiredTableNames, row[nameIx]);
		    verbose(2,"%s %ld dropt %s lost table\n",
			row[timeIxUsed], (unsigned long)timep, row[nameIx]);
		    }
		else
		    verbose(3,"%s %ld OKt %s\n",
			row[timeIxUsed], (unsigned long)timep, row[nameIx]);
		}
	    sqlFreeResult(&sr);
	    }
	}
    }

/*	perhaps the table was already dropped, but not from the metaInfo */
struct hashEl *elList = hashElListHash(expiredHash);
struct hashEl *el;
for (el = elList; el != NULL; el = el->next)
    {
    verbose(2,"%s exists in %s only\n", el->name, CT_META_INFO);
    if (drop)
	ctTouchLastUse(conn, el->name, FALSE); /* removes metaInfo row */
    }

if (drop)
    {
    char comment[256];
    if (expiredTableNames)
	{
	struct slName *el;
	int droppedCount = 0;
	/* customTrash DB user permissions do not have permissions to
 	 * drop tables.  Must use standard special user that has all
 	 * permissions.  If we are not using the standard user at this
 	 * point, then switch to it.
	 */
	if (sameWord(db,CUSTOM_TRASH))
	    {
	    sqlDisconnect(&conn);
	    conn = sqlConnect(db);
	    }
	for (el = expiredTableNames; el != NULL; el = el->next)
	    {
	    verbose(2,"# drop %s\n", el->name);
	    sqlDropTable(conn, el->name);
	    ctTouchLastUse(conn, el->name, FALSE); /* removes metaInfo row */
	    ++droppedCount;
	    }
	/* add a comment to the history table and finish up connection */
	if (tableStatus)
	    safef(comment, sizeof(comment), "Dropped %d tables with "
		"total size %llu, %llu lost tables",
		    droppedCount, totalSize, lostTableCount);
	else
	    safef(comment, sizeof(comment),
		"Dropped %d tables, no size info, %llu lost tables",
		    droppedCount, lostTableCount);
	verbose(2,"# %s\n", comment);
	hgHistoryComment(conn, "%s", comment);
	}
    else
	{
	safef(comment, sizeof(comment),
	    "Dropped no tables, none expired, %llu lost tables",
		lostTableCount);
	verbose(2,"# %s\n", comment);
	}
    }
else
    {
    char comment[256];
    if (expiredTableNames)
	{
	int droppedCount = slCount(expiredTableNames);
	if (tableStatus)
	    safef(comment, sizeof(comment), "Would have dropped %d tables with "
		"total size %llu, %llu lost tables",
		    droppedCount, totalSize, lostTableCount);
	else
	    safef(comment, sizeof(comment),
		"Would have dropped %d tables, no size info, %llu lost tables",
		    droppedCount, lostTableCount);
	verbose(2,"# %s\n", comment);
	}
    else
	{
	safef(comment, sizeof(comment),
	    "Would have dropped no tables, none expired, %llu lost tables",
		lostTableCount);
	verbose(2,"# %s\n", comment);
	}
    }
sqlDisconnect(&conn);
}
static void doSeqAndExtFile(struct sqlConnection *conn, char *db, char *table)
{
int rc = 0;
char cmd[256];
char path[256];
char bedPath[256];
char gbdbPath[256];
char *fname=NULL;
struct dyString *dy = dyStringNew(0);
dyStringClear(dy);
dyStringPrintf(dy, 
"select distinct concat('vgPrb_',e.id), e.seq"
" from vgPrb e join %s.%s v"
" left join %s.seq s on s.acc = v.qName"
" where concat('vgPrb_',e.id) = v.qName"
" and s.acc is NULL"
" order by e.id"
    , db, table, db);
rc = sqlSaveQuery(conn, dy->string, "vgPrbExt.fa", TRUE);
verbose(1,"rc = %d = count of sequences for vgPrbExt.fa, to use with %s track %s\n",rc,db,table);
if (rc > 0)  /* can set any desired minimum */
    {
    safef(bedPath,sizeof(bedPath),"/cluster/data/%s/bed/visiGene/",db);
    if (!fileExists(bedPath))
	{
	safef(cmd,sizeof(cmd),"mkdir %s",bedPath);
	verbose(1,"%s\n",cmd); system(cmd);
	}
    
    safef(gbdbPath,sizeof(gbdbPath),"/gbdb/%s/visiGene/",db);
    if (!fileExists(gbdbPath))
	{
	safef(cmd,sizeof(cmd),"mkdir %s",gbdbPath);
    	verbose(1,"%s\n",cmd); system(cmd);
	}
   
    while(1)
	{
	int i=0;
	safef(path,sizeof(path),"%svgPrbExt_AAAAAA.fa",bedPath);
        char *c = rStringIn("AAAAAA",path);
        srand( (unsigned)time( NULL ) );
        for(i=0;i<6;++i)
            {
            *c++ += (int) 26 * (rand() / (RAND_MAX + 1.0));
            }
	if (!fileExists(path))
	    break;
	}

    
    safef(cmd,sizeof(cmd),"cp vgPrbExt.fa %s",path);
    verbose(1,"%s\n",cmd); system(cmd);
    
    fname = rStringIn("/", path);
    ++fname;
    
    safef(cmd,sizeof(cmd),"ln -s %s %s%s",path,gbdbPath,fname);
    verbose(1,"%s\n",cmd); system(cmd);
    
    safef(cmd,sizeof(cmd),"hgLoadSeq %s %s%s", db, gbdbPath,fname);
    verbose(1,"%s\n",cmd); system(cmd);
    }

dyStringFree(&dy);
}
示例#17
0
文件: https.c 项目: blumroy/kentUtils
void *netConnectHttpsThread(void *threadParam)
/* use a thread to run socket back to user */
{
/* child */

struct netConnectHttpsParams *params = threadParam;

pthread_detach(params->thread);  // this thread will never join back with it's progenitor

int fd=0;

char hostnameProto[256];

BIO *sbio;
SSL_CTX *ctx;
SSL *ssl;

openSslInit();

ctx = SSL_CTX_new(SSLv23_client_method());

fd_set readfds;
fd_set writefds;
int err;
struct timeval tv;


/* TODO checking certificates 

char *certFile = NULL;
char *certPath = NULL;
if (certFile || certPath)
    {
    SSL_CTX_load_verify_locations(ctx,certFile,certPath);
#if (OPENSSL_VERSION_NUMBER < 0x0090600fL)
    SSL_CTX_set_verify_depth(ctx,1);
#endif
    }

// verify paths and mode.

*/


sbio = BIO_new_ssl_connect(ctx);

BIO_get_ssl(sbio, &ssl);
if(!ssl) 
    {
    xerr("Can't locate SSL pointer");
    goto cleanup;
    }

/* Don't want any retries since we are non-blocking bio now */
//SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);


safef(hostnameProto,sizeof(hostnameProto),"%s:%d",params->hostName,params->port);
BIO_set_conn_hostname(sbio, hostnameProto);

BIO_set_nbio(sbio, 1);     /* non-blocking mode */

while (1) 
    {
    if (BIO_do_connect(sbio) == 1) 
	{
	break;  /* Connected */
	}
    if (! BIO_should_retry(sbio)) 
	{
	xerr("BIO_do_connect() failed");
	char s[256];	
	safef(s, sizeof s, "SSL error: %s", ERR_reason_error_string(ERR_get_error()));
	xerr(s);
	goto cleanup;
	}

    fd = BIO_get_fd(sbio, NULL);
    if (fd == -1) 
	{
	xerr("unable to get BIO descriptor");
	goto cleanup;
	}
    FD_ZERO(&readfds);
    FD_ZERO(&writefds);
    if (BIO_should_read(sbio)) 
	{
	FD_SET(fd, &readfds);
	}
    else if (BIO_should_write(sbio)) 
	{
	FD_SET(fd, &writefds);
	}
    else 
	{  /* BIO_should_io_special() */
	FD_SET(fd, &readfds);
	FD_SET(fd, &writefds);
	}
    tv.tv_sec = (long) (DEFAULTCONNECTTIMEOUTMSEC/1000);  // timeout default 10 seconds
    tv.tv_usec = (long) (((DEFAULTCONNECTTIMEOUTMSEC/1000)-tv.tv_sec)*1000000);

    err = select(fd + 1, &readfds, &writefds, NULL, &tv);
    if (err < 0) 
	{
	xerr("select() error");
	goto cleanup;
	}

    if (err == 0) 
	{
	char s[256];	
	safef(s, sizeof s, "connection timeout to %s", params->hostName);
	xerr(s);
	goto cleanup;
	}
    }


/* TODO checking certificates 

if (certFile || certPath)
    if (!check_cert(ssl, host))
	return -1;

*/

/* we need to wait on both the user's socket and the BIO SSL socket 
 * to see if we need to ferry data from one to the other */


char sbuf[32768];  // socket buffer sv[1] to user
char bbuf[32768];  // bio buffer
int srd = 0;
int swt = 0;
int brd = 0;
int bwt = 0;
while (1) 
    {

    // Do NOT move this outside the while loop. 
    /* Get underlying file descriptor, needed for select call */
    fd = BIO_get_fd(sbio, NULL);
    if (fd == -1) 
	{
	xerr("BIO doesn't seem to be initialized in https, unable to get descriptor.");
	goto cleanup;
	}


    FD_ZERO(&readfds);
    FD_ZERO(&writefds);

    if (brd == 0)
	FD_SET(fd, &readfds);
    if (swt < srd)
	FD_SET(fd, &writefds);
    if (srd == 0)
	FD_SET(params->sv[1], &readfds);

    tv.tv_sec = (long) (DEFAULTCONNECTTIMEOUTMSEC/1000);  // timeout default 10 seconds
    tv.tv_usec = (long) (((DEFAULTCONNECTTIMEOUTMSEC/1000)-tv.tv_sec)*1000000);

    err = select(max(fd,params->sv[1]) + 1, &readfds, &writefds, NULL, &tv);

    /* Evaluate select() return code */
    if (err < 0) 
	{
	xerr("error during select()");
	goto cleanup;
	}
    else if (err == 0) 
	{
	/* Timed out - just quit */
	xerr("https timeout expired");
	goto cleanup;
	}

    else 
	{
	if (FD_ISSET(params->sv[1], &readfds))
	    {
	    swt = 0;
	    srd = read(params->sv[1], sbuf, 32768);
	    if (srd == -1)
		{
		if (errno != 104) // udcCache often closes causing "Connection reset by peer"
		    xerrno("error reading https socket");
		goto cleanup;
		}
	    if (srd == 0) 
		break;  // user closed socket, we are done
	    }

	if (FD_ISSET(fd, &writefds))
	    {
	    int swtx = BIO_write(sbio, sbuf+swt, srd-swt);
	    if (swtx <= 0)
		{
		if (!BIO_should_write(sbio))
		    {
		    ERR_print_errors_fp(stderr);
		    xerr("Error writing SSL connection");
		    goto cleanup;
		    }
		}
	    else
		{
		swt += swtx;
		if (swt >= srd)
		    {
		    swt = 0;
		    srd = 0;
		    }
		}
	    }

	if (FD_ISSET(fd, &readfds))
	    {
	    bwt = 0;
	    brd = BIO_read(sbio, bbuf, 32768);

	    if (brd <= 0)
		{
		if (BIO_should_read(sbio))
		    {
		    brd = 0;
		    continue;
		    }
		else
		    {
		    if (brd == 0) break;
		    ERR_print_errors_fp(stderr);
		    xerr("Error reading SSL connection");
		    goto cleanup;
		    }
		}
	    // write the https data received immediately back on socket to user, and it's ok if it blocks.
	    while(bwt < brd)
		{
		int bwtx = write(params->sv[1], bbuf+bwt, brd-bwt);
		if (bwtx == -1)
		    {
		    if ((errno != 104)  // udcCache often closes causing "Connection reset by peer"
		     && (errno !=  32)) // udcCache often closes causing "Broken pipe"
			xerrno("error writing https data back to user socket");
		    goto cleanup;
		    }
		bwt += bwtx;
		}
	    brd = 0;
	    bwt = 0;
	    }
	}
    }

cleanup:

BIO_free_all(sbio);
close(params->sv[1]);  /* we are done with it */

return NULL;
}
static void doPrimers(struct sqlConnection *conn, int taxon, char *db)
/* get probe seq from primers */
{
int rc = 0;
struct dyString *dy = dyStringNew(0);
char cmdLine[256];
char path1[256];
char path2[256];

dyStringClear(dy);
dyStringAppend(dy, "select e.id, p.fPrimer, p.rPrimer from probe p, vgPrbMap m, vgPrb e, gene g");
dyStringPrintf(dy, " where p.id = m.probe and m.vgPrb = e.id and g.id = p.gene and g.taxon = %d",taxon);
dyStringAppend(dy, " and e.state = 'new' and e.type='primersMrna'");
rc = sqlSaveQuery(conn, dy->string, "primers.query", FALSE);
verbose(1,"rc = %d = count of primers for mrna search for taxon %d\n",rc,taxon);

if (rc > 0) /* something to do */
    {

    dyStringClear(dy);
    dyStringPrintf(dy, "select qName from %s.all_mrna",db);
    rc = 0;
    rc = sqlSaveQuery(conn, dy->string, "accFile.txt", FALSE);
    safef(cmdLine,sizeof(cmdLine),"getRna %s accFile.txt mrna.fa",db);
    system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");
    
    verbose(1,"rc = %d = count of mrna for %s\n",rc,db);

    system("date"); system("isPcr mrna.fa primers.query isPcr.fa -out=fa"); system("date");
    system("ls -l");

    processIsPcr(conn,taxon,db);

    unlink("mrna.fa"); unlink("accFile.txt"); unlink("isPcr.fa");

    }
unlink("primers.query");    

/* find any remaining type primersMrna that couldn't be resolved and demote 
 * them to type primersGenome
 */
dyStringClear(dy);
dyStringAppend(dy, "update vgPrb set type='primersGenome'"); 
dyStringPrintf(dy, " where taxon = %d",taxon);
dyStringAppend(dy, " and state = 'new' and type='primersMrna'");
sqlUpdate(conn, dy->string);



/* get primers for those probes that did not find mrna isPcr matches 
 * and then do them against the genome instead */
dyStringClear(dy);
dyStringAppend(dy, "select e.id, p.fPrimer, p.rPrimer from probe p, vgPrbMap m, vgPrb e, gene g");
dyStringPrintf(dy, " where p.id = m.probe and m.vgPrb = e.id and g.id = p.gene and g.taxon = %d",taxon);
dyStringAppend(dy, " and e.state = 'new' and e.type='primersGenome'");
rc = 0;
rc = sqlSaveQuery(conn, dy->string, "primers.query", FALSE);
verbose(1,"rc = %d = count of primers for genome search for taxon %d\n",rc,taxon);

if (rc > 0) /* something to do */
    {
    safef(path1,sizeof(path1),"/gbdb/%s/%s.2bit",db,db);
    safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db);
    verbose(1,"copy: [%s] to [%s]\n",path1,path2);  copyFile(path1,path2);

    safef(cmdLine,sizeof(cmdLine),
	    "ssh kolossus 'cd %s; isPcr %s.2bit primers.query isPcr.fa -out=fa'",
	    getCurrentDir(),db);
    system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");
    safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db);
    verbose(1,"rm %s\n",path2); unlink(path2); system("ls -l");

    processIsPcr(conn,taxon,db);
    
    unlink("mrna.fa"); unlink("accFile.txt"); unlink("isPcr.fa");

    }
unlink("primers.query");    

/* find any remaining type primersGenome that couldn't be resolved and demote 
 * them to type refSeq
 */
dyStringClear(dy);
dyStringAppend(dy, "update vgPrb set type='refSeq'"); 
dyStringPrintf(dy, " where taxon = %d",taxon);
dyStringAppend(dy, " and state = 'new' and type='primersGenome'");
sqlUpdate(conn, dy->string);

dyStringFree(&dy);
}
示例#19
0
void testOneTable(struct htmlPage *trackPage, char *org, char *db,
	char *group, char *track, char *table)
/* Test stuff on one table if we haven't already tested this table. */
{
/* Why declared here and not globally? */
static struct hash *uniqHash = NULL;
char fullName[256];
if (uniqHash == NULL)
     uniqHash = newHash(0);
safef(fullName, sizeof(fullName), "%s.%s", db, table);
if (!hashLookup(uniqHash, fullName))
    {
    struct htmlPage *tablePage;
    struct htmlForm *mainForm;

    hashAdd(uniqHash, fullName, NULL);
    verbose(1, "Testing %s %s %s %s %s\n", naForNull(org), db, group, track, table);
    tablePage = quickSubmit(trackPage, org, db, group, 
	    track, table, "selectTable", hgtaTable, table);
    if (!isObsolete(table) && tablePage != NULL)
	{
	if ((mainForm = htmlFormGet(tablePage, "mainForm")) == NULL)
	    {
	    qaStatusSoftError(tablesTestList->status, 
		    "Couldn't get main form on tablePage for %s %s %s %s", db, group, track, table);
	    }
	else
	    {
	    testSchema(tablePage, mainForm, org, db, group, track, table);
	    testSummaryStats(tablePage, mainForm, org, db, group, track, table);
	    if (outTypeAvailable(mainForm, "bed")) 
		{
		if (outTypeAvailable(mainForm, "primaryTable"))
		    {
		    int rowCount;
		    rowCount = testAllFields(tablePage, mainForm, org, db, group, track, table);
		    testOneField(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutSequence(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutBed(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutHyperlink(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutGff(tablePage, mainForm, org, db, group, track, table);
		    if (rowCount > 0)
			testOutCustomTrack(tablePage, mainForm, org, db, group, track, table);
		    }
		}
	    else if (outTypeAvailable(mainForm, "primaryTable"))
		{
		/* If BED type is not available then the region will be ignored, and
		 * we'll end up scanning whole table.  Make sure table is not huge
		 * before proceeding. */
		if (tableSize(db, table) < 500000)
		    {
		    int rowCount;
		    rowCount = testAllFields(tablePage, mainForm, org, db, group, track, table);
		    testOneField(tablePage, mainForm, org, db, group, track, table, rowCount);
		    }
		}
	    }
	htmlPageFree(&tablePage);
	}
    carefulCheckHeap();
    }
}
static struct genePos *tameAssociationFilter(
	struct slName *termList, boolean orLogic, 
	struct column *col, struct sqlConnection *conn, struct genePos *list)
/* Handle filtering when there are no wildcards present. */
{
struct sqlResult *sr;
char **row;
struct slName *term;
struct hash *passHash = newHash(17);
struct hash *protHash = NULL;
struct hash *prevHash = NULL;
struct genePos *gp;
int protCount = 0, termCount = 0, matchRow = 0, keyRow = 0;

/* Make up protein-keyed hash if need be. */
if (col->protKey)
    {
    protHash = newHash(17);
    for (gp = list; gp != NULL; gp = gp->next)
	{
        hashAdd(protHash, gp->protein, gp->name);
	++protCount;
	}
    }
for (term = termList; term != NULL; term = term->next)
    {
    char query[1024];
    safef(query, sizeof(query), col->invQueryOne, term->name);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
	char *key = row[0];
	++matchRow;
	if (protHash != NULL)
	    key = hashFindVal(protHash, key);
	if (key != NULL)
	    {
	    ++keyRow;
	    if (prevHash == NULL || hashLookup(prevHash, key) != NULL)
		{
		hashStore(passHash, key);
		}
	    }
	}
    if (!orLogic)
	{
	hashFree(&prevHash);
	if (term->next != NULL)
	    {
	    prevHash = passHash;
	    passHash = newHash(17);
	    }
	}
    sqlFreeResult(&sr);
    ++termCount;
    }
list = weedUnlessInHash(list, passHash);
hashFree(&prevHash);
freeHash(&protHash);
freeHash(&passHash);
return list;
}
示例#21
0
void checkExp(char *bedFileName, char *tNibDir, char *nibList)
{
struct lineFile *bf = lineFileOpen(bedFileName , TRUE), *af = NULL;
char *row[PSEUDOGENELINK_NUM_COLS] ;
struct pseudoGeneLink *ps;
char *tmpName[512], cmd[512];
struct axt *axtList = NULL, *axt, *mAxt = NULL;
struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seqList = NULL;
struct nibInfo *qNib = NULL, *tNib = NULL;
FILE *op;
int ret;

if (nibHash == NULL)
    nibHash = hashNew(0);
while (lineFileNextRow(bf, row, ArraySize(row)))
    {
    struct misMatch *misMatchList = NULL;
    struct binKeeper *bk = NULL;
    struct binElement *el, *elist = NULL;
    struct psl *mPsl = NULL, *rPsl = NULL, *pPsl = NULL, *psl ;
    struct misMatch *mf = NULL;
    ps = pseudoGeneLinkLoad(row);
    tmpName[0] = cloneString(ps->name);
    chopByChar(tmpName[0], '.', tmpName, sizeof(tmpName));
    verbose(2,"name %s %s:%d-%d\n",
            ps->name, ps->chrom, ps->chromStart,ps->chromEnd);
    /* get expressed retro from hash */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart, ps->chromEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        rPsl = el->val;
        verbose(2,"retroGene %s %s:%d-%d\n",rPsl->qName, ps->chrom, ps->chromStart,ps->chromEnd);
        }
    /* find mrnas that overlap parent gene */
    bk = hashFindVal(mrnaHash, ps->gChrom);
    elist = binKeeperFindSorted(bk, ps->gStart , ps->gEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        pPsl = el->val;
        verbose(2,"parent %s %s:%d %d,%d\n",
                pPsl->qName, pPsl->tName,pPsl->tStart,
                pPsl->match, pPsl->misMatch);
        }
    /* find self chain */
    bk = hashFindVal(chainHash, ps->chrom);
    elist = binKeeperFind(bk, ps->chromStart , ps->chromEnd ) ;
    slSort(&elist, chainCmpScoreDesc);
    for (el = elist; el != NULL ; el = el->next)
        {
        struct chain *chain = el->val, *subChain, *retChainToFree, *retChainToFree2;
        int qs = chain->qStart;
        int qe = chain->qEnd;
        int id = chain->id;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        if (!sameString(chain->qName , ps->gChrom) || 
                !positiveRangeIntersection(qs, qe, ps->gStart, ps->gEnd))
            {
            verbose(2," wrong chain %s:%d-%d %s:%d-%d parent %s:%d-%d\n", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd,
                ps->gChrom,ps->gStart,ps->gEnd);
            continue;
            }
        verbose(2,"chain id %d %4.0f",chain->id, chain->score);
        chainSubsetOnT(chain, ps->chromStart+7, ps->chromEnd-7, 
            &subChain,  &retChainToFree);
        if (subChain != NULL)
            chain = subChain;
        chainSubsetOnQ(chain, ps->gStart, ps->gEnd, 
            &subChain,  &retChainToFree2);
        if (subChain != NULL)
            chain = subChain;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        verbose(2," %s:%d-%d %s:%d-%d ", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd);
        if (subChain != NULL)
            verbose(2,"subChain %s:%d-%d %s:%d-%d\n",
                    subChain->qName, subChain->qStart, subChain->qEnd, 
                    subChain->tName,subChain->tStart,subChain->tEnd);

	qNib = nibInfoFromCache(nibHash, tNibDir, chain->qName);
	tNib = nibInfoFromCache(nibHash, tNibDir, chain->tName);
	tSeq = nibInfoLoadStrand(tNib, chain->tStart, chain->tEnd, '+');
	qSeq = nibInfoLoadStrand(qNib, chain->qStart, chain->qEnd, chain->qStrand);
	axtList = chainToAxt(chain, qSeq, chain->qStart, tSeq, chain->tStart,
	    maxGap, BIGNUM);
        verbose(2,"axt count %d misMatch cnt %d\n",slCount(axtList), slCount(misMatchList));
        for (axt = axtList; axt != NULL ; axt = axt->next)
            {
            addMisMatch(&misMatchList, axt, chain->qSize);
            }
        verbose(2,"%d in mismatch list %s id %d \n",slCount(misMatchList), chain->qName, id);
        chainFree(&retChainToFree);
        chainFree(&retChainToFree2);
        break;
        }
    /* create axt of each expressed retroGene to parent gene */
        /* get alignment for each mrna overlapping retroGene */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart , ps->chromEnd ) ;
    {
    char queryName[512];
    char axtName[512];
    char pslName[512];
    safef(queryName, sizeof(queryName), "/tmp/query.%s.fa", ps->chrom);
    safef(axtName, sizeof(axtName), "/tmp/tmp.%s.axt", ps->chrom);
    safef(pslName, sizeof(pslName), "/tmp/tmp.%s.psl", ps->chrom);
    op = fopen(pslName,"w");
    for (el = elist ; el != NULL ; el = el->next)
        {
        psl = el->val;
        pslOutput(psl, op, '\t','\n');
        qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0);

        if (qSeq != NULL)
            slAddHead(&seqList, qSeq);
        else
            errAbort("seq %s not found \n", psl->qName);
        }
    fclose(op);
    faWriteAll(queryName, seqList);
    safef(cmd,sizeof(cmd),"pslPretty -long -axt %s %s %s %s",pslName , nibList, queryName, axtName);
    ret = system(cmd);
    if (ret != 0)
        errAbort("ret is %d %s\n",ret,cmd);
    verbose(2, "ret is %d %s\n",ret,cmd);
    af = lineFileOpen(axtName, TRUE);
    while ((axt = axtRead(af)) != NULL)
        slAddHead(&mAxt, axt);
    lineFileClose(&af);
    }
    slReverse(&mAxt);
    /* for each parent/retro pair, count bases matching retro and parent better */
    for (el = elist; el != NULL ; el = el->next)
        {
        int i, scoreRetro=0, scoreParent=0, scoreNeither=0;
        struct dyString *parentMatch = newDyString(16*1024);
        struct dyString *retroMatch = newDyString(16*1024);
        mPsl = el->val;

        if (mAxt != NULL)
            {
            verbose(2,"mrna %s %s:%d %d,%d axt %s\n",
                    mPsl->qName, mPsl->tName,mPsl->tStart,
                    mPsl->match, mPsl->misMatch, 
                    mAxt->qName);
            assert(sameString(mPsl->qName, mAxt->qName));
            for (i = 0 ; i< (mPsl->tEnd-mPsl->tStart) ; i++)
                {
                int j = mAxt->tStart - mPsl->tStart;
                verbose(5, "listLen = %d\n",slCount(&misMatchList));
                if ((mf = matchFound(&misMatchList, (mPsl->tStart)+i)) != NULL)
                    {
                    if (toupper(mf->retroBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match retro[%d] %d %c == %c parent %c %d\n",
                                i,mf->retroLoc, mf->retroBase, mAxt->qSym[j+i], 
                                mf->parentBase, mf->parentLoc);
                        dyStringPrintf(retroMatch, "%d,", mf->retroLoc);
                        scoreRetro++;
                        }
                    else if (toupper(mf->parentBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match parent[%d] %d %c == %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->qSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        dyStringPrintf(parentMatch, "%d,", mf->parentLoc);
                        scoreParent++;
                        }
                    else
                        {
                        verbose (3,"match neither[%d] %d %c != %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->tSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        scoreNeither++;
                        }
                    }
                }
            verbose(2,"final score %s parent %d retro %d  neither %d\n",
                    mPsl->qName, scoreParent, scoreRetro, scoreNeither);
            fprintf(outFile,"%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n",
                    ps->chrom, ps->chromStart, ps->chromEnd, ps->name, ps->score, 
                    mPsl->tName, mPsl->tStart, mPsl->tEnd, mPsl->qName, 
                    scoreParent, scoreRetro, scoreNeither, parentMatch->string, retroMatch->string);
            mAxt = mAxt->next;
            }
        dyStringFree(&parentMatch);
        dyStringFree(&retroMatch);
        }
    }
}
static void associationCalcDistances(struct order *ord, 
	struct sqlConnection *conn, /* connection to main database. */
	struct genePos **pGeneList, struct hash *geneHash, int maxCount)
/* Fill in distance fields in geneList. */
{
struct sqlResult *sr;
char **row;
struct hash *curTerms = newHash(8);
struct hash *protHash = NULL;
struct hash *lookupHash = geneHash;
char query[512];
struct genePos *gp;
char *geneId = curGeneId->name;

if (ord->protKey)
    {
    /* Build up hash of genes keyed by protein names. (The geneHash
     * passed in is keyed by the mrna name. */
    protHash = newHash(17);
    for (gp = *pGeneList; gp != NULL; gp = gp->next)
	{
	char *id = (ord->protKey 
	    ? (kgVersion == KG_III ? lookupProtein(conn, gp->name) : gp->protein)
	    : gp->name);
	hashAdd(protHash, id, gp);
	}

    /* Also switch current gene id and lookup hash to protein. */
    geneId = curGeneId->protein;
    lookupHash = protHash;
    }


/* Build up hash full of all go IDs associated with gene. */
if (geneId != NULL)
    {
    safef(query, sizeof(query), ord->queryOne, geneId);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
	{
	hashAdd(curTerms, row[0], NULL);
	}
    sqlFreeResult(&sr);
    }

/* Stream through association table counting matches. */
sr = sqlGetResult(conn, ord->queryAll);
while ((row = sqlNextRow(sr)) != NULL)
    {
    if (hashLookup(curTerms, row[1]))
	{
	struct hashEl *hel = hashLookup(lookupHash, row[0]);
	while (hel != NULL)
	    {
	    gp = hel->val;
	    gp->count += 1;
	    hel = hashLookupNext(hel);
	    }
	}
    }
sqlFreeResult(&sr);

/* Go through list translating non-zero counts to distances. */
for (gp = *pGeneList; gp != NULL; gp = gp->next)
    {
    if (gp->count > 0)
        {
	gp->distance = 1.0/gp->count;
	gp->count = 0;
	}
    if (sameString(gp->name, curGeneId->name))	/* Force self to top of list. */
        gp->distance = 0;
    }

hashFree(&protHash);
hashFree(&curTerms);
}
示例#23
0
void bamLoadItemsCore(struct track *tg, boolean isPaired)
/* Load BAM data into tg->items item list, unless zoomed out so far
 * that the data would just end up in dense mode and be super-slow. */
{
/* protect against temporary network error */
struct errCatch *errCatch = errCatchNew();
if (errCatchStart(errCatch))
    {
    struct hash *pairHash = isPaired ? hashNew(18) : NULL;
    int minAliQual = atoi(cartOrTdbString(cart, tg->tdb, BAM_MIN_ALI_QUAL, BAM_MIN_ALI_QUAL_DEFAULT));
    char *colorMode = cartOrTdbString(cart, tg->tdb, BAM_COLOR_MODE, BAM_COLOR_MODE_DEFAULT);
    char *grayMode = cartOrTdbString(cart, tg->tdb, BAM_GRAY_MODE, BAM_GRAY_MODE_DEFAULT);
    char *userTag = cartOrTdbString(cart, tg->tdb, BAM_COLOR_TAG, BAM_COLOR_TAG_DEFAULT);
    int aliQualShadeMin = 0, aliQualShadeMax = 99, baseQualShadeMin = 0, baseQualShadeMax = 40;
    parseIntRangeSetting(tg->tdb, "aliQualRange", &aliQualShadeMin, &aliQualShadeMax);
    parseIntRangeSetting(tg->tdb, "baseQualRange", &baseQualShadeMin, &baseQualShadeMax);
    struct bamTrackData btd = {tg, pairHash, minAliQual, colorMode, grayMode, userTag,
			       aliQualShadeMin, aliQualShadeMax, baseQualShadeMin, baseQualShadeMax};
    char *fileName = trackDbSetting(tg->tdb, "bigDataUrl");
    if (fileName == NULL)
	{
	if (tg->customPt)
	    {
	    errAbort("bamLoadItemsCore: can't find bigDataUrl for custom track %s", tg->track);
	    }
	else
	    {
	    struct sqlConnection *conn = hAllocConnTrack(database, tg->tdb);
	    fileName = bamFileNameFromTable(conn, tg->table, chromName);
	    hFreeConn(&conn);
	    }
	}

    char *fileName2 = hReplaceGbdb(fileName);

    char posForBam[512];
    safef(posForBam, sizeof(posForBam), "%s:%d-%d", chromName, winStart, winEnd);
    if (!isPaired)
	bamFetch(fileName2, posForBam, addBam, &btd, NULL);
    else
	{
	char *setting = trackDbSettingClosestToHomeOrDefault(tg->tdb, "pairSearchRange", "20000");
	int pairSearchRange = atoi(setting);
	if (pairSearchRange > 0)
	    safef(posForBam, sizeof(posForBam), "%s:%d-%d", chromName,
		  max(0, winStart-pairSearchRange), winEnd+pairSearchRange);
	bamFetch(fileName2, posForBam, addBamPaired, &btd, NULL);
	struct hashEl *hel;
	struct hashCookie cookie = hashFirst(btd.pairHash);
	while ((hel = hashNext(&cookie)) != NULL)
	    {
	    struct linkedFeatures *lf = hel->val;
	    if (lf->start < winEnd && lf->end > winStart)
		slAddHead(&(tg->items), lfsFromLf(lf));
	    }
	}
    freez(&fileName2);

    if (tg->visibility != tvDense)
	{
	slReverse(&(tg->items));
	if (isPaired)
	    slSort(&(tg->items), linkedFeaturesSeriesCmp);
	else if (sameString(colorMode, BAM_COLOR_MODE_STRAND))
	    slSort(&(tg->items), linkedFeaturesCmpOri);
	else if (sameString(colorMode, BAM_COLOR_MODE_GRAY) &&
		 sameString(grayMode, BAM_GRAY_MODE_ALI_QUAL))
	    slSort(&(tg->items), linkedFeaturesCmpScore);
	else
	    slSort(&(tg->items), linkedFeaturesCmpStart);
	if (slCount(tg->items) > MAX_ITEMS_FOR_MAPBOX)
	    {
	    // flag drawItems to make a mapBox for the whole track
	    tg->customInt = 1;
	    tg->mapItem = dontMapItem;
	    }
	}
    }
errCatchEnd(errCatch);
if (errCatch->gotError)
    {
    tg->networkErrMsg = cloneString(errCatch->message->string);
    tg->drawItems = bigDrawWarning;
    tg->totalHeight = bigWarnTotalHeight;
    }
errCatchFree(&errCatch);
}
示例#24
0
struct annoStreamer *annoStreamDbNew(char *db, char *table, struct annoAssembly *aa,
				     struct asObject *asObj, int maxOutRows)
/* Create an annoStreamer (subclass) object from a database table described by asObj. */
{
struct sqlConnection *conn = hAllocConn(db);
if (!sqlTableExists(conn, table))
    errAbort("annoStreamDbNew: table '%s' doesn't exist in database '%s'", table, db);
struct annoStreamDb *self = NULL;
AllocVar(self);
struct annoStreamer *streamer = &(self->streamer);
int dbtLen = strlen(db) + strlen(table) + 2;
char dbTable[dbtLen];
safef(dbTable, dbtLen, "%s.%s", db, table);
annoStreamerInit(streamer, aa, asObj, dbTable);
streamer->rowType = arWords;
streamer->setRegion = asdSetRegion;
streamer->nextRow = asdNextRow;
streamer->close = asdClose;
self->conn = conn;
self->table = cloneString(table);
char *asFirstColumnName = streamer->asObj->columnList->name;
if (sqlFieldIndex(self->conn, self->table, "bin") == 0)
    {
    self->hasBin = 1;
    self->minFinestBin = binFromRange(0, 1);
    }
if (self->hasBin && !sameString(asFirstColumnName, "bin"))
    self->omitBin = 1;
if (!asdInitBed3Fields(self))
    errAbort("annoStreamDbNew: can't figure out which fields of %s.%s to use as "
	     "{chrom, chromStart, chromEnd}.", db, table);
self->makeBaselineQuery = asdMakeBaselineQuery;
// When a table has an index on endField, sometimes the query optimizer uses it
// and that ruins the sorting.  Fortunately most tables don't anymore.
self->endFieldIndexName = sqlTableIndexOnField(self->conn, self->table, self->endField);
self->notSorted = FALSE;
// Special case: genbank-updated tables are not sorted because new mappings are
// tacked on at the end.
if (isIncrementallyUpdated(table))
    self->notSorted = TRUE;
self->mergeBins = FALSE;
self->maxOutRows = maxOutRows;
self->useMaxOutRows = (maxOutRows > 0);
self->needQuery = TRUE;
self->chromList = annoAssemblySeqNames(aa);
if (slCount(self->chromList) > 1000)
    {
    // Assembly has many sequences (e.g. scaffold-based assembly) --
    // don't break up into per-sequence queries.  Take our chances
    // with mysql being unhappy about the sqlResult being open too long.
    self->doQuery = asdDoQuerySimple;
    self->nextRowRaw = nextRowFromSqlResult;
    }
else
    {
    // All-chromosome assembly -- if table is large, perform a series of
    // chunked queries.
    self->doQuery = asdDoQueryChunking;
    self->nextRowRaw = nextRowFromBuffer;
    }
return (struct annoStreamer *)self;
}
示例#25
0
static void procGbEntry(struct lineFile *lf, struct hash *estAuthorHash)
/* process one entry in the genbank file . readGbInfo should be called
 * first */
{
char *words[16];
char date[64];
int wordCount;
DNA *dna = NULL;
int dnaSize;
char sizeString[16];
char accVer[64];
int faSize;
char *locus = gbLocusField->val->string;
char *accession = gbAccessionField->val->string;
int version = 0;
char *gi = NULL;
char *verChar = gbVersionField->val->string;
char *s;
char *org = gbOrganismField->val->string;
char *synOrg = NULL;
struct keyVal *seqKey, *sizeKey, *commentKey;
boolean isEst = FALSE;
char verNum[8];
char *com = gbCommentField->val->string;

if (locus == NULL || accession == NULL)
    errAbort("No LOCUS or no ACCESSION line near %d of %s",
             lf->lineIx, lf->fileName);
lmCleanup(&kvtMem);

/* Chop off all but first word of accession. */
s = skipLeadingSpaces(accession);
if (s != NULL)
    s = skipLeadingNonSpaces(s);
if (s != NULL)
    *s = 0;

/* Get version field (defaults to zero) */
if (verChar != NULL)
    {
    char *parts[2];
    char *accVer;
    int partCount;

    partCount = chopByWhite(verChar, parts, ArraySize(parts));

    /* Version is number after dot. */
    accVer = parts[0];
    if ((accVer = strchr(accVer, '.')) != NULL)
        version = atoi(accVer+1);
    if (partCount >= 2 && startsWith("GI:", parts[1]))
        gi = parts[1]+3;
    }

gbfFlatten(kvt);
                
/* Get additional keys. */
if (com != NULL)
    {
    if (startsWith("REVIEWED", com))
        kvtAdd(kvt, "cur", "yes");
    }
safef(verNum, sizeof(verNum), "%d", version);
kvtAdd(kvt, "ver", verNum);
if (gi != NULL)
    kvtAdd(kvt, "ngi", gi);
wordCount = chopLine(locus, words);
if (wordCount >= 6)
    {
    kvtAdd(kvt, "mol", words[3]);
    kvtAdd(kvt, "cat", words[wordCount-2]);
    ncbiDateToSqlDate(words[wordCount-1], date, sizeof(date), accession);
    kvtAdd(kvt, "dat", date);
    }
else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0]))
    {
    /* Check carefully.  Probably it's just missing the molecule type... */
    if (!isNcbiDate(words[4]))
        {
        errAbort("Strange LOCUS line in %s accession %s",
                 lf->fileName, accession);
        }
    kvtAdd(kvt, "cat", words[3]);
    ncbiDateToSqlDate(words[4], date, sizeof(date), accession);
    kvtAdd(kvt, "dat", date);
    }
else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0]))
    {
    kvtAdd(kvt, "mol", words[3]);
    }
else
    {
    errAbort("Short LOCUS line in %s accession %s",
             lf->fileName, accession);
    }
if (((wordCount >= 5) && sameString(words[4], "EST")) || 
    ((wordCount >= 6) && sameString(words[5], "EST")))
    {
    /* Try and figure out if it's a 3' or 5' EST */
    char *dir = getEstDir(gbDefinitionField->val->string, com);
    if (dir != NULL)
        kvtAdd(kvt, "dir", dir);
    isEst = TRUE;
    }

/* Handle other fields */
parseDbXrefs();
parseGene();
parseSourceOrganism();
parseMiscDiffs();
parseWarnings();

if (startsWith("synthetic construct", gbOrganismField->val->string))
    {
    synOrg = findSyntheticTarget();
    if (synOrg != NULL)
        hackSynClone();
    }

if (keepGbEntry(isEst))
    {
    /* Handle sequence part of read. */
    dna = gbfReadSequence(lf, &dnaSize);
    }
/* just discard if no sequence */
if (dna != NULL)
    {
    seqKey = kvtAdd(kvt, "seq", dna);
    safef(sizeString, sizeof(sizeString), "%d", dnaSize);
    sizeKey = kvtAdd(kvt, "siz", sizeString);
    
    if (isEst)
        {
        char *author = gbAuthorsField->val->string;
        if (author != NULL)
            {
            struct authorExample *ae;
            struct hashEl *hel;
            if ((hel = hashLookup(estAuthorHash, author)) == NULL)
                {
                AllocVar(ae);
                hel = hashAdd(estAuthorHash, author, ae);
                ae->name = hel->name;
                ae->count = 1;
                strncpy(ae->accession, accession, sizeof(ae->accession));
                slAddHead(&estAuthorList, ae);
                }
            else
                {
                ae = hel->val;
                ae->count += 1;
                }
            }
        }
    seqKey->val = NULL; /* Don't write out sequence here. */
    commentKey = kvtGet(kvt, "com");
    if (commentKey != NULL)
        commentKey->val = NULL;  /* Don't write out comment either. */

    setupOutputFiles(accession, org);

    if (faFile != NULL)
        {
        /* save fasta offset, size in ra */
        safef(accVer, sizeof(accVer), "%s.%d", accession, version);
        gbFaWriteSeq(faFile, accVer, NULL, dna, -1);
        faSize = faFile->off - faFile->recOff;
        safef(faOffStr, sizeof(faOffStr), "%lld", (long long)faFile->recOff);
        kvtAdd(kvt, "fao", faOffStr);
        safef(faSizeStr, sizeof(faSizeStr), "%d", faSize);
        kvtAdd(kvt, "fas", faSizeStr);
        }
    if (gPepFa != NULL)
        {
        /* must write before writing kvt */
        writePepSeq();
        }
    kvtWriteAll(kvt, raFile, NULL);
    if (gbIdxFile != NULL)
        {
        /* use synthetic target if it was determined */
        struct keyVal *molkv = kvtGet(kvt, "mol");
        enum molType molType = (molkv->val != NULL) ? gbParseMolType(molkv->val) : mol_mRNA;
        gbProcessedWriteIdxRec(gbIdxFile, accession, version,
                               kvtLookup(kvt, "dat"),
                               ((synOrg != NULL) ? synOrg : org),
                               molType);
        }
    }
else
    gbfSkipSequence(lf);
}
void addSdrfToStormTop(char *sdrfFile, struct tagStorm *storm)
/* Add lines of sdrfFile as children of first top level stanza in storm. */
{
struct fieldedTable *table = fieldedTableFromTabFile(sdrfFile, sdrfFile, NULL, 0 );


/* Convert ArrayExpress field names to our field names */
int fieldIx;
char *lastNonTerm = NULL;
char *lastNonUnit = NULL;
for (fieldIx=0; fieldIx < table->fieldCount; fieldIx += 1)
    {
    char tagName[256];
    aeFieldToNormalField("sdrf.", table->fields[fieldIx], tagName, sizeof(tagName));
    if (lastNonTerm != NULL && sameString("sdrf.Term_Source_REF", tagName))
	 {
         safef(tagName, sizeof(tagName), "%s_Term_Source_REF", lastNonTerm);
	 table->fields[fieldIx] = lmCloneString(table->lm, tagName);
	 }
    else if (lastNonTerm != NULL && sameString("sdrf.Term_Accession_Number", tagName))
	 {
         safef(tagName, sizeof(tagName), "%s_Term_Accession_Number", lastNonTerm);
	 table->fields[fieldIx] = lmCloneString(table->lm, tagName);
	 }
    else if (lastNonUnit != NULL && startsWith("sdrf.Unit_", tagName))
         {
	 safef(tagName, sizeof(tagName), "%s_Unit", lastNonUnit);
	 lastNonTerm = lmCloneString(table->lm, tagName);
	 table->fields[fieldIx] = lastNonTerm;
	 }
    else
	 {
         lastNonTerm = lastNonUnit = lmCloneString(table->lm, tagName);
	 table->fields[fieldIx] = lastNonTerm;
	 }
    }


/* Make up fastq field indexes to handle processing of paired reads in fastq, which
 * take two lines of sdrf file. */
char *fieldsWithFastqs[] = 
/* Fields that contain the fastq file names */
    {
    "sdrf.Comment_FASTQ_URI",
    "sdrf.Comment_SUBMITTED_FILE_NAME",
    "sdrf.Scan_Name",
    };
boolean mightReuseStanza = TRUE;
bool *reuseMultiFields;  // If set this field can vary and line still reused
AllocArray(reuseMultiFields, table->fieldCount);
int i;
for (i=0; i<ArraySize(fieldsWithFastqs); ++i)
    {
    char *field = fieldsWithFastqs[i];
    int ix = stringArrayIx(field, table->fields, table->fieldCount);
    if (ix >=0)
	reuseMultiFields[ix] = TRUE;
    else if (i == 0)
	{
	mightReuseStanza = FALSE;
        break;	    // Make sure has first one if going to do paired read fastq processing
	}
    }


/* Make up a list and hash of fieldMergers to handle conversion of columns that occur
 * multiple times to a comma-separated list of values in a single column. */
struct fieldMerger
/* Something to help merge multiple columns with same name */
    {
    struct fieldMerger *next;	/* Next in list */
    char *name;	
    struct dyString *val;	/* Comma separated value */
    };
struct hash *fieldHash = hashNew(0);
struct fieldMerger *fmList = NULL;
for (fieldIx = 0; fieldIx < table->fieldCount; ++fieldIx)
    {
    char *fieldName = table->fields[fieldIx];
    if (hashLookup(fieldHash, fieldName) == NULL)
        {
	struct fieldMerger *fm;
	AllocVar(fm);
	fm->name = fieldName;
	fm->val = dyStringNew(0);
	slAddTail(&fmList, fm);
	hashAdd(fieldHash, fieldName, fm);
	}
    }

/* Grab top level stanza and make sure there is only one. */
struct tagStanza *topStanza = storm->forest;
if (topStanza == NULL || topStanza->next != NULL)
    internalErr();

/* Scan through table, making new stanzas for each row and hooking them into topStanza */
struct fieldedRow *fr, *lastFr = NULL;
struct tagStanza *stanza = NULL;
for (fr = table->rowList; fr != NULL; fr = fr->next)
    {
    /* Empty out any existing vals */
    struct fieldMerger *fm;
    for (fm = fmList; fm != NULL; fm = fm->next)
	dyStringClear(fm->val);

    /* Add all non-empty values from this row to our fieldMergers. */
    char **row = fr->row;
    for (fieldIx = 0; fieldIx < table->fieldCount; ++fieldIx)
        {
	char *fieldName = table->fields[fieldIx];
	fm = hashMustFindVal(fieldHash, fieldName);
	char *val = row[fieldIx];
	if (!isEmpty(val))
	    csvEscapeAndAppend(fm->val, val);
	}

    /* If only the reuseMultiFields are varying, append to those values in previous stanza,
     * otherwise make a new stanza */
    if (mightReuseStanza && lastFr != NULL 
        && sameExceptForSome(lastFr->row, fr->row, table->fieldCount, reuseMultiFields))
	{
	int i;
	for (i=0; i<ArraySize(fieldsWithFastqs); ++i)
	    {
	    char *fieldName = fieldsWithFastqs[i];
	    if ((fm = hashFindVal(fieldHash, fieldName)) != NULL)
	        {
		char *newVal = fm->val->string;
		char *oldVal = tagMustFindVal(stanza, fieldName);
		int bothSize = strlen(newVal) + strlen(oldVal) + 1 + 1;
		char bothBuf[bothSize];
		safef(bothBuf, bothSize, "%s,%s", oldVal, newVal);
		tagStanzaUpdateTag(storm, stanza, fieldName, bothBuf);
		}
	    }
	}
    else
        {
	/* Output all nonempty vals to stanza */
	stanza = tagStanzaNew(storm, topStanza);
	for (fm = fmList; fm != NULL; fm = fm->next)
	    if (fm->val->stringSize > 0)
		tagStanzaAppend(storm, stanza, fm->name, fm->val->string);
	}

    lastFr = fr;
    }
slReverse(&topStanza->children);
}
示例#27
0
void doFactorSource(struct sqlConnection *conn, struct trackDb *tdb, char *item, int start, int end)
/* Display detailed info about a cluster of TFBS peaks from other tracks. */
{
char extraWhere[256];
safef(extraWhere, sizeof extraWhere, "name='%s'", item);
int rowOffset;
struct sqlResult *sr = hRangeQuery(conn, tdb->table, seqName, start, end, extraWhere, &rowOffset);
char **row = sqlNextRow(sr);
struct factorSource *cluster = NULL;
if (row != NULL)
    cluster = factorSourceLoad(row + rowOffset);
sqlFreeResult(&sr);

if (cluster == NULL)
    errAbort("Error loading cluster from track %s", tdb->track);

char *sourceTable = trackDbRequiredSetting(tdb, "sourceTable");

char *factorLink = cluster->name;
char *vocab = trackDbSetting(tdb, "controlledVocabulary");
if (vocab != NULL)
    {
    char *file = cloneFirstWord(vocab);
    factorLink = wgEncodeVocabLink(file, "term", factorLink, factorLink, factorLink, "");
    }
printf("<B>Factor:</B> %s<BR>\n", factorLink);
printf("<B>Cluster Score (out of 1000):</B> %d<BR>\n", cluster->score);
printPos(cluster->chrom, cluster->chromStart, cluster->chromEnd, NULL, TRUE, item);

/* Get list of tracks we'll look through for input. */
char *inputTrackTable = trackDbRequiredSetting(tdb, "inputTrackTable");
char query[256];
sqlSafef(query, sizeof(query), "select tableName from %s where factor='%s' order by source", 
                inputTrackTable, 
    cluster->name);

/* Next do the lists of hits and misses.  We have the hits from the non-zero signals in
 * cluster->expScores.  We need to figure out the sources actually assayed though
 * some other way.  We'll do this by one of two techniques. */
char *inputTableFieldDisplay = trackDbSetting(tdb, "inputTableFieldDisplay");
if (inputTableFieldDisplay != NULL)
    {
    struct slName *fieldList = stringToSlNames(inputTableFieldDisplay);
    char *vocab = trackDbSetting(tdb, "controlledVocabulary");

    /* In a new section put up list of hits. */
    webNewSection("Assays for %s in Cluster", cluster->name);
    webPrintLinkTableStart();
    printClusterTableHeader(fieldList, TRUE, FALSE, TRUE);
    printFactorSourceTableHits(cluster, conn, sourceTable, 
            inputTrackTable, fieldList, FALSE, vocab);
    webPrintLinkTableEnd();

    webNewSectionHeaderStart();
    char sectionTitle[128];
    safef(sectionTitle, 
            sizeof(sectionTitle),"Assays for %s Without Hits in Cluster", cluster->name);
    jsBeginCollapsibleSectionOldStyle(cart, tdb->track, "cellNoHits", sectionTitle, FALSE);
    webNewSectionHeaderEnd();
    webPrintLinkTableStart();
    printClusterTableHeader(fieldList, TRUE, FALSE, FALSE);
    printFactorSourceTableHits(cluster, conn, sourceTable, 
            inputTrackTable, fieldList, TRUE, vocab);
    webPrintLinkTableEnd();
    jsEndCollapsibleSection();
    }
else
    {
    errAbort("Missing required trackDb setting %s for track %s",
        "inputTableFieldDisplay", tdb->track);
    }
webNewSectionHeaderStart();
jsBeginCollapsibleSectionOldStyle(cart, tdb->track, "cellSources", "Cell Abbreviations", FALSE);
webNewSectionHeaderEnd();
hPrintFactorSourceAbbrevTable(conn, tdb);
jsEndCollapsibleSection();

doClusterMotifDetails(conn, tdb, cluster);
}
void hgLoadChromGraph(boolean doLoad, char *db, char *track, char *fileName)
/* hgLoadChromGraph - Load up chromosome graph. */
{
double minVal,maxVal;
struct chromGraph *el, *list;
FILE *f;
char *tempDir = ".";
char path[PATH_LEN], gbdbPath[PATH_LEN];
char *idTable = optionVal("idTable", NULL);
char *pathPrefix = NULL;

if (idTable == NULL)
    list = chromGraphLoadAll(fileName);
else 
    list = chromGraphListWithTable(fileName, db, idTable);
if (list == NULL)
    errAbort("%s is empty", fileName);

/* Figure out min/max values */
minVal = maxVal = list->val;
for (el = list->next; el != NULL; el = el->next)
    {
    if (optionExists("minusLog10"))
	{
	if (el->val == 1)
	    el->val = 0;
	else if (el->val > 0)
	    el->val = -1 * log(el->val)/log(10);
	}
    if (el->val < minVal)
        minVal = el->val;
    if (el->val > maxVal)
        maxVal = el->val;
    }


/* Sort and write out temp file. */
slSort(&list, chromGraphCmp);
f = hgCreateTabFile(tempDir, track);
for (el = list; el != NULL; el = el->next)
    chromGraphTabOut(el, f);

if (doLoad)
    {
    struct dyString *dy = dyStringNew(0);
    struct sqlConnection *conn;

    /* Set up connection to database and create main table. */
    conn = hAllocConn(db);
    dyStringPrintf(dy, createString, track, hGetMinIndexLength(db));
    sqlRemakeTable(conn, track, dy->string);

    /* Load main table and clean up file handle. */
    hgLoadTabFile(conn, tempDir, track, &f);
    hgRemoveTabFile(tempDir, track);

    /* If need be create meta table.  If need be delete old row. */
    if (!sqlTableExists(conn, "metaChromGraph"))
	sqlUpdate(conn, metaCreateString);
    else
        {
	dyStringClear(dy);
	dyStringPrintf(dy, "delete from metaChromGraph where name = '%s'", 
		track);
	sqlUpdate(conn, dy->string);
	}

    /* Make chrom graph file */
    safef(path, sizeof(path), "%s.cgb", track);
    chromGraphToBin(list, path);
    safef(path, sizeof(path), "/gbdb/%s/chromGraph", db);
    pathPrefix = optionVal("pathPrefix", path);
    safef(gbdbPath, sizeof(gbdbPath), "%s/%s.cgb", pathPrefix, track);

    /* Create new line in meta table */
    dyStringClear(dy);
    dyStringPrintf(dy, "insert into metaChromGraph values('%s',%f,%f,'%s');",
    	track, minVal, maxVal, gbdbPath);
    sqlUpdate(conn, dy->string);
    }
}
matrix * SVMgetSampleAccuracy(struct hash *config)
/*Read all the folds and calculate training and testing accuracies from best models*/
{
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * validationDir = hashMustFindVal(config, "validationDir");
char * modelDir = hashMustFindVal(config, "modelDir");
int fold, folds = foldsCountFromDataDir(config);
int split, splits = splitsCountFromDataDir(config);

matrix * accuracies = NULL;
char filename[1024];
for(split = 1; split <= splits; split++)
	{
	for(fold = 1; fold <= folds; fold++)
		{
		//cat togetehr the training and validation KH values and record which were used to train
		safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.svm", trainingDir, split, fold);
	    matrix * trMetadata = SVMtoMetadataMatrix(filename);
	
		safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.svm", validationDir, split, fold);
	    matrix * valMetadata = SVMtoMetadataMatrix(filename);
	    struct slInt * trainingList = list_indices(trMetadata->cols);
		matrix * metadata = append_matrices(trMetadata, valMetadata, 1);
	
		//cat together the guesses from SVM
		safef(filename, sizeof(filename), "%s/split%02d/fold%02d/svm.training.results", modelDir, split, fold);
		matrix * trainingPred = SVMpopulatePredictionsMatrix(filename);
		safef(trainingPred->rowLabels[0],MAX_LABEL, "Prediction");
		copy_matrix_labels(trainingPred, trMetadata, 2,2);
		trainingPred->labels=1;

	    safef(filename, sizeof(filename), "%s/split%02d/fold%02d/svm.validation.results", modelDir, split, fold);
		matrix * testingPred = SVMpopulatePredictionsMatrix(filename);
		safef(testingPred->rowLabels[0], MAX_LABEL, "Prediction");
		copy_matrix_labels(testingPred, valMetadata, 2,2);
		testingPred->labels=1;

	    matrix * predictions = append_matrices(trainingPred, testingPred, 1);
		//get accuracies
		matrix * accuraciesInFold = SVMpopulateAccuracyMatrix(predictions, metadata, trainingList);
		
		//add the accuracies to the running totals
		if(split == 1 && fold == 1)
			accuracies = copy_matrix(accuraciesInFold);
		else
		    add_matrices_by_colLabel(accuracies, accuraciesInFold);
		
		//clean up
	    free_matrix(trMetadata);
	    free_matrix(valMetadata);
		free_matrix(metadata);
	    free_matrix(trainingPred);
	    free_matrix(testingPred);
		free_matrix(predictions);
		free_matrix(accuraciesInFold);
		slFreeList(&trainingList);
		}
	}

//normalize accuracies over number of splits and folds
int i;
for(i = 0; i < accuracies->cols; i++)
    {
    if(accuracies->graph[0][i] != NULL_FLAG)
        accuracies->graph[0][i] = (accuracies->graph[0][i] / ((folds-1) * splits));
    if(accuracies->graph[1][i] != NULL_FLAG)
        accuracies->graph[1][i] = (accuracies->graph[1][i] / (1 * splits));
    }
return accuracies;
}
示例#30
0
void crossCheck(struct sqlConnection *conn,
	char *motifWeights, char *geneToModule, char *geneToMotif, char *moduleToMotif)
/* Do sanity check after loading.  */
{
struct hash *motifHash = newHash(0);
struct hash *geneToModuleHash = newHash(16);
struct hash *moduleToGeneHash = newHash(16);
struct hash *moduleToMotifHash = newHash(16);
struct hash *motifToModuleHash = newHash(16);
struct hash *modMotHash = newHash(18);
char query[512], modMot[64];
struct sqlResult *sr;
char **row;
int reusedMotifCount = 0;
int fatalErrorCount = 0;

/* Load up motif hash */
sqlSafef(query, sizeof(query), "select name from %s", motifWeights);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    hashAdd(motifHash, row[0], NULL);
    }
sqlFreeResult(&sr);

/* Load up moduleToMotif table and note how many times
 * a motif is used more than once just for curiousity
 * (this is not an error condition). */
sqlSafef(query, sizeof(query), "select module,motif from %s", moduleToMotif);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *module = cloneString(row[0]);
    char *motif = cloneString(row[1]);
    if (!hashLookup(motifHash, motif))
        {
	warn("Motif %s is in %s but not %s", motif, 
		moduleToMotif, motifWeights);
	++fatalErrorCount;
	}
    hashAdd(moduleToMotifHash, module, motif);
    if (hashLookup(motifToModuleHash, motif))
        ++reusedMotifCount;
    hashAdd(motifToModuleHash, motif, module);
    safef(modMot, sizeof(modMot), "%s %s", module, motif);
    hashAdd(modMotHash, modMot, NULL);
    }
sqlFreeResult(&sr);
verbose(1, "%d motifs reuses in modules\n", reusedMotifCount);

verbose(1, "Cross-checking tables\n");

/* Load up geneToModule table, and make sure that all modules actually
 * exist in moduleToMotif table. */
sqlSafef(query, sizeof(query), "select gene,module from %s", geneToModule);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *gene = cloneString(row[0]);
    char *module = cloneString(row[1]);
    if (!hashLookup(moduleToMotifHash, module))
        {
	warn("Module %s is in %s but not %s", 
		module, geneToModule, moduleToMotif);
	++fatalErrorCount;
	}
    hashAdd(geneToModuleHash, gene, module);
    hashAdd(moduleToGeneHash, module, gene);
    }
sqlFreeResult(&sr);

/* Scan again through moduleToMotif table and make sure that
 * all modules are present in geneToModule. */
sqlSafef(query, sizeof(query), "select module from %s", moduleToMotif);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *module = row[0];
    if (!hashLookup(moduleToGeneHash, module))
        {
	warn("Module %s is in %s but not %s",
	    module, moduleToMotif, geneToModule);
	++fatalErrorCount;
	}
    }
sqlFreeResult(&sr);

/* Scan through geneToMotif table checking things. */
sqlSafef(query, sizeof(query), "select gene,name from %s", geneToMotif);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *gene = row[0];
    char *motif = row[1];
    char *module = hashFindVal(geneToModuleHash, gene);
    if (module == NULL)
        {
	warn("Gene %s is in %s but not %s", gene, geneToMotif, geneToModule);
	++fatalErrorCount;
	}
    safef(modMot, sizeof(modMot), "%s %s", module, motif);
    if (hashLookup(modMotHash, modMot) == NULL)
        {
	warn("Gene %s has motif %s, but that motif isn't in %s",
	    gene, motif, module);
	++fatalErrorCount;
	}
    if (!hashLookup(motifHash, motif))
        {
	warn("Motif %s is in %s but not %s", motif, 
		geneToMotif, motifWeights);
	++fatalErrorCount;
	}
    }
sqlFreeResult(&sr);

verbose(1, "%d errors\n", fatalErrorCount);
}