void doRnaFoldDisplay(struct sqlConnection *conn, char *geneId, char *geneName)
/* Show RNA folding somehow. */
{
char *table = cartString(cart, hggMrnaFoldRegion);
char *how = cartString(cart, hggDoRnaFoldDisplay);
struct rnaFold *fold = loadFold(conn, table, geneId);

if (fold == NULL)
    {
    warn("Couldn't load %s from %s", geneId, table);
    return;
    }
if (sameString(how, "text"))
    {
    hPrintf("<TT><PRE>");
    hPrintf("%s\n%s (%1.2f)\n", fold->seq, fold->fold, fold->energy);
    hPrintf("</PRE></TT>");
    }
else if (sameString(how, "picture"))
    {
    char *psFile = cartString(cart, hggMrnaFoldPs);
    char *rootName = cloneString(psFile);
    char pngName[256];
    char pdfName[256];

    chopSuffix(rootName);
    safef(pngName, sizeof(pngName), "%s.png", rootName);
    safef(pdfName, sizeof(pngName), "%s.pdf", rootName);
    hPrintf("<H2>%s (%s) %s energy %1.2f</H2>\n", 
    	geneName, geneId, table, fold->energy);
    if (!fileExists(pdfName))
         {
	 char command[512];
	 safef(command, sizeof(command), "ps2pdf %s %s" , psFile, pdfName);
	 mustSystem(command);
	 }
    hPrintf("Click <A HREF=\"%s\">here for PDF version</A><BR>", pdfName);
    if (!fileExists(pngName))
         {
	 char command[512];
	 safef(command, sizeof(command),
	 	"gs -sDEVICE=png16m -sOutputFile=%s -dBATCH -dNOPAUSE -q %s"
		, pngName, psFile);
	 mustSystem(command);
	 }
    hPrintf("<IMG SRC=\"%s\">", pngName);
    }
}
Beispiel #2
0
void makeTmpSai(struct sqlConnection *conn, struct cdwValidFile *vf, char *genoFile, 
    char **retSampleFile, char **retSaiFile)
/* Given a fastq file, make a subsample of it 100k reads long and align it with
 * bwa producing a sai file of given name. */
{
/* Get fastq record */
long long fileId = vf->fileId;
struct cdwFastqFile *fqf = cdwFastqFileFromFileId(conn, fileId);
if (fqf == NULL)
    errAbort("No cdwFastqFile record for file id %lld", fileId);

/* Create downsampled fastq in temp directory - downsampled more than default even. */
char sampleFastqName[PATH_LEN];
cdwMakeTempFastqSample(fqf->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName);
verbose(1, "downsampled %s into %s\n", vf->licensePlate, sampleFastqName);

/* Do alignment */
char cmd[3*PATH_LEN];
char *saiName = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sai"));
safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, sampleFastqName, saiName);
mustSystem(cmd);

/* Save return variables, clean up,  and go home. */
*retSampleFile = cloneString(sampleFastqName);
*retSaiFile = saiName;
cdwFastqFileFree(&fqf);
}
Beispiel #3
0
void fastqRepeatQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf)
/* Do repeat QA if possible on fastq file. */
{
/* First see if total repeat content is already in our table, in which case we are done. */
long long fileId = ef->id;
char query[512];
sqlSafef(query, sizeof(query), 
    "select count(*) from cdwQaRepeat where fileId=%lld and repeatClass='total'" , fileId);
if (sqlQuickNum(conn, query) != 0)
    return;	/* We've done this already */

/* Get sample file name from fastq table. */
struct cdwFastqFile *fqf = cdwFastqFileForFileId(conn, fileId);
if (fqf == NULL)
    errAbort("No edqFastqRecord for %s",  vf->licensePlate);
char *fastqPath = fqf->sampleFileName;

char bwaIndex[PATH_LEN];
safef(bwaIndex, sizeof(bwaIndex), "%s%s/repeatMasker/repeatMasker.fa", 
    cdwValDataDir, vf->ucscDb);

char cmd[3*PATH_LEN];
char *saiName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".sai"));
safef(cmd, sizeof(cmd), "bwa aln %s %s > %s", bwaIndex, fastqPath, saiName);
mustSystem(cmd);

char *samName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".sam"));
safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", bwaIndex, saiName, fastqPath, samName);
mustSystem(cmd);
remove(saiName);

char *raName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".ra"));
safef(cmd, sizeof(cmd), "edwSamRepeatAnalysis %s %s", samName, raName);
mustSystem(cmd);
verbose(2, "mustSystem(%s)\n", cmd);
remove(samName);

raIntoCdwRepeatQa(raName, conn, fileId);
remove(raName);
#ifdef SOON
#endif /* SOON */

freez(&saiName);
freez(&samName);
freez(&raName);
cdwFastqFileFree(&fqf);
}
Beispiel #4
0
void doSystem(char *command)
/* Do system call if doReal is set,  otherwise just print command. */
{
printf("%s\n", command);
if (doReal)
    {
    mustSystem(command);
    }
}
Beispiel #5
0
void edwAlignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly,
    char *fastqPath, struct edwValidFile *vf, FILE *bedF,
    double *retMapRatio,  double *retDepth,  double *retSampleCoverage)
/* Take a sample fastq and run bwa on it, and then convert that file to a bed. 
 * bedF and all the ret parameters can be NULL. */
{
/* Hmm, tried doing this with Mark's pipeline code, but somehow it would be flaky the
 * second time it was run in same app.  Resorting therefore to temp files. */
char genoFile[PATH_LEN];
safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", 
    edwValDataDir, assembly->ucscDb, assembly->ucscDb);

char cmd[3*PATH_LEN];
char *saiName = cloneString(rTempName(edwTempDir(), "edwSample1", ".sai"));
safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, fastqPath, saiName);
mustSystem(cmd);

char *samName = cloneString(rTempName(edwTempDir(), "ewdSample1", ".sam"));
safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", genoFile, saiName, fastqPath, samName);
mustSystem(cmd);
remove(saiName);

/* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. 
 * and also to produce little bed file for enrichment step. */
struct genomeRangeTree *grt = genomeRangeTreeNew();
long long hitCount=0, missCount=0, totalBasesInHits=0;
scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits);
verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", 
    hitCount, missCount, totalBasesInHits, grt);
if (retMapRatio)
    *retMapRatio = (double)hitCount/(hitCount+missCount);
if (retDepth)
    *retDepth = (double)totalBasesInHits/assembly->baseCount 
	    * (double)vf->itemCount/vf->sampleCount;
long long basesHitBySample = genomeRangeTreeSumRanges(grt);
if (retSampleCoverage)
    *retSampleCoverage = (double)basesHitBySample/assembly->baseCount;
genomeRangeTreeFree(&grt);
remove(samName);
}
Beispiel #6
0
void chainSplit(char *outDir, int inCount, char *inFiles[])
/* chainSplit - Split chains up by target or query sequence. */
{
struct hash *hash = newHash(0);
int inIx;
char tpath[512];
FILE *meta ;
bool metaOpen = TRUE;
makeDir(outDir);
safef(tpath, sizeof(tpath), "%s/meta.tmp", outDir);
meta = mustOpen(tpath,"w");

for (inIx = 0; inIx < inCount; ++inIx)
    {
    struct lineFile *lf = lineFileOpen(inFiles[inIx], TRUE);
    struct chain *chain;
    FILE *f;
    lineFileSetMetaDataOutput(lf, meta);
    while ((chain = chainRead(lf)) != NULL)
        {
	char *name = (splitOnQ ? chain->qName : chain->tName);
	if (lump > 0)
	    name = lumpName(name);
	if ((f = hashFindVal(hash, name)) == NULL)
	    {
	    char path[512], cmd[512];
	    safef(path, sizeof(path),"%s/%s.chain", outDir, name);
            if (metaOpen)
                fclose(meta);
            metaOpen = FALSE;
	    safef(cmd,sizeof(cmd), "cat %s | sort -u > %s", tpath, path);
            mustSystem(cmd);
	    f = mustOpen(path, "a");
	    hashAdd(hash, name, f);
	    }
	chainWrite(chain, f);
	chainFree(&chain);
	}
    lineFileClose(&lf);
    }
}
int main(int argc, char *argv[])
{
    struct sqlConnection *conn, *conn2;
    char query2[256];
    struct sqlResult *sr2;
    char **row2;
    char cond_str[255];
    char *proteinDatabaseName;
    FILE *o1, *o2, *o3;
    FILE *fh[23];
    char temp_str[1000];;
    char *accession;
    char *aaSeq;
    char *chp;
    int i, j, len;
    int ihi, ilow;
    char *answer;
    char *protDisplayId;
    int aaResCnt[30];
    char aaAlphabet[30];
    int aaResFound;
    float fvalue1, fvalue2;
    float p1, p2;
    int icnt, jcnt;
    char *taxon;
    char *database;
    int sortedCnt;

    if (argc != 4) usage();

    strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB");

    proteinDatabaseName = argv[1];
    taxon = argv[2];
    database = argv[3];

    o2 = mustOpen("pbResAvgStd.tab", "w");

    for (i=0; i<20; i++)
    {
        safef(temp_str, sizeof(temp_str), "%c.txt", aaAlphabet[i]);
        fh[i] = mustOpen(temp_str, "w");
    }

    conn  = hAllocConn(hDefaultDb());
    conn2 = hAllocConn(hDefaultDb());

    safef(query2, sizeof(query2), "select proteinID from %s.knownGene;", database);
    sr2 = sqlMustGetResult(conn2, query2);
    row2 = sqlNextRow(sr2);
    icnt = 0;
    jcnt = 0;

    for (j=0; j<MAXRES; j++)
    {
        sumJ[j] = 0;
    }

    while (row2 != NULL)
    {
        protDisplayId = row2[0];
        safef(cond_str, sizeof(cond_str),  "val='%s'", protDisplayId);
        accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str);

        if (accession == NULL)
        {
            safef(cond_str, sizeof(cond_str),  "acc='%s'", protDisplayId);
            accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str);
            if (accession == NULL)
            {
                verbose(2, "'%s' not found.\n", protDisplayId);
                goto skip;
            }
        }

        safef(cond_str, sizeof(cond_str),  "accession='%s'", accession);
        answer = sqlGetField("proteins040115", "spXref2", "biodatabaseID", cond_str);
        if (answer == NULL)
        {
            /* this protein might be a variant splice protein, and then it won't be in spXref2 */
            goto skip;
        }
        if (answer[0] != '1')
        {
            /* printf("%s not in SWISS-PROT\n", protDisplayId);fflush(stdout); */
            goto skip;
        }

        safef(cond_str, sizeof(cond_str),  "acc='%s'", accession);
        aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str);
        if (aaSeq == NULL)
        {
            printf("Can't find peptide sequence for %s, exiting ...\n", protDisplayId);
            fflush(stdout);
            exit(1);
        }

        len  = strlen(aaSeq);
        if (len < 100) goto skip;

        lenDouble = (double)len;

        for (j=0; j<MAXRES; j++)
        {
            aaResCnt[j] = 0;
        }

        chp = aaSeq;
        for (i=0; i<len; i++)
        {
            aaResFound = 0;
            for (j=0; j<MAXRES; j++)
            {
                if (*chp == aaAlphabet[j])
                {
                    aaResFound = 1;
                    aaResCnt[j] ++;
                }
            }
            if (!aaResFound)
            {
                fprintf(stderr, "%c %d not a valid AA residue.\n", *chp, *chp);
            }
            chp++;
        }

        for (j=0; j<MAXRES; j++)
        {
            freq[icnt][j] = (double)aaResCnt[j]/lenDouble;
            sumJ[j] = sumJ[j] + freq[icnt][j];
        }

        for (j=0; j<20; j++)
        {
            fprintf(fh[j], "%15.7f\t%s\n", freq[icnt][j], accession);
            fflush(fh[j]);
        }
        icnt++;
        if (icnt >= MAXN)
            errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN);

skip:
        row2 = sqlNextRow(sr2);
    }

    recordCnt = icnt;
    recordCntDouble = (double)recordCnt;

    for (j=0; j<20; j++)
    {
        carefulClose(&(fh[j]));
    }

    sqlFreeResult(&sr2);
    hFreeConn(&conn);
    hFreeConn(&conn2);

    for (j=0; j<MAXRES; j++)
    {
        avg[j] = sumJ[j]/recordCntDouble;
    }

    for (j=0; j<20; j++)
    {
        sum = 0.0;
        for (i=0; i<recordCnt; i++)
        {
            sum = sum + (freq[i][j] - avg[j]) * (freq[i][j] - avg[j]);
        }
        sigma[j] = sqrt(sum/(double)(recordCnt-1));
        fprintf(o2, "%c\t%f\t%f\n", aaAlphabet[j], avg[j], sigma[j]);
    }

    carefulClose(&o2);

    o1 = mustOpen("pbAnomLimit.tab", "w");
    for (j=0; j<20; j++)
    {
        safef(temp_str, sizeof(temp_str), "cat %c.txt|sort|uniq > %c.srt",  aaAlphabet[j], aaAlphabet[j]);
        mustSystem(temp_str);

        /* figure out how many unique entries */
        safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp",  aaAlphabet[j], aaAlphabet[j]);
        mustSystem(temp_str);
        safef(temp_str, sizeof(temp_str), "%c.tmp",  aaAlphabet[j]);
        o3 = mustOpen(temp_str, "r");
        mustGetLine(o3, temp_str, 1000);
        chp = temp_str;
        while (*chp == ' ') chp++;
        while (*chp != ' ') chp++;
        *chp = '\0';
        sscanf(temp_str, "%d", &sortedCnt);
        safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]);
        mustSystem(temp_str);

        /* cal hi and low cutoff threshold */
        ilow = (int)((float)sortedCnt * 0.025);
        ihi  = (int)((float)sortedCnt * 0.975);

        safef(temp_str, sizeof(temp_str), "%c.srt",  aaAlphabet[j]);
        o2 = mustOpen(temp_str, "r");
        i=0;
        for (i=0; i<ilow; i++)
        {
            mustGetLine(o2, temp_str, 1000);
        }
        sscanf(temp_str, "%f", &fvalue1);

        mustGetLine(o2, temp_str, 1000);
        sscanf(temp_str, "%f", &fvalue2);
        p1 = (fvalue1 + fvalue2)/2.0;

        for (i=ilow+1; i<ihi; i++)
        {
            mustGetLine(o2, temp_str, 1000);
        }
        sscanf(temp_str, "%f", &fvalue1);

        mustGetLine(o2, temp_str, 1000);
        sscanf(temp_str, "%f", &fvalue2);
        p2 = (fvalue1 + fvalue2)/2.0;
        carefulClose(&o2);

        fprintf(o1, "%c\t%f\t%f\n", aaAlphabet[j], p1, p2);
        fflush(stdout);

        for (i=0; i<recordCnt; i++)
        {
            measure[i] = freq[i][j];
        }
        safef(temp_str, sizeof(temp_str), "pbAaDist%c.tab", aaAlphabet[j]);
        calDist(measure,  recordCnt,    51,     0.0, 0.005, temp_str);
    }

    carefulClose(&o1);

    return(0);
}
int main(int argc, char *argv[])
{
struct sqlConnection *conn2, *conn3;
  
char query2[256];
struct sqlResult *sr2;
char **row2;

char *proteinDataDate;
 
FILE   *o2;
char *entrez;

char *chp;
char *hgncId, *name, *symbol, *refSeqIds, *uniProt;
int j;
char *locusType;

char *refseq;
boolean gotRefseq;

if (argc != 2) usage();
proteinDataDate = argv[1];
   
o2 = fopen("j.dat", "w");
conn2= hAllocConn(hDefaultDb());
conn3= hAllocConn(hDefaultDb());

sprintf(query2,
	"select hgncId, symbol, name, refSeqMapped, refSeqIds, uniProt, entrezMapped, locusType from proteins%s.hgnc where status not like '%cWithdrawn%c'", 
	proteinDataDate, '%', '%');
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    j=0;
    hgncId 	= row2[j];j++;
    symbol 	= row2[j];j++;
    name	= row2[j];j++;
    refseq 	= row2[j];j++;
    refSeqIds 	= row2[j];j++;
    uniProt   	= row2[j];j++;
    entrez   	= row2[j];j++;
    locusType  	= row2[j];j++;
    
    chp = strstr(hgncId, "HGNC:");
    hgncId = chp+5;

    gotRefseq = FALSE;

    /* process refSeqMapped first */

    if (!sameWord(refseq, ""))
    	{
    	fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name);
	gotRefseq = TRUE;
	}

    /* process refSeqIds next */

    chp = strstr(refSeqIds, ",");
    if (chp != NULL) 
    	{
    	*chp = '\0';
        while (chp != NULL)
	    {    
	    fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name);
	    chp++;
 	    while (*chp == ' ') chp++;
	    refseq = chp;
            chp = strstr(refseq, ",");
	    if (chp != NULL) *chp = '\0';
	    }
	fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name);
	gotRefseq = TRUE;
	}
    else
    	{
	if (!sameWord(refseq,""))
	    {
	    fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name);
	    }
	else
	    {
	    /* output the record if no RefSeq in either refSeqIds or refSeqMapped */
	    if (!gotRefseq)
	    	{
	    	fprintf(o2, 
		"%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name);
		}
	    }
	}
    row2 = sqlNextRow(sr2);
    }
sqlFreeResult(&sr2);

hFreeConn(&conn2);
fclose(o2);
    
mustSystem("cat j.dat |sort -u >hgncXref.tab");
return(0);
}
Beispiel #9
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn3;
char query[256], query3[256];
struct sqlResult *sr, *sr3;
char **row, **row3;

FILE *o1, *o2;

char *locusID;	/* LocusLink ID */

char *kgTempDbName, *roDbName; 
char cond_str[200];
char *kgId;
char *mapID;
char *desc;
char *mRNA;

optionInit(&argc, argv, options);
if (argc != 3)  usage();
kgTempDbName    = argv[1];
roDbName 	= argv[2];

conn = hAllocConn(roDbName);
conn3= hAllocConn(roDbName);

o1 = fopen("j.dat",  "w");
o2 = fopen("jj.dat", "w");
    
table = optionVal("table", "knownGene");
sqlSafef(query, sizeof(query), "select name from %s.%s", roDbName, table);
sr = sqlMustGetResult(conn, query);
row = sqlNextRow(sr);
while (row != NULL)
    {
    kgId = row[0];
	
    sqlSafefFrag(cond_str, sizeof(cond_str), "kgId='%s'", kgId);
    mRNA = sqlGetField(roDbName, "kgXref", "mRNA", cond_str);
    
    sqlSafefFrag(cond_str, sizeof(cond_str), "mrna='%s'", mRNA);
    locusID = sqlGetField("entrez", "entrezMrna", "geneId", cond_str);
    
    /* look for RefSeq if not found in mRNAs */
    if (locusID == NULL)
    	{
    	sqlSafefFrag(cond_str, sizeof(cond_str), "refseq='%s'", mRNA);
    	locusID = sqlGetField("entrez", "entrezRefseq", "geneId", cond_str);
	}

    if (locusID != NULL)
	{
        sqlSafef(query3, sizeof(query3), "select * from %s.keggList where locusID = '%s'", kgTempDbName, locusID);
        sr3 = sqlGetResult(conn3, query3);
        while ((row3 = sqlNextRow(sr3)) != NULL)
            {
            mapID   = row3[1];
	    desc    = row3[2];
	    fprintf(o1, "%s\t%s\t%s\n", kgId, locusID, mapID);
	    fprintf(o2, "%s\t%s\n", mapID, desc);
	    row3 = sqlNextRow(sr3);
            }
        sqlFreeResult(&sr3);
	}
    else
        {
	/* printf("%s not found in Entrez.\n", kgId);fflush(stdout);*/
        if (differentString(table, "knownGene"))
            {
            sqlSafefFrag(cond_str, sizeof(cond_str), "name='%s'", kgId);
            locusID = sqlGetField(roDbName, table, "name2", cond_str);
            sqlSafef(query3, sizeof(query3), "select * from %s.keggList where locusID = '%s'", kgTempDbName, kgId);
            sr3 = sqlGetResult(conn3, query3);
            while ((row3 = sqlNextRow(sr3)) != NULL)
                {
                mapID   = row3[1];
                desc    = row3[2];
                fprintf(o1, "%s\t%s\t%s\n", kgId, locusID, mapID);
                fprintf(o2, "%s\t%s\n", mapID, desc);
                row3 = sqlNextRow(sr3);
                }
            sqlFreeResult(&sr3);
            }
        }
    row = sqlNextRow(sr);
    }

fclose(o1);
fclose(o2);
hFreeConn(&conn);

mustSystem("cat j.dat|sort|uniq >keggPathway.tab");
mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");
return(0);
}
Beispiel #10
0
int main(int argc, char *argv[])
    {
    struct sqlConnection *conn, *conn2, *conn3;
    char query2[256];
    struct sqlResult *sr2;
    char **row2;
    char cond_str[256];  
  
    char *protDbDate;
    char *kgID;
    char *protDisplayId;
    
    FILE *o1;
    char *kgTempDb;
    char spDb[255],proteinsDb[255];
    char *ro_DB;
    char *refSeqName;
    char *hugoID;
    char *protAcc;	/* protein Accession number from NCBI */
    char *answer;
    char *emptyStr;
    char *parSpID;
    
    int leg;		/* marker for debugging */
    char *spID, *kgProteinID, *geneSymbol, *refseqID, *desc;

    if (argc != 4) usage();
    kgTempDb  = cloneString(argv[1]);
    protDbDate = cloneString(argv[2]);
    ro_DB = cloneString(argv[3]);
    
    safef(spDb, sizeof(spDb), "sp%s",  protDbDate);
    safef(proteinsDb, sizeof(proteinsDb), "proteins%s", protDbDate);

    conn = hAllocConn(ro_DB);
    conn2= hAllocConn(ro_DB);
    conn3= hAllocConn(ro_DB);

    o1 = mustOpen("j.dat", "w");

    emptyStr = strdup("");

    sqlSafef(query2, sizeof query2, "select name, proteinID from %s.knownGene;", kgTempDb);
    sr2 = sqlMustGetResult(conn2, query2);
    row2 = sqlNextRow(sr2);
    while (row2 != NULL)
	{
	kgID 		= row2[0];
	kgProteinID	= row2[1];
	
	refseqID 	= strdup("");
	geneSymbol 	= strdup("");
	desc		= strdup("");
	protAcc		= strdup("");

        sqlSafefFrag(cond_str, sizeof cond_str, "displayID='%s'", kgProteinID);
        spID = sqlGetField(proteinsDb, "spXref3", "accession", cond_str);
    
        /* process variant splice proteins */
	if (spID == NULL)
	    {
            sqlSafefFrag(cond_str, sizeof cond_str, "varAcc='%s'", kgProteinID);
	    spID = kgProteinID;
	    
            parSpID = sqlGetField(proteinsDb, "splicProt", "parAcc", cond_str);
	    if (parSpID != NULL)
	    	{
        	sqlSafefFrag(cond_str, sizeof cond_str, "accession='%s'", parSpID);
        	protDisplayId = sqlGetField(proteinsDb, "spXref3", "displayID", cond_str);
		}
	    else
	    	{
		fprintf(stderr, "%s not found in kgXref3 nor in varProtein.\n", kgProteinID);
		exit(1);
		}
	    }
	else
	    {
	    protDisplayId = kgProteinID;	
	    }
	/* use description for the protein as default, replace it with HUGO desc if available. */
	sqlSafefFrag(cond_str, sizeof cond_str, "displayID='%s'", protDisplayId);
        desc  = sqlGetField(proteinsDb, "spXref3", "description", cond_str);
        
        if (strstr(kgID, "NM_") != NULL)
            {
	    leg = 1;
            /* special processing for RefSeq DNA based genes */
            sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", kgID);
            refSeqName = sqlGetField(ro_DB, "refLink", "name", cond_str);
            if (refSeqName != NULL)
                {
                geneSymbol = cloneString(refSeqName);
		refseqID   = kgID;
            	sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", kgID);
            	desc = sqlGetField(ro_DB, "refLink", "product", cond_str);
		
		sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc='%s'", refseqID);
        	answer = sqlGetField(ro_DB, "refLink", "protAcc", cond_str);
        	if (answer != NULL)
            	    {
	    	    protAcc = strdup(answer);
	    	    }
                }
            }
        else
            {
            sqlSafefFrag(cond_str, sizeof cond_str, "displayID = '%s'", protDisplayId);
            hugoID = sqlGetField(proteinsDb, "spXref3", "hugoSymbol", cond_str);
            if (!((hugoID == NULL) || (*hugoID == '\0')) )
                {
		leg = 21;
                geneSymbol = cloneString(hugoID);

            	sqlSafefFrag(cond_str, sizeof cond_str, "displayID = '%s'", protDisplayId);
            	desc = sqlGetField(proteinsDb, "spXref3", "hugoDesc", cond_str);
		if (desc == NULL) 
		    {
		    printf("%s/%s don't have hugo desc ...\n", kgProteinID, protDisplayId);
		    fflush(stdout);
		    }
		}

	    refseqID = emptyStr;
	    protAcc  = emptyStr;
            sqlSafefFrag(cond_str, sizeof cond_str, "mrna = '%s'", kgID);
            answer = sqlGetField(ro_DB, "mrnaRefseq", "refseq", cond_str);
	    if (answer != NULL) 
	    	{
		refseqID = answer;
		}
	    else
	    	{
		/*printf("%s does not have a related RefSeq.\n", kgID);fflush(stdout); */
		}
	    
	    if (strlen(geneSymbol) == 0)
		{ 
		leg = 23;
		if (strlen(refseqID) != 0)
			{
			sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", refseqID);
			answer = sqlGetField(ro_DB, "refLink", "name", cond_str);
			if (answer != NULL) 
				{
				leg = 24;
				geneSymbol = strdup(answer);
				}
			}
                }
            }

	/* fix missing fields */
	if (strlen(refseqID) == 0)
		{
		/* printf("%3d %s reseqID is empty.\n", leg, kgID); */
		}

	if (strlen(geneSymbol) == 0)
		{
		/* printf("%3d %s geneSymbol is empty.\n", leg, kgID);fflush(stdout);*/
		geneSymbol = strdup(kgID);
		}

	if (strlen(desc) == 0)
		{
		/* printf("%3d %s desc is empty.\n", leg, kgID);fflush(stdout); */
		desc = strdup("N/A");
		}
	
	fprintf(o1, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", 
		kgID, kgID, spID, protDisplayId, geneSymbol, refseqID, protAcc, desc);
	row2 = sqlNextRow(sr2);
	}

    fclose(o1);
    hFreeConn(&conn);
    hFreeConn(&conn2);
    hFreeConn(&conn3);
    mustSystem("cat j.dat|sort|uniq  >kgXref.tab");
    mustSystem("rm j.dat");
    return(0);
    }
Beispiel #11
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn2, *conn3;
 
char query2[256], query3[256];
struct sqlResult *sr2, *sr3;
char **row2, **row3;

char *accession;
char *displayID;
char *division;
char *extDB;
char *extAC;

char *proteinDataDate;
char *genomeRelease;
 
FILE   *o2, *o3;
char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd,
     *exonCount, *exonStarts, *exonEnds;

char *bioDBID, *bioentryID;

if (argc != 3) usage();
proteinDataDate = argv[1];
genomeRelease   = argv[2];
   
o2 = fopen("jj.dat", "w");
o3 = fopen("j.dat", "w");
conn2= hAllocConn(hDefaultDb());
conn3= hAllocConn(hDefaultDb());
	
sqlSafef(query2, sizeof query2, "select * from %sTemp.refGene;", genomeRelease);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    name 	= row2[0];
    chrom 	= row2[1];
    strand	= row2[2];
    txStart 	= row2[3];
    txEnd   	= row2[4];
    cdsStart	= row2[5]; 
    cdsEnd	= row2[6];
    exonCount = row2[7]; 
    exonStarts= row2[8]; 
    exonEnds  = row2[9];	

    sqlSafef(query3, sizeof query3,  "select * from proteins%s.spXref2 where extAC='%s' and extDB='EMBL';",
    	    proteinDataDate, name);

    sr3 = sqlMustGetResult(conn3, query3);
    row3 = sqlNextRow(sr3);
	      
    while (row3 != NULL)
	{
   	accession = row3[0];
       	displayID = row3[1];	 
        division  = row3[2];  
	extDB	  = row3[3];     
	extAC	  = row3[4];
	bioentryID= row3[5];
	bioDBID	  = row3[6];

	if (! ( (strcmp(bioDBID, "1") == 0) || 
		(strcmp(bioDBID, "2") == 0) || 
		(strcmp(bioDBID, "3") == 0)
	      )
	   )
	    {
	    printf("non-recognized bioDB index %s encountered.\n", bioDBID);
	    printf("displayId=%s bioDBID=%s\n", displayID, bioDBID);
	    fflush(stdout);
	    exit(1);
	    }

	fprintf(o2, "%s\n", displayID);
	fprintf(o3, "%s\t%s\n", displayID, extAC);
	row3 = sqlNextRow(sr3);
	}
    sqlFreeResult(&sr3);
    row2 = sqlNextRow(sr2);
    }
sqlFreeResult(&sr2);

hFreeConn(&conn2);
hFreeConn(&conn3);
fclose(o2);
fclose(o3);
    
mustSystem("cat j.dat |sort|uniq >proteinMrna.tab");
mustSystem("cat jj.dat|sort|uniq >protein.lis");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");
return(0);
}
Beispiel #12
0
void pairedEndQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf)
/* Look for other end,  do a pairwise alignment, and save results in database. */
{
verbose(2, "pairedEndQa on %u %s %s\n", ef->id, ef->cdwFileName, ef->submitFileName);
/* Get other end, return if not found. */
struct cdwValidFile *otherVf = cdwOppositePairedEnd(conn, ef, vf);
if (otherVf == NULL)
    return;

if (otherVf->fileId > vf->fileId)
    return;

struct cdwValidFile *vf1, *vf2;
struct cdwQaPairedEndFastq *pair = cdwQaPairedEndFastqFromVfs(conn, vf, otherVf, &vf1, &vf2);
if (pair != NULL)
    {
    cdwValidFileFree(&otherVf);
    return;
    }

/* Get target assembly and figure out path for BWA index. */
struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, vf->ucscDb);
assert(assembly != NULL);
char genoFile[PATH_LEN];
safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", 
    cdwValDataDir, assembly->ucscDb, assembly->ucscDb);

verbose(1, "aligning subsamples on %u vs. %u paired reads\n", vf1->fileId, vf2->fileId);

/* Make alignments of subsamples. */
char *sample1 = NULL, *sample2 = NULL, *sai1 = NULL, *sai2 = NULL;
makeTmpSai(conn, vf1, genoFile, &sample1, &sai1);
makeTmpSai(conn, vf2, genoFile, &sample2, &sai2);

/* Make paired end alignment */
char *tmpSam = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sam"));
char command[6*PATH_LEN];
safef(command, sizeof(command),
   "bwa sampe -n 1 -N 1 -f %s %s %s %s %s %s"
   , tmpSam, genoFile, sai1, sai2, sample1, sample2);
mustSystem(command);

/* Make ra file with pairing statistics */
char *tmpRa = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".ra"));
safef(command, sizeof(command), 
    "edwSamPairedEndStats -maxInsert=%d %s %s", maxInsert, tmpSam, tmpRa);
mustSystem(command);

/* Read RA file into variables. */
struct cdwQaPairedEndFastq *pe = cdwQaPairedEndFastqOneFromRa(tmpRa);

/* Update database with record. */
struct sqlConnection *freshConn = cdwConnectReadWrite();
char query[256];
sqlSafef(query, sizeof(query),
    "insert into cdwQaPairedEndFastq "
    "(fileId1,fileId2,concordance,distanceMean,distanceStd,distanceMin,distanceMax,recordComplete) "
    " values (%u,%u,%g,%g,%g,%g,%g,1)"
    , vf1->fileId, vf2->fileId, pe->concordance, pe->distanceMean
    , pe->distanceStd, pe->distanceMin, pe->distanceMax);
sqlUpdate(conn, query);
sqlDisconnect(&freshConn);

/* Clean up and go home. */
cdwValidFileFree(&otherVf);
remove(sample1);
remove(sample2);
remove(sai1);
remove(sai2);
remove(tmpSam);
remove(tmpRa);
#ifdef SOON
#endif /* SOON */
freez(&sample1);
freez(&sample2);
freez(&sai1);
freez(&sai2);
freez(&tmpSam);
freez(&tmpRa);
cdwQaPairedEndFastqFree(&pe);
cdwValidFileFree(&otherVf);
}
int main(int argc, char *argv[])
{
char *skippedKgId;
char *lastValidKgId;
    
struct sqlConnection *conn2, *conn3;
struct sqlResult *sr2;
char query2[256];
char **row2;
    
char *proteinID;
FILE   *o3, *o7;
char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd,
     *exonCount, *exonStarts, *exonEnds;

char *alignID;

char *chp;
int  i, j;

int  isDuplicate;
    
char *genomeDBname;
char *proteinDataDate;
char proteinsDB[40];
char spDB[40];
char *acc;

#define MAX_EXON 1000
int exStart[MAX_EXON], exEnd[MAX_EXON];
int exCount;

int aaStart[MAX_EXON], aaEnd[MAX_EXON];
    
char *sp, *ep;

int  aalen;
int  cdsS, cdsE;
int  eS, eE;
 
if (argc != 3) usage();
    
proteinDataDate = argv[1];
genomeDBname    = argv[2];
  
safef(spDB, sizeof(spDB), "sp%s", proteinDataDate);
safef(proteinsDB, sizeof(proteinsDB), "proteins%s", proteinDataDate);
 
o3 = fopen("j.dat", "w");
o7 = fopen("jj.dat", "w");

conn2= hAllocConn(genomeDBname);
conn3= hAllocConn(genomeDBname);
    
inf  = mustOpen("sorted.lis", "r");

strcpy(oldInfo, "");

skippedKgId   = cloneString("");
lastValidKgId = cloneString("");

isDuplicate   = 0;
oldMrnaStr    = cloneString("");
oldAlignStr   = cloneString("");
oldProteinStr = cloneString("");

mrnaStr       = cloneString("");
proteinStr    = cloneString("");
alignStr      = cloneString("");

while (fgets(line_in, 10000, inf) != NULL)
    {
    strcpy(line, line_in);

    chp = strstr(line, "\t");	/* chrom */
    chp ++;

    chp = strstr(chp, "\t");	/* cds block start position */
    chp ++;

    chp = strstr(chp, "\t");	/* cds block end   position */
    *chp = '\0';
    chp++;
    strcpy(newInfo, line);

    if (sameString(oldInfo, newInfo))
	{
	isDuplicate = 1;
	}
    else
	{
	/* remember previous record as old only if it is not a duplicate */
	if (!isDuplicate)
	    {
	    oldMrnaStr 	  = mrnaStr;
	    oldProteinStr = proteinStr;
	    oldAlignStr	  = alignStr;
	    }
	strcpy(oldInfo, newInfo);
	isDuplicate = 0;
	}

    chp = strstr(chp, "\t");	/* priority score */
    chp ++;
		
    chp = strstr(chp, "\t");	/* mRNA transcription length */ 
    chp ++;
		
    chp = strstr(chp, "\t");	/* mRNA date */
    chp ++;
	
    mrnaStr = chp;	
    chp = strstr(chp, "\t");	/* mRNA ID */
    *chp = '\0';
    chp ++;
    mrnaStr = cloneString(mrnaStr);

    proteinStr = chp;	
    chp = strstr(chp, "\t");	/* protein ID */
    *chp = '\0';
    chp ++;
    proteinStr = cloneString(proteinStr);

    alignID = chp;

    /* get rid of "end-of-line" character at the end of the string */
    alignStr = trimSpaces(alignID);

    if (isDuplicate)
	{
	/* only put out records for valid KG entries */
	if (!sameString(oldMrnaStr, skippedKgId) || sameString(oldMrnaStr, lastValidKgId))
	    {
	    fprintf(o7, "%s\t%s\t%s\t%s\n", oldMrnaStr, oldProteinStr, mrnaStr, proteinStr);
	    }
	}
    else
	{
	safef(query2, sizeof(query2), "select * from %sTemp.knownGene0 where alignID='%s';", genomeDBname, alignID);
	sr2 = sqlMustGetResult(conn2, query2);
    	row2 = sqlNextRow(sr2);
    	while (row2 != NULL)
	    {
 	    name 	= row2[0];
	    chrom 	= row2[1];
	    strand	= row2[2];
 	    txStart 	= row2[3];
	    txEnd       = row2[4];
	    cdsStart    = row2[5]; 
	    cdsEnd	= row2[6];
	    exonCount   = row2[7]; 
	    exonStarts  = row2[8]; 
	    exonEnds    = row2[9];	

	    proteinID = row2[10];
	    alignID   = row2[11];

	    sscanf(exonCount, "%d", &exCount);
	    sp = cloneString(exonStarts);
	    ep = cloneString(exonEnds);
	
            sscanf(cdsStart, "%d", &cdsS);
            sscanf(cdsEnd, "%d", &cdsE);

	    aalen = 0;
	    j=0;
	    for (i=0; i<exCount; i++)
		{
		chp = strstr(sp, ",");
		*chp = '\0';
		sscanf(sp, "%d", &(exStart[i]));
		chp++;
		sp = chp;

		chp = strstr(ep, ",");
		*chp = '\0';
		sscanf(ep, "%d", &(exEnd[i]));
	
		eS = exStart[i];
		eE = exEnd[i];
		
		if (cdsS > eS)
		    {
		    eS = cdsS;
		    }
		if (cdsE < eE)
		    {
		    eE = cdsE;
		    }
		if (eS > eE) 
		    {
		    eS = 0;
		    eE = 0;
		    }
	        if (eS != eE)
		    {
		    aaStart[j] = aalen;
		    aaEnd[j] = aaStart[j] + (eE- eS +1)/3 -1;
		    aalen = aalen + (eE- eS +1)/3;
			
		    j++;
		    }
		
		chp++;
		ep = chp;
		}
		
	    cdsLen = aalen;

            safef(cond_str, sizeof(cond_str), "val='%s'", proteinID);
            acc = sqlGetField(spDB, "displayId", "acc", cond_str);

            safef(cond_str, sizeof(cond_str), "acc='%s'", acc);
            aaStr=sqlGetField(spDB, "protein", "val", cond_str);
    	    aaLen = strlen(aaStr);

            if ((cdsLen >  50) || ((cdsLen * 100)/aaLen > 50))
		{
		fprintf(o3,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
 		    	name,
			chrom,
		        strand,
 	    		txStart,
	    		txEnd,
	 	   	cdsStart,
	    		cdsEnd,
		    	exonCount,
		    	exonStarts,
	    		exonEnds,
			
			proteinID,
			alignID);
		lastValidKgId = cloneString(name);
		}
	    else
		{
		printf("skipping %s %d \n", name, cdsLen);
		skippedKgId = cloneString(name);
		} 
	    row2 = sqlNextRow(sr2);
	    }
	sqlFreeResult(&sr2);
	}
    }
hFreeConn(&conn2);
hFreeConn(&conn3);
fclose(o3);
fclose(o7);
    
mustSystem("cat j.dat|sort|uniq  >knownGene.tab");
mustSystem("cat jj.dat|sort|uniq >duplicate.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");
return(0);
}
Beispiel #14
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2;

char query2[256];
struct sqlResult *sr2;
char **row2;
    
char *chp0, *chp;
char *kgID;
FILE *o1, *o2;
char cond_str[256];
char *database;
char *proteinDB;
boolean doingAlias, bothDone;

char *answer;
char *symbol, *alias, *aliases;

if (argc != 3) usage();
database  = cloneString(argv[1]);
proteinDB = cloneString(argv[2]);

conn = hAllocConn(database);
conn2= hAllocConn(database);
o1 = fopen("j.dat", "w");
o2 = fopen("jj.dat", "w");

doingAlias = TRUE;
bothDone   = FALSE;

while (!bothDone)
    {
    if (doingAlias)
	{
    	sqlSafef(query2, sizeof query2, "select symbol, aliases from %s.hgnc;", proteinDB);
	}
    else
	{
        sqlSafef(query2, sizeof query2, "select symbol, prvSymbols from %s.hgnc;", proteinDB);
    	}
    
    sr2 = sqlMustGetResult(conn2, query2);
    row2 = sqlNextRow(sr2);
    while (row2 != NULL)
	{
	symbol		= row2[0];
	aliases		= row2[1];

	if ( (symbol  != NULL) && (strlen(symbol) != 0) )
	    {
            sqlSafefFrag(cond_str, sizeof cond_str, "geneSymbol = '%s'", symbol);
            answer = sqlGetField(database, "kgXref", "kgID", cond_str);
	    if (answer != NULL)
		{
		kgID = strdup(answer);
		fprintf(o2, "%s\t%s\n", kgID, symbol);
		}
	    if ( (aliases  != NULL) && (strlen(aliases) != 0) && (answer != NULL) )
		{
		kgID = strdup(answer);
    
		chp0 = aliases; 
	    	while (chp0 != NULL)
		    {
		    while (*chp0 == ' ') chp0++;
		    chp = strstr(chp0, ",");
		    if (chp == NULL)
			{
			alias = strdup(chp0);
			
			/* get rid of quote character in some aliases */
			if (*alias == '"') 
			    {
			    *(alias + strlen(alias) - 1) = '\0';
			    alias++;
			    printf("%s\n", alias);fflush(stdout);
			    }
			chp0 = NULL;
			}
		    else
			{
			*chp = '\0';
			
			/* get rid of quote character in some aliases */
			if (*chp0 == '"') 
			    {
			    *(chp0 + strlen(chp0) - 1) = '\0';
			    chp0++;
			    printf("%s\n", chp0);fflush(stdout);
			    }
			alias = strdup(chp0);
			chp0 = chp+1;
			}
		    if (kgID != NULL)
			{
			fprintf(o1, "%s\t%s\t%s\n", kgID, symbol, alias);
			fprintf(o2, "%s\t%s\n", kgID, alias);
			}
		    }
		}
	    }
	row2 = sqlNextRow(sr2);
	}
    sqlFreeResult(&sr2);

    if (doingAlias) 
	{
	doingAlias = FALSE;
	}
    else
	{
	bothDone = TRUE;
	}
    }
fclose(o1);
fclose(o2);

/* geneAlias.tab has 3 columns, the 2nd is HUGO.symbol 
   and 3rd contains aliases and withdraws */

mustSystem("cat  j.dat|sort|uniq  >geneAlias.tab");

/*  kgAliasM.tab has 2 columns, all entries from HUGO.symbol, HUGO.aliass, 
    and HUGO.withdraws are listed in the 2nd column. */
mustSystem("cat jj.dat|sort|uniq  >kgAliasM.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");
    
return(0);
}
Beispiel #15
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2, *conn3;
struct sqlConnection *connCentral = hConnectCentral();
char query[256], query2[256], query3[256];
struct sqlResult *sr, *sr2;
char **row, **row2;
char buf[128];
char *answer;
char *kgID, *chrom, *txStart, *txEnd;
char *mRNA;
int i;
int geneCnt  = 0;
int pageNum  = 0;
int topLevel = 1;

char *geneSymbol, *proteinID, *spID, *desc;
FILE *outf, *outf2;
char fileName[255];
database = strdup("hg17");
boolean newPage;
int totalKgId, totalKgCnt;
int totalKgPage;
int kgIdCnt = 0;

if (argc != 2) usage();
database = argv[1];

sqlSafef(query, sizeof query, "select genome from dbDb where name = '%s'", database);
answer = sqlQuickQuery(connCentral, query, buf, sizeof(buf));
if (answer == NULL)
    {
    fprintf(stderr,"'%s' is not a valid genome database name.", database);
    exit(1);
    }
else
    {
    genome = strdup(answer);
    }

if (!hTableExists(database, "knownGene"))
    {
    fprintf(stderr,"Database %s currently does not have UCSC Known Genes.", database);
    exit(1);
    }

sqlSafef(query, sizeof query, "select description from dbDb where name = '%s'", database);

genomeDesc = strdup(sqlQuickQuery(connCentral, query, buf, sizeof(buf)));
hDisconnectCentral(&connCentral);

/* create first top level subdirectory */
safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel);
mustSystem(command);

conn = hAllocConn(database);
conn2= hAllocConn(database);
conn3= hAllocConn(database);

newPage  = TRUE;

currentPage = 0;

/* put this in to avoid compiler complaining */
outf = NULL;
geneSymbol = NULL;
char *protAcc = NULL;

/* figure out how many pages in total */
sqlSafef(query2, sizeof(query2), "select count(k.name) from %s.knownGene k, %s.kgXref x where k.name=x.kgId and geneSymbol != ''", database, database);
sr2  = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
totalKgCnt = atoi(row2[0]);
sqlFreeResult(&sr2);

/* figure out how many KG IDs in total */
sqlSafef(query2, sizeof(query2), "select count(*) from %s.kgXref where geneSymbol !=''", database);
sr2  = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
totalKgId = atoi(row2[0]);
sqlFreeResult(&sr2);
totalKgPage = totalKgId/LINKSPERPAGE + 1;

sqlSafef(query2, sizeof(query2),
      "select kgID, geneSymbol, description from %s.kgXref where geneSymbol!= '' order by geneSymbol",
      database);

      /* for debugging */
      /* "select kgID, geneSymbol, description from %s.kgXref order by geneSymbol limit %d",
      database, TESTSIZE);*/
sr2  = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);

/* for debugging */
/* while (kgIdCnt < TESTSIZE) */

while (kgIdCnt < totalKgId)
    {
    kgIdCnt++;

    kgID 	= row2[0];
    geneSymbol  = strdup(row2[1]);
    desc 	= row2[2];
    sqlSafef(query, sizeof(query),
    "select chrom,txSTart,txEnd,proteinID from %s.knownGene where name='%s'", database, kgID);
    sr = sqlMustGetResult(conn, query);
    row = sqlNextRow(sr);
    if (row != NULL)
    	{
	geneCnt++;
    	chrom     = row[0];
    	txStart   = row[1];
    	txEnd     = row[2];
    	proteinID = row[3];

	if (newPage)
	    {
	    /* create a KG links page */
	    pageNum++;
	    currentPage++;

	    /* use mkdir -p to make sure the subdirectory exists */
	    safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel);
	    mustSystem(command);
	    safef(fileName, sizeof(fileName),
	    	  "knownGeneList/%s/%d/kgList%d.html", database, topLevel, pageNum);
  	    outf = fopen(fileName, "w");
	    printHtmlHead(outf);

	    fprintf(outf,"<H2>UCSC %s Known Genes List (page %d of %d)</H2>\n",
	    	    genome, pageNum, totalKgPage);
	    fprintf(outf, "<TABLE BORDER=1=CELLSPACING=1 CELLPADDING=3 BGCOLOR=\"#D9F8E4\"><TR>\n");
	    fprintf(outf,
	    "<TR><TH>Gene Symbol</TH><TH>Known Gene ID</TH><TH>mRNA</TH><TH>UniProt</TH><TH>RefSeq Protein</TH><TH>Description</TH>\n");
	    strcpy(startSymbol[pageNum], geneSymbol);
	    strcpy(pageStartSymbol[currentPage], geneSymbol);
	    newPage = FALSE;
	    }

	fprintf(outf,"<TR>");
    	fprintf(outf,"<TD>%s</TD>", geneSymbol);
    	/*fprintf(outf,"<TD>%d:%s</TD>", geneCnt, geneSymbol);*/
    	fprintf(outf,"<TD>");
    	fprintf(outf,"<A href=\"/cgi-bin/hgGene?db=%s&hgg_gene=%s", database, kgID);
    	fprintf(outf,"&hgg_chrom=%s&hgg_start=%s&hgg_end=%s\">", chrom, txStart, txEnd);
    	fprintf(outf,"%s", kgID);
    	fprintf(outf,"</A>");
    	fprintf(outf,"</TD>\n");

	sqlSafef(query3,sizeof(query3),"select spID from %s.kgXref where kgID = '%s'", database, kgID);
	spID = cloneString(sqlQuickQuery(conn3, query3, buf, sizeof(buf)));
	if (spID == NULL)
	    {
	    spID = emptyString;
	    }
	else
	    {
	    if (sameWord(spID,"")) spID = emptyString;
	    }

	sqlSafef(query3,sizeof(query3),"select mRNA from %s.kgXref where kgID = '%s'", database, kgID);
	mRNA = cloneString(sqlQuickQuery(conn3, query3, buf, sizeof(buf)));
	if (mRNA == NULL)
	    {
	    mRNA = emptyString;
	    }
	else
	    {
	    if (sameWord(mRNA,"")) mRNA = emptyString;
	    }

	sqlSafef(query3,sizeof(query3),"select protAcc from %s.kgXref where kgID = '%s'", database, kgID);
	protAcc = sqlQuickQuery(conn3, query3, buf, sizeof(buf));
	if (protAcc == NULL)
	    {
	    protAcc = emptyString;
	    }
	else
	    {
	    if (sameWord(protAcc,"")) protAcc = emptyString;
	    }

	fprintf(outf,"<TD>%s</TD>", mRNA);
	fprintf(outf,"<TD>%s</TD>", spID);
	fprintf(outf,"<TD>%s</TD>", protAcc);
    	fprintf(outf,"<TD>%s</TD>", desc );
    	fprintf(outf,"</TR>\n");

	if ((geneCnt % LINKSPERPAGE) == 0)
    	    {
	    /* flush out and close the page if a page is filled, and start a new page */
	    fprintf(outf,"</TABLE>");
	    strcpy(endSymbol[pageNum], geneSymbol);
	    strcpy(pageEndSymbol[currentPage], endSymbol[pageNum]);
	    fprintf(outf, "<BR>");
	    fprintf(outf, "<A href=\"/knownGeneList/%s/%d/kgIndex%d.html\">",
	    	    database, topLevel,topLevel);
	    fprintf(outf, "Up");
	    fprintf(outf,"</A><BR>\n");
	    printHtmlEnd(outf);
	    newPage = TRUE;
	    fclose(outf);
	    outf = NULL;

	    if ((pageNum % LINKSPERPAGE) == 0 )
	    	{
	    	printf("Processing topLevel %d ...\n", topLevel);fflush(stdout);
	    	safef(fileName, sizeof(fileName),
	    	      "knownGeneList/%s/%d/kgIndex%d.html", database, topLevel, topLevel);
	    	outf2 = fopen(fileName, "w");
	    	printHtmlHead(outf2);
		//fprintf(outf2,"<H2>UCSC %s Known Genes List</H2>\n", genome);
		fprintf(outf2,"<H2>UCSC %s Known Genes List (Group %d)</H2>\n", genome, topLevel);
	    	for (i=1; i<= currentPage; i++)
	      	    {
	      	    fprintf(outf2, "Page %d: ", (topLevel-1)*LINKSPERPAGE+i);
	            fprintf(outf2,
	      	    	    "<A href=\"/knownGeneList/%s/%d/kgList%d.html\">",
	             	    database, topLevel, (topLevel-1)*LINKSPERPAGE+i);
	      	    fprintf(outf2, "%s to %s", pageStartSymbol[i], pageEndSymbol[i]);
    	      	    fprintf(outf2,"</A><BR>\n");
	      	    }
		fprintf(outf2, "<BR>");
		fprintf(outf2, "<A href=\"/knownGeneList/%s/top.html\">",database);
		fprintf(outf2, "Up");
		fprintf(outf2,"</A><BR>\n");
	    	printHtmlEnd(outf2);
	    	fclose(outf2);

	    strcpy(topStartSymbol[topLevel], pageStartSymbol[1]);
	    strcpy(  topEndSymbol[topLevel], pageEndSymbol[currentPage]);
	    currentPage = 0;
	    topLevel++;
	    }
    	}
	row = sqlNextRow(sr);
    	}
    sqlFreeResult(&sr);
    row2 = sqlNextRow(sr2);
    }
sqlFreeResult(&sr2);

/* flush out and close the last list page */
if (outf != NULL)
    {
    fprintf(outf,"</TABLE>");
    strcpy(endSymbol[pageNum], geneSymbol);
    strcpy(pageEndSymbol[currentPage], endSymbol[pageNum]);
    fprintf(outf, "<BR>");
    fprintf(outf, "<A href=\"/knownGeneList/%s/%d/kgIndex%d.html\">",
    database, topLevel,topLevel);
    fprintf(outf, "Up");
    fprintf(outf,"</A><BR>\n");
    printHtmlEnd(outf);
    fclose(outf);
    }

/* generate the last index page */
safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel);
mustSystem(command);
safef(fileName, sizeof(fileName),
      "knownGeneList/%s/%d/kgIndex%d.html", database, topLevel, topLevel);
outf2 = fopen(fileName, "w");
printHtmlHead(outf2);
fprintf(outf2,"<H2>UCSC %s Known Genes List (Group %d)</H2>\n", genome, topLevel);
for (i=1; i<= currentPage; i++)
    {
    fprintf(outf2, "Page %d: ", (topLevel-1)*LINKSPERPAGE+i);
    fprintf(outf2, "<A href=\"/knownGeneList/%s/%d/kgList%d.html\">",
	    database, topLevel, (topLevel-1)*LINKSPERPAGE+i);
    fprintf(outf2, "%s to %s", pageStartSymbol[i], pageEndSymbol[i]);
    fprintf(outf2,"</A><BR>\n");
    fflush(outf2);
    }

fprintf(outf2, "<BR>");
fprintf(outf2, "<A href=\"/knownGeneList/%s/top.html\">",database);
fprintf(outf2, "Up");
fprintf(outf2,"</A><BR>\n");
strcpy(topStartSymbol[topLevel], pageStartSymbol[1]);
strcpy(  topEndSymbol[topLevel], pageEndSymbol[currentPage]);

fclose(outf2);

currentPage = 0;

/* generate the top HTML page */
safef(fileName, sizeof(fileName), "knownGeneList/%s/top.html", database);
outf2 = fopen(fileName, "w");
printHtmlHead(outf2);
fprintf(outf2,"<H2>UCSC %s Known Genes List</H2>\n", genome);
for (i=1; i<= topLevel; i++)
    {
    fprintf(outf2, "Group %d: ", i);
    fprintf(outf2, "<A href=\"/knownGeneList/%s/%d/kgIndex%d.html\">", database, i, i);
    fprintf(outf2, " %s to %s", topStartSymbol[i], topEndSymbol[i]);
    fprintf(outf2,"</A><BR>\n");
    fflush(outf2);
    }

fprintf(outf2, "<BR>");
fprintf(outf2, "<A href=\"/knownGeneLists.html\">");
fprintf(outf2, "Up");
fprintf(outf2,"</A><BR>\n");

printHtmlEnd(outf2);
printHtmlEnd(outf2);
fclose(outf2);

return(0);
}
Beispiel #16
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2, *conn3;
char query[256], query3[256];
struct sqlResult *sr, *sr3;
char **row, **row3;
FILE *o1, *o2;

char *locusID;	/* LocusLink ID */
char *refAC;	/* Refseq accession.version */
char *kgTempDbName, *roDbName; 
char cond_str[200];
char *kgID;
char *mapID;
char *desc;

if (argc != 3)  usage();
kgTempDbName    = argv[1];
roDbName 	= argv[2];

conn = hAllocConn(roDbName);
conn2= hAllocConn(roDbName);
conn3= hAllocConn(roDbName);

o1 = fopen("j.dat",  "w");
o2 = fopen("jj.dat", "w");
    
sqlSafef(query, sizeof query, "select kgID, refseq from %s.kgXref", roDbName);
sr = sqlMustGetResult(conn, query);
row = sqlNextRow(sr);
while (row != NULL)
    {
    kgID  = row[0];
    refAC = row[1];
	
    sqlSafefFrag(cond_str, sizeof cond_str, "refseq='%s'", refAC);
    locusID = sqlGetField("entrez", "entrezRefProt", "geneID", cond_str);
    if (locusID != NULL)
	{
        sqlSafef(query3, sizeof query3, "select * from %s.keggList where locusID = '%s'", kgTempDbName, locusID);
        sr3 = sqlGetResult(conn3, query3);
        while ((row3 = sqlNextRow(sr3)) != NULL)
            {
            mapID   = row3[1];
	    desc    = row3[2];
	    fprintf(o1, "%s\t%s\t%s\n", kgID, locusID, mapID);fflush(o1);
	    fprintf(o2, "%s\t%s\n", mapID, desc);
	    row3 = sqlNextRow(sr3);
            }
        sqlFreeResult(&sr3);
	}
    row = sqlNextRow(sr);
    }

fclose(o1);
fclose(o2);
hFreeConn(&conn);
hFreeConn(&conn2);

mustSystem("cat j.dat|sort|uniq >keggPathway.tab");
mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");
return(0);
}
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2, *conn3;
char query[256], query2[256], query3[256];
struct sqlResult *sr, *sr2, *sr3;
char **row, **row2, **row3;

char *chp;
FILE *o1, *o2;

char *locusID;	/* LocusLink ID */
char *gbAC;		/* GenBank accession.version */
char *locusID2;	/* LocusLink ID */
char *refAC;	/* Refseq accession.version */
char *dbName; 
char cond_str[200];
char *kgID;
char *mapID;
char *desc;

if (argc != 2) usage();
dbName = argv[1];

conn = hAllocConn(dbName);
conn2= hAllocConn(dbName);
conn3= hAllocConn(dbName);

o1 = fopen("j.dat",  "w");
o2 = fopen("jj.dat", "w");
    
sprintf(query2,"select * from %sTemp.locus2Ref0;", dbName);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    locusID2 	= row2[0];
    refAC 	= row2[1];
    
    sprintf(query, "select * from %sTemp.locus2Acc0 where locusID=%s and seqType='m';", 
		   dbName, locusID2);
    sr = sqlMustGetResult(conn, query);
    row = sqlNextRow(sr);
    while (row != NULL)
    	{
	locusID 	= row[0];
	gbAC 		= row[1];
	
	chp = strstr(gbAC, ".");
	if (chp != NULL) *chp = '\0';
	chp = strstr(refAC, ".");
	if (chp != NULL) *chp = '\0';
    
	sprintf(cond_str, "name='%s'", gbAC);
        kgID = sqlGetField(dbName, "knownGene", "name", cond_str);
	if (kgID != NULL)
	    {
            sprintf(query3, "select * from %sTemp.keggList where locusID = '%s'", dbName, locusID);
            sr3 = sqlGetResult(conn3, query3);
            while ((row3 = sqlNextRow(sr3)) != NULL)
                {
                mapID   = row3[1];
		desc    = row3[2];
		fprintf(o1, "%s\t%s\t%s\n", kgID, locusID, mapID);
		fprintf(o2, "%s\t%s\n", mapID, desc);
		row3 = sqlNextRow(sr3);
                }
            sqlFreeResult(&sr3);
	    }
	row = sqlNextRow(sr);
	}
    row2 = sqlNextRow(sr2);
    }
sqlFreeResult(&sr2);

fclose(o1);
fclose(o2);
hFreeConn(&conn);
hFreeConn(&conn2);

mustSystem("cat j.dat|sort|uniq >keggPathway.tab");
mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");
return(0);
}
int main(int argc, char *argv[])
{
struct sqlConnection *conn;
FILE *o1;
char *chp0, *chp;
char *genomeDBname;
char refseqID[40], mapID[40];
char *kgID, *geneSymbol;
 
if (argc != 3) usage();
  
infileName      = argv[1];
genomeDBname    = argv[2];

conn= hAllocConn(genomeDBname);
o1 = fopen("j.dat", "w");

inf  = mustOpen(infileName, "r");
while (fgets(line_in, 1000, inf) != NULL)
    {
    strcpy(line, line_in);
    chp = strstr(line, "\t");
    *chp = '\0';
    strcpy(refseqID, line);

again:
    chp ++;
    chp0 = chp;
    chp = strtok(chp, "\r\t\n");	

    if (chp == NULL) continue;

    sprintf(cond_str, "alias='%s'", refseqID);
    kgID=sqlGetField(genomeDBname, "kgAlias", "kgID", cond_str);

    // check with refLink if not found in kgAlias
    if (kgID == NULL)
	{
    	sprintf(cond_str, "mrnaAcc='%s'", refseqID);
    	geneSymbol=sqlGetField(genomeDBname, "refLink", "name", cond_str);
    	sprintf(cond_str, "alias='%s'", geneSymbol);
    	kgID=sqlGetField(genomeDBname, "kgAlias", "kgID", cond_str);
	}

    strcpy(mapID, chp);
   
    if (kgID != NULL)
	{ 
    	fprintf(o1, "%s\t%s\t%s\n", kgID, refseqID, mapID);fflush(stdout);
    	}
    else
	{
	printf("%s not found in kgAlias nor in refLink\n", refseqID);
	}
    chp = chp + strlen(mapID);
    
    // process remaing refeqID(s)
    goto again;
    }

fclose(o1);
mustSystem("cat j.dat|sort|uniq >bioCycPathway.tab");
mustSystem("rm j.dat");
return 0;
}
Beispiel #19
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2, *conn3;
char query[256], query2[256];
struct sqlResult *sr, *sr2;
char **row, **row2;
    
char *chp;
FILE *o1;

char *locusID;	/* LocusLink ID */
char *gbAC;		/* GenBank accession.version */
char *giNCBI;	/* NCBI gi for the protein record associated with the CDS */
char *seqType;	/* sequence type m=mRNA g=genomic u=undefined */
char *proteinAC;	/* protein accession.version */
char *taxID;	/* tax id */
    
char *locusID2;	/* LocusLink ID */
char *refAC;	/* Refseq accession.version */
char *giNCBI2;	/* NCBI gi for the protein record associated with the CDS */
char *revStatus;	/* review status */
char *proteinAC2;	/* protein accession.version */
char *taxID2;	/* tax id */
char *dbName; 

if (argc != 2) usage();
dbName = argv[1];

conn = hAllocConn(dbName);
conn2= hAllocConn(dbName);
conn3= hAllocConn(dbName);

o1 = fopen("j.dat", "w");
    
sqlSafef(query2, sizeof query2, "select * from %sTemp.locus2Ref0;", dbName);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    locusID2 	= row2[0];
    refAC 	= row2[1];
    giNCBI2 	= row2[2];
    revStatus 	= row2[3];
    proteinAC2 	= row2[4];
    taxID2 	= row2[5];
		
    sqlSafef(query, sizeof query, "select * from %sTemp.locus2Acc0 where locusID=%s and seqType='m';", dbName, locusID2);
    sr = sqlMustGetResult(conn, query);
    row = sqlNextRow(sr);
    while (row != NULL)
    	{
	locusID 	= row[0];
	gbAC 		= row[1];
	giNCBI 		= row[2];
	seqType 	= row[3];
	proteinAC 	= row[4];
	taxID 		= row[5];

	chp = strstr(gbAC, ".");
	if (chp != NULL) *chp = '\0';
    			
	chp = strstr(refAC, ".");
	if (chp != NULL) *chp = '\0';
    			
	fprintf(o1, "%s\t%s\n", gbAC, refAC);
			
	row = sqlNextRow(sr);
	}
    row2 = sqlNextRow(sr2);
    }
		
fclose(o1);
hFreeConn(&conn);
hFreeConn(&conn2);
sqlFreeResult(&sr2);

mustSystem("cat j.dat|sort|uniq >mrnaRefseq.tab");
printf("mrnaRefseq.tab created.\n");
mustSystem("rm j.dat");
return(0);
}
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2;

char query[256], query2[256];
struct sqlResult *sr, *sr2;
char **row, **row2;
 
char *chp;
char *kgID;
FILE *o2;
char cond_str[256];
char *database;
char *ro_db;

char *proteinID;
char *proteinAC;

if (argc != 3) usage();
database  = cloneString(argv[1]);
ro_db  = cloneString(argv[2]);

conn = hAllocConn(database);
conn2= hAllocConn(database);
o2 = fopen("jj.dat", "w");

sprintf(query2,"select name, proteinID from %s.knownGene;", database);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    kgID = row2[0];
    proteinID = row2[1];

    // get RefSeq protein AC numbers (NP_xxxxx) if they exist
    sprintf(cond_str, "kgID='%s'", kgID);
    proteinAC = sqlGetField(database, "kgXref", "protAcc", cond_str);
    if (proteinAC != NULL)
	{
	if (strlen(proteinAC) > 0)
	    {
	    fprintf(o2, "%s\t%s\t%s\n", kgID, proteinID, proteinAC);
	    }
	}

    // get Genbank protein accession numbers
    if (strstr(kgID, "NM_") != NULL)
	{
	sprintf(query,"select protAcc from %s.refLink where mrnaAcc = '%s';", ro_db, kgID);
	sr = sqlMustGetResult(conn, query);
	row = sqlNextRow(sr);
	while (row != NULL)
    	    {
    	    proteinAC = row[0];
	    fprintf(o2, "%s\t%s\t%s\n", kgID, proteinID, proteinAC);
	    row = sqlNextRow(sr);
	    }
    	sqlFreeResult(&sr);
	}
    else
	{
	sprintf(query,"select proteinAC from %sTemp.locus2Acc0 where gbAC like '%s%c';", database, kgID, '%');
	sr = sqlMustGetResult(conn, query);
	row = sqlNextRow(sr);
	while (row != NULL)
    	    {
    	    proteinAC = row[0];

	    chp = strstr(proteinAC, ".");
	    if (chp != NULL)
		{
		*chp = '\0';
		}
	    if (proteinAC[0] != '-')
		{
		fprintf(o2, "%s\t%s\t%s\n", kgID, proteinID, proteinAC);
		}
	    row = sqlNextRow(sr);
	    }
    	sqlFreeResult(&sr);
	}
    row2 = sqlNextRow(sr2);
    }
sqlFreeResult(&sr2);
fclose(o2);
hFreeConn(&conn);
hFreeConn(&conn2);

mustSystem("cat jj.dat|sort|uniq  >kgProtAliasNCBI.tab");
mustSystem("rm jj.dat");
    
return(0);
}
void gsBig(char *faName, char *gtfName, 
	   char *suboptName, 
	   char *transName,
	   char *exeName, 
	   char *parName,
	   char *tmpDirName)
/* gsBig - Run Genscan on big input and produce GTF files. */
{
struct dnaSeq seq;
struct lineFile *lf = lineFileOpen(faName, TRUE);
FILE *gtfFile = mustOpen(gtfName, "w");
FILE *subFile = NULL;
FILE *transFile = NULL;
ZeroVar(&seq);

if (suboptName != NULL)
    subFile = mustOpen(suboptName, "w");
if (transName != NULL)
    transFile = mustOpen(transName, "w");
if (exeName != NULL)
    exePath = cloneString(exeName);
if (parName != NULL)
        parPath = cloneString(parName);	
if (tmpDirName != NULL)
        tmpDir = cloneString(tmpDirName);
	
if (optionExists("prerun"))
    {
    char *preFileName = optionVal("prerun", NULL);
    char seqName[128];
    struct segment *seg = parseSegment(preFileName, 0, 100000000, seqName);
    writeSeg(seqName, seg, gtfFile, subFile, transFile);
    }
else
    {
    struct dyString *dy = newDyString(1024);
    char tempFa[512], tempGs[512];
    char dir1[256], root1[128], ext1[64];
    int myPid = (int)getpid();

    splitPath(faName, dir1, root1, ext1);
    while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
	{
	int offset, sizeOne;
	struct segment *segList = NULL, *seg;
	char *seqName = cloneString(seq.name);
	int chunkNum = 0;

	for (offset = 0; offset < seq.size; offset += stepSize)
	    {
	    boolean allN = TRUE;
	    int i;
	    safef(tempFa, sizeof(tempFa), "%s/temp_gsBig_%d_%s_%d.fa",
		  tmpDir, myPid, seqName, chunkNum);
	    safef(tempGs, sizeof(tempGs), "%s/temp_gsBig_%d_%s_%d.genscan",
		  tmpDir, myPid, seqName, chunkNum);
	    sizeOne = seq.size - offset;
	    if (sizeOne > winSize) sizeOne = winSize;
	    /* Genscan hangs forever if a chunk is all-N's... if so, 
	     * then skip this chunk. */
	    for (i=offset;  i < (offset+sizeOne);  i++)
		{
		if (seq.dna[i] != 'N' && seq.dna[i] != 'n')
		    {
		    allN = FALSE;
		    break;
		    }
		}
	    if (allN)
		{
		printf("\ngsBig: skipping %s[%d:%d] -- it's all N's.\n\n",
		       seqName, offset, (offset+sizeOne-1));
		}
	    else
		{
		faWrite(tempFa, "split", seq.dna + offset, sizeOne); 
		dyStringClear(dy);
		dyStringPrintf(dy, "%s %s %s", exePath, parPath, tempFa);
		if (suboptName != NULL)
		    dyStringPrintf(dy, " -subopt");
		dyStringPrintf(dy, " > %s", tempGs);
		verbose(3, "%s\n", dy->string);
		mustSystem(dy->string);
		seg = parseSegment(tempGs, offset, offset+sizeOne, NULL);
		slAddHead(&segList, seg);
		}
	    chunkNum++;
	    }
	slReverse(&segList);
	seg = mergeSegs(segList);
	writeSeg(seqName, seg, gtfFile, subFile, transFile);
	freez(&seqName);
	}
    if (! optionExists("noRemove"))
	{
	remove(tempFa);
	remove(tempGs);
	}
    }
}
Beispiel #22
0
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2;
char query2[256];
struct sqlResult *sr2;
char **row2;

char cond_str[256];
char *kgID;
char *proteinID;
char *seq;
char *acc;
 
char protDbName[100];
char spDbName[100];
char *dbName;
char *ro_dbName;

FILE *o1, *o2;

struct dnaSeq *kgSeq;
    
if (argc != 4) usage();

o1 = fopen("j.dat",  "w");
o2 = fopen("jj.dat", "w");
    
dbName = argv[1];
ro_dbName = argv[3];
sprintf(protDbName,   "proteins%s", argv[2]);
sprintf(spDbName, "sp%s",   argv[2]);

conn= hAllocConn(ro_dbName);
conn2= hAllocConn(ro_dbName);
sqlSafef(query2, sizeof query2, "select name from %s.knownGene;", dbName);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    kgID    = row2[0];
    
    sqlSafefFrag(cond_str, sizeof cond_str, "name = '%s';", kgID);
    seq = sqlGetField(dbName, "knownGenePep", "seq", cond_str);
    if (seq != NULL)
	{
        fprintf(o1, "%s\t%s\n", kgID, seq);fflush(o1);
	}
    else
	{
        sqlSafefFrag(cond_str, sizeof cond_str, "name = '%s';", kgID);
        proteinID=sqlGetField(dbName, "knownGene", "proteinID", cond_str);
	if (proteinID != NULL)
	    {
            sqlSafefFrag(cond_str, sizeof cond_str, "val = '%s';", proteinID);
            acc = sqlGetField(spDbName, "displayId", "acc", cond_str);
	    if (acc == NULL)
		{
fprintf(stderr, "NO acc.displayId.%s: %s from name.knownGene.%s: %s\n", spDbName, proteinID, dbName, kgID);
		fflush(stderr);
		}
	    else
		{
		sqlSafefFrag(cond_str, sizeof cond_str, "acc = '%s';", acc);
		seq = sqlGetField(spDbName, "protein", "val", cond_str);
		if (seq == NULL)
		    {
		    fprintf(stderr, "NO protein seq for %s\n", kgID);
		    fprintf(stderr, "proteinID.knownGene.%s: %s, acc.displayID.%s: %s\n", dbName, proteinID, spDbName, acc);
		    fflush(stderr);
		    }
		    else
		    {
		    fprintf(o1, "%s\t%s\n", kgID, seq);
		    }
		}
	    } else {
fprintf(stderr, "kgID: %s not in knownGenePep or knownGene\n", kgID);
	    }
	}

    sqlSafefFrag(cond_str, sizeof cond_str, "name = '%s';", kgID);
        
    seq = sqlGetField(dbName, "knownGeneMrna", "seq", cond_str);
    if (seq != NULL)
    	{
        fprintf(o2, "%s\t%s\n", kgID, seq);fflush(o1);
        }
    else
        {
	kgSeq = hGenBankGetMrna(dbName, kgID, NULL);
	   
	if (kgSeq != NULL)
	    {
            fprintf(o2, "%s\t%s\n", kgID, kgSeq->dna);fflush(o1);
            }
	else
	    {
	    fprintf(stderr, "NO mRNA seq for %s\n", kgID);fflush(stderr);
	    }
	}
    row2 = sqlNextRow(sr2);
    }

sqlFreeResult(&sr2);
hFreeConn(&conn);
hFreeConn(&conn2);

fclose(o1);
fclose(o2);
    
mustSystem("cat j.dat |sort|uniq > knownGenePep.tab");
mustSystem("cat jj.dat|sort|uniq > knownGeneMrna.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");

return(0);
}