void doRnaFoldDisplay(struct sqlConnection *conn, char *geneId, char *geneName) /* Show RNA folding somehow. */ { char *table = cartString(cart, hggMrnaFoldRegion); char *how = cartString(cart, hggDoRnaFoldDisplay); struct rnaFold *fold = loadFold(conn, table, geneId); if (fold == NULL) { warn("Couldn't load %s from %s", geneId, table); return; } if (sameString(how, "text")) { hPrintf("<TT><PRE>"); hPrintf("%s\n%s (%1.2f)\n", fold->seq, fold->fold, fold->energy); hPrintf("</PRE></TT>"); } else if (sameString(how, "picture")) { char *psFile = cartString(cart, hggMrnaFoldPs); char *rootName = cloneString(psFile); char pngName[256]; char pdfName[256]; chopSuffix(rootName); safef(pngName, sizeof(pngName), "%s.png", rootName); safef(pdfName, sizeof(pngName), "%s.pdf", rootName); hPrintf("<H2>%s (%s) %s energy %1.2f</H2>\n", geneName, geneId, table, fold->energy); if (!fileExists(pdfName)) { char command[512]; safef(command, sizeof(command), "ps2pdf %s %s" , psFile, pdfName); mustSystem(command); } hPrintf("Click <A HREF=\"%s\">here for PDF version</A><BR>", pdfName); if (!fileExists(pngName)) { char command[512]; safef(command, sizeof(command), "gs -sDEVICE=png16m -sOutputFile=%s -dBATCH -dNOPAUSE -q %s" , pngName, psFile); mustSystem(command); } hPrintf("<IMG SRC=\"%s\">", pngName); } }
void makeTmpSai(struct sqlConnection *conn, struct cdwValidFile *vf, char *genoFile, char **retSampleFile, char **retSaiFile) /* Given a fastq file, make a subsample of it 100k reads long and align it with * bwa producing a sai file of given name. */ { /* Get fastq record */ long long fileId = vf->fileId; struct cdwFastqFile *fqf = cdwFastqFileFromFileId(conn, fileId); if (fqf == NULL) errAbort("No cdwFastqFile record for file id %lld", fileId); /* Create downsampled fastq in temp directory - downsampled more than default even. */ char sampleFastqName[PATH_LEN]; cdwMakeTempFastqSample(fqf->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName); verbose(1, "downsampled %s into %s\n", vf->licensePlate, sampleFastqName); /* Do alignment */ char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sai")); safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, sampleFastqName, saiName); mustSystem(cmd); /* Save return variables, clean up, and go home. */ *retSampleFile = cloneString(sampleFastqName); *retSaiFile = saiName; cdwFastqFileFree(&fqf); }
void fastqRepeatQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf) /* Do repeat QA if possible on fastq file. */ { /* First see if total repeat content is already in our table, in which case we are done. */ long long fileId = ef->id; char query[512]; sqlSafef(query, sizeof(query), "select count(*) from cdwQaRepeat where fileId=%lld and repeatClass='total'" , fileId); if (sqlQuickNum(conn, query) != 0) return; /* We've done this already */ /* Get sample file name from fastq table. */ struct cdwFastqFile *fqf = cdwFastqFileForFileId(conn, fileId); if (fqf == NULL) errAbort("No edqFastqRecord for %s", vf->licensePlate); char *fastqPath = fqf->sampleFileName; char bwaIndex[PATH_LEN]; safef(bwaIndex, sizeof(bwaIndex), "%s%s/repeatMasker/repeatMasker.fa", cdwValDataDir, vf->ucscDb); char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".sai")); safef(cmd, sizeof(cmd), "bwa aln %s %s > %s", bwaIndex, fastqPath, saiName); mustSystem(cmd); char *samName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".sam")); safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", bwaIndex, saiName, fastqPath, samName); mustSystem(cmd); remove(saiName); char *raName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".ra")); safef(cmd, sizeof(cmd), "edwSamRepeatAnalysis %s %s", samName, raName); mustSystem(cmd); verbose(2, "mustSystem(%s)\n", cmd); remove(samName); raIntoCdwRepeatQa(raName, conn, fileId); remove(raName); #ifdef SOON #endif /* SOON */ freez(&saiName); freez(&samName); freez(&raName); cdwFastqFileFree(&fqf); }
void doSystem(char *command) /* Do system call if doReal is set, otherwise just print command. */ { printf("%s\n", command); if (doReal) { mustSystem(command); } }
void edwAlignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly, char *fastqPath, struct edwValidFile *vf, FILE *bedF, double *retMapRatio, double *retDepth, double *retSampleCoverage) /* Take a sample fastq and run bwa on it, and then convert that file to a bed. * bedF and all the ret parameters can be NULL. */ { /* Hmm, tried doing this with Mark's pipeline code, but somehow it would be flaky the * second time it was run in same app. Resorting therefore to temp files. */ char genoFile[PATH_LEN]; safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", edwValDataDir, assembly->ucscDb, assembly->ucscDb); char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(edwTempDir(), "edwSample1", ".sai")); safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, fastqPath, saiName); mustSystem(cmd); char *samName = cloneString(rTempName(edwTempDir(), "ewdSample1", ".sam")); safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", genoFile, saiName, fastqPath, samName); mustSystem(cmd); remove(saiName); /* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. * and also to produce little bed file for enrichment step. */ struct genomeRangeTree *grt = genomeRangeTreeNew(); long long hitCount=0, missCount=0, totalBasesInHits=0; scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits); verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", hitCount, missCount, totalBasesInHits, grt); if (retMapRatio) *retMapRatio = (double)hitCount/(hitCount+missCount); if (retDepth) *retDepth = (double)totalBasesInHits/assembly->baseCount * (double)vf->itemCount/vf->sampleCount; long long basesHitBySample = genomeRangeTreeSumRanges(grt); if (retSampleCoverage) *retSampleCoverage = (double)basesHitBySample/assembly->baseCount; genomeRangeTreeFree(&grt); remove(samName); }
void chainSplit(char *outDir, int inCount, char *inFiles[]) /* chainSplit - Split chains up by target or query sequence. */ { struct hash *hash = newHash(0); int inIx; char tpath[512]; FILE *meta ; bool metaOpen = TRUE; makeDir(outDir); safef(tpath, sizeof(tpath), "%s/meta.tmp", outDir); meta = mustOpen(tpath,"w"); for (inIx = 0; inIx < inCount; ++inIx) { struct lineFile *lf = lineFileOpen(inFiles[inIx], TRUE); struct chain *chain; FILE *f; lineFileSetMetaDataOutput(lf, meta); while ((chain = chainRead(lf)) != NULL) { char *name = (splitOnQ ? chain->qName : chain->tName); if (lump > 0) name = lumpName(name); if ((f = hashFindVal(hash, name)) == NULL) { char path[512], cmd[512]; safef(path, sizeof(path),"%s/%s.chain", outDir, name); if (metaOpen) fclose(meta); metaOpen = FALSE; safef(cmd,sizeof(cmd), "cat %s | sort -u > %s", tpath, path); mustSystem(cmd); f = mustOpen(path, "a"); hashAdd(hash, name, f); } chainWrite(chain, f); chainFree(&chain); } lineFileClose(&lf); } }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[255]; char *proteinDatabaseName; FILE *o1, *o2, *o3; FILE *fh[23]; char temp_str[1000];; char *accession; char *aaSeq; char *chp; int i, j, len; int ihi, ilow; char *answer; char *protDisplayId; int aaResCnt[30]; char aaAlphabet[30]; int aaResFound; float fvalue1, fvalue2; float p1, p2; int icnt, jcnt; char *taxon; char *database; int sortedCnt; if (argc != 4) usage(); strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB"); proteinDatabaseName = argv[1]; taxon = argv[2]; database = argv[3]; o2 = mustOpen("pbResAvgStd.tab", "w"); for (i=0; i<20; i++) { safef(temp_str, sizeof(temp_str), "%c.txt", aaAlphabet[i]); fh[i] = mustOpen(temp_str, "w"); } conn = hAllocConn(hDefaultDb()); conn2 = hAllocConn(hDefaultDb()); safef(query2, sizeof(query2), "select proteinID from %s.knownGene;", database); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); icnt = 0; jcnt = 0; for (j=0; j<MAXRES; j++) { sumJ[j] = 0; } while (row2 != NULL) { protDisplayId = row2[0]; safef(cond_str, sizeof(cond_str), "val='%s'", protDisplayId); accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str); if (accession == NULL) { safef(cond_str, sizeof(cond_str), "acc='%s'", protDisplayId); accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str); if (accession == NULL) { verbose(2, "'%s' not found.\n", protDisplayId); goto skip; } } safef(cond_str, sizeof(cond_str), "accession='%s'", accession); answer = sqlGetField("proteins040115", "spXref2", "biodatabaseID", cond_str); if (answer == NULL) { /* this protein might be a variant splice protein, and then it won't be in spXref2 */ goto skip; } if (answer[0] != '1') { /* printf("%s not in SWISS-PROT\n", protDisplayId);fflush(stdout); */ goto skip; } safef(cond_str, sizeof(cond_str), "acc='%s'", accession); aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str); if (aaSeq == NULL) { printf("Can't find peptide sequence for %s, exiting ...\n", protDisplayId); fflush(stdout); exit(1); } len = strlen(aaSeq); if (len < 100) goto skip; lenDouble = (double)len; for (j=0; j<MAXRES; j++) { aaResCnt[j] = 0; } chp = aaSeq; for (i=0; i<len; i++) { aaResFound = 0; for (j=0; j<MAXRES; j++) { if (*chp == aaAlphabet[j]) { aaResFound = 1; aaResCnt[j] ++; } } if (!aaResFound) { fprintf(stderr, "%c %d not a valid AA residue.\n", *chp, *chp); } chp++; } for (j=0; j<MAXRES; j++) { freq[icnt][j] = (double)aaResCnt[j]/lenDouble; sumJ[j] = sumJ[j] + freq[icnt][j]; } for (j=0; j<20; j++) { fprintf(fh[j], "%15.7f\t%s\n", freq[icnt][j], accession); fflush(fh[j]); } icnt++; if (icnt >= MAXN) errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN); skip: row2 = sqlNextRow(sr2); } recordCnt = icnt; recordCntDouble = (double)recordCnt; for (j=0; j<20; j++) { carefulClose(&(fh[j])); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); for (j=0; j<MAXRES; j++) { avg[j] = sumJ[j]/recordCntDouble; } for (j=0; j<20; j++) { sum = 0.0; for (i=0; i<recordCnt; i++) { sum = sum + (freq[i][j] - avg[j]) * (freq[i][j] - avg[j]); } sigma[j] = sqrt(sum/(double)(recordCnt-1)); fprintf(o2, "%c\t%f\t%f\n", aaAlphabet[j], avg[j], sigma[j]); } carefulClose(&o2); o1 = mustOpen("pbAnomLimit.tab", "w"); for (j=0; j<20; j++) { safef(temp_str, sizeof(temp_str), "cat %c.txt|sort|uniq > %c.srt", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); /* figure out how many unique entries */ safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); safef(temp_str, sizeof(temp_str), "%c.tmp", aaAlphabet[j]); o3 = mustOpen(temp_str, "r"); mustGetLine(o3, temp_str, 1000); chp = temp_str; while (*chp == ' ') chp++; while (*chp != ' ') chp++; *chp = '\0'; sscanf(temp_str, "%d", &sortedCnt); safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]); mustSystem(temp_str); /* cal hi and low cutoff threshold */ ilow = (int)((float)sortedCnt * 0.025); ihi = (int)((float)sortedCnt * 0.975); safef(temp_str, sizeof(temp_str), "%c.srt", aaAlphabet[j]); o2 = mustOpen(temp_str, "r"); i=0; for (i=0; i<ilow; i++) { mustGetLine(o2, temp_str, 1000); } sscanf(temp_str, "%f", &fvalue1); mustGetLine(o2, temp_str, 1000); sscanf(temp_str, "%f", &fvalue2); p1 = (fvalue1 + fvalue2)/2.0; for (i=ilow+1; i<ihi; i++) { mustGetLine(o2, temp_str, 1000); } sscanf(temp_str, "%f", &fvalue1); mustGetLine(o2, temp_str, 1000); sscanf(temp_str, "%f", &fvalue2); p2 = (fvalue1 + fvalue2)/2.0; carefulClose(&o2); fprintf(o1, "%c\t%f\t%f\n", aaAlphabet[j], p1, p2); fflush(stdout); for (i=0; i<recordCnt; i++) { measure[i] = freq[i][j]; } safef(temp_str, sizeof(temp_str), "pbAaDist%c.tab", aaAlphabet[j]); calDist(measure, recordCnt, 51, 0.0, 0.005, temp_str); } carefulClose(&o1); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn2, *conn3; char query2[256]; struct sqlResult *sr2; char **row2; char *proteinDataDate; FILE *o2; char *entrez; char *chp; char *hgncId, *name, *symbol, *refSeqIds, *uniProt; int j; char *locusType; char *refseq; boolean gotRefseq; if (argc != 2) usage(); proteinDataDate = argv[1]; o2 = fopen("j.dat", "w"); conn2= hAllocConn(hDefaultDb()); conn3= hAllocConn(hDefaultDb()); sprintf(query2, "select hgncId, symbol, name, refSeqMapped, refSeqIds, uniProt, entrezMapped, locusType from proteins%s.hgnc where status not like '%cWithdrawn%c'", proteinDataDate, '%', '%'); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { j=0; hgncId = row2[j];j++; symbol = row2[j];j++; name = row2[j];j++; refseq = row2[j];j++; refSeqIds = row2[j];j++; uniProt = row2[j];j++; entrez = row2[j];j++; locusType = row2[j];j++; chp = strstr(hgncId, "HGNC:"); hgncId = chp+5; gotRefseq = FALSE; /* process refSeqMapped first */ if (!sameWord(refseq, "")) { fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name); gotRefseq = TRUE; } /* process refSeqIds next */ chp = strstr(refSeqIds, ","); if (chp != NULL) { *chp = '\0'; while (chp != NULL) { fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name); chp++; while (*chp == ' ') chp++; refseq = chp; chp = strstr(refseq, ","); if (chp != NULL) *chp = '\0'; } fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name); gotRefseq = TRUE; } else { if (!sameWord(refseq,"")) { fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name); } else { /* output the record if no RefSeq in either refSeqIds or refSeqMapped */ if (!gotRefseq) { fprintf(o2, "%s\t%s\t%s\t%s\t%s\t%s\n", symbol, refseq, uniProt, hgncId, entrez, name); } } } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn2); fclose(o2); mustSystem("cat j.dat |sort -u >hgncXref.tab"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn3; char query[256], query3[256]; struct sqlResult *sr, *sr3; char **row, **row3; FILE *o1, *o2; char *locusID; /* LocusLink ID */ char *kgTempDbName, *roDbName; char cond_str[200]; char *kgId; char *mapID; char *desc; char *mRNA; optionInit(&argc, argv, options); if (argc != 3) usage(); kgTempDbName = argv[1]; roDbName = argv[2]; conn = hAllocConn(roDbName); conn3= hAllocConn(roDbName); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); table = optionVal("table", "knownGene"); sqlSafef(query, sizeof(query), "select name from %s.%s", roDbName, table); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { kgId = row[0]; sqlSafefFrag(cond_str, sizeof(cond_str), "kgId='%s'", kgId); mRNA = sqlGetField(roDbName, "kgXref", "mRNA", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "mrna='%s'", mRNA); locusID = sqlGetField("entrez", "entrezMrna", "geneId", cond_str); /* look for RefSeq if not found in mRNAs */ if (locusID == NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "refseq='%s'", mRNA); locusID = sqlGetField("entrez", "entrezRefseq", "geneId", cond_str); } if (locusID != NULL) { sqlSafef(query3, sizeof(query3), "select * from %s.keggList where locusID = '%s'", kgTempDbName, locusID); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgId, locusID, mapID); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } else { /* printf("%s not found in Entrez.\n", kgId);fflush(stdout);*/ if (differentString(table, "knownGene")) { sqlSafefFrag(cond_str, sizeof(cond_str), "name='%s'", kgId); locusID = sqlGetField(roDbName, table, "name2", cond_str); sqlSafef(query3, sizeof(query3), "select * from %s.keggList where locusID = '%s'", kgTempDbName, kgId); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgId, locusID, mapID); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } } row = sqlNextRow(sr); } fclose(o1); fclose(o2); hFreeConn(&conn); mustSystem("cat j.dat|sort|uniq >keggPathway.tab"); mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[256]; char *protDbDate; char *kgID; char *protDisplayId; FILE *o1; char *kgTempDb; char spDb[255],proteinsDb[255]; char *ro_DB; char *refSeqName; char *hugoID; char *protAcc; /* protein Accession number from NCBI */ char *answer; char *emptyStr; char *parSpID; int leg; /* marker for debugging */ char *spID, *kgProteinID, *geneSymbol, *refseqID, *desc; if (argc != 4) usage(); kgTempDb = cloneString(argv[1]); protDbDate = cloneString(argv[2]); ro_DB = cloneString(argv[3]); safef(spDb, sizeof(spDb), "sp%s", protDbDate); safef(proteinsDb, sizeof(proteinsDb), "proteins%s", protDbDate); conn = hAllocConn(ro_DB); conn2= hAllocConn(ro_DB); conn3= hAllocConn(ro_DB); o1 = mustOpen("j.dat", "w"); emptyStr = strdup(""); sqlSafef(query2, sizeof query2, "select name, proteinID from %s.knownGene;", kgTempDb); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { kgID = row2[0]; kgProteinID = row2[1]; refseqID = strdup(""); geneSymbol = strdup(""); desc = strdup(""); protAcc = strdup(""); sqlSafefFrag(cond_str, sizeof cond_str, "displayID='%s'", kgProteinID); spID = sqlGetField(proteinsDb, "spXref3", "accession", cond_str); /* process variant splice proteins */ if (spID == NULL) { sqlSafefFrag(cond_str, sizeof cond_str, "varAcc='%s'", kgProteinID); spID = kgProteinID; parSpID = sqlGetField(proteinsDb, "splicProt", "parAcc", cond_str); if (parSpID != NULL) { sqlSafefFrag(cond_str, sizeof cond_str, "accession='%s'", parSpID); protDisplayId = sqlGetField(proteinsDb, "spXref3", "displayID", cond_str); } else { fprintf(stderr, "%s not found in kgXref3 nor in varProtein.\n", kgProteinID); exit(1); } } else { protDisplayId = kgProteinID; } /* use description for the protein as default, replace it with HUGO desc if available. */ sqlSafefFrag(cond_str, sizeof cond_str, "displayID='%s'", protDisplayId); desc = sqlGetField(proteinsDb, "spXref3", "description", cond_str); if (strstr(kgID, "NM_") != NULL) { leg = 1; /* special processing for RefSeq DNA based genes */ sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", kgID); refSeqName = sqlGetField(ro_DB, "refLink", "name", cond_str); if (refSeqName != NULL) { geneSymbol = cloneString(refSeqName); refseqID = kgID; sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", kgID); desc = sqlGetField(ro_DB, "refLink", "product", cond_str); sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc='%s'", refseqID); answer = sqlGetField(ro_DB, "refLink", "protAcc", cond_str); if (answer != NULL) { protAcc = strdup(answer); } } } else { sqlSafefFrag(cond_str, sizeof cond_str, "displayID = '%s'", protDisplayId); hugoID = sqlGetField(proteinsDb, "spXref3", "hugoSymbol", cond_str); if (!((hugoID == NULL) || (*hugoID == '\0')) ) { leg = 21; geneSymbol = cloneString(hugoID); sqlSafefFrag(cond_str, sizeof cond_str, "displayID = '%s'", protDisplayId); desc = sqlGetField(proteinsDb, "spXref3", "hugoDesc", cond_str); if (desc == NULL) { printf("%s/%s don't have hugo desc ...\n", kgProteinID, protDisplayId); fflush(stdout); } } refseqID = emptyStr; protAcc = emptyStr; sqlSafefFrag(cond_str, sizeof cond_str, "mrna = '%s'", kgID); answer = sqlGetField(ro_DB, "mrnaRefseq", "refseq", cond_str); if (answer != NULL) { refseqID = answer; } else { /*printf("%s does not have a related RefSeq.\n", kgID);fflush(stdout); */ } if (strlen(geneSymbol) == 0) { leg = 23; if (strlen(refseqID) != 0) { sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", refseqID); answer = sqlGetField(ro_DB, "refLink", "name", cond_str); if (answer != NULL) { leg = 24; geneSymbol = strdup(answer); } } } } /* fix missing fields */ if (strlen(refseqID) == 0) { /* printf("%3d %s reseqID is empty.\n", leg, kgID); */ } if (strlen(geneSymbol) == 0) { /* printf("%3d %s geneSymbol is empty.\n", leg, kgID);fflush(stdout);*/ geneSymbol = strdup(kgID); } if (strlen(desc) == 0) { /* printf("%3d %s desc is empty.\n", leg, kgID);fflush(stdout); */ desc = strdup("N/A"); } fprintf(o1, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", kgID, kgID, spID, protDisplayId, geneSymbol, refseqID, protAcc, desc); row2 = sqlNextRow(sr2); } fclose(o1); hFreeConn(&conn); hFreeConn(&conn2); hFreeConn(&conn3); mustSystem("cat j.dat|sort|uniq >kgXref.tab"); mustSystem("rm j.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn2, *conn3; char query2[256], query3[256]; struct sqlResult *sr2, *sr3; char **row2, **row3; char *accession; char *displayID; char *division; char *extDB; char *extAC; char *proteinDataDate; char *genomeRelease; FILE *o2, *o3; char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd, *exonCount, *exonStarts, *exonEnds; char *bioDBID, *bioentryID; if (argc != 3) usage(); proteinDataDate = argv[1]; genomeRelease = argv[2]; o2 = fopen("jj.dat", "w"); o3 = fopen("j.dat", "w"); conn2= hAllocConn(hDefaultDb()); conn3= hAllocConn(hDefaultDb()); sqlSafef(query2, sizeof query2, "select * from %sTemp.refGene;", genomeRelease); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { name = row2[0]; chrom = row2[1]; strand = row2[2]; txStart = row2[3]; txEnd = row2[4]; cdsStart = row2[5]; cdsEnd = row2[6]; exonCount = row2[7]; exonStarts= row2[8]; exonEnds = row2[9]; sqlSafef(query3, sizeof query3, "select * from proteins%s.spXref2 where extAC='%s' and extDB='EMBL';", proteinDataDate, name); sr3 = sqlMustGetResult(conn3, query3); row3 = sqlNextRow(sr3); while (row3 != NULL) { accession = row3[0]; displayID = row3[1]; division = row3[2]; extDB = row3[3]; extAC = row3[4]; bioentryID= row3[5]; bioDBID = row3[6]; if (! ( (strcmp(bioDBID, "1") == 0) || (strcmp(bioDBID, "2") == 0) || (strcmp(bioDBID, "3") == 0) ) ) { printf("non-recognized bioDB index %s encountered.\n", bioDBID); printf("displayId=%s bioDBID=%s\n", displayID, bioDBID); fflush(stdout); exit(1); } fprintf(o2, "%s\n", displayID); fprintf(o3, "%s\t%s\n", displayID, extAC); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn2); hFreeConn(&conn3); fclose(o2); fclose(o3); mustSystem("cat j.dat |sort|uniq >proteinMrna.tab"); mustSystem("cat jj.dat|sort|uniq >protein.lis"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
void pairedEndQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf) /* Look for other end, do a pairwise alignment, and save results in database. */ { verbose(2, "pairedEndQa on %u %s %s\n", ef->id, ef->cdwFileName, ef->submitFileName); /* Get other end, return if not found. */ struct cdwValidFile *otherVf = cdwOppositePairedEnd(conn, ef, vf); if (otherVf == NULL) return; if (otherVf->fileId > vf->fileId) return; struct cdwValidFile *vf1, *vf2; struct cdwQaPairedEndFastq *pair = cdwQaPairedEndFastqFromVfs(conn, vf, otherVf, &vf1, &vf2); if (pair != NULL) { cdwValidFileFree(&otherVf); return; } /* Get target assembly and figure out path for BWA index. */ struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, vf->ucscDb); assert(assembly != NULL); char genoFile[PATH_LEN]; safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", cdwValDataDir, assembly->ucscDb, assembly->ucscDb); verbose(1, "aligning subsamples on %u vs. %u paired reads\n", vf1->fileId, vf2->fileId); /* Make alignments of subsamples. */ char *sample1 = NULL, *sample2 = NULL, *sai1 = NULL, *sai2 = NULL; makeTmpSai(conn, vf1, genoFile, &sample1, &sai1); makeTmpSai(conn, vf2, genoFile, &sample2, &sai2); /* Make paired end alignment */ char *tmpSam = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sam")); char command[6*PATH_LEN]; safef(command, sizeof(command), "bwa sampe -n 1 -N 1 -f %s %s %s %s %s %s" , tmpSam, genoFile, sai1, sai2, sample1, sample2); mustSystem(command); /* Make ra file with pairing statistics */ char *tmpRa = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".ra")); safef(command, sizeof(command), "edwSamPairedEndStats -maxInsert=%d %s %s", maxInsert, tmpSam, tmpRa); mustSystem(command); /* Read RA file into variables. */ struct cdwQaPairedEndFastq *pe = cdwQaPairedEndFastqOneFromRa(tmpRa); /* Update database with record. */ struct sqlConnection *freshConn = cdwConnectReadWrite(); char query[256]; sqlSafef(query, sizeof(query), "insert into cdwQaPairedEndFastq " "(fileId1,fileId2,concordance,distanceMean,distanceStd,distanceMin,distanceMax,recordComplete) " " values (%u,%u,%g,%g,%g,%g,%g,1)" , vf1->fileId, vf2->fileId, pe->concordance, pe->distanceMean , pe->distanceStd, pe->distanceMin, pe->distanceMax); sqlUpdate(conn, query); sqlDisconnect(&freshConn); /* Clean up and go home. */ cdwValidFileFree(&otherVf); remove(sample1); remove(sample2); remove(sai1); remove(sai2); remove(tmpSam); remove(tmpRa); #ifdef SOON #endif /* SOON */ freez(&sample1); freez(&sample2); freez(&sai1); freez(&sai2); freez(&tmpSam); freez(&tmpRa); cdwQaPairedEndFastqFree(&pe); cdwValidFileFree(&otherVf); }
int main(int argc, char *argv[]) { char *skippedKgId; char *lastValidKgId; struct sqlConnection *conn2, *conn3; struct sqlResult *sr2; char query2[256]; char **row2; char *proteinID; FILE *o3, *o7; char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd, *exonCount, *exonStarts, *exonEnds; char *alignID; char *chp; int i, j; int isDuplicate; char *genomeDBname; char *proteinDataDate; char proteinsDB[40]; char spDB[40]; char *acc; #define MAX_EXON 1000 int exStart[MAX_EXON], exEnd[MAX_EXON]; int exCount; int aaStart[MAX_EXON], aaEnd[MAX_EXON]; char *sp, *ep; int aalen; int cdsS, cdsE; int eS, eE; if (argc != 3) usage(); proteinDataDate = argv[1]; genomeDBname = argv[2]; safef(spDB, sizeof(spDB), "sp%s", proteinDataDate); safef(proteinsDB, sizeof(proteinsDB), "proteins%s", proteinDataDate); o3 = fopen("j.dat", "w"); o7 = fopen("jj.dat", "w"); conn2= hAllocConn(genomeDBname); conn3= hAllocConn(genomeDBname); inf = mustOpen("sorted.lis", "r"); strcpy(oldInfo, ""); skippedKgId = cloneString(""); lastValidKgId = cloneString(""); isDuplicate = 0; oldMrnaStr = cloneString(""); oldAlignStr = cloneString(""); oldProteinStr = cloneString(""); mrnaStr = cloneString(""); proteinStr = cloneString(""); alignStr = cloneString(""); while (fgets(line_in, 10000, inf) != NULL) { strcpy(line, line_in); chp = strstr(line, "\t"); /* chrom */ chp ++; chp = strstr(chp, "\t"); /* cds block start position */ chp ++; chp = strstr(chp, "\t"); /* cds block end position */ *chp = '\0'; chp++; strcpy(newInfo, line); if (sameString(oldInfo, newInfo)) { isDuplicate = 1; } else { /* remember previous record as old only if it is not a duplicate */ if (!isDuplicate) { oldMrnaStr = mrnaStr; oldProteinStr = proteinStr; oldAlignStr = alignStr; } strcpy(oldInfo, newInfo); isDuplicate = 0; } chp = strstr(chp, "\t"); /* priority score */ chp ++; chp = strstr(chp, "\t"); /* mRNA transcription length */ chp ++; chp = strstr(chp, "\t"); /* mRNA date */ chp ++; mrnaStr = chp; chp = strstr(chp, "\t"); /* mRNA ID */ *chp = '\0'; chp ++; mrnaStr = cloneString(mrnaStr); proteinStr = chp; chp = strstr(chp, "\t"); /* protein ID */ *chp = '\0'; chp ++; proteinStr = cloneString(proteinStr); alignID = chp; /* get rid of "end-of-line" character at the end of the string */ alignStr = trimSpaces(alignID); if (isDuplicate) { /* only put out records for valid KG entries */ if (!sameString(oldMrnaStr, skippedKgId) || sameString(oldMrnaStr, lastValidKgId)) { fprintf(o7, "%s\t%s\t%s\t%s\n", oldMrnaStr, oldProteinStr, mrnaStr, proteinStr); } } else { safef(query2, sizeof(query2), "select * from %sTemp.knownGene0 where alignID='%s';", genomeDBname, alignID); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { name = row2[0]; chrom = row2[1]; strand = row2[2]; txStart = row2[3]; txEnd = row2[4]; cdsStart = row2[5]; cdsEnd = row2[6]; exonCount = row2[7]; exonStarts = row2[8]; exonEnds = row2[9]; proteinID = row2[10]; alignID = row2[11]; sscanf(exonCount, "%d", &exCount); sp = cloneString(exonStarts); ep = cloneString(exonEnds); sscanf(cdsStart, "%d", &cdsS); sscanf(cdsEnd, "%d", &cdsE); aalen = 0; j=0; for (i=0; i<exCount; i++) { chp = strstr(sp, ","); *chp = '\0'; sscanf(sp, "%d", &(exStart[i])); chp++; sp = chp; chp = strstr(ep, ","); *chp = '\0'; sscanf(ep, "%d", &(exEnd[i])); eS = exStart[i]; eE = exEnd[i]; if (cdsS > eS) { eS = cdsS; } if (cdsE < eE) { eE = cdsE; } if (eS > eE) { eS = 0; eE = 0; } if (eS != eE) { aaStart[j] = aalen; aaEnd[j] = aaStart[j] + (eE- eS +1)/3 -1; aalen = aalen + (eE- eS +1)/3; j++; } chp++; ep = chp; } cdsLen = aalen; safef(cond_str, sizeof(cond_str), "val='%s'", proteinID); acc = sqlGetField(spDB, "displayId", "acc", cond_str); safef(cond_str, sizeof(cond_str), "acc='%s'", acc); aaStr=sqlGetField(spDB, "protein", "val", cond_str); aaLen = strlen(aaStr); if ((cdsLen > 50) || ((cdsLen * 100)/aaLen > 50)) { fprintf(o3,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, proteinID, alignID); lastValidKgId = cloneString(name); } else { printf("skipping %s %d \n", name, cdsLen); skippedKgId = cloneString(name); } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); } } hFreeConn(&conn2); hFreeConn(&conn3); fclose(o3); fclose(o7); mustSystem("cat j.dat|sort|uniq >knownGene.tab"); mustSystem("cat jj.dat|sort|uniq >duplicate.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char *chp0, *chp; char *kgID; FILE *o1, *o2; char cond_str[256]; char *database; char *proteinDB; boolean doingAlias, bothDone; char *answer; char *symbol, *alias, *aliases; if (argc != 3) usage(); database = cloneString(argv[1]); proteinDB = cloneString(argv[2]); conn = hAllocConn(database); conn2= hAllocConn(database); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); doingAlias = TRUE; bothDone = FALSE; while (!bothDone) { if (doingAlias) { sqlSafef(query2, sizeof query2, "select symbol, aliases from %s.hgnc;", proteinDB); } else { sqlSafef(query2, sizeof query2, "select symbol, prvSymbols from %s.hgnc;", proteinDB); } sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { symbol = row2[0]; aliases = row2[1]; if ( (symbol != NULL) && (strlen(symbol) != 0) ) { sqlSafefFrag(cond_str, sizeof cond_str, "geneSymbol = '%s'", symbol); answer = sqlGetField(database, "kgXref", "kgID", cond_str); if (answer != NULL) { kgID = strdup(answer); fprintf(o2, "%s\t%s\n", kgID, symbol); } if ( (aliases != NULL) && (strlen(aliases) != 0) && (answer != NULL) ) { kgID = strdup(answer); chp0 = aliases; while (chp0 != NULL) { while (*chp0 == ' ') chp0++; chp = strstr(chp0, ","); if (chp == NULL) { alias = strdup(chp0); /* get rid of quote character in some aliases */ if (*alias == '"') { *(alias + strlen(alias) - 1) = '\0'; alias++; printf("%s\n", alias);fflush(stdout); } chp0 = NULL; } else { *chp = '\0'; /* get rid of quote character in some aliases */ if (*chp0 == '"') { *(chp0 + strlen(chp0) - 1) = '\0'; chp0++; printf("%s\n", chp0);fflush(stdout); } alias = strdup(chp0); chp0 = chp+1; } if (kgID != NULL) { fprintf(o1, "%s\t%s\t%s\n", kgID, symbol, alias); fprintf(o2, "%s\t%s\n", kgID, alias); } } } } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); if (doingAlias) { doingAlias = FALSE; } else { bothDone = TRUE; } } fclose(o1); fclose(o2); /* geneAlias.tab has 3 columns, the 2nd is HUGO.symbol and 3rd contains aliases and withdraws */ mustSystem("cat j.dat|sort|uniq >geneAlias.tab"); /* kgAliasM.tab has 2 columns, all entries from HUGO.symbol, HUGO.aliass, and HUGO.withdraws are listed in the 2nd column. */ mustSystem("cat jj.dat|sort|uniq >kgAliasM.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; struct sqlConnection *connCentral = hConnectCentral(); char query[256], query2[256], query3[256]; struct sqlResult *sr, *sr2; char **row, **row2; char buf[128]; char *answer; char *kgID, *chrom, *txStart, *txEnd; char *mRNA; int i; int geneCnt = 0; int pageNum = 0; int topLevel = 1; char *geneSymbol, *proteinID, *spID, *desc; FILE *outf, *outf2; char fileName[255]; database = strdup("hg17"); boolean newPage; int totalKgId, totalKgCnt; int totalKgPage; int kgIdCnt = 0; if (argc != 2) usage(); database = argv[1]; sqlSafef(query, sizeof query, "select genome from dbDb where name = '%s'", database); answer = sqlQuickQuery(connCentral, query, buf, sizeof(buf)); if (answer == NULL) { fprintf(stderr,"'%s' is not a valid genome database name.", database); exit(1); } else { genome = strdup(answer); } if (!hTableExists(database, "knownGene")) { fprintf(stderr,"Database %s currently does not have UCSC Known Genes.", database); exit(1); } sqlSafef(query, sizeof query, "select description from dbDb where name = '%s'", database); genomeDesc = strdup(sqlQuickQuery(connCentral, query, buf, sizeof(buf))); hDisconnectCentral(&connCentral); /* create first top level subdirectory */ safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel); mustSystem(command); conn = hAllocConn(database); conn2= hAllocConn(database); conn3= hAllocConn(database); newPage = TRUE; currentPage = 0; /* put this in to avoid compiler complaining */ outf = NULL; geneSymbol = NULL; char *protAcc = NULL; /* figure out how many pages in total */ sqlSafef(query2, sizeof(query2), "select count(k.name) from %s.knownGene k, %s.kgXref x where k.name=x.kgId and geneSymbol != ''", database, database); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); totalKgCnt = atoi(row2[0]); sqlFreeResult(&sr2); /* figure out how many KG IDs in total */ sqlSafef(query2, sizeof(query2), "select count(*) from %s.kgXref where geneSymbol !=''", database); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); totalKgId = atoi(row2[0]); sqlFreeResult(&sr2); totalKgPage = totalKgId/LINKSPERPAGE + 1; sqlSafef(query2, sizeof(query2), "select kgID, geneSymbol, description from %s.kgXref where geneSymbol!= '' order by geneSymbol", database); /* for debugging */ /* "select kgID, geneSymbol, description from %s.kgXref order by geneSymbol limit %d", database, TESTSIZE);*/ sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); /* for debugging */ /* while (kgIdCnt < TESTSIZE) */ while (kgIdCnt < totalKgId) { kgIdCnt++; kgID = row2[0]; geneSymbol = strdup(row2[1]); desc = row2[2]; sqlSafef(query, sizeof(query), "select chrom,txSTart,txEnd,proteinID from %s.knownGene where name='%s'", database, kgID); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); if (row != NULL) { geneCnt++; chrom = row[0]; txStart = row[1]; txEnd = row[2]; proteinID = row[3]; if (newPage) { /* create a KG links page */ pageNum++; currentPage++; /* use mkdir -p to make sure the subdirectory exists */ safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel); mustSystem(command); safef(fileName, sizeof(fileName), "knownGeneList/%s/%d/kgList%d.html", database, topLevel, pageNum); outf = fopen(fileName, "w"); printHtmlHead(outf); fprintf(outf,"<H2>UCSC %s Known Genes List (page %d of %d)</H2>\n", genome, pageNum, totalKgPage); fprintf(outf, "<TABLE BORDER=1=CELLSPACING=1 CELLPADDING=3 BGCOLOR=\"#D9F8E4\"><TR>\n"); fprintf(outf, "<TR><TH>Gene Symbol</TH><TH>Known Gene ID</TH><TH>mRNA</TH><TH>UniProt</TH><TH>RefSeq Protein</TH><TH>Description</TH>\n"); strcpy(startSymbol[pageNum], geneSymbol); strcpy(pageStartSymbol[currentPage], geneSymbol); newPage = FALSE; } fprintf(outf,"<TR>"); fprintf(outf,"<TD>%s</TD>", geneSymbol); /*fprintf(outf,"<TD>%d:%s</TD>", geneCnt, geneSymbol);*/ fprintf(outf,"<TD>"); fprintf(outf,"<A href=\"/cgi-bin/hgGene?db=%s&hgg_gene=%s", database, kgID); fprintf(outf,"&hgg_chrom=%s&hgg_start=%s&hgg_end=%s\">", chrom, txStart, txEnd); fprintf(outf,"%s", kgID); fprintf(outf,"</A>"); fprintf(outf,"</TD>\n"); sqlSafef(query3,sizeof(query3),"select spID from %s.kgXref where kgID = '%s'", database, kgID); spID = cloneString(sqlQuickQuery(conn3, query3, buf, sizeof(buf))); if (spID == NULL) { spID = emptyString; } else { if (sameWord(spID,"")) spID = emptyString; } sqlSafef(query3,sizeof(query3),"select mRNA from %s.kgXref where kgID = '%s'", database, kgID); mRNA = cloneString(sqlQuickQuery(conn3, query3, buf, sizeof(buf))); if (mRNA == NULL) { mRNA = emptyString; } else { if (sameWord(mRNA,"")) mRNA = emptyString; } sqlSafef(query3,sizeof(query3),"select protAcc from %s.kgXref where kgID = '%s'", database, kgID); protAcc = sqlQuickQuery(conn3, query3, buf, sizeof(buf)); if (protAcc == NULL) { protAcc = emptyString; } else { if (sameWord(protAcc,"")) protAcc = emptyString; } fprintf(outf,"<TD>%s</TD>", mRNA); fprintf(outf,"<TD>%s</TD>", spID); fprintf(outf,"<TD>%s</TD>", protAcc); fprintf(outf,"<TD>%s</TD>", desc ); fprintf(outf,"</TR>\n"); if ((geneCnt % LINKSPERPAGE) == 0) { /* flush out and close the page if a page is filled, and start a new page */ fprintf(outf,"</TABLE>"); strcpy(endSymbol[pageNum], geneSymbol); strcpy(pageEndSymbol[currentPage], endSymbol[pageNum]); fprintf(outf, "<BR>"); fprintf(outf, "<A href=\"/knownGeneList/%s/%d/kgIndex%d.html\">", database, topLevel,topLevel); fprintf(outf, "Up"); fprintf(outf,"</A><BR>\n"); printHtmlEnd(outf); newPage = TRUE; fclose(outf); outf = NULL; if ((pageNum % LINKSPERPAGE) == 0 ) { printf("Processing topLevel %d ...\n", topLevel);fflush(stdout); safef(fileName, sizeof(fileName), "knownGeneList/%s/%d/kgIndex%d.html", database, topLevel, topLevel); outf2 = fopen(fileName, "w"); printHtmlHead(outf2); //fprintf(outf2,"<H2>UCSC %s Known Genes List</H2>\n", genome); fprintf(outf2,"<H2>UCSC %s Known Genes List (Group %d)</H2>\n", genome, topLevel); for (i=1; i<= currentPage; i++) { fprintf(outf2, "Page %d: ", (topLevel-1)*LINKSPERPAGE+i); fprintf(outf2, "<A href=\"/knownGeneList/%s/%d/kgList%d.html\">", database, topLevel, (topLevel-1)*LINKSPERPAGE+i); fprintf(outf2, "%s to %s", pageStartSymbol[i], pageEndSymbol[i]); fprintf(outf2,"</A><BR>\n"); } fprintf(outf2, "<BR>"); fprintf(outf2, "<A href=\"/knownGeneList/%s/top.html\">",database); fprintf(outf2, "Up"); fprintf(outf2,"</A><BR>\n"); printHtmlEnd(outf2); fclose(outf2); strcpy(topStartSymbol[topLevel], pageStartSymbol[1]); strcpy( topEndSymbol[topLevel], pageEndSymbol[currentPage]); currentPage = 0; topLevel++; } } row = sqlNextRow(sr); } sqlFreeResult(&sr); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); /* flush out and close the last list page */ if (outf != NULL) { fprintf(outf,"</TABLE>"); strcpy(endSymbol[pageNum], geneSymbol); strcpy(pageEndSymbol[currentPage], endSymbol[pageNum]); fprintf(outf, "<BR>"); fprintf(outf, "<A href=\"/knownGeneList/%s/%d/kgIndex%d.html\">", database, topLevel,topLevel); fprintf(outf, "Up"); fprintf(outf,"</A><BR>\n"); printHtmlEnd(outf); fclose(outf); } /* generate the last index page */ safef(command, sizeof(command), "mkdir -p knownGeneList/%s/%d", database, topLevel); mustSystem(command); safef(fileName, sizeof(fileName), "knownGeneList/%s/%d/kgIndex%d.html", database, topLevel, topLevel); outf2 = fopen(fileName, "w"); printHtmlHead(outf2); fprintf(outf2,"<H2>UCSC %s Known Genes List (Group %d)</H2>\n", genome, topLevel); for (i=1; i<= currentPage; i++) { fprintf(outf2, "Page %d: ", (topLevel-1)*LINKSPERPAGE+i); fprintf(outf2, "<A href=\"/knownGeneList/%s/%d/kgList%d.html\">", database, topLevel, (topLevel-1)*LINKSPERPAGE+i); fprintf(outf2, "%s to %s", pageStartSymbol[i], pageEndSymbol[i]); fprintf(outf2,"</A><BR>\n"); fflush(outf2); } fprintf(outf2, "<BR>"); fprintf(outf2, "<A href=\"/knownGeneList/%s/top.html\">",database); fprintf(outf2, "Up"); fprintf(outf2,"</A><BR>\n"); strcpy(topStartSymbol[topLevel], pageStartSymbol[1]); strcpy( topEndSymbol[topLevel], pageEndSymbol[currentPage]); fclose(outf2); currentPage = 0; /* generate the top HTML page */ safef(fileName, sizeof(fileName), "knownGeneList/%s/top.html", database); outf2 = fopen(fileName, "w"); printHtmlHead(outf2); fprintf(outf2,"<H2>UCSC %s Known Genes List</H2>\n", genome); for (i=1; i<= topLevel; i++) { fprintf(outf2, "Group %d: ", i); fprintf(outf2, "<A href=\"/knownGeneList/%s/%d/kgIndex%d.html\">", database, i, i); fprintf(outf2, " %s to %s", topStartSymbol[i], topEndSymbol[i]); fprintf(outf2,"</A><BR>\n"); fflush(outf2); } fprintf(outf2, "<BR>"); fprintf(outf2, "<A href=\"/knownGeneLists.html\">"); fprintf(outf2, "Up"); fprintf(outf2,"</A><BR>\n"); printHtmlEnd(outf2); printHtmlEnd(outf2); fclose(outf2); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query[256], query3[256]; struct sqlResult *sr, *sr3; char **row, **row3; FILE *o1, *o2; char *locusID; /* LocusLink ID */ char *refAC; /* Refseq accession.version */ char *kgTempDbName, *roDbName; char cond_str[200]; char *kgID; char *mapID; char *desc; if (argc != 3) usage(); kgTempDbName = argv[1]; roDbName = argv[2]; conn = hAllocConn(roDbName); conn2= hAllocConn(roDbName); conn3= hAllocConn(roDbName); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); sqlSafef(query, sizeof query, "select kgID, refseq from %s.kgXref", roDbName); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { kgID = row[0]; refAC = row[1]; sqlSafefFrag(cond_str, sizeof cond_str, "refseq='%s'", refAC); locusID = sqlGetField("entrez", "entrezRefProt", "geneID", cond_str); if (locusID != NULL) { sqlSafef(query3, sizeof query3, "select * from %s.keggList where locusID = '%s'", kgTempDbName, locusID); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgID, locusID, mapID);fflush(o1); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } row = sqlNextRow(sr); } fclose(o1); fclose(o2); hFreeConn(&conn); hFreeConn(&conn2); mustSystem("cat j.dat|sort|uniq >keggPathway.tab"); mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query[256], query2[256], query3[256]; struct sqlResult *sr, *sr2, *sr3; char **row, **row2, **row3; char *chp; FILE *o1, *o2; char *locusID; /* LocusLink ID */ char *gbAC; /* GenBank accession.version */ char *locusID2; /* LocusLink ID */ char *refAC; /* Refseq accession.version */ char *dbName; char cond_str[200]; char *kgID; char *mapID; char *desc; if (argc != 2) usage(); dbName = argv[1]; conn = hAllocConn(dbName); conn2= hAllocConn(dbName); conn3= hAllocConn(dbName); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); sprintf(query2,"select * from %sTemp.locus2Ref0;", dbName); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { locusID2 = row2[0]; refAC = row2[1]; sprintf(query, "select * from %sTemp.locus2Acc0 where locusID=%s and seqType='m';", dbName, locusID2); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { locusID = row[0]; gbAC = row[1]; chp = strstr(gbAC, "."); if (chp != NULL) *chp = '\0'; chp = strstr(refAC, "."); if (chp != NULL) *chp = '\0'; sprintf(cond_str, "name='%s'", gbAC); kgID = sqlGetField(dbName, "knownGene", "name", cond_str); if (kgID != NULL) { sprintf(query3, "select * from %sTemp.keggList where locusID = '%s'", dbName, locusID); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgID, locusID, mapID); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } row = sqlNextRow(sr); } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); fclose(o1); fclose(o2); hFreeConn(&conn); hFreeConn(&conn2); mustSystem("cat j.dat|sort|uniq >keggPathway.tab"); mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn; FILE *o1; char *chp0, *chp; char *genomeDBname; char refseqID[40], mapID[40]; char *kgID, *geneSymbol; if (argc != 3) usage(); infileName = argv[1]; genomeDBname = argv[2]; conn= hAllocConn(genomeDBname); o1 = fopen("j.dat", "w"); inf = mustOpen(infileName, "r"); while (fgets(line_in, 1000, inf) != NULL) { strcpy(line, line_in); chp = strstr(line, "\t"); *chp = '\0'; strcpy(refseqID, line); again: chp ++; chp0 = chp; chp = strtok(chp, "\r\t\n"); if (chp == NULL) continue; sprintf(cond_str, "alias='%s'", refseqID); kgID=sqlGetField(genomeDBname, "kgAlias", "kgID", cond_str); // check with refLink if not found in kgAlias if (kgID == NULL) { sprintf(cond_str, "mrnaAcc='%s'", refseqID); geneSymbol=sqlGetField(genomeDBname, "refLink", "name", cond_str); sprintf(cond_str, "alias='%s'", geneSymbol); kgID=sqlGetField(genomeDBname, "kgAlias", "kgID", cond_str); } strcpy(mapID, chp); if (kgID != NULL) { fprintf(o1, "%s\t%s\t%s\n", kgID, refseqID, mapID);fflush(stdout); } else { printf("%s not found in kgAlias nor in refLink\n", refseqID); } chp = chp + strlen(mapID); // process remaing refeqID(s) goto again; } fclose(o1); mustSystem("cat j.dat|sort|uniq >bioCycPathway.tab"); mustSystem("rm j.dat"); return 0; }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query[256], query2[256]; struct sqlResult *sr, *sr2; char **row, **row2; char *chp; FILE *o1; char *locusID; /* LocusLink ID */ char *gbAC; /* GenBank accession.version */ char *giNCBI; /* NCBI gi for the protein record associated with the CDS */ char *seqType; /* sequence type m=mRNA g=genomic u=undefined */ char *proteinAC; /* protein accession.version */ char *taxID; /* tax id */ char *locusID2; /* LocusLink ID */ char *refAC; /* Refseq accession.version */ char *giNCBI2; /* NCBI gi for the protein record associated with the CDS */ char *revStatus; /* review status */ char *proteinAC2; /* protein accession.version */ char *taxID2; /* tax id */ char *dbName; if (argc != 2) usage(); dbName = argv[1]; conn = hAllocConn(dbName); conn2= hAllocConn(dbName); conn3= hAllocConn(dbName); o1 = fopen("j.dat", "w"); sqlSafef(query2, sizeof query2, "select * from %sTemp.locus2Ref0;", dbName); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { locusID2 = row2[0]; refAC = row2[1]; giNCBI2 = row2[2]; revStatus = row2[3]; proteinAC2 = row2[4]; taxID2 = row2[5]; sqlSafef(query, sizeof query, "select * from %sTemp.locus2Acc0 where locusID=%s and seqType='m';", dbName, locusID2); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { locusID = row[0]; gbAC = row[1]; giNCBI = row[2]; seqType = row[3]; proteinAC = row[4]; taxID = row[5]; chp = strstr(gbAC, "."); if (chp != NULL) *chp = '\0'; chp = strstr(refAC, "."); if (chp != NULL) *chp = '\0'; fprintf(o1, "%s\t%s\n", gbAC, refAC); row = sqlNextRow(sr); } row2 = sqlNextRow(sr2); } fclose(o1); hFreeConn(&conn); hFreeConn(&conn2); sqlFreeResult(&sr2); mustSystem("cat j.dat|sort|uniq >mrnaRefseq.tab"); printf("mrnaRefseq.tab created.\n"); mustSystem("rm j.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query[256], query2[256]; struct sqlResult *sr, *sr2; char **row, **row2; char *chp; char *kgID; FILE *o2; char cond_str[256]; char *database; char *ro_db; char *proteinID; char *proteinAC; if (argc != 3) usage(); database = cloneString(argv[1]); ro_db = cloneString(argv[2]); conn = hAllocConn(database); conn2= hAllocConn(database); o2 = fopen("jj.dat", "w"); sprintf(query2,"select name, proteinID from %s.knownGene;", database); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { kgID = row2[0]; proteinID = row2[1]; // get RefSeq protein AC numbers (NP_xxxxx) if they exist sprintf(cond_str, "kgID='%s'", kgID); proteinAC = sqlGetField(database, "kgXref", "protAcc", cond_str); if (proteinAC != NULL) { if (strlen(proteinAC) > 0) { fprintf(o2, "%s\t%s\t%s\n", kgID, proteinID, proteinAC); } } // get Genbank protein accession numbers if (strstr(kgID, "NM_") != NULL) { sprintf(query,"select protAcc from %s.refLink where mrnaAcc = '%s';", ro_db, kgID); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { proteinAC = row[0]; fprintf(o2, "%s\t%s\t%s\n", kgID, proteinID, proteinAC); row = sqlNextRow(sr); } sqlFreeResult(&sr); } else { sprintf(query,"select proteinAC from %sTemp.locus2Acc0 where gbAC like '%s%c';", database, kgID, '%'); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { proteinAC = row[0]; chp = strstr(proteinAC, "."); if (chp != NULL) { *chp = '\0'; } if (proteinAC[0] != '-') { fprintf(o2, "%s\t%s\t%s\n", kgID, proteinID, proteinAC); } row = sqlNextRow(sr); } sqlFreeResult(&sr); } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); fclose(o2); hFreeConn(&conn); hFreeConn(&conn2); mustSystem("cat jj.dat|sort|uniq >kgProtAliasNCBI.tab"); mustSystem("rm jj.dat"); return(0); }
void gsBig(char *faName, char *gtfName, char *suboptName, char *transName, char *exeName, char *parName, char *tmpDirName) /* gsBig - Run Genscan on big input and produce GTF files. */ { struct dnaSeq seq; struct lineFile *lf = lineFileOpen(faName, TRUE); FILE *gtfFile = mustOpen(gtfName, "w"); FILE *subFile = NULL; FILE *transFile = NULL; ZeroVar(&seq); if (suboptName != NULL) subFile = mustOpen(suboptName, "w"); if (transName != NULL) transFile = mustOpen(transName, "w"); if (exeName != NULL) exePath = cloneString(exeName); if (parName != NULL) parPath = cloneString(parName); if (tmpDirName != NULL) tmpDir = cloneString(tmpDirName); if (optionExists("prerun")) { char *preFileName = optionVal("prerun", NULL); char seqName[128]; struct segment *seg = parseSegment(preFileName, 0, 100000000, seqName); writeSeg(seqName, seg, gtfFile, subFile, transFile); } else { struct dyString *dy = newDyString(1024); char tempFa[512], tempGs[512]; char dir1[256], root1[128], ext1[64]; int myPid = (int)getpid(); splitPath(faName, dir1, root1, ext1); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { int offset, sizeOne; struct segment *segList = NULL, *seg; char *seqName = cloneString(seq.name); int chunkNum = 0; for (offset = 0; offset < seq.size; offset += stepSize) { boolean allN = TRUE; int i; safef(tempFa, sizeof(tempFa), "%s/temp_gsBig_%d_%s_%d.fa", tmpDir, myPid, seqName, chunkNum); safef(tempGs, sizeof(tempGs), "%s/temp_gsBig_%d_%s_%d.genscan", tmpDir, myPid, seqName, chunkNum); sizeOne = seq.size - offset; if (sizeOne > winSize) sizeOne = winSize; /* Genscan hangs forever if a chunk is all-N's... if so, * then skip this chunk. */ for (i=offset; i < (offset+sizeOne); i++) { if (seq.dna[i] != 'N' && seq.dna[i] != 'n') { allN = FALSE; break; } } if (allN) { printf("\ngsBig: skipping %s[%d:%d] -- it's all N's.\n\n", seqName, offset, (offset+sizeOne-1)); } else { faWrite(tempFa, "split", seq.dna + offset, sizeOne); dyStringClear(dy); dyStringPrintf(dy, "%s %s %s", exePath, parPath, tempFa); if (suboptName != NULL) dyStringPrintf(dy, " -subopt"); dyStringPrintf(dy, " > %s", tempGs); verbose(3, "%s\n", dy->string); mustSystem(dy->string); seg = parseSegment(tempGs, offset, offset+sizeOne, NULL); slAddHead(&segList, seg); } chunkNum++; } slReverse(&segList); seg = mergeSegs(segList); writeSeg(seqName, seg, gtfFile, subFile, transFile); freez(&seqName); } if (! optionExists("noRemove")) { remove(tempFa); remove(tempGs); } } }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[256]; char *kgID; char *proteinID; char *seq; char *acc; char protDbName[100]; char spDbName[100]; char *dbName; char *ro_dbName; FILE *o1, *o2; struct dnaSeq *kgSeq; if (argc != 4) usage(); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); dbName = argv[1]; ro_dbName = argv[3]; sprintf(protDbName, "proteins%s", argv[2]); sprintf(spDbName, "sp%s", argv[2]); conn= hAllocConn(ro_dbName); conn2= hAllocConn(ro_dbName); sqlSafef(query2, sizeof query2, "select name from %s.knownGene;", dbName); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { kgID = row2[0]; sqlSafefFrag(cond_str, sizeof cond_str, "name = '%s';", kgID); seq = sqlGetField(dbName, "knownGenePep", "seq", cond_str); if (seq != NULL) { fprintf(o1, "%s\t%s\n", kgID, seq);fflush(o1); } else { sqlSafefFrag(cond_str, sizeof cond_str, "name = '%s';", kgID); proteinID=sqlGetField(dbName, "knownGene", "proteinID", cond_str); if (proteinID != NULL) { sqlSafefFrag(cond_str, sizeof cond_str, "val = '%s';", proteinID); acc = sqlGetField(spDbName, "displayId", "acc", cond_str); if (acc == NULL) { fprintf(stderr, "NO acc.displayId.%s: %s from name.knownGene.%s: %s\n", spDbName, proteinID, dbName, kgID); fflush(stderr); } else { sqlSafefFrag(cond_str, sizeof cond_str, "acc = '%s';", acc); seq = sqlGetField(spDbName, "protein", "val", cond_str); if (seq == NULL) { fprintf(stderr, "NO protein seq for %s\n", kgID); fprintf(stderr, "proteinID.knownGene.%s: %s, acc.displayID.%s: %s\n", dbName, proteinID, spDbName, acc); fflush(stderr); } else { fprintf(o1, "%s\t%s\n", kgID, seq); } } } else { fprintf(stderr, "kgID: %s not in knownGenePep or knownGene\n", kgID); } } sqlSafefFrag(cond_str, sizeof cond_str, "name = '%s';", kgID); seq = sqlGetField(dbName, "knownGeneMrna", "seq", cond_str); if (seq != NULL) { fprintf(o2, "%s\t%s\n", kgID, seq);fflush(o1); } else { kgSeq = hGenBankGetMrna(dbName, kgID, NULL); if (kgSeq != NULL) { fprintf(o2, "%s\t%s\n", kgID, kgSeq->dna);fflush(o1); } else { fprintf(stderr, "NO mRNA seq for %s\n", kgID);fflush(stderr); } } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); fclose(o1); fclose(o2); mustSystem("cat j.dat |sort|uniq > knownGenePep.tab"); mustSystem("cat jj.dat|sort|uniq > knownGeneMrna.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }