int main(int argc, char *argv[]) { FILE *inf; char line[1000]; int i; char *infileName; if (argc != 3) usage(); proteinId = argv[1]; infileName = argv[2]; for (i=0; i<8; i++) previousWord[i] = strdup("n/a"); inf = mustOpen(infileName, "r"); /* skip initial 2 header lines in .rdb format file */ mustGetLine(inf, line, sizeof(line)); mustGetLine(inf, line, sizeof(line)); /* read and process all lines one by one */ while (fgets(line, sizeof(line), inf) != NULL) { *(line + strlen(line) - 1) = '\0'; processOneLine(line); } return(0); }
static boolean findAltSpliceRange(char *name, struct snof *snof, FILE *f, char **retChrom, int *retStart, int *retEnd, char *retStrand) /* Return range of chromosome covered by a gene and all of it's isoforms. */ { char baseName[64]; char bName[64]; int snIx, maxIx; int start = 0x7fffffff; int end = -start; char lineBuf[128]; char *words[3]; int wordCount; int baseNameSize; strcpy(baseName, name); makeIsoformBaseName(baseName); baseNameSize = strlen(baseName); if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx)) return FALSE; maxIx = snofElementCount(snof); for (;snIx < maxIx; ++snIx) { long offset; char *geneName; snofNameOffsetAtIx(snof, snIx, &geneName, &offset); if (strncmp(geneName, baseName, baseNameSize) != 0) break; strcpy(bName, geneName); makeIsoformBaseName(bName); if (sameString(baseName, bName)) { int s, e; fseek(f, offset, SEEK_SET); mustGetLine(f, lineBuf, sizeof(lineBuf)); wordCount = chopLine(lineBuf, words); assert(wordCount == 3); wormParseChromRange(words[0], retChrom, &s, &e); *retStrand = words[1][0]; if (start > s) start = s; if (end < e) end = e; } } *retStart = start; *retEnd = end; return TRUE; }
boolean wormCdnaInfo(char *name, struct wormCdnaInfo *retInfo) /* Get info about cDNA sequence. */ { char commentBuf[512]; char *comment; long offset; wormCdnaCache(); if (!snofFindOffset(cdnaSnof, name, &offset)) return FALSE; fseek(cdnaFa, offset, SEEK_SET); mustGetLine(cdnaFa, commentBuf, sizeof(commentBuf)); if (commentBuf[0] != '>') errAbort("Expecting line starting with > in cDNA fa file.\nGot %s", commentBuf); comment = cloneString(commentBuf); wormFaCommentIntoInfo(comment, retInfo); return TRUE; }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[255]; char *proteinDatabaseName; FILE *o1, *o2, *o3; FILE *fh[23]; char temp_str[1000];; char *accession; char *aaSeq; char *chp; int i, j, len; int ihi, ilow; char *answer; char *protDisplayId; int aaResCnt[30]; char aaAlphabet[30]; int aaResFound; float fvalue1, fvalue2; float p1, p2; int icnt, jcnt; char *taxon; char *database; int sortedCnt; if (argc != 4) usage(); strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB"); proteinDatabaseName = argv[1]; taxon = argv[2]; database = argv[3]; o2 = mustOpen("pbResAvgStd.tab", "w"); for (i=0; i<20; i++) { safef(temp_str, sizeof(temp_str), "%c.txt", aaAlphabet[i]); fh[i] = mustOpen(temp_str, "w"); } conn = hAllocConn(hDefaultDb()); conn2 = hAllocConn(hDefaultDb()); safef(query2, sizeof(query2), "select proteinID from %s.knownGene;", database); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); icnt = 0; jcnt = 0; for (j=0; j<MAXRES; j++) { sumJ[j] = 0; } while (row2 != NULL) { protDisplayId = row2[0]; safef(cond_str, sizeof(cond_str), "val='%s'", protDisplayId); accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str); if (accession == NULL) { safef(cond_str, sizeof(cond_str), "acc='%s'", protDisplayId); accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str); if (accession == NULL) { verbose(2, "'%s' not found.\n", protDisplayId); goto skip; } } safef(cond_str, sizeof(cond_str), "accession='%s'", accession); answer = sqlGetField("proteins040115", "spXref2", "biodatabaseID", cond_str); if (answer == NULL) { /* this protein might be a variant splice protein, and then it won't be in spXref2 */ goto skip; } if (answer[0] != '1') { /* printf("%s not in SWISS-PROT\n", protDisplayId);fflush(stdout); */ goto skip; } safef(cond_str, sizeof(cond_str), "acc='%s'", accession); aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str); if (aaSeq == NULL) { printf("Can't find peptide sequence for %s, exiting ...\n", protDisplayId); fflush(stdout); exit(1); } len = strlen(aaSeq); if (len < 100) goto skip; lenDouble = (double)len; for (j=0; j<MAXRES; j++) { aaResCnt[j] = 0; } chp = aaSeq; for (i=0; i<len; i++) { aaResFound = 0; for (j=0; j<MAXRES; j++) { if (*chp == aaAlphabet[j]) { aaResFound = 1; aaResCnt[j] ++; } } if (!aaResFound) { fprintf(stderr, "%c %d not a valid AA residue.\n", *chp, *chp); } chp++; } for (j=0; j<MAXRES; j++) { freq[icnt][j] = (double)aaResCnt[j]/lenDouble; sumJ[j] = sumJ[j] + freq[icnt][j]; } for (j=0; j<20; j++) { fprintf(fh[j], "%15.7f\t%s\n", freq[icnt][j], accession); fflush(fh[j]); } icnt++; if (icnt >= MAXN) errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN); skip: row2 = sqlNextRow(sr2); } recordCnt = icnt; recordCntDouble = (double)recordCnt; for (j=0; j<20; j++) { carefulClose(&(fh[j])); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); for (j=0; j<MAXRES; j++) { avg[j] = sumJ[j]/recordCntDouble; } for (j=0; j<20; j++) { sum = 0.0; for (i=0; i<recordCnt; i++) { sum = sum + (freq[i][j] - avg[j]) * (freq[i][j] - avg[j]); } sigma[j] = sqrt(sum/(double)(recordCnt-1)); fprintf(o2, "%c\t%f\t%f\n", aaAlphabet[j], avg[j], sigma[j]); } carefulClose(&o2); o1 = mustOpen("pbAnomLimit.tab", "w"); for (j=0; j<20; j++) { safef(temp_str, sizeof(temp_str), "cat %c.txt|sort|uniq > %c.srt", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); /* figure out how many unique entries */ safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); safef(temp_str, sizeof(temp_str), "%c.tmp", aaAlphabet[j]); o3 = mustOpen(temp_str, "r"); mustGetLine(o3, temp_str, 1000); chp = temp_str; while (*chp == ' ') chp++; while (*chp != ' ') chp++; *chp = '\0'; sscanf(temp_str, "%d", &sortedCnt); safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]); mustSystem(temp_str); /* cal hi and low cutoff threshold */ ilow = (int)((float)sortedCnt * 0.025); ihi = (int)((float)sortedCnt * 0.975); safef(temp_str, sizeof(temp_str), "%c.srt", aaAlphabet[j]); o2 = mustOpen(temp_str, "r"); i=0; for (i=0; i<ilow; i++) { mustGetLine(o2, temp_str, 1000); } sscanf(temp_str, "%f", &fvalue1); mustGetLine(o2, temp_str, 1000); sscanf(temp_str, "%f", &fvalue2); p1 = (fvalue1 + fvalue2)/2.0; for (i=ilow+1; i<ihi; i++) { mustGetLine(o2, temp_str, 1000); } sscanf(temp_str, "%f", &fvalue1); mustGetLine(o2, temp_str, 1000); sscanf(temp_str, "%f", &fvalue2); p2 = (fvalue1 + fvalue2)/2.0; carefulClose(&o2); fprintf(o1, "%c\t%f\t%f\n", aaAlphabet[j], p1, p2); fflush(stdout); for (i=0; i<recordCnt; i++) { measure[i] = freq[i][j]; } safef(temp_str, sizeof(temp_str), "pbAaDist%c.tab", aaAlphabet[j]); calDist(measure, recordCnt, 51, 0.0, 0.005, temp_str); } carefulClose(&o1); return(0); }
int main(int argc, char *argv[]) { char *id; char *ox; char *chp; char *infName, *outfName; char line[2000]; FILE *inf, *outf; if (argc!=3) { usage(); } infName = argv[1]; outfName = argv[2]; if ((inf = fopen(infName, "r")) == NULL) { fprintf(stderr, "Can't open file %s.\n", infName); exit(8); } outf = fopen(outfName, "w"); while (fgets(line, 1000, inf) != NULL) { chp = strstr(line, "ID "); if (chp != line) { fprintf(stderr, "expected ID line, but got: %s\n", line); exit(1); } chp = chp + strlen("ID "); id = chp; chp = strstr(id, " "); *chp = '\0'; id = strdup(id); again: if (fgets(line, 1000, inf) == NULL) break; /* "//" is the end of record line */ if ((line[0] == '/') && (line[1] == '/')) goto one_done; chp = strstr(line, "OX "); if (chp != NULL) { chp = strstr(line, "NCBI_TaxID="); ox = chp + strlen("NCBI_TaxID="); again1: chp = strstr(ox, ","); if (chp != NULL) { *chp='\0'; while (*ox == ' ') ox++; fprintf(outf, "%s\t%s\n", id, ox); chp++; ox = chp; if (*ox == '\n') { mustGetLine(inf, line, sizeof(line)); chp = strstr(line, "OX "); if (chp == NULL) { fprintf(stderr, "no OX line after OX continuation line!\n"); exit(1); } ox = line + strlen("OX "); goto again1; } } else { chp = strstr(ox, ";"); if (chp != NULL) { *chp = '\0'; while (*ox == ' ') ox++; fprintf(outf, "%s\t%s\n", id, ox); ox = NULL; } } if (ox != NULL) goto again1; } goto again; one_done: id = id; } fclose(outf); return 0; }
boolean wormGeneRange(char *name, char **retChrom, char *retStrand, int *retStart, int *retEnd) /* Return chromosome position of a chrom range, gene, ORF, cosmid, or nameless cluster. */ { static struct snof *c2gSnof = NULL, *c2cSnof = NULL; static FILE *c2gFile = NULL, *c2cFile = NULL; long offset; char fileName[512]; struct slName *syn = NULL; boolean ok; if (wormIsChromRange(name)) { *retStrand = '.'; return wormParseChromRange(name, retChrom, retStart, retEnd); } getDirs(); /* Translate biologist type name to cosmid.N name */ if (wormIsGeneName(name)) { syn = wormGeneToOrfNames(name); if (syn != NULL) { name = syn->name; } } if (wormFixupOrfName(name)) /* See if ORF, and if so make nice. */ { if (c2gSnof == NULL) { sprintf(fileName, "%sc2g", featDir); c2gSnof = snofMustOpen(fileName); sprintf(fileName, "%sc2g", featDir); c2gFile = mustOpen(fileName, "rb"); } ok = findAltSpliceRange(name, c2gSnof, c2gFile, retChrom, retStart, retEnd, retStrand); } else /* Lets say it's a cosmid. */ { char lineBuf[128]; char *words[3]; int wordCount; touppers(name); if (c2cSnof == NULL) { sprintf(fileName, "%sc2c", featDir); c2cSnof = snofMustOpen(fileName); sprintf(fileName, "%sc2c", featDir); c2cFile = mustOpen(fileName, "rb"); } if (!snofFindOffset(c2cSnof, name, &offset) ) return FALSE; fseek(c2cFile, offset, SEEK_SET); mustGetLine(c2cFile, lineBuf, sizeof(lineBuf)); wordCount = chopLine(lineBuf, words); assert(wordCount == 3); assert(strcmp(words[2], name) == 0); assert(wormIsChromRange(words[0])); *retStrand = words[1][0]; ok = wormParseChromRange(words[0], retChrom, retStart, retEnd); } slFreeList(&syn); return ok; }