예제 #1
0
파일: samHit.c 프로젝트: blumroy/kentUtils
int main(int argc, char *argv[])
{
FILE *inf;
char line[1000];

int i;
char *infileName;

if (argc != 3) usage();

proteinId  = argv[1];
infileName = argv[2];

for (i=0; i<8; i++) previousWord[i] = strdup("n/a");
    
inf   = mustOpen(infileName, "r");

/* skip initial 2 header lines in .rdb format file */
mustGetLine(inf, line, sizeof(line));
mustGetLine(inf, line, sizeof(line));

/* read and process all lines one by one */
while (fgets(line, sizeof(line), inf) != NULL)
    {
    *(line + strlen(line) - 1) = '\0';
    processOneLine(line);
    }

return(0);
}
예제 #2
0
static boolean findAltSpliceRange(char *name, struct snof *snof, FILE *f, 
    char **retChrom, int *retStart, int *retEnd, char *retStrand)
/* Return range of chromosome covered by a gene and all of it's isoforms. */
{
char baseName[64];
char bName[64];
int snIx, maxIx;
int start = 0x7fffffff;
int end = -start;
char lineBuf[128];
char *words[3];
int wordCount;
int baseNameSize;

strcpy(baseName, name);
makeIsoformBaseName(baseName);
baseNameSize = strlen(baseName);
if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
    return FALSE;
maxIx = snofElementCount(snof);
for (;snIx < maxIx; ++snIx)
    {
    long offset;
    char *geneName;

    snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
    if (strncmp(geneName, baseName, baseNameSize) != 0)
        break;
    strcpy(bName, geneName);
    makeIsoformBaseName(bName);
    if (sameString(baseName, bName))
        {
        int s, e;
        fseek(f, offset, SEEK_SET);
        mustGetLine(f, lineBuf, sizeof(lineBuf));
        wordCount = chopLine(lineBuf, words);
        assert(wordCount == 3);
        wormParseChromRange(words[0], retChrom, &s, &e);
        *retStrand = words[1][0];
        if (start > s)
            start = s;
        if (end < e)
            end = e;
        }
    }
*retStart = start;
*retEnd = end;
return TRUE;
}
예제 #3
0
boolean wormCdnaInfo(char *name, struct wormCdnaInfo *retInfo)
/* Get info about cDNA sequence. */
{
char commentBuf[512];
char *comment;
long offset;

wormCdnaCache();
if (!snofFindOffset(cdnaSnof, name, &offset))
    return FALSE;
fseek(cdnaFa, offset, SEEK_SET);
mustGetLine(cdnaFa, commentBuf, sizeof(commentBuf));
if (commentBuf[0] != '>')
    errAbort("Expecting line starting with > in cDNA fa file.\nGot %s", commentBuf);
comment = cloneString(commentBuf);
wormFaCommentIntoInfo(comment, retInfo);
return TRUE;
}
int main(int argc, char *argv[])
{
    struct sqlConnection *conn, *conn2;
    char query2[256];
    struct sqlResult *sr2;
    char **row2;
    char cond_str[255];
    char *proteinDatabaseName;
    FILE *o1, *o2, *o3;
    FILE *fh[23];
    char temp_str[1000];;
    char *accession;
    char *aaSeq;
    char *chp;
    int i, j, len;
    int ihi, ilow;
    char *answer;
    char *protDisplayId;
    int aaResCnt[30];
    char aaAlphabet[30];
    int aaResFound;
    float fvalue1, fvalue2;
    float p1, p2;
    int icnt, jcnt;
    char *taxon;
    char *database;
    int sortedCnt;

    if (argc != 4) usage();

    strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB");

    proteinDatabaseName = argv[1];
    taxon = argv[2];
    database = argv[3];

    o2 = mustOpen("pbResAvgStd.tab", "w");

    for (i=0; i<20; i++)
    {
        safef(temp_str, sizeof(temp_str), "%c.txt", aaAlphabet[i]);
        fh[i] = mustOpen(temp_str, "w");
    }

    conn  = hAllocConn(hDefaultDb());
    conn2 = hAllocConn(hDefaultDb());

    safef(query2, sizeof(query2), "select proteinID from %s.knownGene;", database);
    sr2 = sqlMustGetResult(conn2, query2);
    row2 = sqlNextRow(sr2);
    icnt = 0;
    jcnt = 0;

    for (j=0; j<MAXRES; j++)
    {
        sumJ[j] = 0;
    }

    while (row2 != NULL)
    {
        protDisplayId = row2[0];
        safef(cond_str, sizeof(cond_str),  "val='%s'", protDisplayId);
        accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str);

        if (accession == NULL)
        {
            safef(cond_str, sizeof(cond_str),  "acc='%s'", protDisplayId);
            accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str);
            if (accession == NULL)
            {
                verbose(2, "'%s' not found.\n", protDisplayId);
                goto skip;
            }
        }

        safef(cond_str, sizeof(cond_str),  "accession='%s'", accession);
        answer = sqlGetField("proteins040115", "spXref2", "biodatabaseID", cond_str);
        if (answer == NULL)
        {
            /* this protein might be a variant splice protein, and then it won't be in spXref2 */
            goto skip;
        }
        if (answer[0] != '1')
        {
            /* printf("%s not in SWISS-PROT\n", protDisplayId);fflush(stdout); */
            goto skip;
        }

        safef(cond_str, sizeof(cond_str),  "acc='%s'", accession);
        aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str);
        if (aaSeq == NULL)
        {
            printf("Can't find peptide sequence for %s, exiting ...\n", protDisplayId);
            fflush(stdout);
            exit(1);
        }

        len  = strlen(aaSeq);
        if (len < 100) goto skip;

        lenDouble = (double)len;

        for (j=0; j<MAXRES; j++)
        {
            aaResCnt[j] = 0;
        }

        chp = aaSeq;
        for (i=0; i<len; i++)
        {
            aaResFound = 0;
            for (j=0; j<MAXRES; j++)
            {
                if (*chp == aaAlphabet[j])
                {
                    aaResFound = 1;
                    aaResCnt[j] ++;
                }
            }
            if (!aaResFound)
            {
                fprintf(stderr, "%c %d not a valid AA residue.\n", *chp, *chp);
            }
            chp++;
        }

        for (j=0; j<MAXRES; j++)
        {
            freq[icnt][j] = (double)aaResCnt[j]/lenDouble;
            sumJ[j] = sumJ[j] + freq[icnt][j];
        }

        for (j=0; j<20; j++)
        {
            fprintf(fh[j], "%15.7f\t%s\n", freq[icnt][j], accession);
            fflush(fh[j]);
        }
        icnt++;
        if (icnt >= MAXN)
            errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN);

skip:
        row2 = sqlNextRow(sr2);
    }

    recordCnt = icnt;
    recordCntDouble = (double)recordCnt;

    for (j=0; j<20; j++)
    {
        carefulClose(&(fh[j]));
    }

    sqlFreeResult(&sr2);
    hFreeConn(&conn);
    hFreeConn(&conn2);

    for (j=0; j<MAXRES; j++)
    {
        avg[j] = sumJ[j]/recordCntDouble;
    }

    for (j=0; j<20; j++)
    {
        sum = 0.0;
        for (i=0; i<recordCnt; i++)
        {
            sum = sum + (freq[i][j] - avg[j]) * (freq[i][j] - avg[j]);
        }
        sigma[j] = sqrt(sum/(double)(recordCnt-1));
        fprintf(o2, "%c\t%f\t%f\n", aaAlphabet[j], avg[j], sigma[j]);
    }

    carefulClose(&o2);

    o1 = mustOpen("pbAnomLimit.tab", "w");
    for (j=0; j<20; j++)
    {
        safef(temp_str, sizeof(temp_str), "cat %c.txt|sort|uniq > %c.srt",  aaAlphabet[j], aaAlphabet[j]);
        mustSystem(temp_str);

        /* figure out how many unique entries */
        safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp",  aaAlphabet[j], aaAlphabet[j]);
        mustSystem(temp_str);
        safef(temp_str, sizeof(temp_str), "%c.tmp",  aaAlphabet[j]);
        o3 = mustOpen(temp_str, "r");
        mustGetLine(o3, temp_str, 1000);
        chp = temp_str;
        while (*chp == ' ') chp++;
        while (*chp != ' ') chp++;
        *chp = '\0';
        sscanf(temp_str, "%d", &sortedCnt);
        safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]);
        mustSystem(temp_str);

        /* cal hi and low cutoff threshold */
        ilow = (int)((float)sortedCnt * 0.025);
        ihi  = (int)((float)sortedCnt * 0.975);

        safef(temp_str, sizeof(temp_str), "%c.srt",  aaAlphabet[j]);
        o2 = mustOpen(temp_str, "r");
        i=0;
        for (i=0; i<ilow; i++)
        {
            mustGetLine(o2, temp_str, 1000);
        }
        sscanf(temp_str, "%f", &fvalue1);

        mustGetLine(o2, temp_str, 1000);
        sscanf(temp_str, "%f", &fvalue2);
        p1 = (fvalue1 + fvalue2)/2.0;

        for (i=ilow+1; i<ihi; i++)
        {
            mustGetLine(o2, temp_str, 1000);
        }
        sscanf(temp_str, "%f", &fvalue1);

        mustGetLine(o2, temp_str, 1000);
        sscanf(temp_str, "%f", &fvalue2);
        p2 = (fvalue1 + fvalue2)/2.0;
        carefulClose(&o2);

        fprintf(o1, "%c\t%f\t%f\n", aaAlphabet[j], p1, p2);
        fflush(stdout);

        for (i=0; i<recordCnt; i++)
        {
            measure[i] = freq[i][j];
        }
        safef(temp_str, sizeof(temp_str), "pbAaDist%c.tab", aaAlphabet[j]);
        calDist(measure,  recordCnt,    51,     0.0, 0.005, temp_str);
    }

    carefulClose(&o1);

    return(0);
}
예제 #5
0
int main(int argc, char *argv[])
{
char *id;
char *ox;
char *chp;
char *infName, *outfName;

char line[2000];
FILE *inf, *outf;

if (argc!=3)
   {
   usage();
   }

infName  = argv[1];
outfName = argv[2];

if ((inf = fopen(infName, "r")) == NULL)
    {		
    fprintf(stderr, "Can't open file %s.\n", infName);
    exit(8);
    }

outf = fopen(outfName, "w");

while (fgets(line, 1000, inf) != NULL)
    {
    chp = strstr(line, "ID   ");
    if (chp != line)
	{
	fprintf(stderr, "expected ID line, but got: %s\n", line);
	exit(1);
	} 
    chp = chp + strlen("ID   ");
    id = chp;
    chp = strstr(id, " ");
    *chp = '\0';
    id = strdup(id);

    again:
    if (fgets(line, 1000, inf) == NULL) break;

    /* "//" is the end of record line */	
    if ((line[0] == '/') && (line[1] == '/')) goto one_done;

    chp = strstr(line, "OX   ");
    if (chp != NULL)
	{
	chp = strstr(line, "NCBI_TaxID=");
	ox  = chp + strlen("NCBI_TaxID=");

	again1:
	chp = strstr(ox, ",");
	if (chp != NULL)
	    {
	    *chp='\0';
	    while (*ox == ' ') ox++;
	    fprintf(outf, "%s\t%s\n", id, ox);
 	    chp++;
	    ox = chp;	
	    if (*ox == '\n') 
		{
		mustGetLine(inf, line, sizeof(line));
		chp = strstr(line, "OX   ");
		if (chp == NULL)
		    {
		    fprintf(stderr, "no OX line after OX continuation line!\n");
		    exit(1);
	 	    }
		ox  = line + strlen("OX   ");
		goto again1;
	    	}	
	    }
        else
	    {
	    chp = strstr(ox, ";");
	    if (chp != NULL)
	    	{
	    	*chp = '\0';
	    	while (*ox == ' ') ox++;
	    	fprintf(outf, "%s\t%s\n", id, ox);
	    	ox = NULL;
	    	}	
		}
    	if (ox != NULL) goto again1;
    	}
    goto again;
    one_done: id = id;
    }
fclose(outf);
return 0;
}
예제 #6
0
boolean wormGeneRange(char *name, char **retChrom, char *retStrand, int *retStart, int *retEnd)
/* Return chromosome position of a chrom range, gene, ORF, cosmid, or nameless cluster. */
{
static struct snof *c2gSnof = NULL, *c2cSnof = NULL;
static FILE *c2gFile = NULL, *c2cFile = NULL;
long offset;
char fileName[512];
struct slName *syn = NULL;
boolean ok;

if (wormIsChromRange(name))
    {
    *retStrand = '.';
    return wormParseChromRange(name, retChrom, retStart, retEnd);
    }

getDirs();

/* Translate biologist type name to cosmid.N name */
if (wormIsGeneName(name))
    {
    syn = wormGeneToOrfNames(name);
    if (syn != NULL)
	{
        name = syn->name;
	}
    }
if (wormFixupOrfName(name)) /* See if ORF, and if so make nice. */
    {
    if (c2gSnof == NULL)
        {
        sprintf(fileName, "%sc2g", featDir);
        c2gSnof = snofMustOpen(fileName);
        sprintf(fileName, "%sc2g", featDir);
        c2gFile = mustOpen(fileName, "rb");
        }
    ok = findAltSpliceRange(name, c2gSnof, c2gFile, retChrom, retStart, retEnd, retStrand);
    }
else    /* Lets say it's a cosmid. */
    {
    char lineBuf[128];
    char *words[3];
    int wordCount;
    touppers(name);
    if (c2cSnof == NULL)
        {
        sprintf(fileName, "%sc2c", featDir);
        c2cSnof = snofMustOpen(fileName);
        sprintf(fileName, "%sc2c", featDir);
        c2cFile = mustOpen(fileName, "rb");
        }
    if (!snofFindOffset(c2cSnof, name, &offset) )
        return FALSE;
    fseek(c2cFile, offset, SEEK_SET);
    mustGetLine(c2cFile, lineBuf, sizeof(lineBuf));
    wordCount = chopLine(lineBuf, words);
    assert(wordCount == 3);
    assert(strcmp(words[2], name) == 0);
    assert(wormIsChromRange(words[0]));
    *retStrand = words[1][0];
    ok = wormParseChromRange(words[0], retChrom, retStart, retEnd);
    }
slFreeList(&syn);
return ok;
}