struct segment *parseSegment(char *fileName, int start, int end, char *retSeqName)
/* Read in a genscan file into segment. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct segment *seg;
char *line;
struct genScanFeature *gsfList = NULL, *gsf;
struct genScanGene *gsg;
char *words[2];

if (!lineFileNext(lf, &line, NULL))
    errAbort("%s is empty", fileName);
if (!startsWith("GENSCAN ", line))
    errAbort("%s is not a GENSCAN output file", fileName);
if (retSeqName != NULL)
    {
    line = mustSkipTo(lf, "Sequence");
    if (chopLine(line, words) < 2)
        errAbort("Expecting sequence name line %d of %s", lf->lineIx, lf->fileName);
    strcpy(retSeqName, words[1]);
    }

mustSkipTo(lf, "Predicted genes/exons");
mustSkipTo(lf, "Gn.Ex");
mustSkipTo(lf, "-----");
AllocVar(seg);
seg->start = start;
seg->end = end;

for (;;)
    {
    if (!lineFileNext(lf, &line, NULL))
        break;
    line = skipLeadingSpaces(line);
    if (line == NULL || line[0] == 0)
        continue;
    if (!isdigit(line[0]))
	{
	lineFileReuse(lf);
        break;
	}
    gsf = parseGenscanLine(lf, line);
    slAddHead(&gsfList, gsf);
    }
slReverse(&gsfList);
printf("Got %d exons\n", slCount(gsfList));
seg->geneList = bundleGenes(gsfList);
seg->geneList = filterEmptyGenes(seg->geneList);
gsfList = NULL;
printf("Got %d genes\n", slCount(seg->geneList));

if (!lineFileNext(lf, &line, NULL))
    errAbort("Unexpected end of file in %s", lf->fileName);
if (startsWith("Suboptimal exons", line))
    {
    mustSkipTo(lf, "-----");
    for (;;)
	{
	if (!lineFileNext(lf, &line, NULL))
	    break;
	line = skipLeadingSpaces(line);
	if (line == NULL || line[0] == 0)
	    continue;
	if (!startsWith("S.", line))
	    break;
	gsf = parseGenscanLine(lf, line);
	slAddHead(&gsfList, gsf);
	}
    slReverse(&gsfList);
    seg->suboptList = gsfList;
    printf("Got %d suboptimal exons\n", slCount(seg->suboptList));
    }
lineFileReuse(lf);

mustSkipTo(lf, "Predicted peptide sequence");
if ((line = skipTo(lf, ">")) != NULL)
    {
    lineFileReuse(lf);
    for (gsg = seg->geneList; gsg != NULL; gsg = gsg->next)
        {
	aaSeq seq;
	if (!faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
	    errAbort("Not enough predicted peptides in %s\n", lf->fileName);
	gsg->translation = cloneString(seq.dna);
	}
    }

lineFileClose(&lf);
return seg;
}
Exemplo n.º 2
0
void spDbAddVarSplice(char *database, char *inFile, char *outDir)
/* spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database. */
{
struct sqlConnection *conn = sqlConnect(database);
char query[256];
makeDir(outDir);
FILE *varProtein = openToWrite(outDir, "varProtein.txt");
FILE *varAcc = openToWrite(outDir, "varAcc.txt");
FILE *varDisplayId = openToWrite(outDir, "varDisplayId.txt");
FILE *varAccToTaxon = openToWrite(outDir, "varAccToTaxon.txt");
FILE *varDescription = openToWrite(outDir, "varDescription.txt");
FILE *varGene = openToWrite(outDir, "varGene.txt");
FILE *varGeneLogic = openToWrite(outDir, "varGeneLogic.txt");
struct lineFile *lf = lineFileOpen(inFile, TRUE);
aaSeq seq;
ZeroVar(&seq);
while (faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    char *row[4];
    char *name = seq.name;
    if (startsWith("sp|", name))	// Skip over sp| introduced Aug 2009
        name += 3;
    int rowSize = chopString(name, "-|", row, ArraySize(row));
    if (rowSize != 3)
        errAbort("Expecting name to be in format accession-N|DISP_ID, got %s\n", name);
    char *acc = row[0];
    char *version = row[1];
    char *displayId = row[2];
    int accLen = strlen(acc);
    int verLen = strlen(version);
    int displayIdLen = strlen(displayId);

    /* Do some tests. */
    if ((accLen != 6 && accLen != 10) || isdigit(acc[0]) || !isdigit(acc[accLen-1]))
        errAbort("wierd accession %s before line %d of %s", acc, lf->lineIx, lf->fileName);
    if (!isdigit(version[0]) || verLen > 4)
        errAbort("wierd version %s before line %d of %s", version, lf->lineIx, lf->fileName);
    if (countChars(displayId, '_') != 1 || displayIdLen < 6 || displayIdLen > 16)
        errAbort("wierd displayId %s before line %d of %s", displayId, lf->lineIx, lf->fileName);
    if (accLen + 1 + verLen >= sizeof(SpAcc))
        errAbort("Need to increase size of SpAcc in spDb.h because of %s-%s - need %d characters but only have %lu", acc, version, accLen + 1 + verLen, sizeof(SpAcc));

    /* Print out parsed results. */
    fprintf(varAcc, "%s-%s\t%s\t%s\n", acc, version, acc, version);
    fprintf(varProtein, "%s-%s\t%s\n", acc, version, seq.dna);
    fprintf(varDisplayId, "%s-%s\t%s-%s\n", acc, version, acc, version);

    /* Look up taxon of base protein and use it to write to varAccToTaxon table. */
    int taxon = spTaxon(conn, acc);
    fprintf(varAccToTaxon, "%s-%s\t%d\n", acc, version, taxon);

    /*Transfer description. */
    char *description = spDescription(conn, acc);
    fprintf(varDescription, "%s-%s\t%s\n", acc, version, description);
    freez(&description);

    /* Transfer gene logic. */
    sqlSafef(query, sizeof(query), "select val from geneLogic where acc = '%s'", acc);
    char *geneLogic = sqlQuickString(conn, query);
    if (geneLogic != NULL)
        fprintf(varGeneLogic, "%s-%s\t%s\n", acc, version, geneLogic);
    freez(&geneLogic);

    /* Transfer genes. */
    struct slName *gene, *geneList = spGenes(conn, acc);
    for (gene = geneList; gene != NULL; gene = gene->next)
        fprintf(varGene, "%s-%s\t%s\n", acc, version, gene->name);
    slFreeList(&geneList);

    }
carefulClose(&varAcc);
carefulClose(&varProtein);
carefulClose(&varDisplayId);
carefulClose(&varAccToTaxon);
carefulClose(&varDescription);
carefulClose(&varGene);
carefulClose(&varGeneLogic);
sqlDisconnect(&conn);
}