struct segment *parseSegment(char *fileName, int start, int end, char *retSeqName) /* Read in a genscan file into segment. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct segment *seg; char *line; struct genScanFeature *gsfList = NULL, *gsf; struct genScanGene *gsg; char *words[2]; if (!lineFileNext(lf, &line, NULL)) errAbort("%s is empty", fileName); if (!startsWith("GENSCAN ", line)) errAbort("%s is not a GENSCAN output file", fileName); if (retSeqName != NULL) { line = mustSkipTo(lf, "Sequence"); if (chopLine(line, words) < 2) errAbort("Expecting sequence name line %d of %s", lf->lineIx, lf->fileName); strcpy(retSeqName, words[1]); } mustSkipTo(lf, "Predicted genes/exons"); mustSkipTo(lf, "Gn.Ex"); mustSkipTo(lf, "-----"); AllocVar(seg); seg->start = start; seg->end = end; for (;;) { if (!lineFileNext(lf, &line, NULL)) break; line = skipLeadingSpaces(line); if (line == NULL || line[0] == 0) continue; if (!isdigit(line[0])) { lineFileReuse(lf); break; } gsf = parseGenscanLine(lf, line); slAddHead(&gsfList, gsf); } slReverse(&gsfList); printf("Got %d exons\n", slCount(gsfList)); seg->geneList = bundleGenes(gsfList); seg->geneList = filterEmptyGenes(seg->geneList); gsfList = NULL; printf("Got %d genes\n", slCount(seg->geneList)); if (!lineFileNext(lf, &line, NULL)) errAbort("Unexpected end of file in %s", lf->fileName); if (startsWith("Suboptimal exons", line)) { mustSkipTo(lf, "-----"); for (;;) { if (!lineFileNext(lf, &line, NULL)) break; line = skipLeadingSpaces(line); if (line == NULL || line[0] == 0) continue; if (!startsWith("S.", line)) break; gsf = parseGenscanLine(lf, line); slAddHead(&gsfList, gsf); } slReverse(&gsfList); seg->suboptList = gsfList; printf("Got %d suboptimal exons\n", slCount(seg->suboptList)); } lineFileReuse(lf); mustSkipTo(lf, "Predicted peptide sequence"); if ((line = skipTo(lf, ">")) != NULL) { lineFileReuse(lf); for (gsg = seg->geneList; gsg != NULL; gsg = gsg->next) { aaSeq seq; if (!faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) errAbort("Not enough predicted peptides in %s\n", lf->fileName); gsg->translation = cloneString(seq.dna); } } lineFileClose(&lf); return seg; }
void spDbAddVarSplice(char *database, char *inFile, char *outDir) /* spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database. */ { struct sqlConnection *conn = sqlConnect(database); char query[256]; makeDir(outDir); FILE *varProtein = openToWrite(outDir, "varProtein.txt"); FILE *varAcc = openToWrite(outDir, "varAcc.txt"); FILE *varDisplayId = openToWrite(outDir, "varDisplayId.txt"); FILE *varAccToTaxon = openToWrite(outDir, "varAccToTaxon.txt"); FILE *varDescription = openToWrite(outDir, "varDescription.txt"); FILE *varGene = openToWrite(outDir, "varGene.txt"); FILE *varGeneLogic = openToWrite(outDir, "varGeneLogic.txt"); struct lineFile *lf = lineFileOpen(inFile, TRUE); aaSeq seq; ZeroVar(&seq); while (faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { char *row[4]; char *name = seq.name; if (startsWith("sp|", name)) // Skip over sp| introduced Aug 2009 name += 3; int rowSize = chopString(name, "-|", row, ArraySize(row)); if (rowSize != 3) errAbort("Expecting name to be in format accession-N|DISP_ID, got %s\n", name); char *acc = row[0]; char *version = row[1]; char *displayId = row[2]; int accLen = strlen(acc); int verLen = strlen(version); int displayIdLen = strlen(displayId); /* Do some tests. */ if ((accLen != 6 && accLen != 10) || isdigit(acc[0]) || !isdigit(acc[accLen-1])) errAbort("wierd accession %s before line %d of %s", acc, lf->lineIx, lf->fileName); if (!isdigit(version[0]) || verLen > 4) errAbort("wierd version %s before line %d of %s", version, lf->lineIx, lf->fileName); if (countChars(displayId, '_') != 1 || displayIdLen < 6 || displayIdLen > 16) errAbort("wierd displayId %s before line %d of %s", displayId, lf->lineIx, lf->fileName); if (accLen + 1 + verLen >= sizeof(SpAcc)) errAbort("Need to increase size of SpAcc in spDb.h because of %s-%s - need %d characters but only have %lu", acc, version, accLen + 1 + verLen, sizeof(SpAcc)); /* Print out parsed results. */ fprintf(varAcc, "%s-%s\t%s\t%s\n", acc, version, acc, version); fprintf(varProtein, "%s-%s\t%s\n", acc, version, seq.dna); fprintf(varDisplayId, "%s-%s\t%s-%s\n", acc, version, acc, version); /* Look up taxon of base protein and use it to write to varAccToTaxon table. */ int taxon = spTaxon(conn, acc); fprintf(varAccToTaxon, "%s-%s\t%d\n", acc, version, taxon); /*Transfer description. */ char *description = spDescription(conn, acc); fprintf(varDescription, "%s-%s\t%s\n", acc, version, description); freez(&description); /* Transfer gene logic. */ sqlSafef(query, sizeof(query), "select val from geneLogic where acc = '%s'", acc); char *geneLogic = sqlQuickString(conn, query); if (geneLogic != NULL) fprintf(varGeneLogic, "%s-%s\t%s\n", acc, version, geneLogic); freez(&geneLogic); /* Transfer genes. */ struct slName *gene, *geneList = spGenes(conn, acc); for (gene = geneList; gene != NULL; gene = gene->next) fprintf(varGene, "%s-%s\t%s\n", acc, version, gene->name); slFreeList(&geneList); } carefulClose(&varAcc); carefulClose(&varProtein); carefulClose(&varDisplayId); carefulClose(&varAccToTaxon); carefulClose(&varDescription); carefulClose(&varGene); carefulClose(&varGeneLogic); sqlDisconnect(&conn); }