static void processBlock(struct blastBlock *bb, unsigned flags, FILE* pslFh, FILE* scoreFh) /* process one gapped alignment block */ { struct blastGappedAli* ba = bb->gappedAli; struct blastQuery *bq = ba->query; struct psl *psl = pslBuildFromHsp(firstWordInLine(bq->query), bq->queryBaseCount, bb->qStart, bb->qEnd, ((bb->qStrand > 0) ? '+' : '-'), bb->qSym, firstWordInLine(ba->targetName), ba->targetSize, bb->tStart, bb->tEnd, ((bb->tStrand > 0) ? '+' : '-'), bb->tSym, flags); if (psl->blockCount > 0 && (bb->eVal <= eVal || eVal == -1)) outputPsl(bb, flags, psl, pslFh, scoreFh); pslFree(&psl); }
struct hash *hashFaNames(char *fileName) /* Return a hash full of the names (but not sequence) of all * records in fa file. */ { struct hash *hash = hashNew(0); struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { line = skipLeadingSpaces(line); if (line[0] == '>') { line += 1; char *name = firstWordInLine(line); if (name == NULL || name[0] == 0) errAbort("Empty name in fasta file line %d of %s", lf->lineIx, lf->fileName); if (hashLookup(hash, name)) errAbort("Duplicate name %s in fasta file line %d of %s", name, lf->lineIx, lf->fileName); hashAdd(hash, name, NULL); } } lineFileClose(&lf); return hash; }
void oneGenieFile(char *fileName, struct hash *uniq, FILE *f) /* Process one genie peptide prediction file into known and alt tab files. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; int lineSize; boolean firstTime = TRUE; char *trans; boolean skip = FALSE; /* Do cursory sanity check. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s is empty", fileName); if (line[0] != '>') errAbort("%s is badly formatted, doesn't begin with '>'", fileName); lineFileReuse(lf); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '>') { /* End last line. */ if (firstTime) firstTime = FALSE; else fputc('\n', f); trans = firstWordInLine(line+1); if (abbr != NULL && startsWith(abbr, trans)) trans += strlen(abbr); if (hashLookupUpperCase(uniq, trans) != NULL) { warn("Duplicate (case insensitive) '%s' line %d of %s. Ignoring all but first.", trans, lf->lineIx, lf->fileName); skip = TRUE; } else { char *upperCase; upperCase = cloneString(trans); touppers(upperCase); hashAdd(uniq, upperCase, NULL); freeMem(upperCase); fprintf(f, "%s\t", trans); skip = FALSE; } } else if (!skip) { mustWrite(f, line, lineSize-1); } } fputc('\n', f); lineFileClose(&lf); }
void genericOne(char *fileName, struct hash *uniq, FILE *f) /* Process one ensemble peptide prediction file into tab delimited * output f, using uniq hash to make sure no dupes. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; int lineSize; boolean firstTime = TRUE; char *trans, transBuf[128]; /* Do cursory sanity check. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s is empty", fileName); if (line[0] != '>') errAbort("%s is badly formatted, doesn't begin with '>'", fileName); lineFileReuse(lf); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '>') { char *upperCase; /* End last line. */ if (firstTime) firstTime = FALSE; else fputc('\n', f); trans = firstWordInLine(line+1); if (abbr != NULL && startsWith(abbr, trans)) trans += strlen(abbr); if (suffix != NULL) { safef(transBuf, sizeof(transBuf), "%s%s", trans, suffix); trans = transBuf; } if (hashLookupUpperCase(uniq, trans) != NULL) errAbort("Duplicate (case insensitive) '%s' line %d of %s", trans, lf->lineIx, lf->fileName); upperCase = cloneString(trans); touppers(upperCase); hashAdd(uniq, upperCase, NULL); freeMem(upperCase); fprintf(f, "%s\t", trans); } else { mustWrite(f, line, lineSize-1); } } fputc('\n', f); lineFileClose(&lf); }
static Fastq* fastq_processNextSequence (int freeMemory, int truncateName) { char *line; static Fastq* currFQ = NULL; int count; Seq* currSeq = NULL; if (ls_isEof (lsFastq)) { if (freeMemory) { fastq_freeFastq (currFQ); } return NULL; } count = 0; while ( (line=ls_nextLine (lsFastq)) && (count<4) ) { if (line[0] == '\0') { continue; } if (line[0] == '@') { if (freeMemory) { fastq_freeFastq (currFQ); } count++; AllocVar (currFQ); AllocVar (currFQ->seq); currSeq = currFQ->seq; currSeq->name = hlr_strdup (line + 1); if (truncateName) { currSeq->name = firstWordInLine (skipLeadingSpaces (currSeq->name)); } line = ls_nextLine (lsFastq); // reading sequence currSeq->sequence = hlr_strdup ( line ); currSeq->size = strlen (currSeq->sequence); count++; line = ls_nextLine (lsFastq); // reading quality ID if( line[0] != '+' ) die("Expected quality ID: '+' or '+%s'", currSeq->name ); count++; line = ls_nextLine (lsFastq); // reading quality currFQ->quality = hlr_strdup( line ); count++; } } ls_back (lsFastq,1); return currFQ; }
boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName) /* Read in DNA or Peptide FA record in mixed case. Allow any upper or lower case * letter, or the dash character in. */ { char c; int bufIx = 0; static char name[512]; int lineSize, i; char *line; dnaUtilOpen(); /* Read first line, make sure it starts with '>', and read first word * as name of sequence. */ name[0] = 0; if (!lineFileNext(lf, &line, &lineSize)) { *retDna = NULL; *retSize = 0; return FALSE; } if (line[0] == '>') { line = firstWordInLine(skipLeadingSpaces(line+1)); if (line == NULL) errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName); strncpy(name, line, sizeof(name)); name[sizeof(name)-1] = '\0'; /* Just to make sure name is NULL terminated. */ } else { errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName); } /* Read until next '>' */ for (;;) { if (!lineFileNext(lf, &line, &lineSize)) break; if (line[0] == '>') { lineFileReuse(lf); break; } if (bufIx + lineSize >= faFastBufSize) expandFaFastBuf(bufIx, lineSize); for (i=0; i<lineSize; ++i) { c = line[i]; if (isalpha(c) || c == '-') faFastBuf[bufIx++] = c; } } if (bufIx >= faFastBufSize) expandFaFastBuf(bufIx, 0); faFastBuf[bufIx] = 0; *retDna = faFastBuf; *retSize = bufIx; *retName = name; if (bufIx == 0) { warn("Invalid fasta format: sequence size == 0 for element %s",name); } return TRUE; }
struct trackHubSettingSpec *trackHubSettingsForVersion(char *specHost, char *version) /* Return list of settings with support level. Version can be version string or spec url */ { struct htmlPage *page = NULL; if (version == NULL) { version = trackHubVersionDefault(specHost, &page); if (version == NULL) errAbort("Can't get default spec from host %s", specHost); } /* Retrieve specs from file url. * Settings are the first text word within any <code> tag having class="level-" attribute. * The level represents the level of support for the setting (e.g. base, full, deprecated) * The support level ('level-*') is the class value of the * <code> tag. * E.g. <code class="level-full">boxedConfig on</code> produces: * setting=boxedConfig, class=full */ if (page == NULL) page = trackHubVersionSpecMustGet(specHost, version); if (page == NULL) errAbort("Can't get settings spec for version %s from host %s", version, specHost); verbose(5, "Found %d tags\n", slCount(page->tags)); struct trackHubSettingSpec *spec, *savedSpec; struct hash *specHash = hashNew(0); struct htmlTag *tag; struct htmlAttribute *attr; char buf[256]; for (tag = page->tags; tag != NULL; tag = tag->next) { if (differentWord(tag->name, "code")) continue; attr = tag->attributes; if (attr == NULL || differentString(attr->name, "class") || !startsWith("level-", attr->val)) continue; AllocVar(spec); int len = min(tag->next->start - tag->end, sizeof buf - 1); memcpy(buf, tag->end, len); buf[len] = 0; verbose(6, "Found spec: %s\n", buf); spec->name = cloneString(firstWordInLine(buf)); if (spec->name == NULL || strlen(spec->name) == 0) { warn("ERROR: format problem with trackDbHub.html -- contact UCSC."); continue; } spec->level = cloneString(chopPrefixAt(attr->val, '-')); verbose(6, "spec: name=%s, level=%s\n", spec->name, spec->level); savedSpec = (struct trackHubSettingSpec *)hashFindVal(specHash, spec->name); if (savedSpec != NULL) verbose(6, "found spec %s level %s in hash\n", savedSpec->name, savedSpec->level); if (savedSpec == NULL) { hashAdd(specHash, spec->name, spec); verbose(6, "added spec %s at level %s\n", spec->name, spec->level); } else if (trackHubSettingLevelCmp(spec, savedSpec) > 0) { hashReplace(specHash, spec->name, spec); verbose(6, "replaced spec %s at level %s, was %s\n", spec->name, spec->level, savedSpec->level); } } struct hashEl *el, *list = hashElListHash(specHash); int settingsCt = slCount(list); verbose(5, "Found %d settings's\n", slCount(list)); if (settingsCt == 0) errAbort("Can't find hub setting info for version %s (host %s)." " Use -version to indicate a different version number or url.", version, specHost); slSort(&list, hashElCmp); struct trackHubSettingSpec *specs = NULL; int baseCt = 0; int requiredCt = 0; int deprecatedCt = 0; for (el = list; el != NULL; el = el->next) { if (sameString(((struct trackHubSettingSpec *)el->val)->level, "base")) baseCt++; else if (sameString(((struct trackHubSettingSpec *)el->val)->level, "required")) requiredCt++; else if (sameString(((struct trackHubSettingSpec *)el->val)->level, "deprecated")) deprecatedCt++; slAddHead(&specs, el->val); } slReverse(&specs); verbose(3, "Found %d supported settings for this version (%d required, %d base, %d deprecated)\n", slCount(specs), requiredCt, baseCt, deprecatedCt); return specs; }
void ctgFaToFa(char *ctgFa, char *ctgCoords, char *ntDir) /* ctgFaToFa - Convert from one big file with all NT contigs to one contig per file.. */ { struct lineFile *lf; char fileName[512], *line; char *ntName, *hsName; char *parts[6]; int lineSize, partCount; struct hash *uniqHash = newHash(0); FILE *f = NULL; int dotMod = 0; struct hash *ntHash = newHash(0); struct hash *hsHash = newHash(0); struct ntContig *nt; char *words[8]; printf("Loading %s\n", ctgCoords); lf = lineFileOpen(ctgCoords, TRUE); while (lineFileRow(lf, words)) { ntName = words[0]; if ((nt = hashFindVal(ntHash, ntName)) != NULL) ++nt->cloneCount; else { AllocVar(nt); hashAddSaveName(ntHash, ntName, nt, &nt->name); hashAddSaveName(hsHash, words[1], nt, &nt->hsName); nt->cloneCount = 1; } } lineFileClose(&lf); lf = lineFileOpen(ctgFa, FALSE); makeDir(ntDir); while (lineFileNext(lf, &line, &lineSize)) { if ((++dotMod&0x1ffff) == 0) { printf("."); fflush(stdout); } if (line[0] == '>') { carefulClose(&f); line[lineSize-1] = 0; partCount = chopByChar(line, '|',parts,ArraySize(parts)); if (partCount < 3) { uglyf("partCount = %d\n", partCount); errAbort("Expecting | separated header line %d of %s", lf->lineIx, lf->fileName); } ntName = parts[1]; nt = hashFindVal(ntHash, ntName); hsName = parts[2]; if (nt == NULL) { hsName = firstWordInLine(ntName); nt = hashMustFindVal(hsHash, hsName); ntName = nt->name; } if (nt->cloneCount > 1) { if (!startsWith("Hs", hsName)) errAbort("Expecting %s to start with 'Hs' line %d of %s", hsName, lf->lineIx, lf->fileName); if (hashLookup(uniqHash, ntName)) ntName = nextFakeNtName(hsName, ntName); hashAddUnique(uniqHash, ntName, NULL); if (!startsWith("NT_", ntName)) errAbort("Expecting NT_ name line %d of %s", lf->lineIx, lf->fileName); sprintf(fileName, "%s/%s.fa", ntDir, ntName); f = mustOpen(fileName, "w"); fprintf(f, ">%s.1_1\n", ntName); } } else { if (f != NULL) mustWrite(f, line, lineSize); } } printf("\n"); carefulClose(&f); lineFileClose(&lf); }
boolean faSpeedReadNextKeepCase(struct lineFile *lf, DNA **retDna, int *retSize, char **retName) /* Read in next FA entry as fast as we can. Faster than that old, * pokey faFastReadNext. Return FALSE at EOF. * The returned DNA and name will be overwritten by the next call * to this function. */ { int c; int bufIx = 0; static char name[256]; int lineSize, i; char *line; dnaUtilOpen(); /* Read first line, make sure it starts wiht '>', and read first word * as name of sequence. */ name[0] = 0; if (!lineFileNext(lf, &line, &lineSize)) { *retDna = NULL; *retSize = 0; return FALSE; } if (line[0] == '>') { line = firstWordInLine(skipLeadingSpaces(line+1)); if (line == NULL) errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName); strncpy(name, line, sizeof(name)); } else { errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName); } /* Read until next '>' */ for (;;) { if (!lineFileNext(lf, &line, &lineSize)) break; if (line[0] == '>') { lineFileReuse(lf); break; } if (bufIx + lineSize >= faFastBufSize) expandFaFastBuf(bufIx); for (i=0; i<lineSize; ++i) { c = line[i]; if (isalpha(c)) { faFastBuf[bufIx++] = c; } } } if (bufIx >= faFastBufSize) expandFaFastBuf(bufIx); faFastBuf[bufIx] = 0; *retDna = faFastBuf; *retSize = bufIx; *retName = name; return TRUE; }
static struct jsonWrite *rTdbToJw(struct trackDb *tdb, struct hash *fieldHash, struct hash *excludeTypesHash, int depth, int maxDepth) /* Recursively build and return a new jsonWrite object with JSON for tdb and its children, * or NULL if tdb or all children have been filtered out by excludeTypesHash. * If excludeTypesHash is non-NULL, omit any tracks/views/subtracks with type in excludeTypesHash. * If fieldHash is non-NULL, include only the field names indexed in fieldHash. */ { if (maxDepth >= 0 && depth > maxDepth) return NULL; boolean doSubtracks = (tdb->subtracks && fieldOk("subtracks", fieldHash)); // If excludeTypesHash is given and tdb is a leaf track/subtrack, look up the first word // of tdb->type in excludeTypesHash; if found, return NULL. if (excludeTypesHash && !doSubtracks) { char typeCopy[PATH_LEN]; safecpy(typeCopy, sizeof(typeCopy), tdb->type); if (hashLookup(excludeTypesHash, firstWordInLine(typeCopy))) return NULL; } boolean gotSomething = !doSubtracks; struct jsonWrite *jwNew = jsonWriteNew(); jsonWriteObjectStart(jwNew, NULL); writeTdbSimple(jwNew, tdb, fieldHash); if (tdb->parent && fieldOk("parent", fieldHash)) { // We can't link to an object in JSON and better not recurse here or else infinite loop. if (tdbIsSuperTrackChild(tdb)) { // Supertracks have been omitted from fullTrackList, so add the supertrack object's // non-parent/child info here. jsonWriteObjectStart(jwNew, "parent"); writeTdbSimple(jwNew, tdb->parent, fieldHash); jsonWriteObjectEnd(jwNew); } else // Just the name so we don't have infinite loops. jsonWriteString(jwNew, "parent", tdb->parent->track); } if (doSubtracks) { jsonWriteListStart(jwNew, "subtracks"); slSort(&tdb->subtracks, trackDbViewCmp); struct trackDb *subTdb; for (subTdb = tdb->subtracks; subTdb != NULL; subTdb = subTdb->next) { struct jsonWrite *jwSub = rTdbToJw(subTdb, fieldHash, excludeTypesHash, depth+1, maxDepth); if (jwSub) { gotSomething = TRUE; jsonWriteAppend(jwNew, NULL, jwSub); jsonWriteFree(&jwSub); } } jsonWriteListEnd(jwNew); } jsonWriteObjectEnd(jwNew); if (! gotSomething) // All children were excluded; clean up and null out jwNew. jsonWriteFree(&jwNew); return jwNew; }
void splitByBase(char *inName, int splitCount, char *outRoot, off_t estSize) /* Split into a file base by base. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); int lineSize; char *line; char c; char dir[PATH_LEN], seqName[128], outFile[128], outPathName[PATH_LEN]; int digits = digitsBaseTen(splitCount); boolean warnedMultipleRecords = FALSE; int fileCount = 0; off_t nextEnd = 0; off_t curPos = 0; FILE *f = NULL; int linePos = 0; int outLineSize = 50; if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s is empty", inName); if (line[0] == '>') { line = firstWordInLine(line+1); if (line == NULL) errAbort("Empty initial '>' line in %s", inName); strncpy(seqName, line, sizeof(seqName)); } else { splitPath(inName, dir, seqName, NULL); lineFileReuse(lf); } splitPath(outRoot, NULL, outFile, NULL); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '>') { if (!warnedMultipleRecords) { warnedMultipleRecords = TRUE; warn("More than one record in FA file line %d of %s", lf->lineIx, lf->fileName); continue; } } while ((c = *line++) != 0) { if (isdigit(c) || isspace(c)) continue; if (!isalpha(c)) errAbort("Weird %c (0x%x) line %d of %s", c, c, lf->lineIx, lf->fileName); if (++curPos >= nextEnd) { if (f != NULL) { if (linePos != 0) fputc('\n', f); fclose(f); } mkOutPath(outPathName, outRoot, digits, fileCount); verbose(2, "writing %s\n", outPathName); f = mustOpen(outPathName, "w"); fprintf(f, ">%s%0*d\n", outFile, digits, fileCount); ++fileCount; linePos = 0; nextEnd = calcNextEnd(fileCount, splitCount, estSize); } fputc(c, f); if (++linePos >= outLineSize) { fputc('\n', f); linePos = 0; } } } if (f != NULL) { if (linePos != 0) fputc('\n', f); fclose(f); } lineFileClose(&lf); }
void faNcbiToUcsc(char *inFile, char *out) /* faNcbiToUcsc - Convert FA file from NCBI to UCSC format.. */ { struct lineFile *lf = lineFileOpen(inFile, TRUE); char outName[512]; char *line; boolean split = cgiBoolean("split"); boolean ntLast = cgiBoolean("ntLast"); boolean encode = cgiBoolean("encode"); struct dnaSeq seq; FILE *f = NULL; char *wordBefore = cgiUsualString("wordBefore", "gb"); int wordIx = cgiUsualInt("wordIx", -1); char *e = NULL; char *nt = NULL; ZeroVar(&seq); if (split) makeDir(out); else f = mustOpen(out, "w"); while (lineFileNext(lf, &line, NULL)) { if (line[0] == '>') { if (ntLast || encode) { nt = NULL; if (ntLast) { e = NULL; nt = stringIn("NT_", line); if (nt == NULL) nt = stringIn("NG_", line); if (nt == NULL) nt = stringIn("NC_", line); if (nt == NULL) errAbort("Expecting NT_ NG_ or NC_in '%s'", line); e = strchr(nt, '|'); if (e != NULL) *e = 0; e = strchr(nt, ' '); if (e != NULL) *e = 0; } else { nt = stringIn("|EN", line); if (nt == NULL) errAbort("Expecting EN in %s", line); nt++; nt = firstWordInLine(nt); } if (split) { sprintf(outName, "%s/%s.fa", out, nt); carefulClose(&f); f = mustOpen(outName, "w"); } fprintf(f, ">%s\n", nt); } else { char *words[32]; int wordCount, i; char *accession = NULL; wordCount = chopString(line+1, "|", words, ArraySize(words)); if (wordIx >= 0) { if (wordIx >= wordCount) errAbort("Sorry only %d words", wordCount); accession = words[wordIx]; } else { for (i=0; i<wordCount-1; ++i) { if (sameString(words[i], wordBefore)) { accession = words[i+1]; break; } } if (accession == NULL) errAbort("Couldn't find '%s' line %d of %s", wordBefore, lf->lineIx, lf->fileName); } chopSuffix(accession); fprintf(f, ">%s\n", accession); } } else { fprintf(f, "%s\n", line); } } }
void hgFlyBase(char *database, char *genesFile) /* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of * tables. */ { char *tGene = "fbGene"; char *tSynonym = "fbSynonym"; char *tAllele = "fbAllele"; char *tRef = "fbRef"; char *tRole = "fbRole"; char *tPhenotype = "fbPhenotype"; char *tTranscript = "fbTranscript"; char *tGo = "fbGo"; char *tUniProt = "fbUniProt"; FILE *fGene = hgCreateTabFile(tabDir, tGene); FILE *fSynonym = hgCreateTabFile(tabDir, tSynonym); FILE *fAllele = hgCreateTabFile(tabDir, tAllele); FILE *fRef = hgCreateTabFile(tabDir, tRef); FILE *fRole = hgCreateTabFile(tabDir, tRole); FILE *fPhenotype = hgCreateTabFile(tabDir, tPhenotype); FILE *fTranscript = NULL; FILE *fGo = hgCreateTabFile(tabDir, tGo); FILE *fUniProt = hgCreateTabFile(tabDir, tUniProt); struct lineFile *lf = lineFileOpen(genesFile, TRUE); struct hash *refHash = newHash(19); int nextRefId = 0; int nextAlleleId = 0; char *line, sub, type, *rest, *s; char *geneSym = NULL, *geneName = NULL, *geneId = NULL; int recordCount = 0; struct slName *synList = NULL, *syn; int curAllele = 0, curRef = 0; struct ref *ref = NULL; struct sqlConnection *conn; struct hash *goUniqHash = newHash(18); /* Make table from flybase genes to BGDP transcripts. */ if (doTranscript) { fTranscript = hgCreateTabFile(tabDir, tTranscript); getAllSplices(database, fTranscript); } /* Make dummy reference for flybase itself. */ fprintf(fRef, "0\tFlyBase\n"); /* Loop through parsing and writing tab files. */ while (lineFileNext(lf, &line, NULL)) { sub = line[0]; if (sub == '#') { /* End of record. */ ++recordCount; if (geneId == NULL) errAbort("Record without *z line ending line %d of %s", lf->lineIx, lf->fileName); /* Write out synonyms. */ s = naForNull(geneSym); geneSym = ungreek(s); freeMem(s); s = naForNull(geneName); geneName = ungreek(s); if (! sameString(s, "n/a")) freeMem(s); if (geneSym != NULL && !sameString(geneSym, "n/a")) slNameStore(&synList, geneSym); if (geneName != NULL && !sameString(geneName, "n/a")) slNameStore(&synList, geneName); for (syn = synList; syn != NULL; syn = syn->next) { s = ungreek(syn->name); fprintf(fSynonym, "%s\t%s\n", geneId, s); freeMem(s); } /* Write out gene record. */ fprintf(fGene, "%s\t%s\t%s\n", geneId, geneSym, geneName); /* Clean up. */ freez(&geneSym); freez(&geneName); freez(&geneId); slFreeList(&synList); ref = NULL; curRef = curAllele = 0; continue; } else if (sub == 0) errAbort("blank line %d of %s, not allowed in gene.txt", lf->lineIx, lf->fileName); else if (isalnum(sub)) errAbort("line %d of %s begins with %c, not allowed", lf->lineIx, lf->fileName, sub); type = line[1]; rest = trimSpaces(line+2); if (sub == '*' && type == 'a') geneSym = cloneString(rest); else if (sub == '*' && type == 'e') geneName = cloneString(rest); else if (sub == '*' && type == 'z') { geneId = cloneString(rest); if (!startsWith("FBgn", geneId)) errAbort("Bad FlyBase gene ID %s line %d of %s", geneId, lf->lineIx, lf->fileName); } else if (type == 'i' && (sub == '*' || sub == '$')) { if (strlen(rest) > 2) /* Avoid short useless ones. */ slNameStore(&synList, rest); } else if (sub == '*' && type == 'A') { if (geneId == NULL) errAbort("Allele before geneId line %d of %s", lf->lineIx, lf->fileName); curAllele = ++nextAlleleId; fprintf(fAllele, "%d\t%s\t%s\n", curAllele, geneId, rest); if (!sameString(rest, "classical") && !sameString(rest, "in vitro") && !sameString(rest, "wild-type") ) { slNameStore(&synList, rest); } } else if (sub == '*' && type == 'm') { if (geneId == NULL) errAbort("*m protein ID before geneId line %d of %s", lf->lineIx, lf->fileName); if (startsWith("UniProt", rest)) { char *ptr = strchr(rest, ':'); if (ptr != NULL) ptr++; else errAbort("Trouble parsing UniProt ID %s like %d of %s", rest, lf->lineIx, lf->fileName); fprintf(fUniProt, "%s\t%s\n", geneId, ptr); } } else if (type == 'E') { ref = hashFindVal(refHash, rest); if (ref == NULL) { AllocVar(ref); ref->id = ++nextRefId; hashAdd(refHash, rest, ref); subChar(rest, '\t', ' '); fprintf(fRef, "%d\t%s\n", ref->id, rest); } curRef = ref->id; } else if ((type == 'k' || type == 'r' || type == 'p') && sub != '@') { FILE *f = (type == 'r' ? fRole : fPhenotype); struct dyString *dy = suckSameLines(lf, line); subChar(dy->string, '\t', ' '); if (geneId == NULL) errAbort("Expecting *z in record before line %d of %s", lf->lineIx, lf->fileName); fprintf(f, "%s\t%d\t%d\t%s\n", geneId, curAllele, curRef, dy->string); dyStringFree(&dy); } else if (type == 'd' || type == 'f' || type == 'F') { FILE *f = fGo; char aspect = (type == 'd') ? 'P' : (type == 'f') ? 'C' : 'F'; char *goId = rest; char *p = strstr(goId, " ; "); char assoc[128]; if (p == NULL) continue; else goId = firstWordInLine(p + 3); safef(assoc, sizeof(assoc), "%s.%s", geneId, goId); if (hashLookup(goUniqHash, assoc) == NULL) { hashAddInt(goUniqHash, assoc, 1); fprintf(f, "%s\t%s\t%c\n", geneId, goId, aspect); } } } printf("Processed %d records in %d lines\n", recordCount, lf->lineIx); lineFileClose(&lf); conn = sqlConnect(database); remakeTables(conn); if (doLoad) { printf("Loading %s\n", tGene); hgLoadTabFile(conn, tabDir, tGene, &fGene); if (doTranscript) { printf("Loading %s\n", tTranscript); hgLoadTabFile(conn, tabDir, tTranscript, &fTranscript); } printf("Loading %s\n", tSynonym); hgLoadTabFile(conn, tabDir, tSynonym, &fSynonym); printf("Loading %s\n", tAllele); hgLoadTabFile(conn, tabDir, tAllele, &fAllele); printf("Loading %s\n", tRef); hgLoadTabFile(conn, tabDir, tRef, &fRef); printf("Loading %s\n", tRole); hgLoadTabFile(conn, tabDir, tRole, &fRole); printf("Loading %s\n", tPhenotype); hgLoadTabFile(conn, tabDir, tPhenotype, &fPhenotype); printf("Loading %s\n", tGo); hgLoadTabFile(conn, tabDir, tGo, &fGo); printf("Loading %s\n", tUniProt); hgLoadTabFile(conn, tabDir, tUniProt, &fUniProt); hgRemoveTabFile(tabDir, tGene); if (doTranscript) hgRemoveTabFile(tabDir, tTranscript); hgRemoveTabFile(tabDir, tSynonym); hgRemoveTabFile(tabDir, tAllele); hgRemoveTabFile(tabDir, tRef); hgRemoveTabFile(tabDir, tRole); hgRemoveTabFile(tabDir, tPhenotype); hgRemoveTabFile(tabDir, tGo); hgRemoveTabFile(tabDir, tUniProt); } }