static void parseWarnings() /* check for various clone warning cases and flag. */ { if (isAthersysRageEntry) kvtAdd(kvt, "wrn", "athRage"); else if (isOrestesEntry) kvtAdd(kvt, "wrn", "orestes"); }
static void addMiscDiff(int iDiff, char *subField, char *val) /* add a misc diff to kvt, subField can be empty */ { char name[256]; safef(name, sizeof(name), "mdiff.%d%s", iDiff, subField); kvtAdd(kvt, lmCloneString(kvtMem, name), val); }
static void parseSourceOrganism() /* parse source /organism fields, output as srcOrg if different from org */ { int numOrgs, i; char **orgs; if (gbSourceOrganism->val->stringSize == 0) return; if (srcOrgBuf == NULL) srcOrgBuf = dyStringNew(256); dyStringClear(srcOrgBuf); numOrgs = chopString(gbSourceOrganism->val->string, ";", NULL, 0); AllocArray(orgs, numOrgs); chopString(gbSourceOrganism->val->string, ";", orgs, numOrgs); for (i = 0; i < numOrgs; i++) { if (!sameString(orgs[i], gbOrganismField->val->string)) { if (srcOrgBuf->stringSize > 0) dyStringAppendC(srcOrgBuf, ';'); dyStringAppend(srcOrgBuf, orgs[i]); } } freeMem(orgs); if (srcOrgBuf->stringSize > 0) kvtAdd(kvt, "srcOrg", srcOrgBuf->string); }
static void updateKvt(struct keyVal **kvPtr, char* name, char *val) /* add or update at kvt value */ { if (*kvPtr != NULL) (*kvPtr)->val = val; else *kvPtr = kvtAdd(kvt, name, val); }
static void writePepSeq() /* If information is available, write the peptide sequence and * save offset and size in kvt */ { if ((gPepFa != NULL) && (gbProteinIdField->val->stringSize > 0) && (gbTranslationField->val->stringSize > 0)) { int faSize; gbFaWriteSeq(gPepFa, gbProteinIdField->val->string, NULL, gbTranslationField->val->string, -1); safef(pepSizeStr, sizeof(pepSizeStr), "%u", gbTranslationField->val->stringSize); kvtAdd(kvt, "prs", pepSizeStr); safef(pepFaOffStr, sizeof(pepFaOffStr), "%lld", (long long)gPepFa->recOff); kvtAdd(kvt, "pfo", pepFaOffStr); faSize = gPepFa->off - gPepFa->recOff; safef(pepFaSizeStr, sizeof(pepFaSizeStr), "%d", faSize); kvtAdd(kvt, "pfs", pepFaSizeStr); } }
static char *findSyntheticTarget() /* for a synthetic sequence, attempt to find the targeted organism. This was * added to support the MGC/ORFeome clones. In general, there is no defined way to * determine an organism that a synthenic clone targets. */ { struct keyVal *kv; if (synOrgBuf == NULL) synOrgBuf = dyStringNew(256); dyStringClear(synOrgBuf); kv = kvtGet(kvt, "srcOrg"); if (kv != NULL) dyStringAppend(synOrgBuf, kv->val); if (synOrgBuf->stringSize > 0) { kvtAdd(kvt, "synOrg", synOrgBuf->string); return synOrgBuf->string; } else return NULL; }
void kvtParseAdd(struct kvt *kvt, char *text) /* Add in keys from text. Text is in format: * key val * for each line of text. Text gets many of it's * space characters and newlines replaced by 0's * and should persist until call to keysClear(). */ { char *lines[256]; int lineCount; int i; char *k, *v; lineCount = chopString(text, "\n\r", lines, ArraySize(lines)); for (i=0; i<lineCount; ++i) { k = lines[i]; if ((v = strchr(k, ' ')) != NULL) { *v++ = 0; kvtAdd(kvt, k, v); } } }
static void procGbEntry(struct lineFile *lf, struct hash *estAuthorHash) /* process one entry in the genbank file . readGbInfo should be called * first */ { char *words[16]; char date[64]; int wordCount; DNA *dna = NULL; int dnaSize; char sizeString[16]; char accVer[64]; int faSize; char *locus = gbLocusField->val->string; char *accession = gbAccessionField->val->string; int version = 0; char *gi = NULL; char *verChar = gbVersionField->val->string; char *s; char *org = gbOrganismField->val->string; char *synOrg = NULL; struct keyVal *seqKey, *sizeKey, *commentKey; boolean isEst = FALSE; char verNum[8]; char *com = gbCommentField->val->string; if (locus == NULL || accession == NULL) errAbort("No LOCUS or no ACCESSION line near %d of %s", lf->lineIx, lf->fileName); lmCleanup(&kvtMem); /* Chop off all but first word of accession. */ s = skipLeadingSpaces(accession); if (s != NULL) s = skipLeadingNonSpaces(s); if (s != NULL) *s = 0; /* Get version field (defaults to zero) */ if (verChar != NULL) { char *parts[2]; char *accVer; int partCount; partCount = chopByWhite(verChar, parts, ArraySize(parts)); /* Version is number after dot. */ accVer = parts[0]; if ((accVer = strchr(accVer, '.')) != NULL) version = atoi(accVer+1); if (partCount >= 2 && startsWith("GI:", parts[1])) gi = parts[1]+3; } gbfFlatten(kvt); /* Get additional keys. */ if (com != NULL) { if (startsWith("REVIEWED", com)) kvtAdd(kvt, "cur", "yes"); } safef(verNum, sizeof(verNum), "%d", version); kvtAdd(kvt, "ver", verNum); if (gi != NULL) kvtAdd(kvt, "ngi", gi); wordCount = chopLine(locus, words); if (wordCount >= 6) { kvtAdd(kvt, "mol", words[3]); kvtAdd(kvt, "cat", words[wordCount-2]); ncbiDateToSqlDate(words[wordCount-1], date, sizeof(date), accession); kvtAdd(kvt, "dat", date); } else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0])) { /* Check carefully. Probably it's just missing the molecule type... */ if (!isNcbiDate(words[4])) { errAbort("Strange LOCUS line in %s accession %s", lf->fileName, accession); } kvtAdd(kvt, "cat", words[3]); ncbiDateToSqlDate(words[4], date, sizeof(date), accession); kvtAdd(kvt, "dat", date); } else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0])) { kvtAdd(kvt, "mol", words[3]); } else { errAbort("Short LOCUS line in %s accession %s", lf->fileName, accession); } if (((wordCount >= 5) && sameString(words[4], "EST")) || ((wordCount >= 6) && sameString(words[5], "EST"))) { /* Try and figure out if it's a 3' or 5' EST */ char *dir = getEstDir(gbDefinitionField->val->string, com); if (dir != NULL) kvtAdd(kvt, "dir", dir); isEst = TRUE; } /* Handle other fields */ parseDbXrefs(); parseGene(); parseSourceOrganism(); parseMiscDiffs(); parseWarnings(); if (startsWith("synthetic construct", gbOrganismField->val->string)) { synOrg = findSyntheticTarget(); if (synOrg != NULL) hackSynClone(); } if (keepGbEntry(isEst)) { /* Handle sequence part of read. */ dna = gbfReadSequence(lf, &dnaSize); } /* just discard if no sequence */ if (dna != NULL) { seqKey = kvtAdd(kvt, "seq", dna); safef(sizeString, sizeof(sizeString), "%d", dnaSize); sizeKey = kvtAdd(kvt, "siz", sizeString); if (isEst) { char *author = gbAuthorsField->val->string; if (author != NULL) { struct authorExample *ae; struct hashEl *hel; if ((hel = hashLookup(estAuthorHash, author)) == NULL) { AllocVar(ae); hel = hashAdd(estAuthorHash, author, ae); ae->name = hel->name; ae->count = 1; strncpy(ae->accession, accession, sizeof(ae->accession)); slAddHead(&estAuthorList, ae); } else { ae = hel->val; ae->count += 1; } } } seqKey->val = NULL; /* Don't write out sequence here. */ commentKey = kvtGet(kvt, "com"); if (commentKey != NULL) commentKey->val = NULL; /* Don't write out comment either. */ setupOutputFiles(accession, org); if (faFile != NULL) { /* save fasta offset, size in ra */ safef(accVer, sizeof(accVer), "%s.%d", accession, version); gbFaWriteSeq(faFile, accVer, NULL, dna, -1); faSize = faFile->off - faFile->recOff; safef(faOffStr, sizeof(faOffStr), "%lld", (long long)faFile->recOff); kvtAdd(kvt, "fao", faOffStr); safef(faSizeStr, sizeof(faSizeStr), "%d", faSize); kvtAdd(kvt, "fas", faSizeStr); } if (gPepFa != NULL) { /* must write before writing kvt */ writePepSeq(); } kvtWriteAll(kvt, raFile, NULL); if (gbIdxFile != NULL) { /* use synthetic target if it was determined */ struct keyVal *molkv = kvtGet(kvt, "mol"); enum molType molType = (molkv->val != NULL) ? gbParseMolType(molkv->val) : mol_mRNA; gbProcessedWriteIdxRec(gbIdxFile, accession, version, kvtLookup(kvt, "dat"), ((synOrg != NULL) ? synOrg : org), molType); } } else gbfSkipSequence(lf); }