static void hackSynClone() /* Make edits to synthetic clone entries that appear to be mRNAs to have a mol type of * mRNA (normally marked as DNA) */ { if (kvtGet(kvt, "gen") != NULL) { struct keyVal *kv = kvtGet(kvt, "mol"); if (kv != NULL) kv->val = "mRNA"; } }
char *kvtLookup(struct kvt *kvt, char *key) /* Search table for key. Return key value, or NULL if * key not found. */ { struct keyVal *keyVal = kvtGet(kvt, key); if (keyVal == NULL) return NULL; else return keyVal->val; }
static boolean keepGbEntry(boolean isEst) /* should the current entry in the kvt be kept? */ { char *acc = gbAccessionField->val->string; char *cat = kvtGet(kvt, "cat")->val; char *org = kvtGet(kvt, "org")->val; if (genbankBlackListFail(acc, blackListRanges)) return FALSE; else if (gbOrg && (differentString(org, gbOrg))) return FALSE; else if (gbGuessSrcDb(acc) == GB_REFSEQ) { return (startsWith("NM_", acc) || startsWith("NR_", acc) || ((startsWith("XM_", acc) && inclXMs))); } else if ((sameString(cat, "GSS") || sameString(cat, "HTG") || sameString(cat, "STS") || sameString(cat, "CON")) && !(gbType & GB_DNA)) return FALSE; // division to ignore else { if (sameString(cat, "EST")) { return (gbType & GB_EST) != 0; } else if (gbType & GB_MRNA) { // not an EST, keep any type of RNA return containsStringNoCase(kvtGet(kvt, "mol")->val, "RNA") != NULL; } else if (gbType & GB_DNA) { // keep any type of DNA if DNA filter set return containsStringNoCase(kvtGet(kvt, "mol")->val, "DNA") != NULL; } else return FALSE; } }
static char *findSyntheticTarget() /* for a synthetic sequence, attempt to find the targeted organism. This was * added to support the MGC/ORFeome clones. In general, there is no defined way to * determine an organism that a synthenic clone targets. */ { struct keyVal *kv; if (synOrgBuf == NULL) synOrgBuf = dyStringNew(256); dyStringClear(synOrgBuf); kv = kvtGet(kvt, "srcOrg"); if (kv != NULL) dyStringAppend(synOrgBuf, kv->val); if (synOrgBuf->stringSize > 0) { kvtAdd(kvt, "synOrg", synOrgBuf->string); return synOrgBuf->string; } else return NULL; }
static void procGbEntry(struct lineFile *lf, struct hash *estAuthorHash) /* process one entry in the genbank file . readGbInfo should be called * first */ { char *words[16]; char date[64]; int wordCount; DNA *dna = NULL; int dnaSize; char sizeString[16]; char accVer[64]; int faSize; char *locus = gbLocusField->val->string; char *accession = gbAccessionField->val->string; int version = 0; char *gi = NULL; char *verChar = gbVersionField->val->string; char *s; char *org = gbOrganismField->val->string; char *synOrg = NULL; struct keyVal *seqKey, *sizeKey, *commentKey; boolean isEst = FALSE; char verNum[8]; char *com = gbCommentField->val->string; if (locus == NULL || accession == NULL) errAbort("No LOCUS or no ACCESSION line near %d of %s", lf->lineIx, lf->fileName); lmCleanup(&kvtMem); /* Chop off all but first word of accession. */ s = skipLeadingSpaces(accession); if (s != NULL) s = skipLeadingNonSpaces(s); if (s != NULL) *s = 0; /* Get version field (defaults to zero) */ if (verChar != NULL) { char *parts[2]; char *accVer; int partCount; partCount = chopByWhite(verChar, parts, ArraySize(parts)); /* Version is number after dot. */ accVer = parts[0]; if ((accVer = strchr(accVer, '.')) != NULL) version = atoi(accVer+1); if (partCount >= 2 && startsWith("GI:", parts[1])) gi = parts[1]+3; } gbfFlatten(kvt); /* Get additional keys. */ if (com != NULL) { if (startsWith("REVIEWED", com)) kvtAdd(kvt, "cur", "yes"); } safef(verNum, sizeof(verNum), "%d", version); kvtAdd(kvt, "ver", verNum); if (gi != NULL) kvtAdd(kvt, "ngi", gi); wordCount = chopLine(locus, words); if (wordCount >= 6) { kvtAdd(kvt, "mol", words[3]); kvtAdd(kvt, "cat", words[wordCount-2]); ncbiDateToSqlDate(words[wordCount-1], date, sizeof(date), accession); kvtAdd(kvt, "dat", date); } else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0])) { /* Check carefully. Probably it's just missing the molecule type... */ if (!isNcbiDate(words[4])) { errAbort("Strange LOCUS line in %s accession %s", lf->fileName, accession); } kvtAdd(kvt, "cat", words[3]); ncbiDateToSqlDate(words[4], date, sizeof(date), accession); kvtAdd(kvt, "dat", date); } else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0])) { kvtAdd(kvt, "mol", words[3]); } else { errAbort("Short LOCUS line in %s accession %s", lf->fileName, accession); } if (((wordCount >= 5) && sameString(words[4], "EST")) || ((wordCount >= 6) && sameString(words[5], "EST"))) { /* Try and figure out if it's a 3' or 5' EST */ char *dir = getEstDir(gbDefinitionField->val->string, com); if (dir != NULL) kvtAdd(kvt, "dir", dir); isEst = TRUE; } /* Handle other fields */ parseDbXrefs(); parseGene(); parseSourceOrganism(); parseMiscDiffs(); parseWarnings(); if (startsWith("synthetic construct", gbOrganismField->val->string)) { synOrg = findSyntheticTarget(); if (synOrg != NULL) hackSynClone(); } if (keepGbEntry(isEst)) { /* Handle sequence part of read. */ dna = gbfReadSequence(lf, &dnaSize); } /* just discard if no sequence */ if (dna != NULL) { seqKey = kvtAdd(kvt, "seq", dna); safef(sizeString, sizeof(sizeString), "%d", dnaSize); sizeKey = kvtAdd(kvt, "siz", sizeString); if (isEst) { char *author = gbAuthorsField->val->string; if (author != NULL) { struct authorExample *ae; struct hashEl *hel; if ((hel = hashLookup(estAuthorHash, author)) == NULL) { AllocVar(ae); hel = hashAdd(estAuthorHash, author, ae); ae->name = hel->name; ae->count = 1; strncpy(ae->accession, accession, sizeof(ae->accession)); slAddHead(&estAuthorList, ae); } else { ae = hel->val; ae->count += 1; } } } seqKey->val = NULL; /* Don't write out sequence here. */ commentKey = kvtGet(kvt, "com"); if (commentKey != NULL) commentKey->val = NULL; /* Don't write out comment either. */ setupOutputFiles(accession, org); if (faFile != NULL) { /* save fasta offset, size in ra */ safef(accVer, sizeof(accVer), "%s.%d", accession, version); gbFaWriteSeq(faFile, accVer, NULL, dna, -1); faSize = faFile->off - faFile->recOff; safef(faOffStr, sizeof(faOffStr), "%lld", (long long)faFile->recOff); kvtAdd(kvt, "fao", faOffStr); safef(faSizeStr, sizeof(faSizeStr), "%d", faSize); kvtAdd(kvt, "fas", faSizeStr); } if (gPepFa != NULL) { /* must write before writing kvt */ writePepSeq(); } kvtWriteAll(kvt, raFile, NULL); if (gbIdxFile != NULL) { /* use synthetic target if it was determined */ struct keyVal *molkv = kvtGet(kvt, "mol"); enum molType molType = (molkv->val != NULL) ? gbParseMolType(molkv->val) : mol_mRNA; gbProcessedWriteIdxRec(gbIdxFile, accession, version, kvtLookup(kvt, "dat"), ((synOrg != NULL) ? synOrg : org), molType); } } else gbfSkipSequence(lf); }