Esempio n. 1
0
static void hackSynClone()
/* Make edits to synthetic clone entries that appear to be mRNAs to have a mol type of
 * mRNA (normally marked as DNA)  */
{
if (kvtGet(kvt, "gen") != NULL)
    {
    struct keyVal *kv = kvtGet(kvt, "mol");
    if (kv != NULL)
        kv->val = "mRNA";
    }
}
Esempio n. 2
0
char *kvtLookup(struct kvt *kvt, char *key)
/* Search table for key.  Return key value, or NULL if
 * key not found. */
{
    struct keyVal *keyVal = kvtGet(kvt, key);
    if (keyVal == NULL)
        return NULL;
    else
        return keyVal->val;
}
Esempio n. 3
0
static boolean keepGbEntry(boolean isEst)
/* should the current entry in the kvt be kept? */
{
char *acc = gbAccessionField->val->string;
char *cat = kvtGet(kvt, "cat")->val;
char *org = kvtGet(kvt, "org")->val;

if (genbankBlackListFail(acc, blackListRanges))
    return FALSE;
else if (gbOrg && (differentString(org, gbOrg)))
    return FALSE;
else if (gbGuessSrcDb(acc) == GB_REFSEQ)
    {
    return (startsWith("NM_", acc) || startsWith("NR_", acc)
            || ((startsWith("XM_", acc) && inclXMs)));
    }
else if ((sameString(cat, "GSS") || 
         sameString(cat, "HTG") || 
        sameString(cat, "STS") || 
        sameString(cat, "CON")) && !(gbType & GB_DNA))
    return FALSE;   // division to ignore
else
    {
    if (sameString(cat, "EST")) {
        return (gbType & GB_EST) != 0;
        }
    else if (gbType & GB_MRNA)
        {
        // not an EST, keep any type of RNA
        return containsStringNoCase(kvtGet(kvt, "mol")->val, "RNA") != NULL;
        }
    else if (gbType & GB_DNA)
        {
        // keep any type of DNA if DNA filter set
        return containsStringNoCase(kvtGet(kvt, "mol")->val, "DNA") != NULL;
        }
    else
        return FALSE;
    }
}
Esempio n. 4
0
static char *findSyntheticTarget()
/* for a synthetic sequence, attempt to find the targeted organism.  This was
 * added to support the MGC/ORFeome clones.  In general, there is no defined way to
 * determine an organism that a synthenic clone targets. */
{
struct keyVal *kv;
if (synOrgBuf == NULL)
    synOrgBuf = dyStringNew(256);
dyStringClear(synOrgBuf);

kv = kvtGet(kvt, "srcOrg");
if (kv != NULL)
    dyStringAppend(synOrgBuf, kv->val);

if (synOrgBuf->stringSize > 0)
    {
    kvtAdd(kvt, "synOrg", synOrgBuf->string);
    return synOrgBuf->string;
    }
else
    return NULL;
}
Esempio n. 5
0
static void procGbEntry(struct lineFile *lf, struct hash *estAuthorHash)
/* process one entry in the genbank file . readGbInfo should be called
 * first */
{
char *words[16];
char date[64];
int wordCount;
DNA *dna = NULL;
int dnaSize;
char sizeString[16];
char accVer[64];
int faSize;
char *locus = gbLocusField->val->string;
char *accession = gbAccessionField->val->string;
int version = 0;
char *gi = NULL;
char *verChar = gbVersionField->val->string;
char *s;
char *org = gbOrganismField->val->string;
char *synOrg = NULL;
struct keyVal *seqKey, *sizeKey, *commentKey;
boolean isEst = FALSE;
char verNum[8];
char *com = gbCommentField->val->string;

if (locus == NULL || accession == NULL)
    errAbort("No LOCUS or no ACCESSION line near %d of %s",
             lf->lineIx, lf->fileName);
lmCleanup(&kvtMem);

/* Chop off all but first word of accession. */
s = skipLeadingSpaces(accession);
if (s != NULL)
    s = skipLeadingNonSpaces(s);
if (s != NULL)
    *s = 0;

/* Get version field (defaults to zero) */
if (verChar != NULL)
    {
    char *parts[2];
    char *accVer;
    int partCount;

    partCount = chopByWhite(verChar, parts, ArraySize(parts));

    /* Version is number after dot. */
    accVer = parts[0];
    if ((accVer = strchr(accVer, '.')) != NULL)
        version = atoi(accVer+1);
    if (partCount >= 2 && startsWith("GI:", parts[1]))
        gi = parts[1]+3;
    }

gbfFlatten(kvt);
                
/* Get additional keys. */
if (com != NULL)
    {
    if (startsWith("REVIEWED", com))
        kvtAdd(kvt, "cur", "yes");
    }
safef(verNum, sizeof(verNum), "%d", version);
kvtAdd(kvt, "ver", verNum);
if (gi != NULL)
    kvtAdd(kvt, "ngi", gi);
wordCount = chopLine(locus, words);
if (wordCount >= 6)
    {
    kvtAdd(kvt, "mol", words[3]);
    kvtAdd(kvt, "cat", words[wordCount-2]);
    ncbiDateToSqlDate(words[wordCount-1], date, sizeof(date), accession);
    kvtAdd(kvt, "dat", date);
    }
else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0]))
    {
    /* Check carefully.  Probably it's just missing the molecule type... */
    if (!isNcbiDate(words[4]))
        {
        errAbort("Strange LOCUS line in %s accession %s",
                 lf->fileName, accession);
        }
    kvtAdd(kvt, "cat", words[3]);
    ncbiDateToSqlDate(words[4], date, sizeof(date), accession);
    kvtAdd(kvt, "dat", date);
    }
else if (wordCount == 5 && sameString(words[2], "bp") && isdigit(words[1][0]))
    {
    kvtAdd(kvt, "mol", words[3]);
    }
else
    {
    errAbort("Short LOCUS line in %s accession %s",
             lf->fileName, accession);
    }
if (((wordCount >= 5) && sameString(words[4], "EST")) || 
    ((wordCount >= 6) && sameString(words[5], "EST")))
    {
    /* Try and figure out if it's a 3' or 5' EST */
    char *dir = getEstDir(gbDefinitionField->val->string, com);
    if (dir != NULL)
        kvtAdd(kvt, "dir", dir);
    isEst = TRUE;
    }

/* Handle other fields */
parseDbXrefs();
parseGene();
parseSourceOrganism();
parseMiscDiffs();
parseWarnings();

if (startsWith("synthetic construct", gbOrganismField->val->string))
    {
    synOrg = findSyntheticTarget();
    if (synOrg != NULL)
        hackSynClone();
    }

if (keepGbEntry(isEst))
    {
    /* Handle sequence part of read. */
    dna = gbfReadSequence(lf, &dnaSize);
    }
/* just discard if no sequence */
if (dna != NULL)
    {
    seqKey = kvtAdd(kvt, "seq", dna);
    safef(sizeString, sizeof(sizeString), "%d", dnaSize);
    sizeKey = kvtAdd(kvt, "siz", sizeString);
    
    if (isEst)
        {
        char *author = gbAuthorsField->val->string;
        if (author != NULL)
            {
            struct authorExample *ae;
            struct hashEl *hel;
            if ((hel = hashLookup(estAuthorHash, author)) == NULL)
                {
                AllocVar(ae);
                hel = hashAdd(estAuthorHash, author, ae);
                ae->name = hel->name;
                ae->count = 1;
                strncpy(ae->accession, accession, sizeof(ae->accession));
                slAddHead(&estAuthorList, ae);
                }
            else
                {
                ae = hel->val;
                ae->count += 1;
                }
            }
        }
    seqKey->val = NULL; /* Don't write out sequence here. */
    commentKey = kvtGet(kvt, "com");
    if (commentKey != NULL)
        commentKey->val = NULL;  /* Don't write out comment either. */

    setupOutputFiles(accession, org);

    if (faFile != NULL)
        {
        /* save fasta offset, size in ra */
        safef(accVer, sizeof(accVer), "%s.%d", accession, version);
        gbFaWriteSeq(faFile, accVer, NULL, dna, -1);
        faSize = faFile->off - faFile->recOff;
        safef(faOffStr, sizeof(faOffStr), "%lld", (long long)faFile->recOff);
        kvtAdd(kvt, "fao", faOffStr);
        safef(faSizeStr, sizeof(faSizeStr), "%d", faSize);
        kvtAdd(kvt, "fas", faSizeStr);
        }
    if (gPepFa != NULL)
        {
        /* must write before writing kvt */
        writePepSeq();
        }
    kvtWriteAll(kvt, raFile, NULL);
    if (gbIdxFile != NULL)
        {
        /* use synthetic target if it was determined */
        struct keyVal *molkv = kvtGet(kvt, "mol");
        enum molType molType = (molkv->val != NULL) ? gbParseMolType(molkv->val) : mol_mRNA;
        gbProcessedWriteIdxRec(gbIdxFile, accession, version,
                               kvtLookup(kvt, "dat"),
                               ((synOrg != NULL) ? synOrg : org),
                               molType);
        }
    }
else
    gbfSkipSequence(lf);
}