static void getFastaOffsets(struct brokenRefPepTbl *brpTbl, struct sqlConnection *conn, struct extFileTbl* extFileTbl, char *faPath) /* parse fasta file to get offsets of proteins */ { struct gbFa *fa = gbFaOpen(faPath, "r"); char acc[GB_ACC_BUFSZ]; struct brokenRefPep *brp; HGID extId = extFileTblGet(extFileTbl, conn, faPath); gbVerbMsg(5, "scanning fasta: %s", faPath); while (gbFaReadNext(fa)) { gbVerbMsg(5, " %s: %lld", fa->id, (long long)fa->recOff); /* save only if same acecss, version, and file (to match mrna fa) */ short ver = gbSplitAccVer(fa->id, acc); brp = hashFindVal(brpTbl->protAccHash, acc); if ((brp != NULL) && (ver == brp->protVer) && sameString(faPath, brp->newFaPath)) { gbFaGetSeq(fa); /* force read of sequence data */ brp->newFaId = extId; brp->newFaOff = fa->recOff; brp->newSeqSize = fa->seqLen; brp->newRecSize = fa->off-fa->recOff; gbVerbMsg(5, " save: %s %lld for %lld\n", fa->id, (long long)fa->recOff, (long long)fa->off); } } gbFaClose(&fa); }
static boolean isValidMrnaSeq(struct gbFa* inFa) /* check if the sequence appears to be a valid mrna sequence */ { char* seq = gbFaGetSeq(inFa); int numInvalid = numAllowedRNABases(seq); int maxInvalid = MAX_INVALID_MRNA_BASES * inFa->seqLen; if ((MAX_INVALID_MRNA_BASES > 0.0) && (maxInvalid == 0)) maxInvalid = 1; /* round up */ return (numInvalid <= maxInvalid); }
boolean copyFastaRec(struct gbSelect* select, struct gbFa* inFa, struct outFa* nativeFa, struct outFa* xenoFa) /* Read and copy a record to one of the output files, if selected */ { char acc[GB_ACC_BUFSZ]; unsigned version; struct gbEntry* entry; if (!gbFaReadNext(inFa)) return FALSE; /* EOF */ version = gbSplitAccVer(inFa->id, acc); entry = gbReleaseFindEntry(select->release, acc); if (entry != NULL) { char* seq = gbFaGetSeq(inFa); if (strlen(seq) < MIN_SEQ_SIZE) { if (gbVerbose >= 3) gbVerbPr(3, "skip %s, less than minimum sequence size", inFa->id); } else if ((version == entry->selectVer) && (entry->clientFlags & ALIGN_FLAG)) { outFaWrite(((entry->orgCat == GB_NATIVE) ? nativeFa : xenoFa), inFa); if (gbVerbose >= 3) gbVerbPr(3, "aligning %s %s", inFa->id, gbOrgCatName(entry->orgCat)); } else if ((version == entry->selectVer) && (entry->clientFlags & MIGRATE_FLAG)) { if (gbVerbose >= 3) gbVerbPr(3, "migrating %s %s", inFa->id, gbOrgCatName(entry->orgCat)); } else { assert(version != entry->selectVer); if (gbVerbose >= 3) gbVerbPr(3, "skip %s, wrong version %s != %d", gbOrgCatName(entry->orgCat), inFa->id, entry->selectVer); } } else { if (gbVerbose >= 3) gbVerbPr(3, "skip %s, no entry", inFa->id); } return TRUE; }
static void copySeq(struct gbFa *fa, struct gbFa *inFa) /* copy the sequence part of the record */ { unsigned writeCnt = 0; int len; char *seqp = gbFaGetSeq(inFa); for(len = inFa->seqLen; (len >= SEQ_LINE_LEN); len -= SEQ_LINE_LEN, seqp += SEQ_LINE_LEN) { fwrite(seqp, 1, SEQ_LINE_LEN, fa->fh); fputc('\n', fa->fh); writeCnt += SEQ_LINE_LEN + 1; } if (len > 0) { fwrite(seqp, 1, len, fa->fh); fputc('\n', fa->fh); writeCnt += len + 1; } fa->off += writeCnt; }