Пример #1
0
static void getFastaOffsets(struct brokenRefPepTbl *brpTbl,
                            struct sqlConnection *conn,
                            struct extFileTbl* extFileTbl,
                            char *faPath)
/* parse fasta file to get offsets of proteins */
{
struct gbFa *fa = gbFaOpen(faPath, "r");
char acc[GB_ACC_BUFSZ];
struct brokenRefPep *brp;
HGID extId = extFileTblGet(extFileTbl, conn, faPath);

gbVerbMsg(5, "scanning fasta: %s", faPath);
while (gbFaReadNext(fa))
    {
    gbVerbMsg(5, "   %s: %lld", fa->id, (long long)fa->recOff);
    /* save only if same acecss, version, and file (to match mrna fa) */
    short ver = gbSplitAccVer(fa->id, acc);
    brp = hashFindVal(brpTbl->protAccHash, acc);
    if ((brp != NULL) && (ver == brp->protVer) && sameString(faPath, brp->newFaPath))
        {
        gbFaGetSeq(fa); /* force read of sequence data */
        brp->newFaId = extId;
        brp->newFaOff = fa->recOff;
        brp->newSeqSize = fa->seqLen;
        brp->newRecSize = fa->off-fa->recOff;
        gbVerbMsg(5, "      save: %s %lld for %lld\n", fa->id, (long long)fa->recOff, (long long)fa->off);
        }
    }
gbFaClose(&fa);
}
Пример #2
0
static void openByAccPrefix(char* accPrefix)
/* Open up the by accession prefix */
{
char *mode;
char raPath[PATH_LEN], faPath[PATH_LEN], gbIdxPath[PATH_LEN];

carefulClose(&raFile);
gbFaClose(&faFile);
if (gbIdxName != NULL)
    carefulClose(&gbIdxFile);
                    
makeAccPrefixedFile(accPrefix, raName, raPath);
mode = isFirstOpen(raPath) ? "w" : "a";
raFile = mustOpen(raPath, mode);

makeAccPrefixedFile(accPrefix, faName, faPath);
faFile = gbFaOpen(faPath, mode);

if (gbIdxName != NULL)
    {
    makeAccPrefixedFile(accPrefix, gbIdxName, gbIdxPath);
    gbIdxFile = mustOpen(gbIdxPath, mode);
    }
strcpy(gCurAccPrefix, accPrefix);
}
void copySelectedFasta(struct gbSelect* select)
/* copy FASTA records that were selected for alignment, segregating by
 * native/xeno, and partitioning large files. */
{
char inFasta[PATH_LEN];
struct gbFa* inFa;
struct outFa* nativeFa = NULL;
struct outFa* xenoFa = NULL;
if (select->orgCats & GB_NATIVE)
    nativeFa = outFaNew(select, GB_NATIVE);
if (select->orgCats & GB_XENO)
    xenoFa = outFaNew(select, GB_XENO);

gbProcessedGetPath(select, "fa", inFasta);
gbVerbEnter(2, "copying from %s", inFasta);
inFa = gbFaOpen(inFasta, "r");

while (copyFastaRec(select, inFa, nativeFa, xenoFa))
    continue;

outFaFree(&nativeFa);
outFaFree(&xenoFa);
gbFaClose(&inFa);
gbVerbLeave(2, "copying from %s", inFasta);
}
Пример #4
0
void seqDataProcessUpdate(struct gbSelect* select)
/* Get sequences for a partition and update.  Partition processed index should
 * be loaded and selected versions flaged. */
{
char inFasta[PATH_LEN];
struct gbFa* inFa;
gbProcessedGetPath(select, "fa", inFasta);
inFa = gbFaOpen(inFasta, "r"); 
while (gbFaReadNext(inFa))
    processSeq(select, inFa);
gbFaClose(&inFa);
}
Пример #5
0
int main(int argc, char *argv[])
/* Check parameters, set up, loop through each GenBank file. */
{
char *gbName;
int argi = 1;
struct hash *estAuthorHash = NULL;
char *pepFa;

optionInit(&argc, argv, optionSpecs);
if (argc < 4)
    usage();

gByAccPrefixSize = optionInt("byAccPrefix", 0);
gbIdxName = optionVal("gbidx", NULL);
pepFa = optionVal("pepFa", NULL);
gbType = gbParseType(optionVal("type", "mrna,est"));
gbOrg  = optionVal("org", NULL);
inclXMs = optionExists("inclXMs");

if (gByAccPrefixSize > 4)  /* keep small to avoid tons of open files */
    errAbort("max value of -byAccPrefix is 4");

gCurAccPrefix[0] = '\0';

faName = argv[argi++];
raName = argv[argi++];

estAuthorHash = newHash(23);
kvt = newKvt(5*1024);
gbfInit();

if (pepFa != NULL)
    gPepFa = gbFaOpen(pepFa,"w");

char *blackList = optionVal("blackList", NULL);
if (blackList != NULL)
    blackListRanges = genbankBlackListParse(blackList);

while (argi < argc)
    {
    gbName = argv[argi++];
    printf("Processing %s into %s and %s\n", gbName, faName, raName);
    procOneGbFile(gbName, estAuthorHash);
    }

gbFaClose(&faFile);
gbFaClose(&gPepFa);
carefulClose(&raFile);
carefulClose(&gbIdxFile);

return 0;
}
void outFaOpen(struct outFa* outFa)
/* Open the fasta file  */
{
char ext[64];
char path[PATH_LEN];
assert(outFa->fa == NULL);

safef(ext, sizeof(ext), "%d.fa", outFa->nextPartNum);
gbAlignedGetPath(&outFa->select, ext, workDir, path);
outFa->fa = gbFaOpen(path, "w");
outFa->numSeqs = 0;
outFa->numBases = 0;

if (createPolyASizes)
    {
    safef(ext, sizeof(ext), "%d.polya", outFa->nextPartNum);
    gbAlignedGetPath(&outFa->select, ext, workDir, path);
    outFa->polyAFh = mustOpen(path, "w");
    }
outFa->nextPartNum++;
}
Пример #7
0
static void setupOutputFiles(char *acc, char *org)
/* Get the output files (in globals) for a sequence, opening as needed. */
{
if (gByAccPrefixSize > 0)
    {
    char accPrefix[32];
    strncpy(accPrefix, acc, gByAccPrefixSize);
    accPrefix[gByAccPrefixSize] = '\0';
    tolowers(accPrefix);
    if (!sameString(accPrefix, gCurAccPrefix))
        openByAccPrefix(accPrefix);
    }
else
    {
    /* output to a single set of files */
    if (raFile == NULL)
        {
        raFile = mustOpen(raName, "w");
        faFile = gbFaOpen(faName, "w");
        if (gbIdxName != NULL)
            gbIdxFile = mustOpen(gbIdxName, "w");
        }
    }
}
Пример #8
0
void seqDataOpen(boolean inclVersion, char *outFile)
/* open output file and set options */
{
gInclVersion = inclVersion;
gOutFa = gbFaOpen(outFile, "w");
}