Beispiel #1
0
void gbAlignInstall(struct gbSelect* select, struct gbSelect* prevSelect)
/* Install alignments, optionally migrating unchanged ones from a previous
 * release.  This does one update, accPrefix and either native or xeno */
{
char nativeAlignIdx[PATH_LEN], xenoAlignIdx[PATH_LEN];
struct gbAlignInfo alignInfo;

gbVerbEnter(1, "gbAlignInstall: %s", gbSelectDesc(select));

/* load required entry date */
gbReleaseLoadProcessed(select);
if (prevSelect != NULL)
    {
    gbReleaseLoadProcessed(prevSelect);
    gbReleaseLoadAligned(prevSelect);
    }

/* mark entries and updates to migrate or align */
alignInfo = gbAlignFindNeedAligned(select, prevSelect);

/* Process each category */
if (select->orgCats & GB_NATIVE)
    installOrgCatAligned(select, GB_NATIVE, prevSelect, &alignInfo,
                         nativeAlignIdx);
if (select->orgCats & GB_XENO)
    installOrgCatAligned(select, GB_XENO, prevSelect, &alignInfo,
                         xenoAlignIdx);

/* now indices can be renamed, not completely atomic, but good enough */
if (select->orgCats & GB_NATIVE)
    gbOutputRename(nativeAlignIdx, NULL);
if (select->orgCats & GB_XENO)
    gbOutputRename(xenoAlignIdx, NULL);

/* print message before memory is freed */
gbVerbLeave(1, "gbAlignInstall: %s", gbSelectDesc(select));

/* unload entries to free memory */
gbReleaseUnload(select->release);
if (prevSelect != NULL)
    gbReleaseUnload(prevSelect->release);
}
Beispiel #2
0
void copyRefSeqPepFa(struct gbUpdate* update,
                     char* outDir, char *gbFile)
/* copy a subset of the RefSeq peptide file for the select genes */
{
struct gbRelease* release = update->release;
char faInPath[PATH_LEN];
char faOutPath[PATH_LEN];
struct lineFile* inLf;
boolean copying = FALSE;
FILE* outFh;
char* line;

/* change the .gbff.Z suffix to .fsa.Z */
if (!endsWith(gbFile, ".gbff.Z"))
    errAbort("expected a file ending in .gbff.Z, got: %s", gbFile);
strcpy(faInPath, gbFile);
faInPath[strlen(faInPath)-7] = '\0';
strcat(faInPath, ".fsa.Z");

strcpy(faOutPath, outDir);
strcat(faOutPath, "/");
strcat(faOutPath, faInPath);

verbose(1, "copying from %s\n", faInPath);

/* copy selected, don't bother with fa readers */
inLf = gzLineFileOpen(faInPath);
outFh = gbMustOpenOutput(faOutPath);

while (lineFileNext(inLf, &line, NULL))
    {
    if (line[0] == '>')
        {
        char *geneAcc = parsePepGeneAcc(line);
        struct gbEntry* entry = NULL;
        if (geneAcc != NULL)
            entry = gbReleaseFindEntry(release, geneAcc);
        copying = ((entry != NULL) && (entry->selectVer > 0));
	verbose(2, "acc for pep: %s: %s\n", geneAcc,
		(copying ? "yes" : "no"));
        }
    if (copying)
        {
        fputs(line, outFh);
        fputc('\n', outFh);
        if (ferror(outFh))
            errnoAbort("write failed: %s: ", faOutPath);
        }
    }

gbOutputRename(faOutPath, &outFh);
gzLineFileClose(&inLf);
}
void markAligns(struct gbSelect* select, unsigned orgCat)
/* create a file indicating that sequences either needs aligned or migated for
 * this for this partation.  This is used to determine what needs to be
 * installed after the alignment.  This is needed  because they might be all
 * be migrate, so that fasta can't be the indicator. */
{
char path[PATH_LEN];
FILE* fh;
unsigned orgCatsHold = select->orgCats;
select->orgCats = orgCat;

gbAlignedGetPath(select, "aligns", workDir, path);
fh = gbMustOpenOutput(path);
gbOutputRename(path, &fh);

select->orgCats = orgCatsHold;
}
void gbFaClose(struct gbFa **faPtr)
/* close a fasta file, check for any undetected I/O errors. */
{
struct gbFa *fa = *faPtr;
if (fa != NULL)
    {
    if (ferror(fa->fh))
        errAbort("%s error on %s", ((fa->mode[0] != 'r') ? "write" : "read"),
                 fa->fileName);
    if (fa->mode[0] == 'w')
        gbOutputRename(fa->fileName, &fa->fh);
    else
        gzClose(&fa->fh);
    freez(&fa->headerBuf);
    freez(&fa->seqBuf);
    freez(&fa->fhBuf);
    freez(faPtr);  /* NULLs var */
    }
}
Beispiel #5
0
void installOrgCatAligned(struct gbSelect* select, unsigned orgCat,
                          struct gbSelect* prevSelect,
                          struct gbAlignInfo* alignInfo,
                          char* alignIdx)
/* Install alignments for either native or xeno.  The alignment index is
 * created and named returned, but not renamed until both native and xeno are
 * processed. */
{
unsigned holdOrgCats = select->orgCats;
struct outputFiles out;
struct recCounts recCounts;
ZeroVar(&out);
ZeroVar(&recCounts);

select->orgCats = orgCat;
if (prevSelect != NULL)
    prevSelect->orgCats = orgCat;

/* setup out PSL and orientInfo files */
gbAlignedGetPath(select, "psl.gz", NULL, out.psl.path);
out.psl.fh = openSortOutput(out.psl.path, PSL_SORT_SPEC);
if (select->orgCats == GB_NATIVE)
    {
    gbAlignedGetPath(select, "oi.gz", NULL, out.oi.path);
    out.oi.fh = openSortOutput(out.oi.path, OI_SORT_SPEC);
    if (select->type == GB_EST)
        {
        gbAlignedGetPath(select, "intronPsl.gz", NULL, out.intronPsl.path);
        out.intronPsl.fh = openSortOutput(out.intronPsl.path, PSL_SORT_SPEC);
        }
    }
if (select->type == GB_MRNA)
    {
    /* we don't bother sorting raw psl */
    gbAlignedGetPath(select, "rawPsl.gz", NULL, out.rawPsl.path);
    out.rawPsl.fh = gbMustOpenOutput(out.rawPsl.path);
    }

/* previous aligned if this is a full update */
if (prevSelect != NULL)
    migrateAligned(select, prevSelect, alignInfo, &out, &recCounts);

/* copy currently aligned, if they exist */
copyPsls(select, MAIN_PSL_FILE, out.psl.fh, &recCounts.pslCnts);
if (select->type == GB_MRNA)
    copyPsls(select, RAW_PSL_FILE, out.rawPsl.fh, &recCounts.rawPslCnts);
if ((select->orgCats == GB_NATIVE) && (recCounts.pslCnts.recTotalCnt > 0))
    {
    /* copy new OI and intronPsls */
    copyOrientInfos(select, out.oi.fh, &recCounts);
    if (select->type == GB_EST)
        copyPsls(select, INTRON_PSL_FILE, out.intronPsl.fh,
                 &recCounts.intronPslCnts);
    }

/* Install or remove files.  Done seperate from copy due to posibility of
* all being migrated*/
if (recCounts.intronPslCnts.recTotalCnt > 0)
    gbOutputRename(out.intronPsl.path, &out.intronPsl.fh);
else
    gbOutputRemove(out.intronPsl.path, &out.intronPsl.fh);

if (recCounts.oiCnts.recTotalCnt > 0)
    gbOutputRename(out.oi.path, &out.oi.fh);
else
    gbOutputRemove(out.oi.path, &out.oi.fh);

if (recCounts.rawPslCnts.recTotalCnt > 0)
    gbOutputRename(out.rawPsl.path, &out.rawPsl.fh);
else
    gbOutputRemove(out.rawPsl.path, &out.rawPsl.fh);

if (recCounts.pslCnts.recTotalCnt > 0)
    gbOutputRename(out.psl.path, &out.psl.fh);
else
    gbOutputRemove(out.psl.path, &out.psl.fh);

createAlignedIndex(select, alignIdx);

select->orgCats = holdOrgCats;
if (prevSelect != NULL)
    prevSelect->orgCats = holdOrgCats;
}
Beispiel #6
0
void extractAccFromGb(char *inName, char* outName, struct hash *accTbl)
/* Parse records of genBank file and print ones that match accession names.
 * (yanked from gbOneAcc, changed to use stdio so we can access compressed). */
{
enum {maxHeadLines=20, headLineSize=256 };
char *headLines[maxHeadLines];	/* Store stuff between locus and accession. */
char line[headLineSize];
FILE *inFh;
FILE *outFh = NULL;
int lineNum = 0;
int i;
char* acc;

verbose(1, "copying from %s\n", inName);

inFh = gzMustOpen(inName, "r");

for (i=0; i<maxHeadLines; ++i)
    headLines[i] = needMem(headLineSize);

while (TRUE)
    {
    boolean gotAcc = FALSE;
    boolean gotMyAcc = FALSE;
    int headLineCount = 0;
    /* Seek to LOCUS */
    for (;;)
	{
	if (!readData(inFh, inName, line, headLineSize, FALSE))
	    break;
        lineNum++;
	if (startsWith("LOCUS", line))
	    break;
	}
    if (feof(inFh))
        break;
    for (i=0; i<maxHeadLines; ++i)
	{
	++headLineCount;
	strcpy(headLines[i], line);
	readData(inFh, inName, line, headLineSize, TRUE);
        lineNum++;
	if (startsWith("ACCESSION", line))
	    {
	    gotAcc = TRUE;
	    break;
	    }
	}
    if (!gotAcc)
	errAbort("LOCUS without ACCESSION in %d lines at line %d of %s",
                 maxHeadLines, lineNum, inName);
    acc = lastWordInLine(line);
    gotMyAcc = (hashLookup(accTbl, acc) != NULL);
    if (gotMyAcc)
	{
        if (outFh == NULL)
            outFh = gbMustOpenOutput(outName);
	for (i=0; i<headLineCount; ++i)
	    {
	    fputs(headLines[i], outFh);
	    fputc('\n', outFh);
	    }
	fputs(line, outFh);
	fputc('\n', outFh);
	}
    for (;;)
	{
	readData(inFh, inName, line, headLineSize, TRUE);
        lineNum++;
	if (gotMyAcc)
	    {
	    fputs(line, outFh);
	    fputc('\n', outFh);
	    }
	if (startsWith("//", line))
	    break;
	}
    if ((outFh != NULL) && ferror(outFh))
        break;  /* write error */
    }
if (outFh != NULL)
    gbOutputRename(outName, &outFh);
gzClose(&inFh);
}