Beispiel #1
0
void fastqRepeatQa(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf)
/* Do repeat QA if possible on fastq file. */
{
/* First see if total repeat content is already in our table, in which case we are done. */
long long fileId = ef->id;
char query[512];
sqlSafef(query, sizeof(query), 
    "select count(*) from edwQaRepeat where fileId=%lld and repeatClass='total'" , fileId);
if (sqlQuickNum(conn, query) != 0)
    return;	/* We've done this already */

/* Get sample file name from fastq table. */
struct edwFastqFile *fqf = edwFastqFileForFileId(conn, fileId);
if (fqf == NULL)
    errAbort("No edqFastqRecord for %s",  vf->licensePlate);
char *fastqPath = fqf->sampleFileName;

char bwaIndex[PATH_LEN];
safef(bwaIndex, sizeof(bwaIndex), "%s%s/repeatMasker/repeatMasker.fa", 
    edwValDataDir, vf->ucscDb);

char cmd[3*PATH_LEN];
char *saiName = cloneString(rTempName(edwTempDir(), "edwQaRepeat", ".sai"));
safef(cmd, sizeof(cmd), "bwa aln %s %s > %s", bwaIndex, fastqPath, saiName);
mustSystem(cmd);

char *samName = cloneString(rTempName(edwTempDir(), "edwQaRepeat", ".sam"));
safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", bwaIndex, saiName, fastqPath, samName);
mustSystem(cmd);
remove(saiName);

char *raName = cloneString(rTempName(edwTempDir(), "edwQaRepeat", ".ra"));
safef(cmd, sizeof(cmd), "edwSamRepeatAnalysis %s %s", samName, raName);
mustSystem(cmd);
verbose(2, "mustSystem(%s)\n", cmd);
remove(samName);

raIntoEdwRepeatQa(raName, conn, fileId);
remove(raName);
#ifdef SOON
#endif /* SOON */

freez(&saiName);
freez(&samName);
freez(&raName);
edwFastqFileFree(&fqf);
}
Beispiel #2
0
void fetchFdToTempFile(int remoteFd, char tempFileName[PATH_LEN])
/* This will fetch remote data to a temporary file. It fills in tempFileName with the name. */
{
/* Now make temp file name with XXXXXX name at end */
safef(tempFileName, PATH_LEN, "%sedwSubmitXXXXXX", edwTempDir());

/* Get open file handle, copy file, and close. */
int localFd = mustMkstemp(tempFileName);
cpFile(remoteFd, localFd);
mustCloseFd(&localFd);
}
Beispiel #3
0
void edwAlignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly,
    char *fastqPath, struct edwValidFile *vf, FILE *bedF,
    double *retMapRatio,  double *retDepth,  double *retSampleCoverage)
/* Take a sample fastq and run bwa on it, and then convert that file to a bed. 
 * bedF and all the ret parameters can be NULL. */
{
/* Hmm, tried doing this with Mark's pipeline code, but somehow it would be flaky the
 * second time it was run in same app.  Resorting therefore to temp files. */
char genoFile[PATH_LEN];
safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", 
    edwValDataDir, assembly->ucscDb, assembly->ucscDb);

char cmd[3*PATH_LEN];
char *saiName = cloneString(rTempName(edwTempDir(), "edwSample1", ".sai"));
safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, fastqPath, saiName);
mustSystem(cmd);

char *samName = cloneString(rTempName(edwTempDir(), "ewdSample1", ".sam"));
safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", genoFile, saiName, fastqPath, samName);
mustSystem(cmd);
remove(saiName);

/* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. 
 * and also to produce little bed file for enrichment step. */
struct genomeRangeTree *grt = genomeRangeTreeNew();
long long hitCount=0, missCount=0, totalBasesInHits=0;
scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits);
verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", 
    hitCount, missCount, totalBasesInHits, grt);
if (retMapRatio)
    *retMapRatio = (double)hitCount/(hitCount+missCount);
if (retDepth)
    *retDepth = (double)totalBasesInHits/assembly->baseCount 
	    * (double)vf->itemCount/vf->sampleCount;
long long basesHitBySample = genomeRangeTreeSumRanges(grt);
if (retSampleCoverage)
    *retSampleCoverage = (double)basesHitBySample/assembly->baseCount;
genomeRangeTreeFree(&grt);
remove(samName);
}
Beispiel #4
0
int edwFileFetch(struct sqlConnection *conn, struct edwFile *ef, int fd, 
	char *submitFileName, unsigned submitId, unsigned submitDirId, unsigned hostId)
/* Fetch file and if successful update a bunch of the fields in ef with the result. 
 * Returns fileId. */
{
ef->id = makeNewEmptyFileRecord(conn, submitId, submitDirId, ef->submitFileName, ef->size);

/* Update edwSubmit with file in transit info */
char query[256];
sqlSafef(query, sizeof(query), "update edwSubmit set fileIdInTransit=%lld where id=%u",
    (long long)ef->id, submitId);
sqlUpdate(conn, query);

sqlSafef(query, sizeof(query), "select paraFetchStreams from edwHost where id=%u", hostId);
int paraFetchStreams = sqlQuickNum(conn, query);
struct paraFetchInterruptContext interruptContext = {.conn=conn, .submitId=submitId};

/* Wrap getting the file, the actual data transfer, with an error catcher that
 * will remove partly uploaded files.  Perhaps some day we'll attempt to rescue
 * ones that are just truncated by downloading the rest,  but not now. */
struct errCatch *errCatch = errCatchNew();
char tempName[PATH_LEN] = "";
char edwFile[PATH_LEN] = "", edwPath[PATH_LEN];
if (errCatchStart(errCatch))
    {
    /* Now make temp file name and open temp file in an atomic operation */
    char *tempDir = edwTempDir();
    safef(tempName, PATH_LEN, "%sedwSubmitXXXXXX", tempDir);
    int localFd = mustMkstemp(tempName);

    /* Update file name in database with temp file name so web app can track us. */
    char query[PATH_LEN+128];
    sqlSafef(query, sizeof(query), 
	"update edwFile set edwFileName='%s' where id=%lld", 
	tempName + strlen(edwRootDir), (long long)ef->id);
    sqlUpdate(conn, query);

    /* Do actual upload tracking how long it takes. */
    ef->startUploadTime = edwNow();

    mustCloseFd(&localFd);
    if (!parallelFetchInterruptable(submitFileName, tempName, paraFetchStreams, 4, FALSE, FALSE,
	paraFetchInterruptFunction, &interruptContext))
	{
	if (interruptContext.isInterrupted)
	    errAbort("Submission stopped by user.");
	else
	    errAbort("parallel fetch of %s failed", submitFileName);
	}

    ef->endUploadTime = edwNow();

    /* Rename file both in file system and (via ef) database. */
    edwMakeFileNameAndPath(ef->id, submitFileName, edwFile, edwPath);
    mustRename(tempName, edwPath);
    if (endsWith(edwPath, ".gz") && !encode3IsGzipped(edwPath))
         errAbort("%s has .gz suffix, but is not gzipped", submitFileName);
    ef->edwFileName = cloneString(edwFile);
    }
errCatchEnd(errCatch);
if (errCatch->gotError)
    {
    /* Attempt to remove any partial file. */
    if (tempName[0] != 0)
	{
	verbose(1, "Removing partial %s\n", tempName);
	parallelFetchRemovePartial(tempName);
	remove(tempName);
	}
    handleSubmitError(conn, submitId, errCatch->message->string);  // Throws further
    assert(FALSE);  // We never get here
    }
errCatchFree(&errCatch);

/* Now we got the file.  We'll go ahead and save the file name and stuff. */
sqlSafef(query, sizeof(query),
       "update edwFile set"
       "  edwFileName='%s', startUploadTime=%lld, endUploadTime=%lld"
       "  where id = %d"
       , ef->edwFileName, ef->startUploadTime, ef->endUploadTime, ef->id);
sqlUpdate(conn, query);

/* Wrap the validations in an error catcher that will save error to file table in database */
errCatch = errCatchNew();
boolean success = FALSE;
if (errCatchStart(errCatch))
    {
    /* Check MD5 sum here.  */
    unsigned char md5bin[16];
    md5ForFile(edwPath, md5bin);
    char md5[33];
    hexBinaryString(md5bin, sizeof(md5bin), md5, sizeof(md5));
    if (!sameWord(md5, ef->md5))
        errAbort("%s has md5 mismatch: %s != %s.  File may be corrupted in upload, or file may have "
	         "been changed since validateManifest was run.  Please check that md5 of file "
		 "before upload is really %s.  If it is then try submitting again,  otherwise "
		 "rerun validateManifest and then try submitting again. \n", 
		 ef->submitFileName, ef->md5, md5, ef->md5);

    /* Finish updating a bunch more of edwFile record. Note there is a requirement in 
     * the validFile section that ef->updateTime be updated last.  A nonzero ef->updateTime
     * is used as a sign of record complete. */
    struct dyString *dy = dyStringNew(0);  /* Includes tag so query may be long */
    sqlDyStringPrintf(dy, "update edwFile set md5='%s',size=%lld,updateTime=%lld",
	    md5, ef->size, ef->updateTime);
    dyStringAppend(dy, ", tags='");
    dyStringAppend(dy, ef->tags);
    dyStringPrintf(dy, "' where id=%d", ef->id);
    sqlUpdate(conn, dy->string);
    dyStringFree(&dy);

    /* Update edwSubmit so file no longer shown as in transit */
    sqlSafef(query, sizeof(query), "update edwSubmit set fileIdInTransit=0 where id=%u", submitId);
    sqlUpdate(conn, query);

    success = TRUE;
    }
errCatchEnd(errCatch);
if (errCatch->gotError)
    {
    handleFileError(conn, submitId, ef->id, errCatch->message->string);
    }
return ef->id;
}