void fastqRepeatQa(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf) /* Do repeat QA if possible on fastq file. */ { /* First see if total repeat content is already in our table, in which case we are done. */ long long fileId = ef->id; char query[512]; sqlSafef(query, sizeof(query), "select count(*) from edwQaRepeat where fileId=%lld and repeatClass='total'" , fileId); if (sqlQuickNum(conn, query) != 0) return; /* We've done this already */ /* Get sample file name from fastq table. */ struct edwFastqFile *fqf = edwFastqFileForFileId(conn, fileId); if (fqf == NULL) errAbort("No edqFastqRecord for %s", vf->licensePlate); char *fastqPath = fqf->sampleFileName; char bwaIndex[PATH_LEN]; safef(bwaIndex, sizeof(bwaIndex), "%s%s/repeatMasker/repeatMasker.fa", edwValDataDir, vf->ucscDb); char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(edwTempDir(), "edwQaRepeat", ".sai")); safef(cmd, sizeof(cmd), "bwa aln %s %s > %s", bwaIndex, fastqPath, saiName); mustSystem(cmd); char *samName = cloneString(rTempName(edwTempDir(), "edwQaRepeat", ".sam")); safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", bwaIndex, saiName, fastqPath, samName); mustSystem(cmd); remove(saiName); char *raName = cloneString(rTempName(edwTempDir(), "edwQaRepeat", ".ra")); safef(cmd, sizeof(cmd), "edwSamRepeatAnalysis %s %s", samName, raName); mustSystem(cmd); verbose(2, "mustSystem(%s)\n", cmd); remove(samName); raIntoEdwRepeatQa(raName, conn, fileId); remove(raName); #ifdef SOON #endif /* SOON */ freez(&saiName); freez(&samName); freez(&raName); edwFastqFileFree(&fqf); }
void fetchFdToTempFile(int remoteFd, char tempFileName[PATH_LEN]) /* This will fetch remote data to a temporary file. It fills in tempFileName with the name. */ { /* Now make temp file name with XXXXXX name at end */ safef(tempFileName, PATH_LEN, "%sedwSubmitXXXXXX", edwTempDir()); /* Get open file handle, copy file, and close. */ int localFd = mustMkstemp(tempFileName); cpFile(remoteFd, localFd); mustCloseFd(&localFd); }
void edwAlignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly, char *fastqPath, struct edwValidFile *vf, FILE *bedF, double *retMapRatio, double *retDepth, double *retSampleCoverage) /* Take a sample fastq and run bwa on it, and then convert that file to a bed. * bedF and all the ret parameters can be NULL. */ { /* Hmm, tried doing this with Mark's pipeline code, but somehow it would be flaky the * second time it was run in same app. Resorting therefore to temp files. */ char genoFile[PATH_LEN]; safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", edwValDataDir, assembly->ucscDb, assembly->ucscDb); char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(edwTempDir(), "edwSample1", ".sai")); safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, fastqPath, saiName); mustSystem(cmd); char *samName = cloneString(rTempName(edwTempDir(), "ewdSample1", ".sam")); safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", genoFile, saiName, fastqPath, samName); mustSystem(cmd); remove(saiName); /* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. * and also to produce little bed file for enrichment step. */ struct genomeRangeTree *grt = genomeRangeTreeNew(); long long hitCount=0, missCount=0, totalBasesInHits=0; scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits); verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", hitCount, missCount, totalBasesInHits, grt); if (retMapRatio) *retMapRatio = (double)hitCount/(hitCount+missCount); if (retDepth) *retDepth = (double)totalBasesInHits/assembly->baseCount * (double)vf->itemCount/vf->sampleCount; long long basesHitBySample = genomeRangeTreeSumRanges(grt); if (retSampleCoverage) *retSampleCoverage = (double)basesHitBySample/assembly->baseCount; genomeRangeTreeFree(&grt); remove(samName); }
int edwFileFetch(struct sqlConnection *conn, struct edwFile *ef, int fd, char *submitFileName, unsigned submitId, unsigned submitDirId, unsigned hostId) /* Fetch file and if successful update a bunch of the fields in ef with the result. * Returns fileId. */ { ef->id = makeNewEmptyFileRecord(conn, submitId, submitDirId, ef->submitFileName, ef->size); /* Update edwSubmit with file in transit info */ char query[256]; sqlSafef(query, sizeof(query), "update edwSubmit set fileIdInTransit=%lld where id=%u", (long long)ef->id, submitId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "select paraFetchStreams from edwHost where id=%u", hostId); int paraFetchStreams = sqlQuickNum(conn, query); struct paraFetchInterruptContext interruptContext = {.conn=conn, .submitId=submitId}; /* Wrap getting the file, the actual data transfer, with an error catcher that * will remove partly uploaded files. Perhaps some day we'll attempt to rescue * ones that are just truncated by downloading the rest, but not now. */ struct errCatch *errCatch = errCatchNew(); char tempName[PATH_LEN] = ""; char edwFile[PATH_LEN] = "", edwPath[PATH_LEN]; if (errCatchStart(errCatch)) { /* Now make temp file name and open temp file in an atomic operation */ char *tempDir = edwTempDir(); safef(tempName, PATH_LEN, "%sedwSubmitXXXXXX", tempDir); int localFd = mustMkstemp(tempName); /* Update file name in database with temp file name so web app can track us. */ char query[PATH_LEN+128]; sqlSafef(query, sizeof(query), "update edwFile set edwFileName='%s' where id=%lld", tempName + strlen(edwRootDir), (long long)ef->id); sqlUpdate(conn, query); /* Do actual upload tracking how long it takes. */ ef->startUploadTime = edwNow(); mustCloseFd(&localFd); if (!parallelFetchInterruptable(submitFileName, tempName, paraFetchStreams, 4, FALSE, FALSE, paraFetchInterruptFunction, &interruptContext)) { if (interruptContext.isInterrupted) errAbort("Submission stopped by user."); else errAbort("parallel fetch of %s failed", submitFileName); } ef->endUploadTime = edwNow(); /* Rename file both in file system and (via ef) database. */ edwMakeFileNameAndPath(ef->id, submitFileName, edwFile, edwPath); mustRename(tempName, edwPath); if (endsWith(edwPath, ".gz") && !encode3IsGzipped(edwPath)) errAbort("%s has .gz suffix, but is not gzipped", submitFileName); ef->edwFileName = cloneString(edwFile); } errCatchEnd(errCatch); if (errCatch->gotError) { /* Attempt to remove any partial file. */ if (tempName[0] != 0) { verbose(1, "Removing partial %s\n", tempName); parallelFetchRemovePartial(tempName); remove(tempName); } handleSubmitError(conn, submitId, errCatch->message->string); // Throws further assert(FALSE); // We never get here } errCatchFree(&errCatch); /* Now we got the file. We'll go ahead and save the file name and stuff. */ sqlSafef(query, sizeof(query), "update edwFile set" " edwFileName='%s', startUploadTime=%lld, endUploadTime=%lld" " where id = %d" , ef->edwFileName, ef->startUploadTime, ef->endUploadTime, ef->id); sqlUpdate(conn, query); /* Wrap the validations in an error catcher that will save error to file table in database */ errCatch = errCatchNew(); boolean success = FALSE; if (errCatchStart(errCatch)) { /* Check MD5 sum here. */ unsigned char md5bin[16]; md5ForFile(edwPath, md5bin); char md5[33]; hexBinaryString(md5bin, sizeof(md5bin), md5, sizeof(md5)); if (!sameWord(md5, ef->md5)) errAbort("%s has md5 mismatch: %s != %s. File may be corrupted in upload, or file may have " "been changed since validateManifest was run. Please check that md5 of file " "before upload is really %s. If it is then try submitting again, otherwise " "rerun validateManifest and then try submitting again. \n", ef->submitFileName, ef->md5, md5, ef->md5); /* Finish updating a bunch more of edwFile record. Note there is a requirement in * the validFile section that ef->updateTime be updated last. A nonzero ef->updateTime * is used as a sign of record complete. */ struct dyString *dy = dyStringNew(0); /* Includes tag so query may be long */ sqlDyStringPrintf(dy, "update edwFile set md5='%s',size=%lld,updateTime=%lld", md5, ef->size, ef->updateTime); dyStringAppend(dy, ", tags='"); dyStringAppend(dy, ef->tags); dyStringPrintf(dy, "' where id=%d", ef->id); sqlUpdate(conn, dy->string); dyStringFree(&dy); /* Update edwSubmit so file no longer shown as in transit */ sqlSafef(query, sizeof(query), "update edwSubmit set fileIdInTransit=0 where id=%u", submitId); sqlUpdate(conn, query); success = TRUE; } errCatchEnd(errCatch); if (errCatch->gotError) { handleFileError(conn, submitId, ef->id, errCatch->message->string); } return ef->id; }