void makeTmpSai(struct sqlConnection *conn, struct cdwValidFile *vf, char *genoFile, char **retSampleFile, char **retSaiFile) /* Given a fastq file, make a subsample of it 100k reads long and align it with * bwa producing a sai file of given name. */ { /* Get fastq record */ long long fileId = vf->fileId; struct cdwFastqFile *fqf = cdwFastqFileFromFileId(conn, fileId); if (fqf == NULL) errAbort("No cdwFastqFile record for file id %lld", fileId); /* Create downsampled fastq in temp directory - downsampled more than default even. */ char sampleFastqName[PATH_LEN]; cdwMakeTempFastqSample(fqf->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName); verbose(1, "downsampled %s into %s\n", vf->licensePlate, sampleFastqName); /* Do alignment */ char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sai")); safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, sampleFastqName, saiName); mustSystem(cmd); /* Save return variables, clean up, and go home. */ *retSampleFile = cloneString(sampleFastqName); *retSaiFile = saiName; cdwFastqFileFree(&fqf); }
void fastqRepeatQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf) /* Do repeat QA if possible on fastq file. */ { /* First see if total repeat content is already in our table, in which case we are done. */ long long fileId = ef->id; char query[512]; sqlSafef(query, sizeof(query), "select count(*) from cdwQaRepeat where fileId=%lld and repeatClass='total'" , fileId); if (sqlQuickNum(conn, query) != 0) return; /* We've done this already */ /* Get sample file name from fastq table. */ struct cdwFastqFile *fqf = cdwFastqFileForFileId(conn, fileId); if (fqf == NULL) errAbort("No edqFastqRecord for %s", vf->licensePlate); char *fastqPath = fqf->sampleFileName; char bwaIndex[PATH_LEN]; safef(bwaIndex, sizeof(bwaIndex), "%s%s/repeatMasker/repeatMasker.fa", cdwValDataDir, vf->ucscDb); char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".sai")); safef(cmd, sizeof(cmd), "bwa aln %s %s > %s", bwaIndex, fastqPath, saiName); mustSystem(cmd); char *samName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".sam")); safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", bwaIndex, saiName, fastqPath, samName); mustSystem(cmd); remove(saiName); char *raName = cloneString(rTempName(cdwTempDir(), "cdwQaRepeat", ".ra")); safef(cmd, sizeof(cmd), "edwSamRepeatAnalysis %s %s", samName, raName); mustSystem(cmd); verbose(2, "mustSystem(%s)\n", cmd); remove(samName); raIntoCdwRepeatQa(raName, conn, fileId); remove(raName); #ifdef SOON #endif /* SOON */ freez(&saiName); freez(&samName); freez(&raName); cdwFastqFileFree(&fqf); }
void pairedEndQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf) /* Look for other end, do a pairwise alignment, and save results in database. */ { verbose(2, "pairedEndQa on %u %s %s\n", ef->id, ef->cdwFileName, ef->submitFileName); /* Get other end, return if not found. */ struct cdwValidFile *otherVf = cdwOppositePairedEnd(conn, ef, vf); if (otherVf == NULL) return; if (otherVf->fileId > vf->fileId) return; struct cdwValidFile *vf1, *vf2; struct cdwQaPairedEndFastq *pair = cdwQaPairedEndFastqFromVfs(conn, vf, otherVf, &vf1, &vf2); if (pair != NULL) { cdwValidFileFree(&otherVf); return; } /* Get target assembly and figure out path for BWA index. */ struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, vf->ucscDb); assert(assembly != NULL); char genoFile[PATH_LEN]; safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", cdwValDataDir, assembly->ucscDb, assembly->ucscDb); verbose(1, "aligning subsamples on %u vs. %u paired reads\n", vf1->fileId, vf2->fileId); /* Make alignments of subsamples. */ char *sample1 = NULL, *sample2 = NULL, *sai1 = NULL, *sai2 = NULL; makeTmpSai(conn, vf1, genoFile, &sample1, &sai1); makeTmpSai(conn, vf2, genoFile, &sample2, &sai2); /* Make paired end alignment */ char *tmpSam = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sam")); char command[6*PATH_LEN]; safef(command, sizeof(command), "bwa sampe -n 1 -N 1 -f %s %s %s %s %s %s" , tmpSam, genoFile, sai1, sai2, sample1, sample2); mustSystem(command); /* Make ra file with pairing statistics */ char *tmpRa = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".ra")); safef(command, sizeof(command), "edwSamPairedEndStats -maxInsert=%d %s %s", maxInsert, tmpSam, tmpRa); mustSystem(command); /* Read RA file into variables. */ struct cdwQaPairedEndFastq *pe = cdwQaPairedEndFastqOneFromRa(tmpRa); /* Update database with record. */ struct sqlConnection *freshConn = cdwConnectReadWrite(); char query[256]; sqlSafef(query, sizeof(query), "insert into cdwQaPairedEndFastq " "(fileId1,fileId2,concordance,distanceMean,distanceStd,distanceMin,distanceMax,recordComplete) " " values (%u,%u,%g,%g,%g,%g,%g,1)" , vf1->fileId, vf2->fileId, pe->concordance, pe->distanceMean , pe->distanceStd, pe->distanceMin, pe->distanceMax); sqlUpdate(conn, query); sqlDisconnect(&freshConn); /* Clean up and go home. */ cdwValidFileFree(&otherVf); remove(sample1); remove(sample2); remove(sai1); remove(sai2); remove(tmpSam); remove(tmpRa); #ifdef SOON #endif /* SOON */ freez(&sample1); freez(&sample2); freez(&sai1); freez(&sai2); freez(&tmpSam); freez(&tmpRa); cdwQaPairedEndFastqFree(&pe); cdwValidFileFree(&otherVf); }