int countAllReads(char *fileName1, char *fileName2, int compressed, unsigned char pairedEnd) { char dummy[SEQ_MAX_LENGTH]; int maxCnt = 0; if (!compressed) { _r_fp1 = fileOpen( fileName1, "r"); if (_r_fp1 == NULL) return 0; if ( pairedEnd && fileName2 != NULL ) { _r_fp2 = fileOpen ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } else { _r_fp2 = _r_fp1; } readFirstSeq = &readFirstSeqTXT; readSecondSeq = &readSecondSeqTXT; } else { _r_gzfp1 = fileOpenGZ (fileName1, "r"); if (_r_gzfp1 == NULL) return 0; if ( pairedEnd && fileName2 != NULL ) { _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); if (_r_gzfp2 == NULL) return 0; } else { _r_gzfp2 = _r_gzfp1; } readFirstSeq = &readFirstSeqGZ; readSecondSeq = &readSecondSeqGZ; } // Counting the number of lines in the file while (readFirstSeq(dummy)) { if(dummy[0] != '#' && dummy[0]!='>' && dummy[0] != ' ' && dummy[0] != '\r' && dummy[0] != '\n') maxCnt++; } if (!compressed) rewind(_r_fp1); else gzrewind(_r_gzfp1); // Return the Maximum # of sequences return maxCnt * 2; }
int readChunk(Read **seqList, unsigned int *seqListSize) { double startTime=getTime(); char seq1[SEQ_MAX_LENGTH]; char name1[SEQ_MAX_LENGTH]; char qual1[SEQ_MAX_LENGTH]; char seq2[SEQ_MAX_LENGTH]; char name2[SEQ_MAX_LENGTH]; char qual2[SEQ_MAX_LENGTH]; char dummy[SEQ_MAX_LENGTH]; int size; int maxCnt = 0; _r_seqCnt = 0; _r_readMemUsage = 0; int i;//, len; int namelen; while( (namelen = readFirstSeq(name1,1)) ) { if (pairedEndMode) { if (name1[namelen-2]=='/' && name1[namelen-1]=='1') { namelen -= 2; name1[namelen]='\0'; } } size = sizeof(uint16_t) + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH << 4) + namelen +/* 1 +*/ 4; _r_seq[_r_seqCnt].hits = getMem(size); _r_readMemUsage += size; _r_seq[_r_seqCnt].seq = (char *)(_r_seq[_r_seqCnt].hits + 1); _r_seq[_r_seqCnt].rseq = (char *)(_r_seq[_r_seqCnt].seq + SEQ_LENGTH + 1); _r_seq[_r_seqCnt].qual = (char *)(_r_seq[_r_seqCnt].rseq + SEQ_LENGTH + 1); _r_seq[_r_seqCnt].cseq = (CompressedSeq *)(_r_seq[_r_seqCnt].qual + QUAL_LENGTH + 1); _r_seq[_r_seqCnt].crseq = (CompressedSeq *)(_r_seq[_r_seqCnt].cseq + CMP_SEQ_LENGTH); _r_seq[_r_seqCnt].name = (char *)(_r_seq[_r_seqCnt].crseq + CMP_SEQ_LENGTH); _r_seq[_r_seqCnt].alphCnt = (unsigned char *)(_r_seq[_r_seqCnt].name + namelen);// + 1); _r_seq[_r_seqCnt].hits[0] = 0; for (i=1; i<namelen+1; i++) _r_seq[_r_seqCnt].name[i-1] = name1[i]; if ( readFirstSeq(_r_seq[_r_seqCnt].seq,2) != SEQ_LENGTH) { fprintf(stdout, "ERR: Inconsistent read length for %s\n", name1); exit(EXIT_FAILURE); } if ( _r_fastq ) { readFirstSeq(dummy,3); readFirstSeq(_r_seq[_r_seqCnt].qual,4); } else { _r_seq[_r_seqCnt].qual = "*"; } _r_seqCnt++; if (pairedEndMode) { _r_seq[_r_seqCnt].hits = getMem(size); _r_readMemUsage += size; _r_seq[_r_seqCnt].seq = (char *) (_r_seq[_r_seqCnt].hits + 1); _r_seq[_r_seqCnt].rseq = (char *)(_r_seq[_r_seqCnt].seq + SEQ_LENGTH + 1); _r_seq[_r_seqCnt].qual = (char *)(_r_seq[_r_seqCnt].rseq + SEQ_LENGTH + 1); _r_seq[_r_seqCnt].cseq = (CompressedSeq *)(_r_seq[_r_seqCnt].qual + QUAL_LENGTH + 1); _r_seq[_r_seqCnt].crseq = (CompressedSeq *)(_r_seq[_r_seqCnt].cseq + CMP_SEQ_LENGTH); _r_seq[_r_seqCnt].name = (char *)(_r_seq[_r_seqCnt].crseq + CMP_SEQ_LENGTH); _r_seq[_r_seqCnt].alphCnt = (unsigned char *)(_r_seq[_r_seqCnt].name + namelen);// + 1); _r_seq[_r_seqCnt].hits[0] = 0; readSecondSeq(name2, 1); for (i=1; i<namelen+1; i++) _r_seq[_r_seqCnt].name[i-1] = name1[i]; if ( readSecondSeq(_r_seq[_r_seqCnt].seq,2) != SEQ_LENGTH) { fprintf(stdout, "ERR: Inconsistent read length for %s\n", name1); exit(EXIT_FAILURE); } if ( _r_fastq ) { readSecondSeq(dummy,3); readSecondSeq(_r_seq[_r_seqCnt].qual,4); } else { _r_seq[_r_seqCnt].qual = "*"; } _r_seqCnt++; } if (_r_seqCnt >= _r_maxSeqCnt) break; } *seqList = _r_seq; *seqListSize = _r_seqCnt; if (_r_seqCnt > 0) { preProcessReadsMT(); fprintf(stdout, "| *Reading Input* | %15.2f | XXXXXXXXXXXXXXX | %15.2f | XXXXXXXXXXXXXXX %15d |\n", (getTime()-startTime), getMemUsage(), _r_seqCnt ); _r_firstIteration = 0; } else if (_r_firstIteration) { fprintf(stdout, "ERR: No reads for mapping\n"); exit(EXIT_FAILURE); } if (_r_seqCnt < _r_maxSeqCnt) // reached end of file return 0; else return 1; }
int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pairedEnd, Read **seqList, unsigned int *seqListSize, unsigned int ListSize, unsigned int AccListSize) { double startTime=getTime(); char * seq; char * qual; char seq1[SEQ_MAX_LENGTH]; char rseq1[SEQ_MAX_LENGTH]; char qual1[SEQ_MAX_LENGTH]; char seq2[SEQ_MAX_LENGTH]; char rseq2[SEQ_MAX_LENGTH]; char qual2[SEQ_MAX_LENGTH]; char dummy[SEQ_MAX_LENGTH]; int discarded = 0; int seqCnt = 0; Read *list = NULL; int nCnt1; int nCnt2; list = getMem(sizeof(Read)*ListSize, "list @readAllReads()"); while(ListSize > seqCnt && readFirstSeq(dummy)) { int i = 0; int _mtmp = 36; if(dummy[0] == '#' || dummy[0] == '>' || dummy[0] == ' ' || dummy[0] == '\r' || dummy[0] == '\n') continue; strtok(dummy, "\t "); seq = strtok(NULL, "\t "); qual = strtok(NULL, "\t "); for(i = 0; i < _mtmp - 1; i++) { seq1[i] = toupper(seq[i]); qual1[i] = qual[i]; } for(i = 0; i < _mtmp - 1; i++) { seq2[i] = toupper(seq[i + _mtmp - 1]); qual2[i] = qual[i + _mtmp - 1]; } seq1[_mtmp - 1] = seq2[_mtmp - 1] = qual1[_mtmp - 1] = qual2[_mtmp - 1] = '\0'; nCnt1 = 0; nCnt2 = 0; for (i=0; i<_mtmp; i++) { if (seq1[i] == 'N') nCnt1++; if (seq2[i] == 'N') nCnt2++; } if (nCnt1 > errThreshold || nCnt2 > errThreshold) { discarded += 2; continue; } if (errThreshold == 255) { if (cropSize > 0) { errThreshold = (int) ceil(cropSize * 0.04); fprintf(stdout, "Sequence length: %d bp. Error threshold is set to %d bp.\n", cropSize, errThreshold); } else { errThreshold = (int) ceil((strlen(seq1)) * 0.04); fprintf(stdout, "Sequence length: %d bp. Error threshold is set to %d bp.\n", ((int)strlen(seq1)), errThreshold); } fprintf(stdout, "You can override this value using the -e parameter.\n"); } list[seqCnt].hits = getMem (1 + 3 * _mtmp + 3 + _mtmp, "list.hits @readAllReads()"); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp + 1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp + 1; list[seqCnt].name = list[seqCnt].qual + _mtmp + 1; list[seqCnt].hashValue = getMem(sizeof(short) * _mtmp, "list.hashValue @readAllReads()"); list[seqCnt].rhashValue = getMem(sizeof(short) * _mtmp, "list.rhashValue @readAllReads()"); list[seqCnt].readNumber = seqCnt; list[seqCnt].hits[0] = 0; reverseComplement(seq1, rseq1, _mtmp - 1); // DHL Modify rseq1[_mtmp - 1] = '\0'; for (i=0; i<_mtmp-1; i++) { list[seqCnt].seq[i] = seq1[i]; list[seqCnt].rseq[i] = rseq1[i]; list[seqCnt].qual[i] = qual1[i]; } if (!pairedEndMode) sprintf(list[seqCnt].name, "%s_%d/1", mappingOutput, (seqCnt + AccListSize) / 2); else sprintf(list[seqCnt].name, "%s_%d", mappingOutput, (seqCnt + AccListSize) / 2); list[seqCnt].seq[_mtmp - 1] = list[seqCnt].rseq[_mtmp - 1] = list[seqCnt].qual[_mtmp - 1]='\0'; seqCnt++; list[seqCnt].hits = getMem (1 + 3 * _mtmp + 3 + _mtmp, "list.hits @readAllReads()"); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; list[seqCnt].hashValue = getMem(sizeof(short) * _mtmp, "list.hashValue @readAllReads()"); list[seqCnt].rhashValue = getMem(sizeof(short) * _mtmp, "list.rhashValue @readAllReads()"); list[seqCnt].readNumber = seqCnt; list[seqCnt].hits[0] = 0; reverseComplement(seq2, rseq2, _mtmp - 1); // DHL Modify rseq2[_mtmp - 1] = '\0'; for (i=0; i<_mtmp; i++) { list[seqCnt].seq[i] = seq2[i]; list[seqCnt].rseq[i] = rseq2[i]; list[seqCnt].qual[i] = qual2[i]; } if (!pairedEndMode) sprintf(list[seqCnt].name, "%s_%d/2", mappingOutput, (seqCnt + AccListSize) / 2); else sprintf(list[seqCnt].name, "%s_%d", mappingOutput, (seqCnt + AccListSize) / 2); list[seqCnt].seq[_mtmp - 1] = list[seqCnt].rseq[_mtmp - 1] = list[seqCnt].qual[_mtmp - 1]='\0'; seqCnt++; } if (seqCnt <= 0) { //fprintf(stdout, "ERROR: No reads can be found for mapping\n"); fprintf(stdout, "==== End of Input Reads ====\n"); // DHL: read slice return 0; } //qsort(list, seqCnt, sizeof(Read), toCompareRead); adjustQual(list, seqCnt); *seqList = list; *seqListSize = seqCnt; _r_seq = list; _r_seqCnt = seqCnt; if (pairedEnd) discarded *= 2; if (seqCnt > 1) { fprintf(stdout, "==== %d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M] ====\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); } else { fprintf(stdout, "==== %d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M] ====\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); } return seqCnt; }
int initRead(char *fileName1, char *fileName2) { char dummy[SEQ_MAX_LENGTH]; char ch; int i, maxCnt=0; _r_buf1 = getMem(10000000); _r_buf1_pos = getMem(sizeof(int)); _r_buf1_size = getMem(sizeof(int)); *_r_buf1_size = *_r_buf1_pos = 0; if ( pairedEndMode && fileName2 != NULL ) { _r_buf2 = getMem(10000000); _r_buf2_pos = getMem(sizeof(int)); _r_buf2_size = getMem(sizeof(int)); } else { _r_buf2 = _r_buf1; _r_buf2_pos = _r_buf1_pos; _r_buf2_size = _r_buf1_size; } if (!seqCompressed) { _r_fp1 = fileOpen( fileName1, "r"); if (_r_fp1 == NULL) return 0; ch = fgetc(_r_fp1); if ( pairedEndMode) { if ( fileName2 == NULL ) { _r_fp2 = _r_fp1; } else { _r_fp2 = fileOpen ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } } readBuffer1 = &readBufferTxT1; readBuffer2 = &readBufferTxT2; } else { _r_gzfp1 = fileOpenGZ (fileName1, "r"); if (_r_gzfp1 == NULL) { return 0; } ch = gzgetc(_r_gzfp1); if ( pairedEndMode && fileName2 != NULL ) { _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); if (_r_gzfp2 == NULL) { return 0; } } else { _r_gzfp2 = _r_gzfp1; } readBuffer1 = &readBufferGZ1; readBuffer2 = &readBufferGZ2; } if (!seqCompressed) rewind(_r_fp1); else gzrewind(_r_gzfp1); if (ch == '>') _r_fastq = 0; else _r_fastq = 1; readFirstSeq(dummy,1); int nameLen = strlen(dummy); readFirstSeq(dummy,2); *_r_buf1_pos = 0; int seqLen = strlen(dummy); SEQ_LENGTH = 0; i = 0; while (i<seqLen && !isspace(dummy[i])) { i++; SEQ_LENGTH++; } if (cropSize > 0) SEQ_LENGTH = cropSize; if ( SEQ_LENGTH >= SEQ_MAX_LENGTH ) { fprintf(stdout, "ERR: Read Length is greater than the MAX length we can process (Current Max: %d).\n", SEQ_MAX_LENGTH); exit(EXIT_FAILURE); } if (_r_fastq) { QUAL_LENGTH = SEQ_LENGTH; } else { QUAL_LENGTH = 1; } CMP_SEQ_LENGTH = calculateCompressedLen(SEQ_LENGTH); //TODO MEMORY CALCULATION FIX double readMem = sizeof(Read) + (2 + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH * 2 * 8) + (nameLen+10) + 4); readMem += ((bestMappingMode) ?(sizeof(FullMappingInfo)) :0); if (pairedEndMode) readMem += sizeof(MappingInfo) + sizeof(MappingLocations); _r_maxSeqCnt = (int)(((MAX_MEMORY-1.2) * (1 << 30))/readMem); if ( pairedEndMode && _r_maxSeqCnt % 2 ) _r_maxSeqCnt ++; _r_maxSeqCnt -= _r_maxSeqCnt % THREAD_COUNT; //_r_maxSeqCnt = 500000; _r_seq = getMem(sizeof(Read)*_r_maxSeqCnt); int maxErrThreshold = (SEQ_LENGTH/WINDOW_SIZE) - 1; if (errThreshold == -1) { errThreshold = SEQ_LENGTH*6/100; fprintf(stdout, "# Errors: %d\n", errThreshold); } if (errThreshold > maxErrThreshold && SEQ_LENGTH>0) { errThreshold = maxErrThreshold; fprintf(stdout, "# Error: %d (full sensitivity)\n", errThreshold); } checkSumLength = (SEQ_LENGTH / (errThreshold+1)) - WINDOW_SIZE; if (checkSumLength > sizeof(CheckSumType)*4) checkSumLength = sizeof(CheckSumType)*4; calculateSamplingLocations(); if (!nohitDisabled) { _r_umfp = fileOpen(unmappedOutput, "w"); } _r_alphIndex = getMem(128); // used in readChunk() _r_alphIndex['A'] = 0; _r_alphIndex['C'] = 1; _r_alphIndex['G'] = 2; _r_alphIndex['T'] = 3; _r_alphIndex['N'] = 4; return 1; }