int countAllReads(char *fileName1, char *fileName2, int compressed, unsigned char pairedEnd) { char dummy[SEQ_MAX_LENGTH]; int maxCnt = 0; if (!compressed) { _r_fp1 = fileOpen( fileName1, "r"); if (_r_fp1 == NULL) return 0; if ( pairedEnd && fileName2 != NULL ) { _r_fp2 = fileOpen ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } else { _r_fp2 = _r_fp1; } readFirstSeq = &readFirstSeqTXT; readSecondSeq = &readSecondSeqTXT; } else { _r_gzfp1 = fileOpenGZ (fileName1, "r"); if (_r_gzfp1 == NULL) return 0; if ( pairedEnd && fileName2 != NULL ) { _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); if (_r_gzfp2 == NULL) return 0; } else { _r_gzfp2 = _r_gzfp1; } readFirstSeq = &readFirstSeqGZ; readSecondSeq = &readSecondSeqGZ; } // Counting the number of lines in the file while (readFirstSeq(dummy)) { if(dummy[0] != '#' && dummy[0]!='>' && dummy[0] != ' ' && dummy[0] != '\r' && dummy[0] != '\n') maxCnt++; } if (!compressed) rewind(_r_fp1); else gzrewind(_r_gzfp1); // Return the Maximum # of sequences return maxCnt * 2; }
int initOutput ( char *fileName, int compressed) { if (compressed) { char newFileName[strlen(mappingOutputPath)+strlen(fileName)+4]; sprintf(newFileName, "%s%s.sam.gz", mappingOutputPath, fileName); _out_gzfp = fileOpenGZ(newFileName, "w1f"); if (_out_gzfp == Z_NULL) { return 0; } finalizeOutput = &finalizeGZOutput; output = &gzOutputQ; outputMeta =&gzOutputMetaQ; outputBuffer = &outputBufferGZ; } else { char newFileName[strlen(mappingOutputPath)+strlen(fileName)+strlen(".sam")+1]; if ( !strcmp(mappingOutputPath, "/dev/") && !strcmp(fileName, "null") ) { sprintf(newFileName, "%s%s", mappingOutputPath, fileName); nohitDisabled = 1; } else { //sprintf(newFileName, "%s%s.sam", mappingOutputPath, fileName); sprintf(newFileName, "%s%s", mappingOutputPath, fileName); } _out_fp = fileOpen(newFileName, "w"); if (_out_fp == NULL) { return 0; } finalizeOutput = &finalizeTXOutput; output = &outputQ; outputMeta = &outputMetaQ; outputBuffer = &outputBufferTxT; } if (noSamHeader) outputMeta = &noMetaOutput; outputMeta("@HD\tVN:1.4\tSO:unsorted"); return 1; }
int initRead(char *fileName1, char *fileName2) { char dummy[SEQ_MAX_LENGTH]; char ch; int i, maxCnt=0; _r_buf1 = getMem(10000000); _r_buf1_pos = getMem(sizeof(int)); _r_buf1_size = getMem(sizeof(int)); *_r_buf1_size = *_r_buf1_pos = 0; if ( pairedEndMode && fileName2 != NULL ) { _r_buf2 = getMem(10000000); _r_buf2_pos = getMem(sizeof(int)); _r_buf2_size = getMem(sizeof(int)); } else { _r_buf2 = _r_buf1; _r_buf2_pos = _r_buf1_pos; _r_buf2_size = _r_buf1_size; } if (!seqCompressed) { _r_fp1 = fileOpen( fileName1, "r"); if (_r_fp1 == NULL) return 0; ch = fgetc(_r_fp1); if ( pairedEndMode) { if ( fileName2 == NULL ) { _r_fp2 = _r_fp1; } else { _r_fp2 = fileOpen ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } } readBuffer1 = &readBufferTxT1; readBuffer2 = &readBufferTxT2; } else { _r_gzfp1 = fileOpenGZ (fileName1, "r"); if (_r_gzfp1 == NULL) { return 0; } ch = gzgetc(_r_gzfp1); if ( pairedEndMode && fileName2 != NULL ) { _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); if (_r_gzfp2 == NULL) { return 0; } } else { _r_gzfp2 = _r_gzfp1; } readBuffer1 = &readBufferGZ1; readBuffer2 = &readBufferGZ2; } if (!seqCompressed) rewind(_r_fp1); else gzrewind(_r_gzfp1); if (ch == '>') _r_fastq = 0; else _r_fastq = 1; readFirstSeq(dummy,1); int nameLen = strlen(dummy); readFirstSeq(dummy,2); *_r_buf1_pos = 0; int seqLen = strlen(dummy); SEQ_LENGTH = 0; i = 0; while (i<seqLen && !isspace(dummy[i])) { i++; SEQ_LENGTH++; } if (cropSize > 0) SEQ_LENGTH = cropSize; if ( SEQ_LENGTH >= SEQ_MAX_LENGTH ) { fprintf(stdout, "ERR: Read Length is greater than the MAX length we can process (Current Max: %d).\n", SEQ_MAX_LENGTH); exit(EXIT_FAILURE); } if (_r_fastq) { QUAL_LENGTH = SEQ_LENGTH; } else { QUAL_LENGTH = 1; } CMP_SEQ_LENGTH = calculateCompressedLen(SEQ_LENGTH); //TODO MEMORY CALCULATION FIX double readMem = sizeof(Read) + (2 + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH * 2 * 8) + (nameLen+10) + 4); readMem += ((bestMappingMode) ?(sizeof(FullMappingInfo)) :0); if (pairedEndMode) readMem += sizeof(MappingInfo) + sizeof(MappingLocations); _r_maxSeqCnt = (int)(((MAX_MEMORY-1.2) * (1 << 30))/readMem); if ( pairedEndMode && _r_maxSeqCnt % 2 ) _r_maxSeqCnt ++; _r_maxSeqCnt -= _r_maxSeqCnt % THREAD_COUNT; //_r_maxSeqCnt = 500000; _r_seq = getMem(sizeof(Read)*_r_maxSeqCnt); int maxErrThreshold = (SEQ_LENGTH/WINDOW_SIZE) - 1; if (errThreshold == -1) { errThreshold = SEQ_LENGTH*6/100; fprintf(stdout, "# Errors: %d\n", errThreshold); } if (errThreshold > maxErrThreshold && SEQ_LENGTH>0) { errThreshold = maxErrThreshold; fprintf(stdout, "# Error: %d (full sensitivity)\n", errThreshold); } checkSumLength = (SEQ_LENGTH / (errThreshold+1)) - WINDOW_SIZE; if (checkSumLength > sizeof(CheckSumType)*4) checkSumLength = sizeof(CheckSumType)*4; calculateSamplingLocations(); if (!nohitDisabled) { _r_umfp = fileOpen(unmappedOutput, "w"); } _r_alphIndex = getMem(128); // used in readChunk() _r_alphIndex['A'] = 0; _r_alphIndex['C'] = 1; _r_alphIndex['G'] = 2; _r_alphIndex['T'] = 3; _r_alphIndex['N'] = 4; return 1; }
int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pairedEnd, Read **seqList, unsigned int *seqListSize) { double startTime=getTime(); char seq1[SEQ_MAX_LENGTH]; char rseq1[SEQ_MAX_LENGTH]; char name1[SEQ_MAX_LENGTH]; char qual1[SEQ_MAX_LENGTH]; char seq2[SEQ_MAX_LENGTH]; char rseq2[SEQ_MAX_LENGTH]; char name2[SEQ_MAX_LENGTH]; char qual2[SEQ_MAX_LENGTH]; char dummy[SEQ_MAX_LENGTH]; char ch; int err1, err2; int nCnt; int discarded = 0; int seqCnt = 0; int maxCnt = 0; int i; Read *list = NULL; // new vars char * BUFF_1 = NULL; char * BUFF_2 = NULL; unsigned long int BUFF_1_pos = 0; unsigned long int BUFF_2_pos = 0; char READ_BUFFER[SEQ_MAX_LENGTH]; size_t read_len; size_t curr_pos; void * _r_fp1; void * _r_fp2; if (!compressed) { _r_fp1 = fileOpen( fileName1, "r"); if (_r_fp1 == NULL) return 0; if ( pairedEnd && fileName2 != NULL ){ _r_fp2 = fileOpen ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } else{ _r_fp2 = _r_fp1; } readSeq = &readline_TXT; } else{ _r_fp1 = fileOpenGZ (fileName1, "r"); if (_r_fp1 == NULL){ return 0; } if ( pairedEnd && fileName2 != NULL ){ _r_fp2 = fileOpenGZ ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } else{ _r_fp2 = _r_fp1; } readSeq = &readline_GZ; } //READ INTO 1 or 2 buffers ///READ IN read 1 into the buffer curr_pos=0; while (readSeq(_r_fp1,READ_BUFFER)){ read_len = strlen(READ_BUFFER); BUFF_1=(char*)(realloc(BUFF_1,curr_pos+read_len)); //strcpy(&BUFF_1[curr_pos],READ_BUFFER); memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len); curr_pos+=read_len; maxCnt++; //printf("%s",READ_BUFFER); } if (pairedEnd){ curr_pos=0; while (readSeq(_r_fp2,READ_BUFFER)){ read_len = strlen(READ_BUFFER); BUFF_2=(char*)(realloc(BUFF_2,curr_pos+read_len)); //strcpy(&BUFF_2[curr_pos],READ_BUFFER); memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len); curr_pos+=read_len; } } //printf("%s",BUFF_1); //exit(1); printf("read in complete\n"); printf("%d lines\n",maxCnt); //printf("%s",BUFF_1); if (BUFF_1[0] == '>') *fastq = 0; else *fastq = 1; // Counting the number of lines in the file //while (readSeq(dummy)) maxCnt++; if (!compressed){ fclose(_r_fp1); if (pairedEnd) fclose(_r_fp2); }else{ gzclose(_r_fp1); if (pairedEnd) gzclose(_r_fp2); } ///AFTER HERE, no changes except reads in from stream // Calculating the Maximum # of sequences if (*fastq) { maxCnt /= 4; } else { maxCnt /= 2; } if (pairedEnd && fileName2 != NULL ) maxCnt *= 2; list = getMem(sizeof(Read)*maxCnt); //while( readSeq(name1) ) while(scanBUF(BUFF_1,name1,&BUFF_1_pos)==1) { err1 = 0; err2 = 0; //readSeq(seq1); scanBUF(BUFF_1,seq1,&BUFF_1_pos); name1[strlen(name1)-1] = '\0'; for (i=0; i<strlen(name1);i++) { if (name1[i] == ' ') { name1[i] = '\0'; break; } } if ( *fastq ) { scanBUF(BUFF_1,dummy,&BUFF_1_pos); scanBUF(BUFF_1,qual1,&BUFF_1_pos); //readSeq(dummy); //readSeq(qual1); qual1[strlen(qual1)-1] = '\0'; } else { sprintf(qual1, "*"); } // Cropping if (cropSize > 0) { seq1[cropSize] = '\0'; if ( *fastq ) qual1[cropSize] = '\0'; } nCnt = 0; for (i=0; i<strlen(seq1); i++) { seq1[i] = toupper (seq1[i]); if (seq1[i] == 'N') { nCnt++; } else if (isspace(seq1[i])) { seq1[i] = '\0'; break; } } if (nCnt > errThreshold) { err1 = 1; } // Reading the second seq of pair-ends if (pairedEnd) { scanBUF(BUFF_2,name2,&BUFF_2_pos); scanBUF(BUFF_2,seq2,&BUFF_2_pos); //readSeq(name2); //readSeq(seq2); name2[strlen(name2)-1] = '\0'; for (i=0; i<strlen(name2);i++) { if (name2[i] == ' ') { name2[i] = '\0'; break; } } if ( *fastq ) { //readSeq(dummy); //readSeq(qual2); scanBUF(BUFF_2,dummy,&BUFF_2_pos); scanBUF(BUFF_2,qual2,&BUFF_2_pos); qual2[strlen(qual2)-1] = '\0'; } else { sprintf(qual2, "*"); } // Cropping if (cropSize > 0) { seq2[cropSize] = '\0'; if ( *fastq ) qual2[cropSize] = '\0'; } nCnt = 0; for (i=0; i<strlen(seq2); i++) { seq2[i] = toupper (seq2[i]); if (seq2[i] == 'N') { nCnt++; } else if (isspace(seq2[i])) { seq2[i] = '\0'; } } if (nCnt > errThreshold) { err2 = 1; } } if (!pairedEnd && !err1) { int _mtmp = strlen(seq1); list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq1, rseq1, _mtmp); int i; list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq1[i]; list[seqCnt].rseq[i] = rseq1[i] ; list[seqCnt].qual[i] = qual1[i]; } list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0'; sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); seqCnt++; } else if (pairedEnd && !err1 && !err2) { // Naming Conventions X/1, X/2 OR X int tmplen = strlen(name1); if (strcmp(name1, name2) != 0) { tmplen = strlen(name1)-2; } //first seq int _mtmp = strlen(seq1); list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq1, rseq1, _mtmp); int i; list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq1[i]; list[seqCnt].rseq[i] = rseq1[i] ; list[seqCnt].qual[i] = qual1[i]; } name1[tmplen]='\0'; list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0'; sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); seqCnt++; //second seq list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq2, rseq2, _mtmp); list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq2[i]; list[seqCnt].rseq[i] = rseq2[i]; list[seqCnt].qual[i] = qual2[i]; } name2[tmplen]='\0'; list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0'; sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); seqCnt++; } else { discarded++; } } if (seqCnt > 0) { QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq); if (! *fastq) { QUAL_LENGTH = 1; } //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH); } else { fprintf(stdout, "ERR: No reads can be found for mapping, %d discarded\n", discarded); return 1; } if (pairedEnd) { // seqCnt /= 2; } *seqList = list; *seqListSize = seqCnt; _r_seq = list; _r_seqCnt = seqCnt; free(BUFF_1); if (pairedEnd) free(BUFF_2); fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); //totalLoadingTime+=getTime()-startTime; /* fprintf(stdout,"HERE\n"); int j; for (j=0;j<maxCnt;j++){ fprintf(stdout,"%s\n",list[j].seq); } exit(1); */ return 1; }