int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pairedEnd, Read **seqList, unsigned int *seqListSize) { double startTime=getTime(); char seq1[SEQ_MAX_LENGTH]; char rseq1[SEQ_MAX_LENGTH]; char name1[SEQ_MAX_LENGTH]; char qual1[SEQ_MAX_LENGTH]; char seq2[SEQ_MAX_LENGTH]; char rseq2[SEQ_MAX_LENGTH]; char name2[SEQ_MAX_LENGTH]; char qual2[SEQ_MAX_LENGTH]; char dummy[SEQ_MAX_LENGTH]; char ch; int err1, err2; int nCnt; int discarded = 0; int seqCnt = 0; int maxCnt = 0; int i; Read *list = NULL; // connect pair socket printf("attempting to connect to ventilator @ %s\n", ventNode); void *readreceiver = zmq_socket (context, ZMQ_PULL); zmq_connect (readreceiver, ventNode); char *message = s_recv (readreceiver); maxCnt = atoi(message); printf("... succesfull!\n"); printf("ready to receive %d reads\n", maxCnt); free (message); *fastq = 0; list = getMem(sizeof(Read)*maxCnt); // start receiving int recvdCnt = 0; while (1){ char *message = s_recv (readreceiver); char *p = NULL; p = strtok (message," "); sprintf(name1, p); p = strtok (NULL," "); sprintf(seq1, p); //printf(n) free(message); if (strcmp(name1,"DONE")==0){ break;} // get out of loop if the last read is received sprintf(qual1, "*"); // ignore all quality for now recvdCnt++; for (i=0; i<strlen(name1); i++){if (name1[i] == '\n') {name1[i]='\0';}} for (i=0; i<strlen(seq1); i++){if (seq1[i] == '\n') {seq1[i]='\0';}} //printf("%d: %s %s\n", recvdCnt, name1, seq1); //count all the Ns in the sequence and set err1 appropriately nCnt = 0; err1 = 0; for (i=0; i<strlen(seq1); i++) { if (seq1[i] == 'N') {nCnt++;} } if (nCnt > errThreshold){ err1 = 1; } if (!err1){ int _mtmp = strlen(seq1); list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq1, rseq1, _mtmp); rseq1[_mtmp] = '\0'; int i; list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq1[i]; list[seqCnt].rseq[i] = rseq1[i] ; list[seqCnt].qual[i] = qual1[i]; } sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); seqCnt++; } else { // sequence had too many Ns in it! discarded++; } } // CLOSE PAIR SOCKET zmq_close (readreceiver); if (seqCnt > 0) { QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq); if (! *fastq) { QUAL_LENGTH = 1; } //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH); } else { fprintf(stdout, "ERR: No reads can be found for mapping\n"); fprintf(stdout, "recvdCnt: %d\n", recvdCnt); fprintf(stdout, "seqCnt: %d\n", seqCnt); fprintf(stdout, "discarded: %d\n", discarded); char out_msg [255]; // tell controller that no reads were received! sprintf (out_msg, "MAPPER INPUT %s %d %d",mapperID, seqCnt, discarded); s_send (requester, out_msg); s_recv (requester); return 0; } *seqList = list; *seqListSize = seqCnt; _r_seq = list; _r_seqCnt = seqCnt; fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); //totalLoadingTime+=getTime()-startTime; char out_msg [255]; sprintf (out_msg, "MAPPER INPUT %s %d %d",mapperID, seqCnt, discarded); s_send (requester, out_msg); char *msg = s_recv (requester); if (strcmp(msg,"RESTART")==0){ printf("RESTART received from controller. Terminating!\n"); return 0; // end the while loop and terminate the mapper } else if (strcmp(msg,"WAIT")==0){ // wait for appropriate signal on subsocket. not using request-reply socket so that system does not hang! printf("WAIT received from controller via subsocket.\n"); char *msg2 = s_recv (subsocket); char killsig[255]; strcpy(killsig,mapperID); strcat(killsig," RESTART"); if (strcmp(msg2,killsig)==0){ free(msg2); free(msg); printf("RESTART received from controller via subsocket. Terminating!\n"); return 0; // terminate the mapper } free(msg2); } free(msg); printf("GO received from controller. STARTING MAPPING!\n"); return 1; }
int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pairedEnd, Read **seqList, unsigned int *seqListSize) { double startTime=getTime(); char seq1[SEQ_MAX_LENGTH]; char rseq1[SEQ_MAX_LENGTH]; char name1[SEQ_MAX_LENGTH]; char qual1[SEQ_MAX_LENGTH]; char seq2[SEQ_MAX_LENGTH]; char rseq2[SEQ_MAX_LENGTH]; char name2[SEQ_MAX_LENGTH]; char qual2[SEQ_MAX_LENGTH]; char dummy[SEQ_MAX_LENGTH]; char ch; int err1, err2; int nCnt; int discarded = 0; int seqCnt = 0; int maxCnt = 0; int i; Read *list = NULL; // new vars char * BUFF_1 = NULL; char * BUFF_2 = NULL; unsigned long int BUFF_1_pos = 0; unsigned long int BUFF_2_pos = 0; char READ_BUFFER[SEQ_MAX_LENGTH]; size_t read_len; size_t curr_pos; void * _r_fp1; void * _r_fp2; if (!compressed) { _r_fp1 = fileOpen( fileName1, "r"); if (_r_fp1 == NULL) return 0; if ( pairedEnd && fileName2 != NULL ){ _r_fp2 = fileOpen ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } else{ _r_fp2 = _r_fp1; } readSeq = &readline_TXT; } else{ _r_fp1 = fileOpenGZ (fileName1, "r"); if (_r_fp1 == NULL){ return 0; } if ( pairedEnd && fileName2 != NULL ){ _r_fp2 = fileOpenGZ ( fileName2, "r" ); if (_r_fp2 == NULL) return 0; } else{ _r_fp2 = _r_fp1; } readSeq = &readline_GZ; } //READ INTO 1 or 2 buffers ///READ IN read 1 into the buffer curr_pos=0; while (readSeq(_r_fp1,READ_BUFFER)){ read_len = strlen(READ_BUFFER); BUFF_1=(char*)(realloc(BUFF_1,curr_pos+read_len)); //strcpy(&BUFF_1[curr_pos],READ_BUFFER); memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len); curr_pos+=read_len; maxCnt++; //printf("%s",READ_BUFFER); } if (pairedEnd){ curr_pos=0; while (readSeq(_r_fp2,READ_BUFFER)){ read_len = strlen(READ_BUFFER); BUFF_2=(char*)(realloc(BUFF_2,curr_pos+read_len)); //strcpy(&BUFF_2[curr_pos],READ_BUFFER); memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len); curr_pos+=read_len; } } //printf("%s",BUFF_1); //exit(1); printf("read in complete\n"); printf("%d lines\n",maxCnt); //printf("%s",BUFF_1); if (BUFF_1[0] == '>') *fastq = 0; else *fastq = 1; // Counting the number of lines in the file //while (readSeq(dummy)) maxCnt++; if (!compressed){ fclose(_r_fp1); if (pairedEnd) fclose(_r_fp2); }else{ gzclose(_r_fp1); if (pairedEnd) gzclose(_r_fp2); } ///AFTER HERE, no changes except reads in from stream // Calculating the Maximum # of sequences if (*fastq) { maxCnt /= 4; } else { maxCnt /= 2; } if (pairedEnd && fileName2 != NULL ) maxCnt *= 2; list = getMem(sizeof(Read)*maxCnt); //while( readSeq(name1) ) while(scanBUF(BUFF_1,name1,&BUFF_1_pos)==1) { err1 = 0; err2 = 0; //readSeq(seq1); scanBUF(BUFF_1,seq1,&BUFF_1_pos); name1[strlen(name1)-1] = '\0'; for (i=0; i<strlen(name1);i++) { if (name1[i] == ' ') { name1[i] = '\0'; break; } } if ( *fastq ) { scanBUF(BUFF_1,dummy,&BUFF_1_pos); scanBUF(BUFF_1,qual1,&BUFF_1_pos); //readSeq(dummy); //readSeq(qual1); qual1[strlen(qual1)-1] = '\0'; } else { sprintf(qual1, "*"); } // Cropping if (cropSize > 0) { seq1[cropSize] = '\0'; if ( *fastq ) qual1[cropSize] = '\0'; } nCnt = 0; for (i=0; i<strlen(seq1); i++) { seq1[i] = toupper (seq1[i]); if (seq1[i] == 'N') { nCnt++; } else if (isspace(seq1[i])) { seq1[i] = '\0'; break; } } if (nCnt > errThreshold) { err1 = 1; } // Reading the second seq of pair-ends if (pairedEnd) { scanBUF(BUFF_2,name2,&BUFF_2_pos); scanBUF(BUFF_2,seq2,&BUFF_2_pos); //readSeq(name2); //readSeq(seq2); name2[strlen(name2)-1] = '\0'; for (i=0; i<strlen(name2);i++) { if (name2[i] == ' ') { name2[i] = '\0'; break; } } if ( *fastq ) { //readSeq(dummy); //readSeq(qual2); scanBUF(BUFF_2,dummy,&BUFF_2_pos); scanBUF(BUFF_2,qual2,&BUFF_2_pos); qual2[strlen(qual2)-1] = '\0'; } else { sprintf(qual2, "*"); } // Cropping if (cropSize > 0) { seq2[cropSize] = '\0'; if ( *fastq ) qual2[cropSize] = '\0'; } nCnt = 0; for (i=0; i<strlen(seq2); i++) { seq2[i] = toupper (seq2[i]); if (seq2[i] == 'N') { nCnt++; } else if (isspace(seq2[i])) { seq2[i] = '\0'; } } if (nCnt > errThreshold) { err2 = 1; } } if (!pairedEnd && !err1) { int _mtmp = strlen(seq1); list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq1, rseq1, _mtmp); int i; list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq1[i]; list[seqCnt].rseq[i] = rseq1[i] ; list[seqCnt].qual[i] = qual1[i]; } list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0'; sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); seqCnt++; } else if (pairedEnd && !err1 && !err2) { // Naming Conventions X/1, X/2 OR X int tmplen = strlen(name1); if (strcmp(name1, name2) != 0) { tmplen = strlen(name1)-2; } //first seq int _mtmp = strlen(seq1); list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq1, rseq1, _mtmp); int i; list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq1[i]; list[seqCnt].rseq[i] = rseq1[i] ; list[seqCnt].qual[i] = qual1[i]; } name1[tmplen]='\0'; list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0'; sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); seqCnt++; //second seq list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); list[seqCnt].seq = list[seqCnt].hits + 1; list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; list[seqCnt].name = list[seqCnt].qual + _mtmp+1; reverseComplete(seq2, rseq2, _mtmp); list[seqCnt].hits[0] = 0; for (i=0; i<=_mtmp; i++) { list[seqCnt].seq[i] = seq2[i]; list[seqCnt].rseq[i] = rseq2[i]; list[seqCnt].qual[i] = qual2[i]; } name2[tmplen]='\0'; list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0'; sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); seqCnt++; } else { discarded++; } } if (seqCnt > 0) { QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq); if (! *fastq) { QUAL_LENGTH = 1; } //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH); } else { fprintf(stdout, "ERR: No reads can be found for mapping, %d discarded\n", discarded); return 1; } if (pairedEnd) { // seqCnt /= 2; } *seqList = list; *seqListSize = seqCnt; _r_seq = list; _r_seqCnt = seqCnt; free(BUFF_1); if (pairedEnd) free(BUFF_2); fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); //totalLoadingTime+=getTime()-startTime; /* fprintf(stdout,"HERE\n"); int j; for (j=0;j<maxCnt;j++){ fprintf(stdout,"%s\n",list[j].seq); } exit(1); */ return 1; }