Esempio n. 1
0
int readAllReads(char *fileName1,
						char *fileName2,
						int compressed,
						unsigned char *fastq,
						unsigned char pairedEnd,
						Read **seqList,
						unsigned int *seqListSize)
{
	double startTime=getTime();

	char seq1[SEQ_MAX_LENGTH];
	char rseq1[SEQ_MAX_LENGTH];
	char name1[SEQ_MAX_LENGTH];
	char qual1[SEQ_MAX_LENGTH];
	char seq2[SEQ_MAX_LENGTH];
	char rseq2[SEQ_MAX_LENGTH];
	char name2[SEQ_MAX_LENGTH];
	char qual2[SEQ_MAX_LENGTH];

	char dummy[SEQ_MAX_LENGTH];
	char ch;
	int err1, err2;
	int nCnt;
	int discarded = 0;
	int seqCnt = 0;
	int maxCnt = 0;
	int i;
	Read *list = NULL;

	// connect pair socket
	printf("attempting to connect to ventilator @ %s\n", ventNode);
	
	void *readreceiver = zmq_socket (context, ZMQ_PULL);
    zmq_connect (readreceiver, ventNode);
    char *message = s_recv (readreceiver);
    
    maxCnt = atoi(message);
    printf("... succesfull!\n");    
    printf("ready to receive %d reads\n", maxCnt);
    free (message);

	
	*fastq = 0;
	
	list = getMem(sizeof(Read)*maxCnt);

	// start receiving
	
	int recvdCnt = 0;	
	
	while (1){
		
		char *message = s_recv (readreceiver);
		char *p = NULL;
		
		p = strtok (message," ");
		sprintf(name1, p);
		p = strtok (NULL," ");		
		sprintf(seq1, p);
		//printf(n)
		
		free(message);
		
		if (strcmp(name1,"DONE")==0){ break;} // get out of loop if the last read is received
		
		sprintf(qual1, "*"); // ignore all quality for now
		recvdCnt++;
		
		for (i=0; i<strlen(name1); i++){if (name1[i] == '\n') {name1[i]='\0';}}
		for (i=0; i<strlen(seq1); i++){if (seq1[i] == '\n') {seq1[i]='\0';}}		
		
		//printf("%d: %s %s\n", recvdCnt, name1, seq1);
		
		//count all the Ns in the sequence and set err1 appropriately
		nCnt = 0;
		err1 = 0;
		for (i=0; i<strlen(seq1); i++)
		{
			if (seq1[i] == 'N') {nCnt++;}
		}

		if (nCnt > errThreshold){
			err1 = 1;
		}
		
		
		if (!err1){
			int _mtmp = strlen(seq1);
			list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
	
	
			reverseComplete(seq1, rseq1, _mtmp);
			rseq1[_mtmp] =  '\0';
			int i;
	
			list[seqCnt].hits[0] = 0;
	
			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq1[i];
				list[seqCnt].rseq[i] = rseq1[i] ;
				list[seqCnt].qual[i] = qual1[i];
			}
			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
	
			seqCnt++;
		}
		else { // sequence had too many Ns in it!
			discarded++;
		}
			
	}
	
	// CLOSE PAIR SOCKET
	zmq_close (readreceiver);
	
	
	if (seqCnt > 0)
	{
		QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq);
		if (! *fastq)
		{
			QUAL_LENGTH = 1;
		}
		//fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH);
	}
	else
	{
		fprintf(stdout, "ERR: No reads can be found for mapping\n");
		fprintf(stdout, "recvdCnt: %d\n", recvdCnt);
		fprintf(stdout, "seqCnt: %d\n", seqCnt);
		fprintf(stdout, "discarded: %d\n", discarded);
		char out_msg [255];
		// tell controller that no reads were received!
		sprintf (out_msg, "MAPPER INPUT %s %d %d",mapperID, seqCnt, discarded);
		s_send (requester, out_msg);
		s_recv (requester);
		
		return 0;
	}

	*seqList = list;
	*seqListSize = seqCnt;

	_r_seq = list;
	_r_seqCnt = seqCnt;
	
	
	
	fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
	//totalLoadingTime+=getTime()-startTime;
	
	char out_msg [255];
	sprintf (out_msg, "MAPPER INPUT %s %d %d",mapperID, seqCnt, discarded);
	s_send (requester, out_msg);
	char *msg = s_recv (requester);
	
	if (strcmp(msg,"RESTART")==0){
		printf("RESTART received from controller. Terminating!\n");
		return 0; // end the while loop and terminate the mapper
	}
	else if (strcmp(msg,"WAIT")==0){
		// wait for appropriate signal on subsocket. not using request-reply socket so that system does not hang!
		printf("WAIT received from controller via subsocket.\n");
		char *msg2 = s_recv (subsocket);
		char killsig[255];
		strcpy(killsig,mapperID);
		strcat(killsig," RESTART");
		if (strcmp(msg2,killsig)==0){
			free(msg2);
			free(msg);
			printf("RESTART received from controller via subsocket. Terminating!\n");
			return 0; //  terminate the mapper
		}
		free(msg2);
	}

	free(msg);
	printf("GO received from controller. STARTING MAPPING!\n");
	return 1;

}
Esempio n. 2
0
int readAllReads(char *fileName1,
						char *fileName2,
						int compressed,
						unsigned char *fastq,
						unsigned char pairedEnd,
						Read **seqList,
						unsigned int *seqListSize)
{
	double startTime=getTime();

	char seq1[SEQ_MAX_LENGTH];
	char rseq1[SEQ_MAX_LENGTH];
	char name1[SEQ_MAX_LENGTH];
	char qual1[SEQ_MAX_LENGTH];
	char seq2[SEQ_MAX_LENGTH];
	char rseq2[SEQ_MAX_LENGTH];
	char name2[SEQ_MAX_LENGTH];
	char qual2[SEQ_MAX_LENGTH];

	char dummy[SEQ_MAX_LENGTH];
	char ch;
	int err1, err2;
	int nCnt;
	int discarded = 0;
	int seqCnt = 0;
	int maxCnt = 0;
	int i;
	Read *list = NULL;
	// new vars
	
	char * BUFF_1 = NULL;
	char * BUFF_2 = NULL;
	unsigned long int BUFF_1_pos = 0;		
	unsigned long int BUFF_2_pos = 0;		
	
	char READ_BUFFER[SEQ_MAX_LENGTH];
	size_t read_len;
	size_t curr_pos;

	void * _r_fp1;
	void * _r_fp2;

	if (!compressed)
	{
		_r_fp1 = fileOpen( fileName1, "r");

		if (_r_fp1 == NULL) return 0;

		if ( pairedEnd && fileName2 != NULL ){
			_r_fp2 = fileOpen ( fileName2, "r" );
			if (_r_fp2 == NULL)	return 0;
		}
		else{
			_r_fp2 = _r_fp1;
		}
		readSeq = &readline_TXT;
	}
	else{
		_r_fp1 = fileOpenGZ (fileName1, "r");
		if (_r_fp1 == NULL){
			return 0;
		}

		if ( pairedEnd && fileName2 != NULL ){
			_r_fp2 = fileOpenGZ ( fileName2, "r" );
			if (_r_fp2 == NULL)	return 0;
		}
		else{
			_r_fp2 = _r_fp1;
		}
		readSeq = &readline_GZ;
	}
	
	//READ INTO 1 or 2 buffers
		
	///READ IN read 1 into the buffer
	curr_pos=0;

	while (readSeq(_r_fp1,READ_BUFFER)){
		read_len = strlen(READ_BUFFER);
		BUFF_1=(char*)(realloc(BUFF_1,curr_pos+read_len));
		//strcpy(&BUFF_1[curr_pos],READ_BUFFER);
		memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len);
		curr_pos+=read_len;
		maxCnt++;	
		//printf("%s",READ_BUFFER);
	}
	
	if (pairedEnd){
		curr_pos=0;
		while (readSeq(_r_fp2,READ_BUFFER)){
			read_len = strlen(READ_BUFFER);
			BUFF_2=(char*)(realloc(BUFF_2,curr_pos+read_len));
			//strcpy(&BUFF_2[curr_pos],READ_BUFFER);
			memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len);
			curr_pos+=read_len;
		}
	}	
	//printf("%s",BUFF_1);
	//exit(1);
	printf("read in complete\n");
	printf("%d lines\n",maxCnt);
	//printf("%s",BUFF_1);
	if (BUFF_1[0] == '>')
		*fastq = 0;
	else
		*fastq = 1;

	// Counting the number of lines in the file
	//while (readSeq(dummy)) maxCnt++;
	
	if (!compressed){
		fclose(_r_fp1);
		if (pairedEnd) fclose(_r_fp2);
	}else{
		gzclose(_r_fp1);
		if (pairedEnd) gzclose(_r_fp2);
	}
	
	///AFTER HERE, no changes except reads in from stream
	// Calculating the Maximum # of sequences
	if (*fastq)
	{
		maxCnt /= 4;
	}
	else
	{
		maxCnt /= 2;
	}

	if (pairedEnd && fileName2 != NULL )
		maxCnt *= 2;

	list = getMem(sizeof(Read)*maxCnt);

	
	//while( readSeq(name1) )
	while(scanBUF(BUFF_1,name1,&BUFF_1_pos)==1)
	{
		err1 = 0;
		err2 = 0;
		//readSeq(seq1);
		scanBUF(BUFF_1,seq1,&BUFF_1_pos);
		name1[strlen(name1)-1] = '\0';
		for (i=0; i<strlen(name1);i++)
		{
			if (name1[i] == ' ')
			{
				name1[i] = '\0';
				break;
			}

		}

		if ( *fastq )
		{
			scanBUF(BUFF_1,dummy,&BUFF_1_pos);
			scanBUF(BUFF_1,qual1,&BUFF_1_pos);
			//readSeq(dummy);
			//readSeq(qual1);
			qual1[strlen(qual1)-1] = '\0';
		}
		else
		{
			sprintf(qual1, "*");
		}


		// Cropping
		if (cropSize > 0)
		{
			seq1[cropSize] = '\0';
			if ( *fastq )
				qual1[cropSize] = '\0';
		}


		nCnt = 0;
		for (i=0; i<strlen(seq1); i++)
		{
			seq1[i] = toupper (seq1[i]);
			if (seq1[i] == 'N')
			{
				nCnt++;
			}
			else if (isspace(seq1[i]))
			{

				seq1[i] = '\0';
				break;
			}
		}

		if (nCnt > errThreshold)
		{
			err1 = 1;
		}

		// Reading the second seq of pair-ends
		if (pairedEnd)
		{
			scanBUF(BUFF_2,name2,&BUFF_2_pos);
			scanBUF(BUFF_2,seq2,&BUFF_2_pos);
			//readSeq(name2);
			//readSeq(seq2);
			name2[strlen(name2)-1] = '\0';
			for (i=0; i<strlen(name2);i++)
			{
				if (name2[i] == ' ')
				{
					name2[i] = '\0';
					break;
				}
			}

			if ( *fastq )
			{
				//readSeq(dummy);
				//readSeq(qual2);
				scanBUF(BUFF_2,dummy,&BUFF_2_pos);
				scanBUF(BUFF_2,qual2,&BUFF_2_pos);

				qual2[strlen(qual2)-1] = '\0';
			}
			else
			{
				sprintf(qual2, "*");
			}


			// Cropping
			if (cropSize > 0)
			{
				seq2[cropSize] = '\0';
				if ( *fastq )
					qual2[cropSize] = '\0';
			}

			nCnt = 0;
			for (i=0; i<strlen(seq2); i++)
			{
				seq2[i] = toupper (seq2[i]);
				if (seq2[i] == 'N')
				{
					nCnt++;

				}
				else if (isspace(seq2[i]))
				{
					seq2[i] = '\0';
				}
			}
			if (nCnt > errThreshold)
			{
				err2 = 1;
			}
		}

		if (!pairedEnd && !err1)
		{

			int _mtmp = strlen(seq1);
			list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;

			reverseComplete(seq1, rseq1, _mtmp);
			int i;

			list[seqCnt].hits[0] = 0;

			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq1[i];
				list[seqCnt].rseq[i] = rseq1[i] ;
				list[seqCnt].qual[i] = qual1[i];
			}
			list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0';
			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
			seqCnt++;
		}
		else if (pairedEnd && !err1 && !err2)
		{
			// Naming Conventions X/1, X/2 OR X
			int tmplen = strlen(name1);
			if (strcmp(name1, name2) != 0)
			{
				tmplen = strlen(name1)-2;
			}
		
			//first seq
			int _mtmp = strlen(seq1);
			list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;

			reverseComplete(seq1, rseq1, _mtmp);
			int i;

			list[seqCnt].hits[0] = 0;

			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq1[i];
				list[seqCnt].rseq[i] = rseq1[i] ;
				list[seqCnt].qual[i] = qual1[i];
			}

			name1[tmplen]='\0';
			list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0';
			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
			
			seqCnt++;

			//second seq
			list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;

			reverseComplete(seq2, rseq2, _mtmp);

			list[seqCnt].hits[0] = 0;

			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq2[i];
				list[seqCnt].rseq[i] = rseq2[i];
				list[seqCnt].qual[i] = qual2[i];
			}

			name2[tmplen]='\0';
			list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0';
			sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0');

			seqCnt++;
		}
		else
		{
			discarded++;
		}
	}

	if (seqCnt > 0)
	{
		QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq);
		if (! *fastq)
		{
			QUAL_LENGTH = 1;
		}
		//fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH);
	}
	else
	{
		fprintf(stdout, "ERR: No reads can be found for mapping, %d discarded\n", discarded);
		return 1;
	}


	if (pairedEnd)
	{
//		seqCnt /= 2;
	}


	*seqList = list;
	*seqListSize = seqCnt;

	_r_seq = list;
	_r_seqCnt = seqCnt;
	

	free(BUFF_1);
	if (pairedEnd) free(BUFF_2);
	
	fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
	//totalLoadingTime+=getTime()-startTime;
	/*	
	fprintf(stdout,"HERE\n");
	int j;
	for (j=0;j<maxCnt;j++){
		fprintf(stdout,"%s\n",list[j].seq);
	}
	exit(1);
	*/	
	return 1;
}