Beispiel #1
0
int countAllReads(char *fileName1, char *fileName2, int compressed, 
		unsigned char pairedEnd) {


	char dummy[SEQ_MAX_LENGTH];
	int maxCnt = 0;
	
	if (!compressed) {
		_r_fp1 = fileOpen( fileName1, "r");
		if (_r_fp1 == NULL)
			return 0;

		if ( pairedEnd && fileName2 != NULL ) {
			_r_fp2 = fileOpen ( fileName2, "r" );
			if (_r_fp2 == NULL)
				return 0;
		}
		else {
			_r_fp2 = _r_fp1;
		}

		readFirstSeq = &readFirstSeqTXT;
		readSecondSeq = &readSecondSeqTXT;
	}
	else {
		_r_gzfp1 = fileOpenGZ (fileName1, "r");
		if (_r_gzfp1 == NULL)
			return 0;

		if ( pairedEnd && fileName2 != NULL ) {
			_r_gzfp2 = fileOpenGZ ( fileName2, "r" );
			if (_r_gzfp2 == NULL)
				return 0;
		}
		else {
			_r_gzfp2 = _r_gzfp1;
		}

		readFirstSeq = &readFirstSeqGZ;
		readSecondSeq = &readSecondSeqGZ;
	}

	// Counting the number of lines in the file
	while (readFirstSeq(dummy)) { 
		if(dummy[0] != '#' && dummy[0]!='>' && dummy[0] != ' ' && 
			dummy[0] != '\r' && dummy[0] != '\n')
			maxCnt++;
	}

	if (!compressed)
		rewind(_r_fp1);
	else
		gzrewind(_r_gzfp1);

	// Return the Maximum # of sequences
	return maxCnt * 2;
}
Beispiel #2
0
int initOutput ( char *fileName, int compressed)
{
	if (compressed)
	{
		char newFileName[strlen(mappingOutputPath)+strlen(fileName)+4];
		sprintf(newFileName, "%s%s.sam.gz", mappingOutputPath, fileName);
		_out_gzfp = fileOpenGZ(newFileName, "w1f");
		if (_out_gzfp == Z_NULL)
		{
			return 0;
		}
	
		finalizeOutput = &finalizeGZOutput;

		output = &gzOutputQ;
		outputMeta =&gzOutputMetaQ;
		outputBuffer = &outputBufferGZ;
	}
	else
	{
	
		char newFileName[strlen(mappingOutputPath)+strlen(fileName)+strlen(".sam")+1];
		if ( !strcmp(mappingOutputPath, "/dev/") && !strcmp(fileName, "null") )
		{
			sprintf(newFileName, "%s%s", mappingOutputPath, fileName);
			nohitDisabled = 1;
		}
		else
		{
			//sprintf(newFileName, "%s%s.sam", mappingOutputPath, fileName);
			sprintf(newFileName, "%s%s", mappingOutputPath, fileName);
		}

		_out_fp = fileOpen(newFileName, "w");
		if (_out_fp == NULL)
		{
			return 0;
		}

		finalizeOutput = &finalizeTXOutput;
		output = &outputQ;
		outputMeta = &outputMetaQ;
		outputBuffer = &outputBufferTxT;
	}
	
	if (noSamHeader)
		outputMeta = &noMetaOutput;

	outputMeta("@HD\tVN:1.4\tSO:unsorted");
	
	return 1;
}
Beispiel #3
0
int initRead(char *fileName1, char *fileName2)
{
	char dummy[SEQ_MAX_LENGTH];
	char ch;
	int i, maxCnt=0;

	_r_buf1 = getMem(10000000);
	_r_buf1_pos = getMem(sizeof(int));
	_r_buf1_size = getMem(sizeof(int));
	*_r_buf1_size = *_r_buf1_pos = 0; 
	if ( pairedEndMode && fileName2 != NULL )
	{
		_r_buf2 = getMem(10000000);
		_r_buf2_pos = getMem(sizeof(int));
		_r_buf2_size = getMem(sizeof(int));
	}
	else
	{
		_r_buf2 = _r_buf1;
		_r_buf2_pos = _r_buf1_pos;
		_r_buf2_size = _r_buf1_size;
	}


	if (!seqCompressed)
	{
		_r_fp1 = fileOpen( fileName1, "r");

		if (_r_fp1 == NULL)
			return 0;

		ch = fgetc(_r_fp1);

		if ( pairedEndMode) 
		{
			if ( fileName2 == NULL )
			{
				_r_fp2 = _r_fp1;
			}
			else
			{
				_r_fp2 = fileOpen ( fileName2, "r" );
				if (_r_fp2 == NULL)
					return 0;
			}
		}

		readBuffer1 = &readBufferTxT1;
		readBuffer2 = &readBufferTxT2;
	}
	else
	{

		_r_gzfp1 = fileOpenGZ (fileName1, "r");

		if (_r_gzfp1 == NULL)
		{
			return 0;
		}

		ch = gzgetc(_r_gzfp1);

		if ( pairedEndMode && fileName2 != NULL )
		{
			_r_gzfp2 = fileOpenGZ ( fileName2, "r" );
			if (_r_gzfp2 == NULL)
			{
				return 0;
			}
		}
		else
		{
			_r_gzfp2 = _r_gzfp1;
		}

		readBuffer1 = &readBufferGZ1;
		readBuffer2 = &readBufferGZ2;
	}

	if (!seqCompressed)
		rewind(_r_fp1);
	else
		gzrewind(_r_gzfp1);

	if (ch == '>')
		_r_fastq = 0;
	else
		_r_fastq = 1;
	
	readFirstSeq(dummy,1);
	int nameLen = strlen(dummy);
	readFirstSeq(dummy,2);
	*_r_buf1_pos = 0;
	int seqLen = strlen(dummy);
	SEQ_LENGTH = 0;
	i = 0;
	while (i<seqLen && !isspace(dummy[i]))
	{
		i++;
		SEQ_LENGTH++;
	}
	
	if (cropSize > 0)
		SEQ_LENGTH = cropSize;

	if ( SEQ_LENGTH >= SEQ_MAX_LENGTH )
	{
		fprintf(stdout, "ERR: Read Length is greater than the MAX length we can process (Current Max: %d).\n", SEQ_MAX_LENGTH);
		exit(EXIT_FAILURE);
	}

	if (_r_fastq)
	{
		QUAL_LENGTH = SEQ_LENGTH;
	}
	else
	{
		QUAL_LENGTH = 1;
	}

	CMP_SEQ_LENGTH = calculateCompressedLen(SEQ_LENGTH);

	//TODO MEMORY CALCULATION FIX
	double readMem = sizeof(Read) + (2 + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH * 2 * 8) + (nameLen+10) + 4);
	readMem += ((bestMappingMode) ?(sizeof(FullMappingInfo)) :0);
	if (pairedEndMode)
		readMem += sizeof(MappingInfo) + sizeof(MappingLocations);

	_r_maxSeqCnt = (int)(((MAX_MEMORY-1.2) * (1 << 30))/readMem);
	if ( pairedEndMode && _r_maxSeqCnt % 2 )
		_r_maxSeqCnt ++;
	_r_maxSeqCnt -= _r_maxSeqCnt % THREAD_COUNT;

//_r_maxSeqCnt = 500000;

	_r_seq = getMem(sizeof(Read)*_r_maxSeqCnt);

	int maxErrThreshold = (SEQ_LENGTH/WINDOW_SIZE) - 1;
	if (errThreshold == -1)
	{
		errThreshold = SEQ_LENGTH*6/100;
		fprintf(stdout, "# Errors: %d\n", errThreshold);
	}
	if (errThreshold > maxErrThreshold && SEQ_LENGTH>0)
	{
		errThreshold = maxErrThreshold;
		fprintf(stdout, "# Error: %d (full sensitivity)\n", errThreshold);
	}


	checkSumLength = (SEQ_LENGTH / (errThreshold+1)) - WINDOW_SIZE;
	if (checkSumLength > sizeof(CheckSumType)*4)
		checkSumLength = sizeof(CheckSumType)*4;

	calculateSamplingLocations();


	if (!nohitDisabled)
	{
		_r_umfp = fileOpen(unmappedOutput, "w");
	}

	_r_alphIndex = getMem(128);		// used in readChunk()
	_r_alphIndex['A'] = 0;
	_r_alphIndex['C'] = 1;
	_r_alphIndex['G'] = 2;
	_r_alphIndex['T'] = 3;
	_r_alphIndex['N'] = 4;

	return 1;
}
Beispiel #4
0
int readAllReads(char *fileName1,
						char *fileName2,
						int compressed,
						unsigned char *fastq,
						unsigned char pairedEnd,
						Read **seqList,
						unsigned int *seqListSize)
{
	double startTime=getTime();

	char seq1[SEQ_MAX_LENGTH];
	char rseq1[SEQ_MAX_LENGTH];
	char name1[SEQ_MAX_LENGTH];
	char qual1[SEQ_MAX_LENGTH];
	char seq2[SEQ_MAX_LENGTH];
	char rseq2[SEQ_MAX_LENGTH];
	char name2[SEQ_MAX_LENGTH];
	char qual2[SEQ_MAX_LENGTH];

	char dummy[SEQ_MAX_LENGTH];
	char ch;
	int err1, err2;
	int nCnt;
	int discarded = 0;
	int seqCnt = 0;
	int maxCnt = 0;
	int i;
	Read *list = NULL;
	// new vars
	
	char * BUFF_1 = NULL;
	char * BUFF_2 = NULL;
	unsigned long int BUFF_1_pos = 0;		
	unsigned long int BUFF_2_pos = 0;		
	
	char READ_BUFFER[SEQ_MAX_LENGTH];
	size_t read_len;
	size_t curr_pos;

	void * _r_fp1;
	void * _r_fp2;

	if (!compressed)
	{
		_r_fp1 = fileOpen( fileName1, "r");

		if (_r_fp1 == NULL) return 0;

		if ( pairedEnd && fileName2 != NULL ){
			_r_fp2 = fileOpen ( fileName2, "r" );
			if (_r_fp2 == NULL)	return 0;
		}
		else{
			_r_fp2 = _r_fp1;
		}
		readSeq = &readline_TXT;
	}
	else{
		_r_fp1 = fileOpenGZ (fileName1, "r");
		if (_r_fp1 == NULL){
			return 0;
		}

		if ( pairedEnd && fileName2 != NULL ){
			_r_fp2 = fileOpenGZ ( fileName2, "r" );
			if (_r_fp2 == NULL)	return 0;
		}
		else{
			_r_fp2 = _r_fp1;
		}
		readSeq = &readline_GZ;
	}
	
	//READ INTO 1 or 2 buffers
		
	///READ IN read 1 into the buffer
	curr_pos=0;

	while (readSeq(_r_fp1,READ_BUFFER)){
		read_len = strlen(READ_BUFFER);
		BUFF_1=(char*)(realloc(BUFF_1,curr_pos+read_len));
		//strcpy(&BUFF_1[curr_pos],READ_BUFFER);
		memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len);
		curr_pos+=read_len;
		maxCnt++;	
		//printf("%s",READ_BUFFER);
	}
	
	if (pairedEnd){
		curr_pos=0;
		while (readSeq(_r_fp2,READ_BUFFER)){
			read_len = strlen(READ_BUFFER);
			BUFF_2=(char*)(realloc(BUFF_2,curr_pos+read_len));
			//strcpy(&BUFF_2[curr_pos],READ_BUFFER);
			memcpy(&BUFF_1[curr_pos],READ_BUFFER,read_len);
			curr_pos+=read_len;
		}
	}	
	//printf("%s",BUFF_1);
	//exit(1);
	printf("read in complete\n");
	printf("%d lines\n",maxCnt);
	//printf("%s",BUFF_1);
	if (BUFF_1[0] == '>')
		*fastq = 0;
	else
		*fastq = 1;

	// Counting the number of lines in the file
	//while (readSeq(dummy)) maxCnt++;
	
	if (!compressed){
		fclose(_r_fp1);
		if (pairedEnd) fclose(_r_fp2);
	}else{
		gzclose(_r_fp1);
		if (pairedEnd) gzclose(_r_fp2);
	}
	
	///AFTER HERE, no changes except reads in from stream
	// Calculating the Maximum # of sequences
	if (*fastq)
	{
		maxCnt /= 4;
	}
	else
	{
		maxCnt /= 2;
	}

	if (pairedEnd && fileName2 != NULL )
		maxCnt *= 2;

	list = getMem(sizeof(Read)*maxCnt);

	
	//while( readSeq(name1) )
	while(scanBUF(BUFF_1,name1,&BUFF_1_pos)==1)
	{
		err1 = 0;
		err2 = 0;
		//readSeq(seq1);
		scanBUF(BUFF_1,seq1,&BUFF_1_pos);
		name1[strlen(name1)-1] = '\0';
		for (i=0; i<strlen(name1);i++)
		{
			if (name1[i] == ' ')
			{
				name1[i] = '\0';
				break;
			}

		}

		if ( *fastq )
		{
			scanBUF(BUFF_1,dummy,&BUFF_1_pos);
			scanBUF(BUFF_1,qual1,&BUFF_1_pos);
			//readSeq(dummy);
			//readSeq(qual1);
			qual1[strlen(qual1)-1] = '\0';
		}
		else
		{
			sprintf(qual1, "*");
		}


		// Cropping
		if (cropSize > 0)
		{
			seq1[cropSize] = '\0';
			if ( *fastq )
				qual1[cropSize] = '\0';
		}


		nCnt = 0;
		for (i=0; i<strlen(seq1); i++)
		{
			seq1[i] = toupper (seq1[i]);
			if (seq1[i] == 'N')
			{
				nCnt++;
			}
			else if (isspace(seq1[i]))
			{

				seq1[i] = '\0';
				break;
			}
		}

		if (nCnt > errThreshold)
		{
			err1 = 1;
		}

		// Reading the second seq of pair-ends
		if (pairedEnd)
		{
			scanBUF(BUFF_2,name2,&BUFF_2_pos);
			scanBUF(BUFF_2,seq2,&BUFF_2_pos);
			//readSeq(name2);
			//readSeq(seq2);
			name2[strlen(name2)-1] = '\0';
			for (i=0; i<strlen(name2);i++)
			{
				if (name2[i] == ' ')
				{
					name2[i] = '\0';
					break;
				}
			}

			if ( *fastq )
			{
				//readSeq(dummy);
				//readSeq(qual2);
				scanBUF(BUFF_2,dummy,&BUFF_2_pos);
				scanBUF(BUFF_2,qual2,&BUFF_2_pos);

				qual2[strlen(qual2)-1] = '\0';
			}
			else
			{
				sprintf(qual2, "*");
			}


			// Cropping
			if (cropSize > 0)
			{
				seq2[cropSize] = '\0';
				if ( *fastq )
					qual2[cropSize] = '\0';
			}

			nCnt = 0;
			for (i=0; i<strlen(seq2); i++)
			{
				seq2[i] = toupper (seq2[i]);
				if (seq2[i] == 'N')
				{
					nCnt++;

				}
				else if (isspace(seq2[i]))
				{
					seq2[i] = '\0';
				}
			}
			if (nCnt > errThreshold)
			{
				err2 = 1;
			}
		}

		if (!pairedEnd && !err1)
		{

			int _mtmp = strlen(seq1);
			list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;

			reverseComplete(seq1, rseq1, _mtmp);
			int i;

			list[seqCnt].hits[0] = 0;

			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq1[i];
				list[seqCnt].rseq[i] = rseq1[i] ;
				list[seqCnt].qual[i] = qual1[i];
			}
			list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0';
			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
			seqCnt++;
		}
		else if (pairedEnd && !err1 && !err2)
		{
			// Naming Conventions X/1, X/2 OR X
			int tmplen = strlen(name1);
			if (strcmp(name1, name2) != 0)
			{
				tmplen = strlen(name1)-2;
			}
		
			//first seq
			int _mtmp = strlen(seq1);
			list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;

			reverseComplete(seq1, rseq1, _mtmp);
			int i;

			list[seqCnt].hits[0] = 0;

			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq1[i];
				list[seqCnt].rseq[i] = rseq1[i] ;
				list[seqCnt].qual[i] = qual1[i];
			}

			name1[tmplen]='\0';
			list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0';
			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
			
			seqCnt++;

			//second seq
			list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
			list[seqCnt].seq = list[seqCnt].hits + 1;
			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
			list[seqCnt].name = list[seqCnt].qual + _mtmp+1;

			reverseComplete(seq2, rseq2, _mtmp);

			list[seqCnt].hits[0] = 0;

			for (i=0; i<=_mtmp; i++)
			{
				list[seqCnt].seq[i] = seq2[i];
				list[seqCnt].rseq[i] = rseq2[i];
				list[seqCnt].qual[i] = qual2[i];
			}

			name2[tmplen]='\0';
			list[seqCnt].rseq[_mtmp]=list[seqCnt].qual[_mtmp]='\0';
			sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0');

			seqCnt++;
		}
		else
		{
			discarded++;
		}
	}

	if (seqCnt > 0)
	{
		QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq);
		if (! *fastq)
		{
			QUAL_LENGTH = 1;
		}
		//fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH);
	}
	else
	{
		fprintf(stdout, "ERR: No reads can be found for mapping, %d discarded\n", discarded);
		return 1;
	}


	if (pairedEnd)
	{
//		seqCnt /= 2;
	}


	*seqList = list;
	*seqListSize = seqCnt;

	_r_seq = list;
	_r_seqCnt = seqCnt;
	

	free(BUFF_1);
	if (pairedEnd) free(BUFF_2);
	
	fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
	//totalLoadingTime+=getTime()-startTime;
	/*	
	fprintf(stdout,"HERE\n");
	int j;
	for (j=0;j<maxCnt;j++){
		fprintf(stdout,"%s\n",list[j].seq);
	}
	exit(1);
	*/	
	return 1;
}