Esempio n. 1
0
int countAllReads(char *fileName1, char *fileName2, int compressed, 
		unsigned char pairedEnd) {


	char dummy[SEQ_MAX_LENGTH];
	int maxCnt = 0;
	
	if (!compressed) {
		_r_fp1 = fileOpen( fileName1, "r");
		if (_r_fp1 == NULL)
			return 0;

		if ( pairedEnd && fileName2 != NULL ) {
			_r_fp2 = fileOpen ( fileName2, "r" );
			if (_r_fp2 == NULL)
				return 0;
		}
		else {
			_r_fp2 = _r_fp1;
		}

		readFirstSeq = &readFirstSeqTXT;
		readSecondSeq = &readSecondSeqTXT;
	}
	else {
		_r_gzfp1 = fileOpenGZ (fileName1, "r");
		if (_r_gzfp1 == NULL)
			return 0;

		if ( pairedEnd && fileName2 != NULL ) {
			_r_gzfp2 = fileOpenGZ ( fileName2, "r" );
			if (_r_gzfp2 == NULL)
				return 0;
		}
		else {
			_r_gzfp2 = _r_gzfp1;
		}

		readFirstSeq = &readFirstSeqGZ;
		readSecondSeq = &readSecondSeqGZ;
	}

	// Counting the number of lines in the file
	while (readFirstSeq(dummy)) { 
		if(dummy[0] != '#' && dummy[0]!='>' && dummy[0] != ' ' && 
			dummy[0] != '\r' && dummy[0] != '\n')
			maxCnt++;
	}

	if (!compressed)
		rewind(_r_fp1);
	else
		gzrewind(_r_gzfp1);

	// Return the Maximum # of sequences
	return maxCnt * 2;
}
Esempio n. 2
0
int readChunk(Read **seqList, unsigned int *seqListSize)
{
	double startTime=getTime();

	char seq1[SEQ_MAX_LENGTH];
	char name1[SEQ_MAX_LENGTH];
	char qual1[SEQ_MAX_LENGTH];

	char seq2[SEQ_MAX_LENGTH];
	char name2[SEQ_MAX_LENGTH];
	char qual2[SEQ_MAX_LENGTH];


	char dummy[SEQ_MAX_LENGTH];
	int  size;

	int maxCnt = 0;
	_r_seqCnt = 0;
	_r_readMemUsage = 0;
	
	int i;//, len;

	int namelen;
	while( (namelen = readFirstSeq(name1,1)) )
	{

		if (pairedEndMode)
		{
			if (name1[namelen-2]=='/' && name1[namelen-1]=='1')
			{
				namelen -= 2;
				name1[namelen]='\0';
			}
		}
		size = sizeof(uint16_t) + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH << 4) + namelen +/* 1 +*/ 4;	
		_r_seq[_r_seqCnt].hits	= getMem(size);		
		_r_readMemUsage += size;		
		_r_seq[_r_seqCnt].seq	= (char *)(_r_seq[_r_seqCnt].hits + 1);
		_r_seq[_r_seqCnt].rseq	= (char *)(_r_seq[_r_seqCnt].seq + SEQ_LENGTH + 1);
		_r_seq[_r_seqCnt].qual	= (char *)(_r_seq[_r_seqCnt].rseq + SEQ_LENGTH + 1);
		_r_seq[_r_seqCnt].cseq	= (CompressedSeq *)(_r_seq[_r_seqCnt].qual + QUAL_LENGTH + 1);
		_r_seq[_r_seqCnt].crseq	= (CompressedSeq *)(_r_seq[_r_seqCnt].cseq + CMP_SEQ_LENGTH);
		_r_seq[_r_seqCnt].name	= (char *)(_r_seq[_r_seqCnt].crseq + CMP_SEQ_LENGTH);
		_r_seq[_r_seqCnt].alphCnt = (unsigned char *)(_r_seq[_r_seqCnt].name + namelen);// + 1);
		_r_seq[_r_seqCnt].hits[0] = 0;

		for (i=1; i<namelen+1; i++)
			_r_seq[_r_seqCnt].name[i-1] = name1[i];

		if ( readFirstSeq(_r_seq[_r_seqCnt].seq,2) != SEQ_LENGTH)
		{
			fprintf(stdout, "ERR: Inconsistent read length for %s\n", name1);
			exit(EXIT_FAILURE);			
		} 

		if ( _r_fastq )
		{
			readFirstSeq(dummy,3);
			readFirstSeq(_r_seq[_r_seqCnt].qual,4);
		}
		else
		{
			_r_seq[_r_seqCnt].qual = "*";
		}
		_r_seqCnt++;


		if (pairedEndMode)
		{
			_r_seq[_r_seqCnt].hits	= getMem(size);		
			_r_readMemUsage += size;		
			_r_seq[_r_seqCnt].seq	= (char *) (_r_seq[_r_seqCnt].hits + 1);
			_r_seq[_r_seqCnt].rseq	= (char *)(_r_seq[_r_seqCnt].seq + SEQ_LENGTH + 1);
			_r_seq[_r_seqCnt].qual	= (char *)(_r_seq[_r_seqCnt].rseq + SEQ_LENGTH + 1);
			_r_seq[_r_seqCnt].cseq	= (CompressedSeq *)(_r_seq[_r_seqCnt].qual + QUAL_LENGTH + 1);
			_r_seq[_r_seqCnt].crseq	= (CompressedSeq *)(_r_seq[_r_seqCnt].cseq + CMP_SEQ_LENGTH);
			_r_seq[_r_seqCnt].name	= (char *)(_r_seq[_r_seqCnt].crseq + CMP_SEQ_LENGTH);
			_r_seq[_r_seqCnt].alphCnt = (unsigned char *)(_r_seq[_r_seqCnt].name + namelen);// + 1);
			_r_seq[_r_seqCnt].hits[0] = 0;
			
			readSecondSeq(name2, 1);

			for (i=1; i<namelen+1; i++)
				_r_seq[_r_seqCnt].name[i-1] = name1[i];

			if ( readSecondSeq(_r_seq[_r_seqCnt].seq,2) != SEQ_LENGTH)
			{
				fprintf(stdout, "ERR: Inconsistent read length for %s\n", name1);
				exit(EXIT_FAILURE);			
			} 

			if ( _r_fastq )
			{
				readSecondSeq(dummy,3);
				readSecondSeq(_r_seq[_r_seqCnt].qual,4);
			}
			else
			{
				_r_seq[_r_seqCnt].qual = "*";
			}
			_r_seqCnt++;
		}

		if (_r_seqCnt >= _r_maxSeqCnt)
			break;
	}
	*seqList = _r_seq;
	*seqListSize = _r_seqCnt;

	if (_r_seqCnt > 0)
	{
		preProcessReadsMT();
		fprintf(stdout, "| *Reading Input* | %15.2f | XXXXXXXXXXXXXXX | %15.2f | XXXXXXXXXXXXXXX %15d |\n", (getTime()-startTime), getMemUsage(), _r_seqCnt );
		_r_firstIteration = 0;
	}
	else if (_r_firstIteration)
	{
		fprintf(stdout, "ERR: No reads for mapping\n");
		exit(EXIT_FAILURE);
	}

	if (_r_seqCnt < _r_maxSeqCnt)		// reached end of file
		return 0;
	else
		return 1;
}
Esempio n. 3
0
int readAllReads(char *fileName1, char *fileName2, int compressed,
		 unsigned char *fastq, unsigned char pairedEnd, Read **seqList,
		 unsigned int *seqListSize, unsigned int ListSize, unsigned int AccListSize) {

	double startTime=getTime();
	
	char * seq;
	char * qual;
	char seq1[SEQ_MAX_LENGTH];
	char rseq1[SEQ_MAX_LENGTH];
	char qual1[SEQ_MAX_LENGTH];
	char seq2[SEQ_MAX_LENGTH];
	char rseq2[SEQ_MAX_LENGTH];
	char qual2[SEQ_MAX_LENGTH];
	char dummy[SEQ_MAX_LENGTH];
	int discarded = 0;
	int seqCnt = 0;
	Read *list = NULL;
	int nCnt1;
	int nCnt2;

	list = getMem(sizeof(Read)*ListSize, "list @readAllReads()");

	while(ListSize > seqCnt && readFirstSeq(dummy)) {
		int i = 0;
		int _mtmp = 36;

		if(dummy[0] == '#' || dummy[0] == '>' || dummy[0] == ' ' || 
			dummy[0] == '\r' || dummy[0] == '\n')
			continue;

		strtok(dummy, "\t ");
		seq = strtok(NULL, "\t ");
		qual = strtok(NULL, "\t ");

		for(i = 0; i < _mtmp - 1; i++) {
			seq1[i] = toupper(seq[i]);
			qual1[i] = qual[i];
		}

		for(i = 0; i < _mtmp - 1; i++) {
			seq2[i] = toupper(seq[i + _mtmp - 1]);
			qual2[i] = qual[i + _mtmp - 1];
		}

		seq1[_mtmp - 1] = seq2[_mtmp - 1] = qual1[_mtmp - 1] = qual2[_mtmp - 1] = '\0';
		
		nCnt1 = 0; nCnt2 = 0;
		for (i=0; i<_mtmp; i++)
		  {
		    if (seq1[i] == 'N')		      
			nCnt1++;
		    if (seq2[i] == 'N')		      
			nCnt2++;		    
		  }
		
		if (nCnt1 > errThreshold || nCnt2 > errThreshold)
		  {
		    discarded += 2;
		    continue;
		  }
	

		if (errThreshold == 255) {
			if (cropSize > 0) {
				errThreshold = (int) ceil(cropSize * 0.04);
				fprintf(stdout, "Sequence length: %d bp. Error threshold is set to %d bp.\n", 
					cropSize, errThreshold);
				}
			else {
				errThreshold = (int) ceil((strlen(seq1)) * 0.04);
				fprintf(stdout, "Sequence length: %d bp. Error threshold is set to %d bp.\n", 
					((int)strlen(seq1)), errThreshold);
			}
				fprintf(stdout, "You can override this value using the -e parameter.\n");
		}

		list[seqCnt].hits = getMem (1 + 3 * _mtmp + 3 + _mtmp, "list.hits @readAllReads()");
		list[seqCnt].seq = list[seqCnt].hits + 1;
		list[seqCnt].rseq = list[seqCnt].seq + _mtmp + 1;
		list[seqCnt].qual = list[seqCnt].rseq + _mtmp + 1;
		list[seqCnt].name = list[seqCnt].qual + _mtmp + 1;
		list[seqCnt].hashValue = getMem(sizeof(short) * _mtmp, "list.hashValue @readAllReads()");
		list[seqCnt].rhashValue = getMem(sizeof(short) * _mtmp, "list.rhashValue @readAllReads()");
		list[seqCnt].readNumber = seqCnt;			
		list[seqCnt].hits[0] = 0;

		reverseComplement(seq1, rseq1, _mtmp - 1);	// DHL Modify
		rseq1[_mtmp - 1] = '\0';

		for (i=0; i<_mtmp-1; i++) {
			list[seqCnt].seq[i] = seq1[i];
			list[seqCnt].rseq[i] = rseq1[i];
			list[seqCnt].qual[i] = qual1[i];
		}

		if (!pairedEndMode)
		  sprintf(list[seqCnt].name, "%s_%d/1", mappingOutput, (seqCnt + AccListSize) / 2);
		else
		  sprintf(list[seqCnt].name, "%s_%d", mappingOutput, (seqCnt + AccListSize) / 2);

		list[seqCnt].seq[_mtmp - 1] = list[seqCnt].rseq[_mtmp - 1] = list[seqCnt].qual[_mtmp - 1]='\0';

		seqCnt++;

		list[seqCnt].hits = getMem (1 + 3 * _mtmp + 3 + _mtmp, "list.hits @readAllReads()");
		list[seqCnt].seq = list[seqCnt].hits + 1;
		list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
		list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
		list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
		list[seqCnt].hashValue = getMem(sizeof(short) * _mtmp, "list.hashValue @readAllReads()");
		list[seqCnt].rhashValue = getMem(sizeof(short) * _mtmp, "list.rhashValue @readAllReads()");
		list[seqCnt].readNumber = seqCnt;				 
		list[seqCnt].hits[0] = 0;

		reverseComplement(seq2, rseq2, _mtmp - 1);	// DHL Modify
		rseq2[_mtmp - 1] = '\0';

		for (i=0; i<_mtmp; i++) {
			list[seqCnt].seq[i] = seq2[i];
			list[seqCnt].rseq[i] = rseq2[i];
			list[seqCnt].qual[i] = qual2[i];
		}

		if (!pairedEndMode)
		  sprintf(list[seqCnt].name, "%s_%d/2", mappingOutput, (seqCnt + AccListSize) / 2);
		else
		  sprintf(list[seqCnt].name, "%s_%d", mappingOutput, (seqCnt + AccListSize) / 2);

		list[seqCnt].seq[_mtmp - 1] = list[seqCnt].rseq[_mtmp - 1] = list[seqCnt].qual[_mtmp - 1]='\0';

		seqCnt++;
	}

	if (seqCnt <= 0) {
		//fprintf(stdout, "ERROR: No reads can be found for mapping\n");
		fprintf(stdout, "==== End of Input Reads    					====\n");	// DHL: read slice
		return 0;
	}

	//qsort(list, seqCnt, sizeof(Read), toCompareRead);
	adjustQual(list, seqCnt);
	*seqList = list;
	*seqListSize = seqCnt;
	_r_seq = list;
	_r_seqCnt = seqCnt;

	if (pairedEnd) 
		discarded *= 2;

	if (seqCnt > 1) {
		fprintf(stdout, "==== %d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]	====\n", 
			seqCnt, (getTime()-startTime), discarded, getMemUsage());
	}
	else {
		fprintf(stdout, "==== %d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]	====\n", 
			seqCnt, (getTime()-startTime), discarded, getMemUsage());
	}

	return seqCnt;
}
Esempio n. 4
0
int initRead(char *fileName1, char *fileName2)
{
	char dummy[SEQ_MAX_LENGTH];
	char ch;
	int i, maxCnt=0;

	_r_buf1 = getMem(10000000);
	_r_buf1_pos = getMem(sizeof(int));
	_r_buf1_size = getMem(sizeof(int));
	*_r_buf1_size = *_r_buf1_pos = 0; 
	if ( pairedEndMode && fileName2 != NULL )
	{
		_r_buf2 = getMem(10000000);
		_r_buf2_pos = getMem(sizeof(int));
		_r_buf2_size = getMem(sizeof(int));
	}
	else
	{
		_r_buf2 = _r_buf1;
		_r_buf2_pos = _r_buf1_pos;
		_r_buf2_size = _r_buf1_size;
	}


	if (!seqCompressed)
	{
		_r_fp1 = fileOpen( fileName1, "r");

		if (_r_fp1 == NULL)
			return 0;

		ch = fgetc(_r_fp1);

		if ( pairedEndMode) 
		{
			if ( fileName2 == NULL )
			{
				_r_fp2 = _r_fp1;
			}
			else
			{
				_r_fp2 = fileOpen ( fileName2, "r" );
				if (_r_fp2 == NULL)
					return 0;
			}
		}

		readBuffer1 = &readBufferTxT1;
		readBuffer2 = &readBufferTxT2;
	}
	else
	{

		_r_gzfp1 = fileOpenGZ (fileName1, "r");

		if (_r_gzfp1 == NULL)
		{
			return 0;
		}

		ch = gzgetc(_r_gzfp1);

		if ( pairedEndMode && fileName2 != NULL )
		{
			_r_gzfp2 = fileOpenGZ ( fileName2, "r" );
			if (_r_gzfp2 == NULL)
			{
				return 0;
			}
		}
		else
		{
			_r_gzfp2 = _r_gzfp1;
		}

		readBuffer1 = &readBufferGZ1;
		readBuffer2 = &readBufferGZ2;
	}

	if (!seqCompressed)
		rewind(_r_fp1);
	else
		gzrewind(_r_gzfp1);

	if (ch == '>')
		_r_fastq = 0;
	else
		_r_fastq = 1;
	
	readFirstSeq(dummy,1);
	int nameLen = strlen(dummy);
	readFirstSeq(dummy,2);
	*_r_buf1_pos = 0;
	int seqLen = strlen(dummy);
	SEQ_LENGTH = 0;
	i = 0;
	while (i<seqLen && !isspace(dummy[i]))
	{
		i++;
		SEQ_LENGTH++;
	}
	
	if (cropSize > 0)
		SEQ_LENGTH = cropSize;

	if ( SEQ_LENGTH >= SEQ_MAX_LENGTH )
	{
		fprintf(stdout, "ERR: Read Length is greater than the MAX length we can process (Current Max: %d).\n", SEQ_MAX_LENGTH);
		exit(EXIT_FAILURE);
	}

	if (_r_fastq)
	{
		QUAL_LENGTH = SEQ_LENGTH;
	}
	else
	{
		QUAL_LENGTH = 1;
	}

	CMP_SEQ_LENGTH = calculateCompressedLen(SEQ_LENGTH);

	//TODO MEMORY CALCULATION FIX
	double readMem = sizeof(Read) + (2 + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH * 2 * 8) + (nameLen+10) + 4);
	readMem += ((bestMappingMode) ?(sizeof(FullMappingInfo)) :0);
	if (pairedEndMode)
		readMem += sizeof(MappingInfo) + sizeof(MappingLocations);

	_r_maxSeqCnt = (int)(((MAX_MEMORY-1.2) * (1 << 30))/readMem);
	if ( pairedEndMode && _r_maxSeqCnt % 2 )
		_r_maxSeqCnt ++;
	_r_maxSeqCnt -= _r_maxSeqCnt % THREAD_COUNT;

//_r_maxSeqCnt = 500000;

	_r_seq = getMem(sizeof(Read)*_r_maxSeqCnt);

	int maxErrThreshold = (SEQ_LENGTH/WINDOW_SIZE) - 1;
	if (errThreshold == -1)
	{
		errThreshold = SEQ_LENGTH*6/100;
		fprintf(stdout, "# Errors: %d\n", errThreshold);
	}
	if (errThreshold > maxErrThreshold && SEQ_LENGTH>0)
	{
		errThreshold = maxErrThreshold;
		fprintf(stdout, "# Error: %d (full sensitivity)\n", errThreshold);
	}


	checkSumLength = (SEQ_LENGTH / (errThreshold+1)) - WINDOW_SIZE;
	if (checkSumLength > sizeof(CheckSumType)*4)
		checkSumLength = sizeof(CheckSumType)*4;

	calculateSamplingLocations();


	if (!nohitDisabled)
	{
		_r_umfp = fileOpen(unmappedOutput, "w");
	}

	_r_alphIndex = getMem(128);		// used in readChunk()
	_r_alphIndex['A'] = 0;
	_r_alphIndex['C'] = 1;
	_r_alphIndex['G'] = 2;
	_r_alphIndex['T'] = 3;
	_r_alphIndex['N'] = 4;

	return 1;
}