Beispiel #1
0
void DatFile::GetExperimentHeader(struct experiment_header &hdr)
{
    char* f = this->GetFileData(sizeof(struct dat_file_header), sizeof(struct experiment_header));
    memcpy ( &hdr, f, sizeof ( hdr ) );
    ByteSwap4(hdr.first_frame_time);
    ByteSwap2( hdr.rows );
    ByteSwap2 ( hdr.cols );
    ByteSwap2( hdr.x_region_size );
    ByteSwap2( hdr.y_region_size );
    ByteSwap2( hdr.frames_in_file );
    ByteSwap2( hdr.uncomp_frames_in_file );
    ByteSwap2( hdr.interlaceType );

    if(hdr.uncomp_frames_in_file < hdr.frames_in_file || hdr.uncomp_frames_in_file >= hdr.frames_in_file * 4) {
        std::cout << "Unknown compression" << std::endl;
        exit(55);
    }
}
Beispiel #2
0
int main(int argc, char *argv[])
{
	FILE *inSFF = NULL;
	FILE *outSFF = NULL;
	FILE *listFP = NULL;
    int n = 0; //number elements read
	char *inFileName = NULL;
	char *outFileName = NULL;
	char *listFileName = NULL;
	char *errFileName = {"./SFFFilter_err.txt"};
	int numReads;
	int matchCnt = 0;
	int got = 0;
	bool debugflag = false;
	bool qualflag = false;
	int qual_offset = 33;
	bool listMatch = false;
	
	char		name[256];
	uint16_t	*flowgram_values; // [NUMBER_OF_FLOWS_PER_READ];
	uint8_t		*flow_index_per_base; // * number_of_bases;
	char		*bases; // * number_of_bases;
	uint8_t		*quality_scores; // * number_of_bases;

	char		*flow_chars;
	char		*key_sequence;
	
	// Parse command line arguments
	int argcc = 1;
	while (argcc < argc) {
		if (argv[argcc][0] == '-') {
			switch (argv[argcc][1]) {
				
				case 'd':	// print debug info
					debugflag = true;
				break;
				
				case 'q':	// print debug info
					qualflag = true;
				break;
				
				case 'f':	// list of locations to filter
					argcc++;
					listFileName = strdup (argv[argcc]);
				break;
				
				case 's':	// Offset to apply to quality scores
					argcc++;
					qual_offset = atoi(argv[argcc]);
					if(qual_offset==0) {
						fprintf (stderr, "-s option should specify a nonzero quality offset\n");
						exit (1);
					}
				break;
				
				case 'o':	// output file name
					argcc++;
					outFileName = strdup(argv[argcc]);
				break;
				
				default:
					fprintf (stderr, "Unknown option %s\n", argv[argcc]);
					exit (1);
				break;
			}
		}
		else {
			inFileName = argv[argcc];
		}
		argcc++;
	}
	
	if (!inFileName) {
		fprintf (stdout, "No input sff file specified\n");
		fprintf (stdout, "Usage: %s [-f filename] [-d] sff-filename\n", argv[0]);
		fprintf (stdout, "\t-f Specify input file list.\n");
		fprintf (stdout, "\t-o Specify output sff file name.\n");
		fprintf (stdout, "\t-d Prints debug information.\n");
		fprintf (stdout, "\t-q Take qualities from 4th field of file specified by -f.\n");
		fprintf (stdout, "\t-s To use in conjunction with -q option, specifies an offset to be applied to quality scores.\n");
		exit (1);
	}
	if (!listFileName) {
		fprintf (stdout, "No input list file specified\n");
		fprintf (stdout, "Usage: %s [-f filename] [-d] sff-filename\n", argv[0]);
		fprintf (stdout, "\t-f Specify input file list.\n");
		fprintf (stdout, "\t-o Specify output sff file name.\n");
		fprintf (stdout, "\t-d Prints debug information.\n");
		fprintf (stdout, "\t-q Take qualities from 4th field of file specified by -f.\n");
		fprintf (stdout, "\t-s To use in conjunction with -q option, specifies an offset to be applied to quality scores.\n");
		exit (1);
	}
	
	//Create output filename from input filename if it wasn't specified
	if(outFileName==NULL) {
		outFileName = (char *) malloc (sizeof(char) * (strlen(dirname(inFileName)) + strlen(inFileName) + 50));
		sprintf (outFileName, "%s/filtered_%s", dirname(inFileName), inFileName);
	}
	
	//Open the SFF file
	inSFF = fopen(inFileName, "rb");
	if (!inSFF) {
		perror (inFileName);
		exit (1);
	}
	//Open the outputSFF file
	outSFF = fopen(outFileName, "wb");
	if (!outSFF) {
		perror (outFileName);
		exit (1);
	}
	//Open the list file
	listFP = fopen(listFileName, "rb");
	if (!listFP) {
		perror (listFileName);
		exit (1);
	}
	
	//Read the list of locations into buffer
	got = GetNumLines(listFileName);
	if (got <= 0) {
		fprintf (stderr, "Did not read any pixel coordinates; does the file exist?  Is it formatted correctly?\n");
		exit (1);
	}
	else {
		fprintf (stdout, "Reading up to %d lines\n", got);
	}
	
	//Dynamic array allocation
	int *rows = (int *) malloc (sizeof(int) * got);
	int *cols = (int *) malloc (sizeof(int) * got);
	int *lengths = (int *) malloc (sizeof(int) * got);
	char **quals = (char **) malloc (sizeof(char*) * got);
	bool *fnds = (bool *) malloc (sizeof(bool) * got);	//tracks reads that were found in SFF file
	for (int i=0;i<got;i++)
	{
		fnds[i] = false;
		quals[i] = (char *) malloc (sizeof(char) * MAX_BASES);
	}
	int lineCnt = 0;
	while (!feof(listFP)) {
		if(qualflag) {
			if(4 != fscanf (listFP, "%d %d %d %s\n", &rows[lineCnt], &cols[lineCnt], &lengths[lineCnt], quals[lineCnt])) {
				fprintf(stderr,"%s: bad format in line %d of %s - expected 3 ints and a char string.\n",argv[0],1+lineCnt,inFileName);
				exit(EXIT_FAILURE);
			} else if(strlen(quals[lineCnt]) < (unsigned int) lengths[lineCnt]) {
				fprintf(stderr,"%s: warning: line %d of %s - quality string is shorter than requested length.\n",argv[0],1+lineCnt,inFileName);
			}
			lineCnt++;
		} else {
			if(3 != fscanf (listFP, "%d %d %d\n", &rows[lineCnt], &cols[lineCnt], &lengths[lineCnt])) {
				fprintf(stderr,"%s: bad format in line %d of %s - expected 3 ints.\n",argv[0],1+lineCnt,inFileName);
				exit(EXIT_FAILURE);
			} else {
				lineCnt++;
			}
		}
	}
	fclose (listFP);
	
	
	// Read the input file header
	CommonHeader h;
	n = fread(&h, 31, 1, inSFF);
    assert(n==1);
	
	//Copy the header to write the output file
	CommonHeader ch_out;
	ch_out.magic_number = h.magic_number;
	ch_out.version[0] = 0;
	ch_out.version[1] = 0;
	ch_out.version[2] = 0;
	ch_out.version[3] = 1;
	ch_out.index_offset = h.index_offset;
	ch_out.index_length = h.index_length;
	ch_out.number_of_reads = h.number_of_reads;
	ch_out.header_length = h.header_length;
	ch_out.key_length = h.key_length;
	ch_out.number_of_flows_per_read = h.number_of_flows_per_read;
	ch_out.flowgram_format_code = h.flowgram_format_code;
	
	ByteSwap8(h.index_offset);
	ByteSwap4(h.index_length);
	ByteSwap4(h.number_of_reads);
	ByteSwap2(h.header_length);
	ByteSwap2(h.key_length);
	ByteSwap2(h.number_of_flows_per_read);
	flow_chars = (char *)malloc(h.number_of_flows_per_read);
	key_sequence = (char *)malloc(h.key_length);
	n = fread(flow_chars, h.number_of_flows_per_read, 1, inSFF);
    assert(n==1);
	n = fread(key_sequence, h.key_length, 1, inSFF);
    assert(n==1);
	int padBytes = (8-((31 + h.number_of_flows_per_read + h.key_length) & 7));
	char padData[8];
	if (padBytes > 0) {
		n = fread(padData, padBytes, 1, inSFF);
		assert(n==1);
	}
	
	if (debugflag) {
		//DEBUG
		printf("Magic:	%u	%s\n", h.magic_number, (h.magic_number == MAGIC ? "Yes" : "No"));
		printf("Header length: %hu\n", h.header_length);
		printf("Version: %d%d%d%d\n", h.version[0], h.version[1], h.version[2], h.version[3]);
		printf("Index offset: %lu  length: %u\n", h.index_offset, h.index_length);
		printf("Number of reads: %u\n", h.number_of_reads);
		printf("Key length: %u\n", h.key_length);
		printf("Flows per read: %hu\n", h.number_of_flows_per_read);
		printf("Flowgram format: %hhu\n", h.flowgram_format_code);
		printf ("End of Header\n\n");
	}
		
	// Write the header of the output SFF
	char pad[8];
	memset(pad, 0, sizeof(pad));
	int bytes = 31;
	fwrite (&ch_out, bytes, 1, outSFF);
	for(int i=0;i<h.number_of_flows_per_read;i++) {
		fwrite(&flow_chars[i%4], 1, 1, outSFF);
		bytes++;
	}

	fwrite(key_sequence, 1, 4, outSFF);
	bytes += 4;

	padBytes = (8 - (bytes & 0x7)) & 0x7;
	if (padBytes > 0)
		fwrite(pad, padBytes, 1, outSFF);

	// Prepare to process all the reads
	numReads = h.number_of_reads;
	
	flowgram_values = (uint16_t *)malloc(sizeof(uint16_t) * h.number_of_flows_per_read);
	int maxBases = h.number_of_flows_per_read * 100; // problems if ever a 10-mer hits every flow!
	flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
	bases = (char *)malloc(maxBases);
	quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
	
	//Loop thru the reads
	for (int i=0;i<numReads;i++) {

		// Read read header
		ReadHeader r;
		n = fread(&r, 16,  1, inSFF);
        	assert(n==1);

		ByteSwap2(r.read_header_length);
		ByteSwap4(r.number_of_bases);
		ByteSwap2(r.name_length);
		ByteSwap2(r.clip_qual_left);
		ByteSwap2(r.clip_qual_right);
		ByteSwap2(r.clip_adapter_left);
		ByteSwap2(r.clip_adapter_right);
		
		if (r.name_length > 0) {
			n = fread(name, r.name_length, 1, inSFF);
			assert(n==1);
            		name[r.name_length] = '\0';
		}
		
		int readPadLength = ((8 - ((16 + r.name_length) & 7)))%8;
		if (readPadLength > 0) {
			n = fread(padData, readPadLength, 1, inSFF);
			assert(n==1);
		}

		n = fread(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), inSFF);
		assert(n==sizeof(uint16_t));
		n = fread(flow_index_per_base, r.number_of_bases, sizeof(uint8_t), inSFF);
		assert(n==sizeof(uint8_t));
		n = fread(bases, r.number_of_bases, 1, inSFF);
		assert(n==1);
		bases[r.number_of_bases] = '\0';
		n = fread(quality_scores, r.number_of_bases, sizeof(uint8_t), inSFF);
		assert(n==sizeof(uint8_t));

		int bytesRead = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * r.number_of_bases;
		readPadLength = (8 - (bytesRead & 7))%8;
		if (readPadLength > 0) {
			n = fread(padData, readPadLength, 1, inSFF);
			assert(n==1);
		}
		
		int f;		
		if (debugflag) {
			// DEBUG
			printf("Read: %s has %d bases\n",
							(r.name_length > 0 ? name : "NONAME"),
							r.number_of_bases);
			//printf("Read header length: %d\n", r.read_header_length);
			printf("Clip left: %d qual: %d right: %d qual: %d\n",
						r.clip_adapter_left, r.clip_qual_left,
						r.clip_adapter_right, r.clip_qual_right);
			printf("Flowgram bases:\n");
			for(f=0;f<h.number_of_flows_per_read;f++)
				printf("%d ", (int) floor (ByteSwap2(flowgram_values[f])/100.0 + 0.5));
			printf("\n");
			//printf("\nFlow index per base:\n");
			unsigned int b;
			//for(b=0;b<r.number_of_bases;b++)
			//	printf("%d ", flow_index_per_base[b]);
			printf("Bases called:\n");
			for(b=0;b<r.number_of_bases;b++)
				printf("%c", bases[b]);
			//printf("\nQuality scores:\n");
			//for(b=0;b<r.number_of_bases;b++)
			//	printf("%d ", quality_scores[b]);
		} else {
			for(f=0;f<h.number_of_flows_per_read;f++)
				ByteSwap2(flowgram_values[f]);
		}
		
		//Get the row column for this read
		int row;
		int col;
                if(1 != ion_readname_to_rowcol(name, &row, &col)) {
                    fprintf (stderr, "Error parsing read name: '%s'\n", name);
                    continue;
		}
		
		//Look for matching row column in the list
		listMatch = false;
		//fprintf (stdout, "Looking for %d %d\n", row, col);
		int readMatch=0;
		for (;readMatch<got;readMatch++) {
			if (row == rows[readMatch] && col == cols[readMatch]) {
				//fprintf (stdout, "\there it is %d %d\n", rows[readMatch],cols[readMatch]);
				listMatch = true;
				fnds[readMatch] = true;
				matchCnt++;
				break;
			}
		}
		
		if (listMatch) {
			//
			//	Update the output file
			//
			int nameLen = r.name_length;
			int numBasesCalled = r.number_of_bases;
			if(r.clip_qual_right == 0 || r.clip_qual_right > lengths[readMatch])
				r.clip_qual_right = lengths[readMatch];

			// write the header
			ByteSwap2(r.read_header_length);
			ByteSwap4(r.number_of_bases);
			ByteSwap2(r.name_length);
			ByteSwap2(r.clip_qual_left);
			ByteSwap2(r.clip_qual_right);
			ByteSwap2(r.clip_adapter_left);
			ByteSwap2(r.clip_adapter_right);
			fwrite (&r, 16, 1, outSFF);

			fwrite(name, nameLen, 1, outSFF);
			int writePadLength = (8 - (nameLen & 7)) & 7;
			if (writePadLength)
				fwrite(padData, writePadLength, 1, outSFF);

			if(qualflag) {
				for(int iBase=0; iBase < lengths[readMatch]; iBase++) {
					quality_scores[iBase] = (uint8_t) quals[readMatch][iBase] + qual_offset;
				}
			}
			for(int iBase=lengths[readMatch]; iBase < numBasesCalled; iBase++) {
				flow_index_per_base[iBase] = 0;
				bases[iBase] = 'N';
				quality_scores[iBase] = 0;
			}
			for (f=0;f<h.number_of_flows_per_read;f++)
				ByteSwap2(flowgram_values[f]);
			fwrite(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), outSFF);

			fwrite(flow_index_per_base, numBasesCalled, sizeof(uint8_t), outSFF);

			fwrite(bases, numBasesCalled, 1, outSFF);

			fwrite(quality_scores, numBasesCalled, sizeof(uint8_t), outSFF);

			int bytesWritten = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * numBasesCalled;
			writePadLength = (8 - (bytesWritten & 7)) & 7;
			if (writePadLength)
				fwrite(padData, writePadLength, 1, outSFF);
		}
		else {
			//Skip this read
		}
	}
	
	//Update Read Count in output SFF file
	ch_out.number_of_reads = BYTE_SWAP_4(matchCnt);
	fseek (outSFF, 0, SEEK_SET);
	bytes=31;
	fwrite(&ch_out, bytes, 1, outSFF);
	
	//User message
	fprintf (stdout, "Created file: %s\n", outFileName);
	
	//
	//Write out report on unfound reads
	//
	bool printErrorLog = false;
	for (int i=0;i<got;i++)
	{
		if (fnds[i] == false) {
			printErrorLog = true;
			break;
		}
	}
	if (printErrorLog)
	{
		fprintf (stdout, "There are reads that were not found.  See %s\n", errFileName);
		FILE *fpErr = fopen (errFileName, "wb");
		if (fpErr) {
			fprintf (fpErr, "# SFF file: %s\n", inFileName);
			fprintf (fpErr, "# Read positions source: %s\n", listFileName);
			fprintf (fpErr, "# Reads not found in SFF:\n");
			fprintf (fpErr, "# Row Column\n");
			for (int i=0;i<got;i++)
			{
				if (fnds[i] == false) {
					fprintf (fpErr, "%d %d\n", rows[i], cols[i]);
				}
			}
			fclose (fpErr);
		}
		
	}
	//Cleanup
	fclose (inSFF);
	fclose (outSFF);
	free (rows);
	free (cols);
	free (fnds);
	free (flow_chars);
	free (key_sequence);
	free (flowgram_values);
	free (flow_index_per_base);
	free (bases);
	free (quality_scores);
	free (listFileName);
    free (outFileName);
	
	return 0;
}
Beispiel #3
0
int main(int argc, char *argv[])
{
	FILE *inSFF = NULL;
	FILE *outSFF = NULL;
    int n =0; // number elements read
	char *inFileName = NULL;
	char outFileName[512] = {"\0"};
	int numReads;
	//int keyFlows = 8;
	//int adapterFlows = 0;
	bool debugflag = false;
	char *primer = NULL;
	double fom = 0.85;
	unsigned int sPos = 4;	// start searching after 4 base key
	unsigned int endPos = 7;	// end searching st 3rd base after key
	
	char		name[256];
	uint16_t	*flowgram_values; // [NUMBER_OF_FLOWS_PER_READ];
	uint8_t		*flow_index_per_base; // * number_of_bases;
	char		*bases; // * number_of_bases;
	uint8_t		*quality_scores; // * number_of_bases;

	char		*flow_chars;
	char		*key_sequence;
	
	// Parse command line arguments
	int argcc = 1;
	while (argcc < argc) {
		if (argv[argcc][0] == '-') {
			switch (argv[argcc][1]) {
				
				case 's':	// define barcode string
					argcc++;
					primer = strdup (argv[argcc]);
				break;
				
				case 'd':	// print debug info
					debugflag = true;
				break;
				case 'f':	// acceptance threshold
					argcc++;
					fom = atof (argv[argcc]);
				break;
			}
		}
		else {
			inFileName = argv[argcc];
		}
		argcc++;
	}
	
	if (!inFileName) {
		fprintf (stdout, "No input file specified\n");
		fprintf (stdout, "Usage: %s [-b barcode][-f #][-k #][-d]\n", argv[0]);
		fprintf (stdout, "\t-s Specify barcode string (CTTCCTTC).\n");
		fprintf (stdout, "\t-f Specify acceptance threshold (0.85).\n");
		fprintf (stdout, "\t-d Prints debug information.\n");
		exit (1);
	}
	
	// No barcode passed in from command line so set up a default
	if (primer == NULL) {
		primer = strdup ("CTTCCTTC");
	}
	
	//Create output filename from input filename
	snprintf (outFileName, 512, "%s/AT_%s", dirname(inFileName), inFileName);
	
	//Open the SFF file
	inSFF = fopen(inFileName, "rb");
	//Open the outputSFF file
	outSFF = fopen(outFileName, "wb");
	
	// Read the input file header
	CommonHeader h;
	n = fread(&h, 31, 1, inSFF);
    assert(n == 1);
	
	//Copy the header to write the output file
	CommonHeader ch_out;
	ch_out.magic_number = h.magic_number;
	ch_out.version[0] = 0;
	ch_out.version[1] = 0;
	ch_out.version[2] = 0;
	ch_out.version[3] = 1;
	ch_out.index_offset = h.index_offset;
	ch_out.index_length = h.index_length;
	ch_out.number_of_reads = h.number_of_reads;
	ch_out.header_length = h.header_length;
	ch_out.key_length = h.key_length;
	ch_out.number_of_flows_per_read = h.number_of_flows_per_read;
	
	ByteSwap8(h.index_offset);
	ByteSwap4(h.index_length);
	ByteSwap4(h.number_of_reads);
	ByteSwap2(h.header_length);
	ByteSwap2(h.key_length);
	ByteSwap2(h.number_of_flows_per_read);
	flow_chars = (char *)malloc(h.number_of_flows_per_read);
	key_sequence = (char *)malloc(h.key_length);
	n = fread(flow_chars, h.number_of_flows_per_read, 1, inSFF);
    assert(n == 1);
	n = fread(key_sequence, h.key_length, 1, inSFF);
    assert(n == 1);
	int padBytes = (8-((31 + h.number_of_flows_per_read + h.key_length) & 7));
	char padData[8];
	n = fread(padData, padBytes, 1, inSFF);
    assert(n == 1);
		
	if (0) {
		//DEBUG
		printf("Magic:	%u	%s\n", h.magic_number, (h.magic_number == MAGIC ? "Yes" : "No"));
		printf("Header length: %hu\n", h.header_length);
		printf("Version: %d%d%d%d\n", h.version[0], h.version[1], h.version[2], h.version[3]);
		printf("Index offset: %lu  length: %u\n", h.index_offset, h.index_length);
		printf("Number of reads: %u\n", h.number_of_reads);
		printf("Key length: %u\n", h.key_length);
		printf("Flows per read: %hu\n", h.number_of_flows_per_read);
		printf("Flowgram format: %hhu\n", h.flowgram_format_code);
		printf ("End of Header\n\n");
	}
		
	// Write the header of the output SFF
	char pad[8];
	memset(pad, 0, sizeof(pad));
	int bytes = 31;
	fwrite (&ch_out, bytes, 1, outSFF);
	for(int i=0;i<h.number_of_flows_per_read;i++) {
		fwrite(&flow_chars[i%4], 1, 1, outSFF);
		bytes++;
	}

	fwrite(key_sequence, 1, 4, outSFF);
	bytes += 4;

	padBytes = (8 - (bytes & 0x7)) & 0x7;
	if (padBytes > 0)
		fwrite(pad, padBytes, 1, outSFF);

	// Prepare to process all the reads
	numReads = h.number_of_reads;
	// Statistics variables
	int numFoundSeq = 0;	// number of reads with the search sequence
	int numPassFOM = 0;		// number of reads above acceptance threshold
	int *corr_histo = (int *) malloc (sizeof(int) * (strlen(primer)+1));
	for (unsigned int j=0;j<strlen(primer)+1;j++)
		corr_histo[j] = 0;
	
	flowgram_values = (uint16_t *)malloc(sizeof(uint16_t) * h.number_of_flows_per_read);
	int maxBases = h.number_of_flows_per_read * 100; // problems if ever a 10-mer hits every flow!
	flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
	bases = (char *)malloc(maxBases);
	quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
	
	//Loop thru the reads
	for (int nr=0;nr<numReads;nr++) {
		// Read read header
		ReadHeader r;
		n = fread(&r, 16,  1, inSFF);
        assert(n == 1);

		ByteSwap2(r.read_header_length);
		ByteSwap4(r.number_of_bases);
		ByteSwap2(r.name_length);
		ByteSwap2(r.clip_qual_left);
		ByteSwap2(r.clip_qual_right);
		ByteSwap2(r.clip_adapter_left);
		ByteSwap2(r.clip_adapter_right);
		
		if (r.name_length > 0) {
			n = fread(name, r.name_length, 1, inSFF);
            assert(n == 1);
			name[r.name_length] = '\0';
		}
		
		int readPadLength = ((8 - ((16 + r.name_length) & 7)))%8;
		if (readPadLength > 0) {
			n = fread(padData, readPadLength, 1, inSFF);
    	    assert(n == 1);
		}

		n = fread(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), inSFF);
        assert(n == sizeof(uint16_t));
		n = fread(flow_index_per_base, r.number_of_bases, sizeof(uint8_t), inSFF);
        assert(n == sizeof(uint8_t));
		n = fread(bases, r.number_of_bases, 1, inSFF);
        assert(n == 1);
		bases[r.number_of_bases] = '\0';
		n = fread(quality_scores, r.number_of_bases, sizeof(uint8_t), inSFF);
        assert(n == sizeof(uint8_t));

		int bytesRead = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * r.number_of_bases;
		readPadLength = (8 - (bytesRead & 7))%8;
		if (readPadLength > 0) {
			n = fread(padData, readPadLength, 1, inSFF);
			assert(n == 1);
		}
		
		int f;
		if (0) {
			// DEBUG
			printf("Read: %s has %d bases\n",
							(r.name_length > 0 ? name : "NONAME"),
							r.number_of_bases);
			//printf("Read header length: %d\n", r.read_header_length);
			printf("Clip left: %d qual: %d right: %d qual: %d\n",
						r.clip_adapter_left, r.clip_qual_left,
						r.clip_adapter_right, r.clip_qual_right);
			printf("Flowgram bases:\n");
			for(f=0;f<h.number_of_flows_per_read;f++)
				printf("%d ", (int) floor (ByteSwap2(flowgram_values[f])/100.0 + 0.5));
			printf("\n");
			//printf("\nFlow index per base:\n");
			unsigned int b;
			//for(b=0;b<r.number_of_bases;b++)
			//	printf("%d ", flow_index_per_base[b]);
			printf("Bases called:\n");
			for(b=0;b<r.number_of_bases;b++)
				printf("%c", bases[b]);
			//printf("\nQuality scores:\n");
			//for(b=0;b<r.number_of_bases;b++)
			//	printf("%d ", quality_scores[b]);
		} else {
			for(f=0;f<h.number_of_flows_per_read;f++)
				ByteSwap2(flowgram_values[f]);
		}
		
		/* This is trimming based on number of flows
		  Assumes a perfect barcode
		*
		// Trim the key and the adapter
		int numFlowsToTrim = keyFlows + adapterFlows;
		// numBasesToTrim = basesInKey + basesInAdapter;
		int numBasesToTrim = 0;
		for (int j=0;j<numFlowsToTrim;j++)
			numBasesToTrim += (int) floor (flowgram_values[j]/100.0 + 0.5); //relies on debug print to byteswap!
		
		// TODO: make this generic for the final base of the key
		int k = 4;
		while (bases[k++] == 'G')
			numBasesToTrim -= 1;
		
		if (debugflag)
			printf("\nNumBases = %d\n", numBasesToTrim);
		
		
		r.clip_qual_left = numBasesToTrim + 1;	//The essence of this tool.
		// End trimming based on number of flows
		*/
		
		/*	Trimming based on finding the actual barcode 'near' the key	*/
		// Loop thru starting positions starting at left going right
		int primerLen = strlen(primer);
		//sPos = 4;	// start searching after 4 base key
		//endPos = 7;	// end searching st 3rd base after key
		if (r.number_of_bases < endPos)
			continue;
		int correct = 0;
		double *correctness = (double *) malloc (sizeof(double) * (endPos - sPos));
		unsigned int i;
		for (i=sPos;i<endPos;i++) {
			// Loop thru the flows
			for (int flow=0;flow<primerLen;flow++) {
				if (primer[flow] == bases[flow+i]) {
					correct++;
				}
			}
					
			correctness[i-sPos] = (double) correct;/// (double) primerLen;
			correct = 0;
		}
		
		double max;
		unsigned int matchingIndex = 0;
		
		for (i=sPos;i<endPos;i++) {
			if (i == sPos || correctness[i-sPos] > max) {
				max = correctness[i-sPos];
				matchingIndex = i;
			}
		}
		
		corr_histo[(int)max]++;
		
		numFoundSeq++;
		
		if ((double)(max/primerLen) >= fom) {
			numPassFOM++;
			if (debugflag) {
				fprintf (stdout, "%s\n", name);
				fprintf (stdout, "Matching Index = %d (%0.2lf)\n", matchingIndex, (double)(max/primerLen));
				for (i=0;i<matchingIndex;i++)
					fprintf (stdout, " ");
				fprintf (stdout, "%s\n%s\n", primer, bases);
			}
			matchingIndex = matchingIndex + primerLen;
			if (debugflag)
				fprintf (stdout, "Trim point is %d\n", matchingIndex+1);
		}
		else {
			matchingIndex = 4;
		}
		r.clip_qual_left = matchingIndex+1;
		/*	End Trimming barcode near the key							*/
		

		//
		//	Update the output file
		//
		int nameLen = r.name_length;
		int numBasesCalled = r.number_of_bases;
		
		// write the header
		ByteSwap2(r.read_header_length);
		ByteSwap4(r.number_of_bases);
		ByteSwap2(r.name_length);
		ByteSwap2(r.clip_qual_left);
		ByteSwap2(r.clip_qual_right);
		ByteSwap2(r.clip_adapter_left);
		ByteSwap2(r.clip_adapter_right);
		fwrite (&r, 16, 1, outSFF);
		
		fwrite(name, nameLen, 1, outSFF);
		int writePadLength = (8 - (nameLen & 7)) & 7;
		if (writePadLength)
			fwrite(padData, writePadLength, 1, outSFF);

		for (f=0;f<h.number_of_flows_per_read;f++)
			ByteSwap2(flowgram_values[f]);
		fwrite(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), outSFF);

		fwrite(flow_index_per_base, numBasesCalled, sizeof(uint8_t), outSFF);

		fwrite(bases, numBasesCalled, 1, outSFF);

		fwrite(quality_scores, numBasesCalled, sizeof(uint8_t), outSFF);

		int bytesWritten = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * numBasesCalled;
		writePadLength = (8 - (bytesWritten & 7)) & 7;
		if (writePadLength)
			fwrite(padData, writePadLength, 1, outSFF);
			
		free (correctness);
	}
	
	// Print statistics to stdout
	fprintf (stdout, "\n=====================================================\n");
	fprintf (stdout, "Barcode String: %s\n", primer);
	fprintf (stdout, "%15s: %10d\n", "Total Reads", numReads);
	fprintf (stdout, "%15s: %10d\n", "with Adapter", numFoundSeq);
	fprintf (stdout, "%15s: %10d\n", "passing FOM", numPassFOM);
	fprintf (stdout, "Acceptance threshold: %0.2lf%%\n", fom);
	fprintf (stdout, "Search indices: %d thru %d\n", sPos, endPos-1);
	for (int i = 0; i <= (int)strlen(primer);i++) {
		fprintf (stdout, "[%2d/%2d %6.1lf%%] %5d/%d %6.2lf%%\n", i, (int) strlen(primer), ((double)i/(double)strlen(primer))*100.0,corr_histo[i], numFoundSeq, ((double)corr_histo[i]/(double)numFoundSeq)*100.0);
	}
	
	
	//Cleanup
	fclose (inSFF);
	fclose (outSFF);
	free (flow_index_per_base);
	free (flowgram_values);
	free (bases);
	free (quality_scores);
	free (flow_chars);
	free (key_sequence);
	free (primer);
	free (corr_histo);
	
	return 0;
}
Beispiel #4
0
int main(int argc, char *argv[])
{
  char	*fastqFileName = NULL;
  char	*sffFileName = NULL;
  bool	forceClip = false;
  bool	keyPass = false;
  bool	allReads = false;
  int	minReadLen = 8; // min read length after key-pass that we will write out to fastq file
  int	readCol = -1;
  int	readRow = -1;
  bool	findRead = false;
  int row, col;

  // process command-line args
  int argcc = 1;
  while (argcc < argc) {
      if (argv[argcc][0] == '-') {
          switch (argv[argcc][1]) {
            case 'a': // output all reads
              allReads = true;
              break;

            case 'R': // report read at row & column
              argcc++;	
              readRow = atoi(argv[argcc]);
              break;

            case 'C': // report read at row & column
              argcc++;	
              readCol = atoi(argv[argcc]);
              break;

            case 'q': // convert to fastq
              argcc++;
              fastqFileName = argv[argcc];
              break;

            case 'c': // force qual clip left to 5
              forceClip = true;
              break;

            case 'k': // force keypass
              keyPass = true;
              argcc++;
              hackkey = argv[argcc];
              hackkeylen = strlen(hackkey);
              break;

            case 'l': // set min readlength for fastq file output filter
              argcc++;
              minReadLen = atoi(argv[argcc]);
              break;

            default:
              //sffFileName = argv[argcc];
              break;
          }
      }
      else {
          sffFileName = argv[argcc];
      }
      argcc++;
  }

  if (!sffFileName) {
      printf("Usage: SFFRead [args] sffFile.sff\n");
      exit(0);
  }

  if (readCol > -1 && readRow > -1) {
      findRead = true;
      allReads = true;// makes it search all reads
  }

  FILE *fp;
  fp = fopen(sffFileName, "r+");
  if (fp) {
      if (!findRead && !fastqFileName)
        printf("Reading file: %s\n", sffFileName);
      CommonHeader h;

      // Fix the flow_format_code problem: make sure it is set to 1
      fpos_t p, start;

      fgetpos (fp, &p);
      fgetpos (fp, &start);
      start = p;
      int elements_read = fread(&h, 31, 1, fp);
      assert(elements_read == 1);
      h.flowgram_format_code = 1;
      fsetpos (fp, &p);
      fwrite (&h, 31, 1, fp);
      fsetpos (fp, &p);

      elements_read = fread(&h, 31, 1, fp);
      assert(elements_read == 1);
      ByteSwap8(h.index_offset);
      ByteSwap4(h.index_length);
      ByteSwap4(h.number_of_reads);
      ByteSwap2(h.header_length);
      ByteSwap2(h.key_length);
      ByteSwap2(h.number_of_flows_per_read);
      if (!findRead && !fastqFileName) {
          printf("Magic:	%u	%s\n", h.magic_number, (h.magic_number == MAGIC ? "Yes" : "No"));
          printf("Version: %d%d%d%d\n", h.version[0], h.version[1], h.version[2], h.version[3]);
          printf("Index offset: %lu  length: %u\n", h.index_offset, h.index_length);
          printf("Number of reads: %u\n", h.number_of_reads);
          printf("Header length: %hu\n", h.header_length);
          printf("Key length: %u\n", h.key_length);
          printf("Flows per read: %hu\n", h.number_of_flows_per_read);
          printf("Flowgram format: %hhu\n", h.flowgram_format_code);
      }

      flow_chars = (char *)malloc(h.number_of_flows_per_read);
      key_sequence = (char *)malloc(h.key_length);
      elements_read = fread(flow_chars, h.number_of_flows_per_read, 1, fp);
      assert(elements_read == 1);
      elements_read = fread(key_sequence, h.key_length, 1, fp);
      assert(elements_read == 1);
      int i;
      if (!findRead && !fastqFileName) {
          printf("Key sequence: ");
          for(i=0;i<h.key_length;i++)
            printf("%c", key_sequence[i]);

          printf("\nFlow chars:\n");
          for(i=0;i<h.number_of_flows_per_read;i++)
            printf("%c", flow_chars[i]);
          printf("\n");
      }
      int padBytes = (8-((31 + h.number_of_flows_per_read + h.key_length) & 7));
      char padData[8];
      //		fprintf (stdout, "Pad Bytes = %d\n", padBytes);
      elements_read = fread(padData, padBytes, 1, fp);
      assert(elements_read == 1);

      fgetpos(fp, &p);
      //		fprintf (stdout, "We are at %ld\n", (p.__pos - start.__pos));

      // -- read the reads
      int numReads = h.number_of_reads;

      // pre-allocate space so we be fast
      flowgram_values = (uint16_t *)malloc(sizeof(uint16_t) * h.number_of_flows_per_read);
      int maxBases = h.number_of_flows_per_read * 10; // problems if ever a 10-mer hits every flow!
      flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
      bases = (char *)malloc(maxBases);
      quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);

      for(i=0;i<numReads;i++) {
          ReadHeader r;

#define FIXIT
#ifdef FIXIT
          fpos_t pos;
          // Get position ready-to-read header
          fgetpos (fp, &pos);
          // Read header
          elements_read = fread(&r, 16,  1, fp);
          assert(elements_read == 1);
          // byte swap
          ByteSwap2(r.read_header_length);
          ByteSwap2(r.name_length);
          //			fprintf (stdout, "Old read header length = %d\n", r.read_header_length);
          // Fix the read_header_length
          // read_header_length is "16 + name_length" rounded up to nearest divisible by 8
          r.read_header_length = 16 + r.name_length;
          r.read_header_length += (8 - (r.read_header_length & 0x7)) & 0x7;
          //			fprintf (stdout, "New read header length = %d\n", r.read_header_length);
          // Byte swap
          ByteSwap2(r.read_header_length);
          ByteSwap2(r.name_length);
          // Rewind file pointer
          fsetpos (fp, &pos);
          // Write header out again
          fwrite (&r, 16, 1, fp);
          // Rewind again
          fsetpos (fp, &pos);
#endif

          // Read it in and continue
          fpos_t readStart;
          fgetpos(fp, &readStart);
          elements_read = fread(&r, 16,  1, fp);
          assert(elements_read == 1);

          //ByteSwap2(r.read_header_length);
          //ByteSwap2(r.name_length);
          ByteSwap2(r.clip_qual_left);
          ByteSwap2(r.clip_qual_right);
          ByteSwap2(r.clip_adapter_left);
          ByteSwap2(r.clip_adapter_right);

          // Fix clipping values
          r.clip_qual_left = 5;
          r.clip_adapter_left = 0;
          r.clip_qual_right = 0;
          r.clip_adapter_right = 0;
          ByteSwap2(r.clip_qual_left);
          ByteSwap2(r.clip_qual_right);
          ByteSwap2(r.clip_adapter_left);
          ByteSwap2(r.clip_adapter_right);

          // Rewind file pointer to beginning of read header
          fsetpos (fp, &pos);
          // Write read header
          fwrite (&r, 16, 1, fp);
          // Rewind file pointer to beginning of read header
          fsetpos (fp, &pos);
          // Read corrected header
          elements_read = fread(&r, 16,  1, fp);
          assert(elements_read == 1);

          ByteSwap2(r.read_header_length);
          ByteSwap4(r.number_of_bases);
          ByteSwap2(r.name_length);
          ByteSwap2(r.clip_qual_left);
          ByteSwap2(r.clip_qual_right);
          ByteSwap2(r.clip_adapter_left);
          ByteSwap2(r.clip_adapter_right);

          //printf("Read header length: %d\n", r.read_header_length);
          //printf("Read name length: %d\n", r.name_length);

          /*
             flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * r.number_of_bases);
             bases = (char *)malloc(r.number_of_bases);
             quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * r.number_of_bases);
             */

          if (r.name_length > 0) {
              elements_read = fread(name, r.name_length, 1, fp);
              assert(elements_read == 1);
              name[r.name_length] = 0; // so we can easily print it
          }
          if(1 != ion_readname_to_rowcol(name, &row, &col)) {
              fprintf (stderr, "Error parsing read name: '%s'\n", name);
              continue;
          }

          int readPadLength = ((8 - ((16 + r.name_length) & 7)))%8;
          elements_read = fread(padData, readPadLength, 1, fp);
          assert(elements_read == 1);
          /*
             printf("Read: %s (r%d|c%d) has %d bases\n",
             (r.name_length > 0 ? name : "NONAME"),
             row, col,
             r.number_of_bases);
             printf("Clip left: %d qual: %d right: %d qual: %d\n",
             r.clip_adapter_left, r.clip_qual_left,
             r.clip_adapter_right, r.clip_qual_right);
             printf("Flowgram values:\n");
             */
          elements_read = fread(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), fp);
          assert(elements_read == sizeof(uint16_t));
          elements_read = fread(flow_index_per_base, r.number_of_bases, sizeof(uint8_t), fp);
          assert(elements_read == sizeof(uint8_t));
          elements_read = fread(bases, r.number_of_bases, 1, fp);
          assert(elements_read == 1);
          elements_read = fread(quality_scores, r.number_of_bases, sizeof(uint8_t), fp);

          int bytesRead = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * r.number_of_bases;
          readPadLength = (8 - (bytesRead & 7))%8;
          elements_read = fread(padData, readPadLength, 1, fp);
          assert(elements_read == 1);
          fpos_t readEnd;
          fgetpos(fp, &readEnd);
          //			fprintf (stdout, "At end of read. Size: %ld\n", readEnd.__pos-readStart.__pos);
          //if ((readEnd.__pos-readStart.__pos) != r.read_header_length) {
          //	fprintf (stdout, "mismatch in read_header_length\n");
          //	exit (1);
          //}

          // parse the name to get the row & col, if matched, print out read
          if(1 != ion_readname_to_rowcol(name, &row, &col)) {
              fprintf (stderr, "Error parsing read name: '%s'\n", name);
              continue;
          }

          if (row == readRow && col == readCol) {
              //printf("Ionogram: ");
              int i;
              for(i=0;i<h.number_of_flows_per_read;i++) {
                  printf("%.2lf ", (double)(ByteSwap2(flowgram_values[i]))/100.0);
              }
              printf("\n");
          }
          /*
             int f;
             for(f=0;f<h.number_of_flows_per_read;f++)
             printf("%d ", ByteSwap2(flowgram_values[f]));
             printf("\nFlow index per base:\n");
             unsigned int b;
             for(b=0;b<r.number_of_bases;b++)
             printf("%d ", flow_index_per_base[b]);
             printf("\nBases called:\n");
             for(b=0;b<r.number_of_bases;b++)
             printf("%c", bases[b]);
             printf("\nQuality scores:\n");
             for(b=0;b<r.number_of_bases;b++)
             printf("%d ", quality_scores[b]);
             printf("\nDone with this read\n\n");
             */

          /*
             if (name) free(name);
             free(flowgram_values);
             free(flow_index_per_base);
             free(bases);
             free(quality_scores);
             */
      }
      free(flowgram_values);
      free(flow_index_per_base);
      free(bases);
      free(quality_scores);


      fclose(fp);
  }

  return 0;
}
/*
 *	Compose the portions of the reply packet specific to the
 *	EAP-TNC protocol, in the EAP reply typedata
 */
int eaptnc_compose(EAP_DS *eap_ds, TNC_PACKET *reply)
{
	uint8_t *ptr;


	if (reply->code < 3) {
		//fill: EAP-Type (0x888e)
		eap_ds->request->type.type = PW_EAP_TNC;
        DEBUG2("TYPE: EAP-TNC set\n");
		rad_assert(reply->length > 0);
		
		//alloc enough space for whole TNC-Packet (from Code on)
		eap_ds->request->type.data = calloc(reply->length, sizeof(unsigned char*));
        DEBUG2("Malloc %d bytes for packet\n", reply->length);
		if (eap_ds->request->type.data == NULL) {
			radlog(L_ERR, "rlm_eap_tnc: out of memory");
			return 0;
		}
		//put pointer at position where data starts (behind Type)
		ptr = eap_ds->request->type.data;
		//*ptr = (uint8_t)(reply->data_length & 0xFF);

		//ptr++;
		*ptr = reply->flags_ver;
        DEBUG2("Set Flags/Version: %d\n", *ptr);
		if(reply->data_length!=0){
            DEBUG2("Set data-length: %d\n", reply->data_length);
			ptr++; //move to start-position of "data_length"
            DEBUG2("Set data-length: %x\n", reply->data_length);
            DEBUG2("Set data-length (swapped): %x\n", ByteSwap2(reply->data_length));
            unsigned long swappedDataLength = ByteSwap2(reply->data_length);
            //DEBUG2("DATA-length: %d", reply->data_
            memcpy(ptr, &swappedDataLength, 4);
			//*ptr = swappedDataLength;
		}
		uint16_t thisDataLength=0;
		if(reply->data!=NULL){
            DEBUG2("Adding TNCCS-Data ");
			int offset;
			//if data_length-Field present
			if(reply->data_length !=0){
                DEBUG2("with Fragmentation\n");
				offset = TNC_DATA_LENGTH_LENGTH; //length of data_length-field: 4
				thisDataLength = reply->length-TNC_PACKET_LENGTH;
			}else{ //data_length-Field not present
                DEBUG2("without Fragmentation\n");
				offset = 1;
				thisDataLength = reply->length-TNC_PACKET_LENGTH_WITHOUT_DATA_LENGTH;
			}
            DEBUG2("TNCCS-Datalength: %d\n", thisDataLength);
			ptr=ptr+offset; //move to start-position of "data"
			memcpy(ptr,reply->data, thisDataLength);
		}else{
            DEBUG2("No TNCCS-Data present");
        }

		//the length of the TNC-packet (behind Type)
        if(reply->data_length!=0){
    		eap_ds->request->type.length = TNC_DATA_LENGTH_LENGTH+TNC_FLAGS_VERSION_LENGTH+thisDataLength; //4:data_length, 1: flags_ver
        }else{
            eap_ds->request->type.length = TNC_FLAGS_VERSION_LENGTH+thisDataLength; //1: flags_ver
        }
        DEBUG2("Packet built\n");

	} else {
		eap_ds->request->type.length = 0;
	}
	eap_ds->request->code = reply->code;

	return 1;
}
/*
 *	We expect only RESPONSE for which REQUEST, SUCCESS or FAILURE is sent back
 */
TNC_PACKET *eaptnc_extract(EAP_DS *eap_ds)
{
	tnc_packet_t	*data;
	TNC_PACKET	*packet;
	/*
	 *	We need a response, of type EAP-TNC
     */
	if (!eap_ds 					 ||
	    !eap_ds->response 				 ||
	    (eap_ds->response->code != PW_TNC_RESPONSE)	 ||
	    eap_ds->response->type.type != PW_EAP_TNC	 ||
	    !eap_ds->response->type.data 		 ||
	    (eap_ds->response->length <= TNC_HEADER_LEN) ||
	    (eap_ds->response->type.data[0] <= 0)) {
		radlog(L_ERR, "rlm_eap_tnc: corrupted data");
		return NULL;
	}
	packet = eaptnc_alloc();
	if (!packet) return NULL;


	packet->code = eap_ds->response->code;
	packet->id = eap_ds->response->id;
	packet->length = eap_ds->response->length; 

	data = (tnc_packet_t *)eap_ds->response->type.data;
	/*
	 *	Already checked the size above.
	 */
    packet->flags_ver = data->flags_ver;
    unsigned char *ptr = (unsigned char*)data;


	DEBUG2("Flags/Ver: %x\n", packet->flags_ver);
	int thisDataLength;
    int dataStart;
    if(TNC_LENGTH_INCLUDED(packet->flags_ver)){
        DEBUG2("data_length included\n");
//        memcpy(&packet->flags_ver[1], &data->flags_ver[1], 4);
        //packet->data_length = data->data_length;
        memcpy(&packet->data_length, &ptr[1], TNC_DATA_LENGTH_LENGTH);
        DEBUG2("data_length: %x\n", packet->data_length);
        DEBUG2("data_length: %d\n", packet->data_length);
        DEBUG2("data_length: %x\n", ByteSwap2(packet->data_length));
        DEBUG2("data_length: %d\n", ByteSwap2(packet->data_length));
        packet->data_length = ByteSwap2(packet->data_length);
		thisDataLength = packet->length-TNC_PACKET_LENGTH; //1: we need space for flags_ver
        dataStart = TNC_DATA_LENGTH_LENGTH+TNC_FLAGS_VERSION_LENGTH;
    }else{
        DEBUG2("no data_length included\n");
	 	thisDataLength = packet->length-TNC_PACKET_LENGTH_WITHOUT_DATA_LENGTH;
        packet->data_length = 0;
        dataStart = TNC_FLAGS_VERSION_LENGTH;
        
    }
	/*
	 *	Allocate room for the data, and copy over the data.
	 */
	packet->data = malloc(thisDataLength);
	if (packet->data == NULL) {
		radlog(L_ERR, "rlm_eap_tnc: out of memory");
		eaptnc_free(&packet);
		return NULL;
	}
    
    memcpy(packet->data, &(eap_ds->response->type.data[dataStart]), thisDataLength);

	return packet;
}