Esempio n. 1
0
void
sff_sort(sff_file_t *fp_in, sff_file_t *fp_out)
{
  int32_t i, row, col;
  sff_t *sff;
  int32_t requires_sort = 0;
  sff_sort_t *sffs = NULL;
  int32_t sffs_mem = 0, sffs_len = 0;

  // initialize memory
  sffs_mem = 1024;
  sffs = ion_malloc(sizeof(sff_sort_t) * sffs_mem, __func__, "sffs");

  // go through the input file
  while(NULL != (sff = sff_read(fp_in))) {
      // get the row/col co-ordinates
      if(0 == ion_readname_to_rowcol(sff->rheader->name->s, &row, &col)) {
          ion_error(__func__, "could not understand the read name", Exit, OutOfRange);
      }
      // copy over
      while(sffs_mem <= sffs_len) {
          sffs_mem <<= 1; // double
          sffs = ion_realloc(sffs, sizeof(sff_sort_t) * sffs_mem, __func__, "sffs");
      }
      sffs[sffs_len].row = row;
      sffs[sffs_len].col = col;
      sffs[sffs_len].sff = sff;
      sff = NULL;

      // check if we need to sort, for later
      if(0 < sffs_len && __sff_sort_lt(sffs[sffs_len], sffs[sffs_len-1])) {
          requires_sort = 1;
      }

      sffs_len++;
  }

  // resize
  sffs_mem = sffs_len; 
  sffs = ion_realloc(sffs, sizeof(sff_sort_t) * sffs_mem, __func__, "sffs");

  if(1 == requires_sort) {
      // sort
      ion_sort_introsort(sff_sort, sffs_len, sffs);
  }

  // write
  for(i=0;i<sffs_len;i++) {
      if(0 == sff_write(fp_out, sffs[i].sff)) {
          ion_error(__func__, "sff_write", Exit, WriteFileError);
      }
  }

  // destroy
  for(i=0;i<sffs_len;i++) {
      sff_destroy(sffs[i].sff);
  }
  free(sffs);
}
Esempio n. 2
0
int main(int argc, char *argv[])
{
	FILE *inSFF = NULL;
	FILE *outSFF = NULL;
	FILE *listFP = NULL;
    int n = 0; //number elements read
	char *inFileName = NULL;
	char *outFileName = NULL;
	char *listFileName = NULL;
	char *errFileName = {"./SFFFilter_err.txt"};
	int numReads;
	int matchCnt = 0;
	int got = 0;
	bool debugflag = false;
	bool qualflag = false;
	int qual_offset = 33;
	bool listMatch = false;
	
	char		name[256];
	uint16_t	*flowgram_values; // [NUMBER_OF_FLOWS_PER_READ];
	uint8_t		*flow_index_per_base; // * number_of_bases;
	char		*bases; // * number_of_bases;
	uint8_t		*quality_scores; // * number_of_bases;

	char		*flow_chars;
	char		*key_sequence;
	
	// Parse command line arguments
	int argcc = 1;
	while (argcc < argc) {
		if (argv[argcc][0] == '-') {
			switch (argv[argcc][1]) {
				
				case 'd':	// print debug info
					debugflag = true;
				break;
				
				case 'q':	// print debug info
					qualflag = true;
				break;
				
				case 'f':	// list of locations to filter
					argcc++;
					listFileName = strdup (argv[argcc]);
				break;
				
				case 's':	// Offset to apply to quality scores
					argcc++;
					qual_offset = atoi(argv[argcc]);
					if(qual_offset==0) {
						fprintf (stderr, "-s option should specify a nonzero quality offset\n");
						exit (1);
					}
				break;
				
				case 'o':	// output file name
					argcc++;
					outFileName = strdup(argv[argcc]);
				break;
				
				default:
					fprintf (stderr, "Unknown option %s\n", argv[argcc]);
					exit (1);
				break;
			}
		}
		else {
			inFileName = argv[argcc];
		}
		argcc++;
	}
	
	if (!inFileName) {
		fprintf (stdout, "No input sff file specified\n");
		fprintf (stdout, "Usage: %s [-f filename] [-d] sff-filename\n", argv[0]);
		fprintf (stdout, "\t-f Specify input file list.\n");
		fprintf (stdout, "\t-o Specify output sff file name.\n");
		fprintf (stdout, "\t-d Prints debug information.\n");
		fprintf (stdout, "\t-q Take qualities from 4th field of file specified by -f.\n");
		fprintf (stdout, "\t-s To use in conjunction with -q option, specifies an offset to be applied to quality scores.\n");
		exit (1);
	}
	if (!listFileName) {
		fprintf (stdout, "No input list file specified\n");
		fprintf (stdout, "Usage: %s [-f filename] [-d] sff-filename\n", argv[0]);
		fprintf (stdout, "\t-f Specify input file list.\n");
		fprintf (stdout, "\t-o Specify output sff file name.\n");
		fprintf (stdout, "\t-d Prints debug information.\n");
		fprintf (stdout, "\t-q Take qualities from 4th field of file specified by -f.\n");
		fprintf (stdout, "\t-s To use in conjunction with -q option, specifies an offset to be applied to quality scores.\n");
		exit (1);
	}
	
	//Create output filename from input filename if it wasn't specified
	if(outFileName==NULL) {
		outFileName = (char *) malloc (sizeof(char) * (strlen(dirname(inFileName)) + strlen(inFileName) + 50));
		sprintf (outFileName, "%s/filtered_%s", dirname(inFileName), inFileName);
	}
	
	//Open the SFF file
	inSFF = fopen(inFileName, "rb");
	if (!inSFF) {
		perror (inFileName);
		exit (1);
	}
	//Open the outputSFF file
	outSFF = fopen(outFileName, "wb");
	if (!outSFF) {
		perror (outFileName);
		exit (1);
	}
	//Open the list file
	listFP = fopen(listFileName, "rb");
	if (!listFP) {
		perror (listFileName);
		exit (1);
	}
	
	//Read the list of locations into buffer
	got = GetNumLines(listFileName);
	if (got <= 0) {
		fprintf (stderr, "Did not read any pixel coordinates; does the file exist?  Is it formatted correctly?\n");
		exit (1);
	}
	else {
		fprintf (stdout, "Reading up to %d lines\n", got);
	}
	
	//Dynamic array allocation
	int *rows = (int *) malloc (sizeof(int) * got);
	int *cols = (int *) malloc (sizeof(int) * got);
	int *lengths = (int *) malloc (sizeof(int) * got);
	char **quals = (char **) malloc (sizeof(char*) * got);
	bool *fnds = (bool *) malloc (sizeof(bool) * got);	//tracks reads that were found in SFF file
	for (int i=0;i<got;i++)
	{
		fnds[i] = false;
		quals[i] = (char *) malloc (sizeof(char) * MAX_BASES);
	}
	int lineCnt = 0;
	while (!feof(listFP)) {
		if(qualflag) {
			if(4 != fscanf (listFP, "%d %d %d %s\n", &rows[lineCnt], &cols[lineCnt], &lengths[lineCnt], quals[lineCnt])) {
				fprintf(stderr,"%s: bad format in line %d of %s - expected 3 ints and a char string.\n",argv[0],1+lineCnt,inFileName);
				exit(EXIT_FAILURE);
			} else if(strlen(quals[lineCnt]) < (unsigned int) lengths[lineCnt]) {
				fprintf(stderr,"%s: warning: line %d of %s - quality string is shorter than requested length.\n",argv[0],1+lineCnt,inFileName);
			}
			lineCnt++;
		} else {
			if(3 != fscanf (listFP, "%d %d %d\n", &rows[lineCnt], &cols[lineCnt], &lengths[lineCnt])) {
				fprintf(stderr,"%s: bad format in line %d of %s - expected 3 ints.\n",argv[0],1+lineCnt,inFileName);
				exit(EXIT_FAILURE);
			} else {
				lineCnt++;
			}
		}
	}
	fclose (listFP);
	
	
	// Read the input file header
	CommonHeader h;
	n = fread(&h, 31, 1, inSFF);
    assert(n==1);
	
	//Copy the header to write the output file
	CommonHeader ch_out;
	ch_out.magic_number = h.magic_number;
	ch_out.version[0] = 0;
	ch_out.version[1] = 0;
	ch_out.version[2] = 0;
	ch_out.version[3] = 1;
	ch_out.index_offset = h.index_offset;
	ch_out.index_length = h.index_length;
	ch_out.number_of_reads = h.number_of_reads;
	ch_out.header_length = h.header_length;
	ch_out.key_length = h.key_length;
	ch_out.number_of_flows_per_read = h.number_of_flows_per_read;
	ch_out.flowgram_format_code = h.flowgram_format_code;
	
	ByteSwap8(h.index_offset);
	ByteSwap4(h.index_length);
	ByteSwap4(h.number_of_reads);
	ByteSwap2(h.header_length);
	ByteSwap2(h.key_length);
	ByteSwap2(h.number_of_flows_per_read);
	flow_chars = (char *)malloc(h.number_of_flows_per_read);
	key_sequence = (char *)malloc(h.key_length);
	n = fread(flow_chars, h.number_of_flows_per_read, 1, inSFF);
    assert(n==1);
	n = fread(key_sequence, h.key_length, 1, inSFF);
    assert(n==1);
	int padBytes = (8-((31 + h.number_of_flows_per_read + h.key_length) & 7));
	char padData[8];
	if (padBytes > 0) {
		n = fread(padData, padBytes, 1, inSFF);
		assert(n==1);
	}
	
	if (debugflag) {
		//DEBUG
		printf("Magic:	%u	%s\n", h.magic_number, (h.magic_number == MAGIC ? "Yes" : "No"));
		printf("Header length: %hu\n", h.header_length);
		printf("Version: %d%d%d%d\n", h.version[0], h.version[1], h.version[2], h.version[3]);
		printf("Index offset: %lu  length: %u\n", h.index_offset, h.index_length);
		printf("Number of reads: %u\n", h.number_of_reads);
		printf("Key length: %u\n", h.key_length);
		printf("Flows per read: %hu\n", h.number_of_flows_per_read);
		printf("Flowgram format: %hhu\n", h.flowgram_format_code);
		printf ("End of Header\n\n");
	}
		
	// Write the header of the output SFF
	char pad[8];
	memset(pad, 0, sizeof(pad));
	int bytes = 31;
	fwrite (&ch_out, bytes, 1, outSFF);
	for(int i=0;i<h.number_of_flows_per_read;i++) {
		fwrite(&flow_chars[i%4], 1, 1, outSFF);
		bytes++;
	}

	fwrite(key_sequence, 1, 4, outSFF);
	bytes += 4;

	padBytes = (8 - (bytes & 0x7)) & 0x7;
	if (padBytes > 0)
		fwrite(pad, padBytes, 1, outSFF);

	// Prepare to process all the reads
	numReads = h.number_of_reads;
	
	flowgram_values = (uint16_t *)malloc(sizeof(uint16_t) * h.number_of_flows_per_read);
	int maxBases = h.number_of_flows_per_read * 100; // problems if ever a 10-mer hits every flow!
	flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
	bases = (char *)malloc(maxBases);
	quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
	
	//Loop thru the reads
	for (int i=0;i<numReads;i++) {

		// Read read header
		ReadHeader r;
		n = fread(&r, 16,  1, inSFF);
        	assert(n==1);

		ByteSwap2(r.read_header_length);
		ByteSwap4(r.number_of_bases);
		ByteSwap2(r.name_length);
		ByteSwap2(r.clip_qual_left);
		ByteSwap2(r.clip_qual_right);
		ByteSwap2(r.clip_adapter_left);
		ByteSwap2(r.clip_adapter_right);
		
		if (r.name_length > 0) {
			n = fread(name, r.name_length, 1, inSFF);
			assert(n==1);
            		name[r.name_length] = '\0';
		}
		
		int readPadLength = ((8 - ((16 + r.name_length) & 7)))%8;
		if (readPadLength > 0) {
			n = fread(padData, readPadLength, 1, inSFF);
			assert(n==1);
		}

		n = fread(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), inSFF);
		assert(n==sizeof(uint16_t));
		n = fread(flow_index_per_base, r.number_of_bases, sizeof(uint8_t), inSFF);
		assert(n==sizeof(uint8_t));
		n = fread(bases, r.number_of_bases, 1, inSFF);
		assert(n==1);
		bases[r.number_of_bases] = '\0';
		n = fread(quality_scores, r.number_of_bases, sizeof(uint8_t), inSFF);
		assert(n==sizeof(uint8_t));

		int bytesRead = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * r.number_of_bases;
		readPadLength = (8 - (bytesRead & 7))%8;
		if (readPadLength > 0) {
			n = fread(padData, readPadLength, 1, inSFF);
			assert(n==1);
		}
		
		int f;		
		if (debugflag) {
			// DEBUG
			printf("Read: %s has %d bases\n",
							(r.name_length > 0 ? name : "NONAME"),
							r.number_of_bases);
			//printf("Read header length: %d\n", r.read_header_length);
			printf("Clip left: %d qual: %d right: %d qual: %d\n",
						r.clip_adapter_left, r.clip_qual_left,
						r.clip_adapter_right, r.clip_qual_right);
			printf("Flowgram bases:\n");
			for(f=0;f<h.number_of_flows_per_read;f++)
				printf("%d ", (int) floor (ByteSwap2(flowgram_values[f])/100.0 + 0.5));
			printf("\n");
			//printf("\nFlow index per base:\n");
			unsigned int b;
			//for(b=0;b<r.number_of_bases;b++)
			//	printf("%d ", flow_index_per_base[b]);
			printf("Bases called:\n");
			for(b=0;b<r.number_of_bases;b++)
				printf("%c", bases[b]);
			//printf("\nQuality scores:\n");
			//for(b=0;b<r.number_of_bases;b++)
			//	printf("%d ", quality_scores[b]);
		} else {
			for(f=0;f<h.number_of_flows_per_read;f++)
				ByteSwap2(flowgram_values[f]);
		}
		
		//Get the row column for this read
		int row;
		int col;
                if(1 != ion_readname_to_rowcol(name, &row, &col)) {
                    fprintf (stderr, "Error parsing read name: '%s'\n", name);
                    continue;
		}
		
		//Look for matching row column in the list
		listMatch = false;
		//fprintf (stdout, "Looking for %d %d\n", row, col);
		int readMatch=0;
		for (;readMatch<got;readMatch++) {
			if (row == rows[readMatch] && col == cols[readMatch]) {
				//fprintf (stdout, "\there it is %d %d\n", rows[readMatch],cols[readMatch]);
				listMatch = true;
				fnds[readMatch] = true;
				matchCnt++;
				break;
			}
		}
		
		if (listMatch) {
			//
			//	Update the output file
			//
			int nameLen = r.name_length;
			int numBasesCalled = r.number_of_bases;
			if(r.clip_qual_right == 0 || r.clip_qual_right > lengths[readMatch])
				r.clip_qual_right = lengths[readMatch];

			// write the header
			ByteSwap2(r.read_header_length);
			ByteSwap4(r.number_of_bases);
			ByteSwap2(r.name_length);
			ByteSwap2(r.clip_qual_left);
			ByteSwap2(r.clip_qual_right);
			ByteSwap2(r.clip_adapter_left);
			ByteSwap2(r.clip_adapter_right);
			fwrite (&r, 16, 1, outSFF);

			fwrite(name, nameLen, 1, outSFF);
			int writePadLength = (8 - (nameLen & 7)) & 7;
			if (writePadLength)
				fwrite(padData, writePadLength, 1, outSFF);

			if(qualflag) {
				for(int iBase=0; iBase < lengths[readMatch]; iBase++) {
					quality_scores[iBase] = (uint8_t) quals[readMatch][iBase] + qual_offset;
				}
			}
			for(int iBase=lengths[readMatch]; iBase < numBasesCalled; iBase++) {
				flow_index_per_base[iBase] = 0;
				bases[iBase] = 'N';
				quality_scores[iBase] = 0;
			}
			for (f=0;f<h.number_of_flows_per_read;f++)
				ByteSwap2(flowgram_values[f]);
			fwrite(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), outSFF);

			fwrite(flow_index_per_base, numBasesCalled, sizeof(uint8_t), outSFF);

			fwrite(bases, numBasesCalled, 1, outSFF);

			fwrite(quality_scores, numBasesCalled, sizeof(uint8_t), outSFF);

			int bytesWritten = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * numBasesCalled;
			writePadLength = (8 - (bytesWritten & 7)) & 7;
			if (writePadLength)
				fwrite(padData, writePadLength, 1, outSFF);
		}
		else {
			//Skip this read
		}
	}
	
	//Update Read Count in output SFF file
	ch_out.number_of_reads = BYTE_SWAP_4(matchCnt);
	fseek (outSFF, 0, SEEK_SET);
	bytes=31;
	fwrite(&ch_out, bytes, 1, outSFF);
	
	//User message
	fprintf (stdout, "Created file: %s\n", outFileName);
	
	//
	//Write out report on unfound reads
	//
	bool printErrorLog = false;
	for (int i=0;i<got;i++)
	{
		if (fnds[i] == false) {
			printErrorLog = true;
			break;
		}
	}
	if (printErrorLog)
	{
		fprintf (stdout, "There are reads that were not found.  See %s\n", errFileName);
		FILE *fpErr = fopen (errFileName, "wb");
		if (fpErr) {
			fprintf (fpErr, "# SFF file: %s\n", inFileName);
			fprintf (fpErr, "# Read positions source: %s\n", listFileName);
			fprintf (fpErr, "# Reads not found in SFF:\n");
			fprintf (fpErr, "# Row Column\n");
			for (int i=0;i<got;i++)
			{
				if (fnds[i] == false) {
					fprintf (fpErr, "%d %d\n", rows[i], cols[i]);
				}
			}
			fclose (fpErr);
		}
		
	}
	//Cleanup
	fclose (inSFF);
	fclose (outSFF);
	free (rows);
	free (cols);
	free (fnds);
	free (flow_chars);
	free (key_sequence);
	free (flowgram_values);
	free (flow_index_per_base);
	free (bases);
	free (quality_scores);
	free (listFileName);
    free (outFileName);
	
	return 0;
}
Esempio n. 3
0
File: FIXSFF.cpp Progetto: alecw/TS
int main(int argc, char *argv[])
{
  char	*fastqFileName = NULL;
  char	*sffFileName = NULL;
  bool	forceClip = false;
  bool	keyPass = false;
  bool	allReads = false;
  int	minReadLen = 8; // min read length after key-pass that we will write out to fastq file
  int	readCol = -1;
  int	readRow = -1;
  bool	findRead = false;
  int row, col;

  // process command-line args
  int argcc = 1;
  while (argcc < argc) {
      if (argv[argcc][0] == '-') {
          switch (argv[argcc][1]) {
            case 'a': // output all reads
              allReads = true;
              break;

            case 'R': // report read at row & column
              argcc++;	
              readRow = atoi(argv[argcc]);
              break;

            case 'C': // report read at row & column
              argcc++;	
              readCol = atoi(argv[argcc]);
              break;

            case 'q': // convert to fastq
              argcc++;
              fastqFileName = argv[argcc];
              break;

            case 'c': // force qual clip left to 5
              forceClip = true;
              break;

            case 'k': // force keypass
              keyPass = true;
              argcc++;
              hackkey = argv[argcc];
              hackkeylen = strlen(hackkey);
              break;

            case 'l': // set min readlength for fastq file output filter
              argcc++;
              minReadLen = atoi(argv[argcc]);
              break;

            default:
              //sffFileName = argv[argcc];
              break;
          }
      }
      else {
          sffFileName = argv[argcc];
      }
      argcc++;
  }

  if (!sffFileName) {
      printf("Usage: SFFRead [args] sffFile.sff\n");
      exit(0);
  }

  if (readCol > -1 && readRow > -1) {
      findRead = true;
      allReads = true;// makes it search all reads
  }

  FILE *fp;
  fp = fopen(sffFileName, "r+");
  if (fp) {
      if (!findRead && !fastqFileName)
        printf("Reading file: %s\n", sffFileName);
      CommonHeader h;

      // Fix the flow_format_code problem: make sure it is set to 1
      fpos_t p, start;

      fgetpos (fp, &p);
      fgetpos (fp, &start);
      start = p;
      int elements_read = fread(&h, 31, 1, fp);
      assert(elements_read == 1);
      h.flowgram_format_code = 1;
      fsetpos (fp, &p);
      fwrite (&h, 31, 1, fp);
      fsetpos (fp, &p);

      elements_read = fread(&h, 31, 1, fp);
      assert(elements_read == 1);
      ByteSwap8(h.index_offset);
      ByteSwap4(h.index_length);
      ByteSwap4(h.number_of_reads);
      ByteSwap2(h.header_length);
      ByteSwap2(h.key_length);
      ByteSwap2(h.number_of_flows_per_read);
      if (!findRead && !fastqFileName) {
          printf("Magic:	%u	%s\n", h.magic_number, (h.magic_number == MAGIC ? "Yes" : "No"));
          printf("Version: %d%d%d%d\n", h.version[0], h.version[1], h.version[2], h.version[3]);
          printf("Index offset: %lu  length: %u\n", h.index_offset, h.index_length);
          printf("Number of reads: %u\n", h.number_of_reads);
          printf("Header length: %hu\n", h.header_length);
          printf("Key length: %u\n", h.key_length);
          printf("Flows per read: %hu\n", h.number_of_flows_per_read);
          printf("Flowgram format: %hhu\n", h.flowgram_format_code);
      }

      flow_chars = (char *)malloc(h.number_of_flows_per_read);
      key_sequence = (char *)malloc(h.key_length);
      elements_read = fread(flow_chars, h.number_of_flows_per_read, 1, fp);
      assert(elements_read == 1);
      elements_read = fread(key_sequence, h.key_length, 1, fp);
      assert(elements_read == 1);
      int i;
      if (!findRead && !fastqFileName) {
          printf("Key sequence: ");
          for(i=0;i<h.key_length;i++)
            printf("%c", key_sequence[i]);

          printf("\nFlow chars:\n");
          for(i=0;i<h.number_of_flows_per_read;i++)
            printf("%c", flow_chars[i]);
          printf("\n");
      }
      int padBytes = (8-((31 + h.number_of_flows_per_read + h.key_length) & 7));
      char padData[8];
      //		fprintf (stdout, "Pad Bytes = %d\n", padBytes);
      elements_read = fread(padData, padBytes, 1, fp);
      assert(elements_read == 1);

      fgetpos(fp, &p);
      //		fprintf (stdout, "We are at %ld\n", (p.__pos - start.__pos));

      // -- read the reads
      int numReads = h.number_of_reads;

      // pre-allocate space so we be fast
      flowgram_values = (uint16_t *)malloc(sizeof(uint16_t) * h.number_of_flows_per_read);
      int maxBases = h.number_of_flows_per_read * 10; // problems if ever a 10-mer hits every flow!
      flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);
      bases = (char *)malloc(maxBases);
      quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * maxBases);

      for(i=0;i<numReads;i++) {
          ReadHeader r;

#define FIXIT
#ifdef FIXIT
          fpos_t pos;
          // Get position ready-to-read header
          fgetpos (fp, &pos);
          // Read header
          elements_read = fread(&r, 16,  1, fp);
          assert(elements_read == 1);
          // byte swap
          ByteSwap2(r.read_header_length);
          ByteSwap2(r.name_length);
          //			fprintf (stdout, "Old read header length = %d\n", r.read_header_length);
          // Fix the read_header_length
          // read_header_length is "16 + name_length" rounded up to nearest divisible by 8
          r.read_header_length = 16 + r.name_length;
          r.read_header_length += (8 - (r.read_header_length & 0x7)) & 0x7;
          //			fprintf (stdout, "New read header length = %d\n", r.read_header_length);
          // Byte swap
          ByteSwap2(r.read_header_length);
          ByteSwap2(r.name_length);
          // Rewind file pointer
          fsetpos (fp, &pos);
          // Write header out again
          fwrite (&r, 16, 1, fp);
          // Rewind again
          fsetpos (fp, &pos);
#endif

          // Read it in and continue
          fpos_t readStart;
          fgetpos(fp, &readStart);
          elements_read = fread(&r, 16,  1, fp);
          assert(elements_read == 1);

          //ByteSwap2(r.read_header_length);
          //ByteSwap2(r.name_length);
          ByteSwap2(r.clip_qual_left);
          ByteSwap2(r.clip_qual_right);
          ByteSwap2(r.clip_adapter_left);
          ByteSwap2(r.clip_adapter_right);

          // Fix clipping values
          r.clip_qual_left = 5;
          r.clip_adapter_left = 0;
          r.clip_qual_right = 0;
          r.clip_adapter_right = 0;
          ByteSwap2(r.clip_qual_left);
          ByteSwap2(r.clip_qual_right);
          ByteSwap2(r.clip_adapter_left);
          ByteSwap2(r.clip_adapter_right);

          // Rewind file pointer to beginning of read header
          fsetpos (fp, &pos);
          // Write read header
          fwrite (&r, 16, 1, fp);
          // Rewind file pointer to beginning of read header
          fsetpos (fp, &pos);
          // Read corrected header
          elements_read = fread(&r, 16,  1, fp);
          assert(elements_read == 1);

          ByteSwap2(r.read_header_length);
          ByteSwap4(r.number_of_bases);
          ByteSwap2(r.name_length);
          ByteSwap2(r.clip_qual_left);
          ByteSwap2(r.clip_qual_right);
          ByteSwap2(r.clip_adapter_left);
          ByteSwap2(r.clip_adapter_right);

          //printf("Read header length: %d\n", r.read_header_length);
          //printf("Read name length: %d\n", r.name_length);

          /*
             flow_index_per_base = (uint8_t *)malloc(sizeof(uint8_t) * r.number_of_bases);
             bases = (char *)malloc(r.number_of_bases);
             quality_scores = (uint8_t *)malloc(sizeof(uint8_t) * r.number_of_bases);
             */

          if (r.name_length > 0) {
              elements_read = fread(name, r.name_length, 1, fp);
              assert(elements_read == 1);
              name[r.name_length] = 0; // so we can easily print it
          }
          if(1 != ion_readname_to_rowcol(name, &row, &col)) {
              fprintf (stderr, "Error parsing read name: '%s'\n", name);
              continue;
          }

          int readPadLength = ((8 - ((16 + r.name_length) & 7)))%8;
          elements_read = fread(padData, readPadLength, 1, fp);
          assert(elements_read == 1);
          /*
             printf("Read: %s (r%d|c%d) has %d bases\n",
             (r.name_length > 0 ? name : "NONAME"),
             row, col,
             r.number_of_bases);
             printf("Clip left: %d qual: %d right: %d qual: %d\n",
             r.clip_adapter_left, r.clip_qual_left,
             r.clip_adapter_right, r.clip_qual_right);
             printf("Flowgram values:\n");
             */
          elements_read = fread(flowgram_values, h.number_of_flows_per_read, sizeof(uint16_t), fp);
          assert(elements_read == sizeof(uint16_t));
          elements_read = fread(flow_index_per_base, r.number_of_bases, sizeof(uint8_t), fp);
          assert(elements_read == sizeof(uint8_t));
          elements_read = fread(bases, r.number_of_bases, 1, fp);
          assert(elements_read == 1);
          elements_read = fread(quality_scores, r.number_of_bases, sizeof(uint8_t), fp);

          int bytesRead = h.number_of_flows_per_read * sizeof(uint16_t) + 3 * r.number_of_bases;
          readPadLength = (8 - (bytesRead & 7))%8;
          elements_read = fread(padData, readPadLength, 1, fp);
          assert(elements_read == 1);
          fpos_t readEnd;
          fgetpos(fp, &readEnd);
          //			fprintf (stdout, "At end of read. Size: %ld\n", readEnd.__pos-readStart.__pos);
          //if ((readEnd.__pos-readStart.__pos) != r.read_header_length) {
          //	fprintf (stdout, "mismatch in read_header_length\n");
          //	exit (1);
          //}

          // parse the name to get the row & col, if matched, print out read
          if(1 != ion_readname_to_rowcol(name, &row, &col)) {
              fprintf (stderr, "Error parsing read name: '%s'\n", name);
              continue;
          }

          if (row == readRow && col == readCol) {
              //printf("Ionogram: ");
              int i;
              for(i=0;i<h.number_of_flows_per_read;i++) {
                  printf("%.2lf ", (double)(ByteSwap2(flowgram_values[i]))/100.0);
              }
              printf("\n");
          }
          /*
             int f;
             for(f=0;f<h.number_of_flows_per_read;f++)
             printf("%d ", ByteSwap2(flowgram_values[f]));
             printf("\nFlow index per base:\n");
             unsigned int b;
             for(b=0;b<r.number_of_bases;b++)
             printf("%d ", flow_index_per_base[b]);
             printf("\nBases called:\n");
             for(b=0;b<r.number_of_bases;b++)
             printf("%c", bases[b]);
             printf("\nQuality scores:\n");
             for(b=0;b<r.number_of_bases;b++)
             printf("%d ", quality_scores[b]);
             printf("\nDone with this read\n\n");
             */

          /*
             if (name) free(name);
             free(flowgram_values);
             free(flow_index_per_base);
             free(bases);
             free(quality_scores);
             */
      }
      free(flowgram_values);
      free(flow_index_per_base);
      free(bases);
      free(quality_scores);


      fclose(fp);
  }

  return 0;
}
Esempio n. 4
0
int main(int argc, char *argv[])
{
	char	*fastqFileName = NULL;
	char	*sffFileName = NULL;
	bool	forceClip = false;
	bool	keyPass = false;
	bool	allReads = false;
	int	readCol = -1;
	int	readRow = -1;
	bool	findRead = false;
	int row, col;
	int numKeypassedReads = 0;
	int qual_offset = DEFAULT_QUAL_OFFSET;
	bool legacyFASTQName = false;	// enable if you want r10|c100 format name in fastq file
	bool debug = false;
	bool legacyReadName = false;
	bool adapterTrim = true;
	bool ignoreLeftQualTrim = false;
	
	// process command-line args
	int argcc = 1;
	while (argcc < argc) {
		if (argv[argcc][0] == '-') {
			switch (argv[argcc][1]) {
				case 'a': // output all reads
					allReads = true;
				break;

				case 'R': // report read at row & column
					argcc++;	
					readRow = atoi(argv[argcc]);
				break;

				case 'C': // report read at row & column
					argcc++;	
					readCol = atoi(argv[argcc]);
				break;

				case 'q': // convert to fastq
					argcc++;
					fastqFileName = argv[argcc];
				break;

				case 'c': // force qual clip left to 5
					forceClip = true;
				break;

				case 's':	// Offset to apply to quality scores
					argcc++;
					qual_offset = atoi(argv[argcc]);
					if(qual_offset==0) {
						fprintf (stderr, "-s option should specify a nonzero quality offset\n");
						exit (1);
					}
				break;
				
				case 'k': // force keypass
					keyPass = true;
					argcc++;
					hackkey = argv[argcc];
					hackkeylen = strlen(hackkey);
				break;
			
				case 'L':	// don't record name of read in comment
					legacyFASTQName = true;
				break;
			
				case 'd':	// enable debug print outs
					debug = true;
				break;
			
				case 'h':	// help info
					printHelp ();
					exit (0);
				break;
			
				case 'u':	// prevent read clipping
					adapterTrim = false;
				break;
			
				case 'b':	// ignore barcodes (ok really its ignoring the left qual trim)
					ignoreLeftQualTrim = true;
				break;
			
				case 'v':	// version info
					fprintf (stdout, "%s", IonVersion::GetFullVersion("SFFRead").c_str());
					exit (0);
				break;

				default:
					//sffFileName = argv[argcc];
					break;
			}
		}
		else {
			sffFileName = argv[argcc];
		}
		argcc++;
	}

	if (!sffFileName) {
		printHelp();
		exit(0);
	}

	if (readCol > -1 && readRow > -1) {
		findRead = true;
		allReads = true;// makes it search all reads
	}

    sff_file_t* sff_file_in = NULL;
    sff_file_in = sff_fopen(sffFileName, "rb", NULL, NULL);

	if (sff_file_in) {
		if (!findRead && !fastqFileName) {
			printf("Reading file: %s\n", sffFileName);
            sff_header_print(stdout, sff_file_in->header);
		}

		// -- read the reads
		int numReads;
		if (allReads) {
			numReads = sff_file_in->header->n_reads;
		}
		else {
			numReads = (sff_file_in->header->n_reads < 10 ? sff_file_in->header->n_reads:10);
		}
		FILE *fpq = NULL;
		if (fastqFileName) {
			numReads = sff_file_in->header->n_reads;
			fpq = fopen(fastqFileName, "w");
			if (!fpq){
				perror (fastqFileName);
				exit (1);
			}
		}

		for(int i=0;i<numReads;i++) {
            sff_read_header_t* rh = sff_read_header_read(sff_file_in->fp);
            sff_read_t* rr = sff_read_read(sff_file_in->fp, sff_file_in->header, rh);

			// optional - ignore the left & right adapter clipping by simply setting these values to 0
			if (!adapterTrim) {
				rh->clip_adapter_left = 0;
				rh->clip_adapter_right = 0;
			}

			if (!fpq && !findRead) {
				printf("Read header length: %d\n", rh->rheader_length);
				printf("Read name length: %d\n", rh->name_length);
			}

			
			// Extract the row and column popsition info for this read
            if (1 != ion_readname_to_rowcol(rh->name->s, &row, &col)) {
                fprintf (stderr, "Error parsing read name: '%s'\n", rh->name->s);
                continue;
            }
            if(1 == ion_readname_legacy(rh->name->s)) {
                legacyReadName = true;
			}
			else {
                legacyReadName = false;
			}
			

			if (!fpq && !findRead) {
				printf("Read: %s (r%05d|c%05d) has %d bases\n",
						(rh->name_length > 0 ? rh->name->s : "NONAME"),
						row, col,
						rh->n_bases);
				printf("Clip left: %d qual: %d right: %d qual: %d\n",
					rh->clip_adapter_left, rh->clip_qual_left,
					rh->clip_adapter_right, rh->clip_qual_right);
				printf("Flowgram values:\n");
			}

			if (findRead) {
				if (row == readRow && col == readCol) {
					//printf("Ionogram: ");
					int i;
					for(i=0;i<sff_file_in->header->flow_length;i++) {
						printf("%.2lf ", (double)(rr->flowgram[i])/100.0);
					}
					printf("\n");
					
					//// now print the bases - all the bases, not clipped!
					//// these bases correspond to the raw flowgram data. in essence
					//for (int b=0;b<r.number_of_bases;b++)
					//	fprintf(stdout, "%c", bases[b]);
					//fprintf(stdout, "\n");
				}
			}
			else if (fpq) {
				bool ok = true;
				if (keyPass) {
					// if (r.number_of_bases > h.key_length) {
					if ((int)rh->n_bases > hackkeylen) {
						int b;
						// for(b=0;b<h.key_length;b++) {
						for(b=0;b<hackkeylen;b++) {
							// if (key_sequence[b] != bases[b]) {
							if (hackkey[b] != rr->bases->s[b]) {
								ok = false;
								break;
							}
						}
					} else
						ok = false; // not long enough
				}

				int clip_left_index = 0;
				int clip_right_index = 0;
				if (ok) {
					//numKeypassedReads++;
					
					// If force-clip option is set, we want to ensure the key gets trimmed
					if (forceClip && rh->clip_adapter_left < 4)
						rh->clip_adapter_left = hackkeylen+1;

					if (ignoreLeftQualTrim)
						clip_left_index = max (1, rh->clip_adapter_left);
					else
						clip_left_index = max (1, max (rh->clip_qual_left, rh->clip_adapter_left));
					clip_right_index = min ((rh->clip_qual_right == 0 ? rh->n_bases:rh->clip_qual_right),
											(rh->clip_adapter_right == 0 ? rh->n_bases:rh->clip_adapter_right));
					if (debug)
						fprintf (stdout, "debug clip: left = %d right = %d\n", clip_left_index, clip_right_index);
					numKeypassedReads++;
					if (clip_left_index > clip_right_index)
						// Suppress output of zero-mer reads (left > right)
						ok = false;
				}
				if (ok) {
					//print id string
					if (legacyFASTQName) {
						fprintf (fpq, "@r%d|c%d\n", row, col);
					}
					else {
						if (legacyReadName){
							//Override legacy name
							char runId[6] = {'\0'};
							strncpy (runId, &rh->name->s[7], 5);
							fprintf (fpq, "@%s:%d:%d\n", runId, row, col);
						}
						else {
							//Copy name verbatim
							fprintf (fpq, "@%s\n", rh->name->s);
						}
					}
						
					//print bases
					for (int b=clip_left_index-1;b<clip_right_index;b++)
						fprintf(fpq, "%c", rr->bases->s[b]);
					fprintf(fpq, "\n");
					//print '+'
					fprintf(fpq, "+\n");
					//print quality scores
					for (int b=clip_left_index-1;b<clip_right_index;b++)
						fprintf(fpq, "%c", QualToFastQ((int)(rr->quality->s[b]),qual_offset));
					fprintf(fpq, "\n");
				}
			}
			else {
				int f;
				for(f=0;f<sff_file_in->header->flow_length;f++)
					printf("%d ", rr->flowgram[f]);
				printf("\nFlow index per base:\n");
				unsigned int b;
				for(b=0;b<rh->n_bases;b++)
					printf("%d ", rr->flow_index[b]);
				printf("\nBases called:\n");
				for(b=0;b<rh->n_bases;b++)
					printf("%c", rr->bases->s[b]);
				printf("\nQuality scores:\n");
				for(b=0;b<rh->n_bases;b++)
					printf("%d ", rr->quality->s[b]);
				printf("\nDone with this read\n\n");
			}

            sff_read_header_destroy(rh);
            sff_read_destroy(rr);
		}

		//	debug print - keypass reads written to the fastq file
		if (fpq) {
		  static char *printkey = "All";
		  if (keyPass) printkey = hackkey;		  
			fprintf (stdout, "Keypass Reads(%s) = %d\n", printkey, numKeypassedReads);
			fprintf (stdout, "Total Reads = %d\n", numReads);
			fprintf (stdout, "Percentage = %.2f%%\n", ((float) numKeypassedReads/ (float) numReads) * 100.0);
		}
        sff_fclose(sff_file_in);
		if (fpq)
			fclose(fpq);
	}
	else {
		perror (sffFileName);
		exit (1);
	}

	return 0;
}
Esempio n. 5
0
// TODO: should we change the header:
// - must trake index_length
// - assumes row-major order
sff_index_t*
sff_index_create(sff_file_t *fp_in, sff_header_t *fp_out_header, int32_t num_rows, int32_t num_cols, int32_t type)
{
  int64_t len = 0;
  int32_t i, prev_row, prev_col, row, col;
  sff_index_t *idx;
  sff_t *sff;
  uint64_t fp_in_start, prev_pos;

  idx = sff_index_init();

  idx->num_rows = num_rows;
  idx->num_cols = num_cols;
  idx->type = type;

  // alloc
  switch(type) {
    case SFF_INDEX_ROW_ONLY:
      len = 1 + idx->num_rows;
      idx->offset = ion_malloc(len * sizeof(uint64_t), __func__, "idx->offset");
      break;
    case SFF_INDEX_ALL:
      len = 1 + (idx->num_rows * idx->num_cols);
      idx->offset = ion_malloc(len * sizeof(uint64_t), __func__, "idx->offset");
      break;
    default:
      ion_error(__func__, "this index type is currently not supported", Exit, OutOfRange);
  }

  // save where the sff entries started
  prev_pos = fp_in_start = ftell(fp_in->fp);
  if(-1L == fp_in_start) {
      ion_error(__func__, "ftell", Exit, ReadFileError);
  }

  // go through the input file
  i = 0;
  prev_row = prev_col = 0;
  while(NULL != (sff = sff_read(fp_in))) {
      // out of range
      if(len-1 <= i) {
          ion_error(__func__, "bug encountered", Exit, OutOfRange);
      }

      // get the row/col co-ordinates
      if(0 == ion_readname_to_rowcol(sff->rheader->name->s, &row, &col)) {
          ion_error(__func__, "could not understand the read name", Exit, OutOfRange);
      }

      // assumes row-major order, skips over reads that are not present
      if(row < prev_row || (row == prev_row && col < prev_col)) {
          ion_error(__func__, "SFF file was not sorted in row-major order", Exit, OutOfRange);
      }
      while(row != prev_row || col != prev_col) {
          // add in empty entry
          switch(type) {
            case SFF_INDEX_ROW_ONLY:
              if(0 == prev_col) { // first column
                  idx->offset[i] = UINT64_MAX;
                  // do not increment i, since we only do this when moving to a new row
              }
              break;
            case SFF_INDEX_ALL:
              // all rows and columns
              idx->offset[i] = UINT64_MAX;
              i++;
              break;
            default:
              ion_error(__func__, "this index type is currently not supported", Exit, OutOfRange);
          }
          if(len-1 <= i) {
              ion_error(__func__, "x/y was out of range", Exit, OutOfRange);
          }

          prev_col++;
          if(prev_col == idx->num_cols) {
              // new row
              prev_col = 0;
              prev_row++;
              if(SFF_INDEX_ROW_ONLY == type) {
                  i++;
              }
          }
      }

      // add to the index
      switch(type) {
        case SFF_INDEX_ROW_ONLY:
          if(0 == col) { // first column
              idx->offset[i] = prev_pos;
          }
          else if(0 < col && UINT64_MAX == idx->offset[i]) {
              idx->offset[i] = prev_pos;
              // do not move onto the next
          }
          break;
        case SFF_INDEX_ALL:
          // all rows and columns
          idx->offset[i] = prev_pos;
          i++;
          break;
        default:
          ion_error(__func__, "this index type is currently not supported", Exit, OutOfRange);
      }
      prev_row = row;
      prev_col = col;

      // destroy
      sff_destroy(sff);

      // next
      prev_col++;
      if(prev_col == idx->num_cols) {
          // new row
          prev_col = 0;
          prev_row++;
          if(SFF_INDEX_ROW_ONLY == type) {
              i++;
          }
      }

      prev_pos = ftell(fp_in->fp);
      if(-1L == prev_pos) {
          ion_error(__func__, "ftell", Exit, ReadFileError);
      }
  }
  // get the last offset
  idx->offset[len-1] = prev_pos;

  // update the index offset in the header
  fp_out_header->index_offset = fp_in_start; // insert between the header and sff entries
  // update the index length in the header
  fp_out_header->index_length = sff_index_length(idx);
  // update the offsets based on the index length
  for(i=0;i<len;i++) {
      if(UINT64_MAX != idx->offset[i]) {
          idx->offset[i] += fp_out_header->index_length;
      }
  }

  return idx;
}
Esempio n. 6
0
void UnpackOnLoad(Alignment *rai, const InputStructures &global_context)
{
  // No need to waste time if the read is filtered
  if (rai->filtered)
    return;

  rai->is_reverse_strand = rai->alignment.IsReverseStrand();

  // Parse read name, run id & flow order index

  rai->runid.clear();
  if (not rai->alignment.Name.empty()) {
    rai->well_rowcol.resize(2);
    ion_readname_to_rowcol(rai->alignment.Name.c_str(), &rai->well_rowcol[0], &rai->well_rowcol[1]);
    // extract runid while we are at it
    rai->runid  = rai->alignment.Name.substr(0,rai->alignment.Name.find(":"));
  }
  
  if (rai->runid.empty()){
    cerr << "WARNING: Unable to determine run id of read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }

  std::map<string,int>::const_iterator fo_it = global_context.flow_order_index_by_run_id.find(rai->runid);
  if (fo_it == global_context.flow_order_index_by_run_id.end()){
    cerr << "WARNING: No matching flow oder found for read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }
  rai->flow_order_index = fo_it->second;
  const ion::FlowOrder & flow_order = global_context.flow_order_vector.at(rai->flow_order_index);

  // Retrieve measurements from ZM tag

  vector<int16_t> quantized_measurements;
  if (not rai->alignment.GetTag("ZM", quantized_measurements)) {
    cerr << "ERROR: Normalized measurements ZM:tag is not present in read " << rai->alignment.Name << endl;
    exit(1);
  }
  if ((int)quantized_measurements.size() > global_context.num_flows_by_run_id.at(rai->runid)) {
    cerr << "ERROR: Normalized measurements ZM:tag length " << quantized_measurements.size()
         << " exceeds flow order length " << global_context.num_flows_by_run_id.at(rai->runid)
         <<" in read " << rai->alignment.Name << endl;
    exit(1);
  }
  rai->measurements.assign(global_context.num_flows_by_run_id.at(rai->runid), 0.0);
  for (size_t counter = 0; counter < quantized_measurements.size(); ++counter)
    rai->measurements[counter] = (float)quantized_measurements[counter]/256;
  rai->measurements_length = quantized_measurements.size();

  // Retrieve phasing parameters from ZP tag

  if (not rai->alignment.GetTag("ZP", rai->phase_params)) {
    cerr << "ERROR: Phasing Parameters ZP:tag is not present in read " << rai->alignment.Name << endl;
    exit(1);
  }
  if (rai->phase_params.size() != 3) {
    cerr << "ERROR: Phasing Parameters ZP:tag does not have 3 phase parameters in read " << rai->alignment.Name << endl;
    exit(1);
  }
  if (rai->phase_params[0] < 0 or rai->phase_params[0] > 1 or rai->phase_params[1] < 0 or rai->phase_params[1] > 1
      or rai->phase_params[2] < 0 or rai->phase_params[2] > 1) {
    cerr << "ERROR: Phasing Parameters ZP:tag outside of [0,1] range in read " << rai->alignment.Name << endl;
    exit(1);
  }
  rai->phase_params[2] = 0.0f;   // ad-hoc corrector: zero droop

  // Populate read_bases (bases without rev-comp on reverse-mapped reads) and flow_index

  rai->read_bases = rai->alignment.QueryBases;
  if (rai->is_reverse_strand)
    RevComplementInPlace(rai->read_bases);
  if (rai->read_bases.empty()){
    cerr << "WARNING: Ignoring length zero read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }

  // Unpack alignment

  rai->pretty_aln.reserve(global_context.num_flows_by_run_id.at(rai->runid));
  UnpackAlignmentInfo(rai);
  if (rai->is_reverse_strand)
    rai->start_sc = rai->right_sc;
  else
    rai->start_sc = rai->left_sc;

  // Generate flow index

  rai->start_flow = 0;
  if (not rai->alignment.GetTag("ZF", rai->start_flow)) {
    uint8_t start_flow_byte = 0;
    if (not rai->alignment.GetTag("ZF", start_flow_byte)) {
      cerr << "ERROR: Start Flow ZF:tag not found in read " << rai->alignment.Name << endl;
      exit(1);
    }
    rai->start_flow = (int)start_flow_byte;
  }
  if (rai->start_flow == 0) {
    cerr << "WARNING: Start Flow ZF:tag has zero value in read " << rai->alignment.Name << endl;
    rai->filtered = true;
    return;
  }
  CreateFlowIndex(rai, flow_order);

  if (global_context.resolve_clipped_bases) {
    // Increment start flow to first aligned base
    rai->start_flow = rai->flow_index[rai->start_sc];
  }

  // Check validity of input arguments
  if (rai->start_flow < 0 or rai->start_flow >= global_context.num_flows_by_run_id.at(rai->runid)) {
    cerr << "ERROR: Start flow outside of [0,num_flows) range in read " << rai->alignment.Name << endl;
    cerr << "Start flow: " << rai->start_flow << " Number of flows: " << global_context.flow_order_vector.at(rai->flow_order_index).num_flows();
    exit(1);
  }

  // Retrieve read group name & generate prefix flow

  if (not rai->alignment.GetTag("RG",rai->read_group)) {
    cerr << "WARNING: No read group found in read " << rai->alignment.Name << endl;
    // No big problem, we'll just have to solve the prefix like it's 2013!
    rai->read_group.clear();
  }

  // Get read prefix - hard clipped start of the read: [KS][ZT][ZE]
  rai->prefix_flow = -1;
  std::map<string,string>::const_iterator key_it = global_context.key_by_read_group.find(rai->read_group);
  if (key_it != global_context.key_by_read_group.end()) {
    rai->prefix_bases = key_it->second;

    string temp_zt, temp_ze;
    if (rai->alignment.GetTag("ZT", temp_zt))
      rai->prefix_bases += temp_zt;
    if (rai->alignment.GetTag("ZE", temp_ze))
      rai->prefix_bases += temp_ze;

    if (not rai->prefix_bases.empty())
	  GetPrefixFlow(rai, rai->prefix_bases, flow_order);
  }

  // Check consistency of prefix_flow and start_flow - maybe we don't have all info about hard clipped bases
  if (rai->prefix_flow >= 0) {
    int check_start_flow = rai->prefix_flow;
    while (check_start_flow < flow_order.num_flows() and  flow_order.nuc_at(check_start_flow) != rai->read_bases.at(0))
	  check_start_flow++;
    if (check_start_flow != rai->start_flow) {
      rai->prefix_flow = -1;
      rai->prefix_bases.clear();
    }
  }

}