/* * Reads a read header (including variable length components) from an mFILE. * * Returns the a pointer to the header on success * NULL on failure */ sff_read_header *read_sff_read_header(mFILE *mf) { sff_read_header *h; unsigned char rhdr[16]; if (16 != mfread(rhdr, 1, 16, mf)) return NULL; h = decode_sff_read_header(rhdr); if (h->name_len != mfread(h->name, 1, h->name_len, mf)) return free_sff_read_header(h), NULL; /* Pad to 8 chars */ mfseek(mf, (mftell(mf) + 7)& ~7, SEEK_SET); return h; }
/* * Override the sff.c functions to use FILE pointers instead. This means * we don't have to load the entire archive into memory, which is optimal when * dealing with a single file (ie in sff/sff.c), but not when indexing it. * * Done with minimal error checking I'll admit... */ static sff_read_header *fread_sff_read_header(FILE *fp) { sff_read_header *h; unsigned char rhdr[16]; if (16 != fread(rhdr, 1, 16, fp)) return NULL; h = decode_sff_read_header(rhdr); if (h->name_len != fread(h->name, 1, h->name_len, fp)) return free_sff_read_header(h), NULL; /* Pad to 8 chars */ fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET); return h; }
/* * Unpacks the 16-byte fixed size part of the SFF read header. * It allocates memory for this and for the base calls, but does not * unpack these. * * Returns sff_read_header* on success * NULL on failure */ sff_read_header *decode_sff_read_header(unsigned char *buf) { sff_read_header *h; if (NULL == (h = (sff_read_header *)xcalloc(1, sizeof(*h)))) return NULL; h->header_len = be_int2(*(uint16_t *)(buf+0)); h->name_len = be_int2(*(uint16_t *)(buf+2)); h->nbases = be_int4(*(uint32_t *)(buf+4)); h->clip_qual_left = be_int2(*(uint16_t *)(buf+8)); h->clip_qual_right = be_int2(*(uint16_t *)(buf+10)); h->clip_adapter_left = be_int2(*(uint16_t *)(buf+12)); h->clip_adapter_right = be_int2(*(uint16_t *)(buf+14)); if (NULL == (h->name = (char *)xmalloc(h->name_len))) return free_sff_read_header(h), NULL; return h; }
/* F U N C T I O N S *********************************************************/ void process_sff_to_fastq(char *sff_file, int trim_flag) { sff_read_header rh; sff_read_data rd; FILE *sff_fp, *fastq_fp; if ( (sff_fp = fopen(sff_file, "r")) == NULL ) { fprintf(stderr, "[err] Could not open file '%s' for reading.\n", sff_file); exit(1); } get_sff_file_size(sff_fp); read_sff_common_header(sff_fp, &h); verify_sff_common_header(&h); if ( keep_fastq_orig == true ) { vector<string> tmp_rep; split_str(string(sff_file), tmp_rep, "//"); if ( ( fastq_fp = fopen( (tmp_rep[tmp_rep.size() - 1].substr(0,tmp_rep[tmp_rep.size() - 1].length()-4) + ".fastq").c_str(), "w") ) == NULL ) { fprintf(stderr, "[err] Could not open file '%s' for writing.\n", ""); exit(1); } } int left_clip = 0, right_clip = 0, nbases = 0; char *name; char *bases; uint8_t *quality; //register int i; unsigned int numreads = h.nreads; for (int i = 0; i < numreads; i++) { //cout << i << " " << numreads << endl; read_sff_read_header(sff_fp, &rh); read_sff_read_data(sff_fp, &rd, h.flow_len, rh.nbases); //rheaders.push_back(rh); // get clipping points get_clip_values(rh, trim_flag, &left_clip, &right_clip); nbases = right_clip - left_clip; // create bases string bases = get_read_bases(rd, left_clip, right_clip); // create quality array quality = get_read_quality_values(rd, left_clip, right_clip); //Create new read Read *read = new Read(); read->initial_length = nbases; read->read = string(bases); uint8_t quality_char; read->quality = (uint8_t*)malloc(sizeof(uint8_t)*nbases); for (int j = 0; j < nbases; j++) { quality_char = (quality[j] <= 93 ? quality[j] : 93) + 33; read->quality[j] = quality_char; } //read->rd = rd; read->flowgram = new uint16_t[h.flow_len]; for(int j=0; j<h.flow_len; j++) { read->flowgram[j] = rd.flowgram[j]; //cout << rd.flowgram[j] << " " << endl; } read->flow_index = (uint8_t*)malloc(sizeof(uint8_t)*nbases); for(int j=0; j<nbases; j++) { read->flow_index[j] = rd.flow_index[j]; } read->roche_left_clip = (int) max(1, max(rh.clip_qual_left, rh.clip_adapter_left)) - 1; read->roche_right_clip = (int) min( (rh.clip_qual_right == 0 ? rh.nbases : rh.clip_qual_right ), (rh.clip_adapter_right == 0 ? rh.nbases : rh.clip_adapter_right) ); reads.push_back(read); string tstr = string(rh.name) + " " + string(itoa(rh.clip_adapter_left,new char[5],10)) + " " + string(itoa(rh.clip_adapter_right,new char[5],10))+ " " + string(itoa(rh.clip_qual_left,new char[5],10)) + " " + string(itoa(rh.clip_qual_right,new char[5],10)) + " " + string(itoa(rh.clip_qual_right,new char[5],10)); int t_len = tstr.length(); // create read name string int name_length = (int) t_len + 1; // account for NULL termination name = (char *) malloc( name_length * sizeof(char) ); if (!name) { fprintf(stderr, "Out of memory! For read name string!\n"); exit(1); } memset(name, '\0', (size_t) name_length); read->readID = (char *) malloc( rh.name_len * sizeof(char) ); //read->readID = rh.name; memcpy( read->readID, rh.name, (size_t) rh.name_len ); //strncpy(name, rh.name, (size_t) rh.name_len); strncpy(name, tstr.c_str(), (size_t)t_len); if ( keep_fastq_orig == true ) construct_fastq_entry(fastq_fp, name, bases, quality, nbases); //printf("%d\n",rh.name_len); free(name); free(bases); free(quality); free_sff_read_header(&rh); free_sff_read_data(&rd); } read_manifest(sff_fp); //free_sff_common_header(&h); if ( keep_fastq_orig == true ) fclose(fastq_fp); fclose(sff_fp); }
/* * Reads an SFF file from an mFILE and decodes it to a Read struct. * * Returns Read* on success * NULL on failure */ Read *mfread_sff(mFILE *mf) { int i, bpos; Read *r; sff_common_header *ch; sff_read_header *rh; sff_read_data *rd; /* Load the SFF contents */ if (NULL == (ch = read_sff_common_header(mf))) return NULL; if (NULL == (rh = read_sff_read_header(mf))) { free_sff_common_header(ch); return NULL; } if (NULL == (rd = read_sff_read_data(mf, ch->flow_len, rh->nbases))) { free_sff_common_header(ch); free_sff_read_header(rh); return NULL; } /* Convert to Read struct */ r = read_allocate(0,0); if (r->basePos) free(r->basePos); if (r->base) free(r->base); if (r->prob_A) free(r->prob_A); if (r->prob_C) free(r->prob_C); if (r->prob_G) free(r->prob_G); if (r->prob_T) free(r->prob_T); r->nflows = ch->flow_len; r->flow_order = ch->flow; ch->flow = NULL; r->flow_raw = NULL; r->flow = (float *)malloc(r->nflows * sizeof(float)); for (i = 0; i < r->nflows; i++) { r->flow[i] = rd->flowgram[i] / 100.0; } r->NBases = rh->nbases; r->basePos = (uint_2 *)calloc(r->NBases, 2); r->base = rd->bases; rd->bases = NULL; r->prob_A = (char *)calloc(r->NBases, 1); r->prob_C = (char *)calloc(r->NBases, 1); r->prob_G = (char *)calloc(r->NBases, 1); r->prob_T = (char *)calloc(r->NBases, 1); bpos = 0; for (i=0; i < r->NBases; i++) { r->prob_A[i] = 0; r->prob_C[i] = 0; r->prob_G[i] = 0; r->prob_T[i] = 0; switch (r->base[i]) { case 'A': case 'a': r->prob_A[i] = rd->quality[i]; break; case 'C': case 'c': r->prob_C[i] = rd->quality[i]; break; case 'G': case 'g': r->prob_G[i] = rd->quality[i]; break; case 'T': case 't': r->prob_T[i] = rd->quality[i]; break; } bpos += rd->flow_index[i]; r->basePos[i] = bpos; } r->leftCutoff = MAX(rh->clip_qual_left, rh->clip_adapter_left); r->rightCutoff = MIN(rh->clip_qual_right ? rh->clip_qual_right : r->NBases+1, rh->clip_adapter_right ? rh->clip_adapter_right : r->NBases+1); free_sff_common_header(ch); free_sff_read_header(rh); free_sff_read_data(rd); return r; }
void process_sff_to_fastq(char *sff_file, char *fastq_file, int trim_flag) { sff_common_header h; sff_read_header rh; sff_read_data rd; FILE *sff_fp, *fastq_fp; if ( !strlen(sff_file) ) { sff_fp = stdin; } else if ( (sff_fp = fopen(sff_file, "r")) == NULL ) { fprintf(stderr, "[err] Could not open file '%s' for reading.\n", sff_file); exit(1); } read_sff_common_header(sff_fp, &h); verify_sff_common_header(PRG_NAME, VERSION, &h); // printf("size of header: %d \n", sizeof(sff_common_header)); // printf("\tmagic : 0x%x\n" , h.magic); // printf("\tindex_offset : 0x%llx\n", h.index_offset); // printf("\tindex_len : 0x%x\n" , h.index_len); // printf("\tnumreads : 0x%x\n" , h.nreads); // printf("\theader_len : 0x%x\n" , h.header_len); // printf("\tkey_len : 0x%x\n" , h.key_len); // printf("\tflow_len : 0x%x\n" , h.flow_len); // printf("\tflowgram_fmt : 0x%x\n" , h.flowgram_format); // printf("\tflow : %s\n " , h.flow); // printf("\tkey : %s\n " , h.key); // printf("\n\n"); if ( !strlen(fastq_file) ) { fastq_fp = stdout; } else { if ( (fastq_fp = fopen(fastq_file, "w")) == NULL ) { fprintf(stderr, "[err] Could not open file '%s' for writing.\n", fastq_file); exit(1); } } int left_clip = 0, right_clip = 0, nbases = 0; char *name; char *bases; uint8_t *quality; register int i; int numreads = (int) h.nreads; for (i = 0; i < numreads; i++) { read_sff_read_header(sff_fp, &rh); read_sff_read_data(sff_fp, &rd, h.flow_len, rh.nbases); /* get clipping points */ get_clip_values(rh, trim_flag, &left_clip, &right_clip); nbases = right_clip - left_clip; /* create bases string */ bases = get_read_bases(rd, left_clip, right_clip); /* create quality array */ quality = get_read_quality_values(rd, left_clip, right_clip); /* create read name string */ int name_length = (int) rh.name_len + 1; // account for NULL termination name = (char *) malloc( name_length * sizeof(char) ); if (!name) { fprintf(stderr, "Out of memory! For read name string!\n"); exit(1); } memset(name, '\0', (size_t) name_length); strncpy(name, rh.name, (size_t) rh.name_len); construct_fastq_entry(fastq_fp, name, bases, quality, nbases); free(name); free(bases); free(quality); free_sff_read_header(&rh); free_sff_read_data(&rd); } free_sff_common_header(&h); fclose(fastq_fp); fclose(sff_fp); }