예제 #1
0
/*
 * Reads a read header (including variable length components) from an mFILE.
 *
 * Returns the a pointer to the header on success
 *         NULL on failure
 */
sff_read_header *read_sff_read_header(mFILE *mf) {
    sff_read_header *h;
    unsigned char rhdr[16];

    if (16 != mfread(rhdr, 1, 16, mf))
	return NULL;
    h = decode_sff_read_header(rhdr);

    if (h->name_len != mfread(h->name, 1, h->name_len, mf))
	return free_sff_read_header(h), NULL;
    
    /* Pad to 8 chars */
    mfseek(mf, (mftell(mf) + 7)& ~7, SEEK_SET);

    return h;
}
예제 #2
0
/*
 * Override the sff.c functions to use FILE pointers instead. This means
 * we don't have to load the entire archive into memory, which is optimal when
 * dealing with a single file (ie in sff/sff.c), but not when indexing it.
 *
 * Done with minimal error checking I'll admit...
 */
static sff_read_header *fread_sff_read_header(FILE *fp) {
    sff_read_header *h;
    unsigned char rhdr[16];

    if (16 != fread(rhdr, 1, 16, fp))
	return NULL;
    h = decode_sff_read_header(rhdr);

    if (h->name_len != fread(h->name, 1, h->name_len, fp))
	return free_sff_read_header(h), NULL;
    
    /* Pad to 8 chars */
    fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET);

    return h;
}
예제 #3
0
/*
 * Unpacks the 16-byte fixed size part of the SFF read header.
 * It allocates memory for this and for the base calls, but does not
 * unpack these.
 *
 * Returns sff_read_header* on success
 *         NULL on failure
 */
sff_read_header *decode_sff_read_header(unsigned char *buf) {
    sff_read_header *h;

    if (NULL == (h = (sff_read_header *)xcalloc(1, sizeof(*h))))
	return NULL;

    h->header_len         = be_int2(*(uint16_t *)(buf+0));
    h->name_len           = be_int2(*(uint16_t *)(buf+2));
    h->nbases             = be_int4(*(uint32_t *)(buf+4));
    h->clip_qual_left     = be_int2(*(uint16_t *)(buf+8));
    h->clip_qual_right    = be_int2(*(uint16_t *)(buf+10));
    h->clip_adapter_left  = be_int2(*(uint16_t *)(buf+12));
    h->clip_adapter_right = be_int2(*(uint16_t *)(buf+14));

    if (NULL == (h->name  = (char *)xmalloc(h->name_len)))
	return free_sff_read_header(h), NULL;

    return h;
}
예제 #4
0
/* F U N C T I O N S *********************************************************/
void process_sff_to_fastq(char *sff_file, int trim_flag) {
    sff_read_header rh;
    sff_read_data rd;
    FILE *sff_fp, *fastq_fp;

    if ( (sff_fp = fopen(sff_file, "r")) == NULL ) {
        fprintf(stderr,
                "[err] Could not open file '%s' for reading.\n", sff_file);
        exit(1);
    }
    
    get_sff_file_size(sff_fp);
    
    read_sff_common_header(sff_fp, &h);
    verify_sff_common_header(&h);


    if ( keep_fastq_orig == true ) {
        vector<string> tmp_rep;
        split_str(string(sff_file), tmp_rep, "//");
    
        if ( ( fastq_fp = fopen( (tmp_rep[tmp_rep.size() - 1].substr(0,tmp_rep[tmp_rep.size() - 1].length()-4) + ".fastq").c_str(), "w") ) == NULL ) {
            fprintf(stderr,
                    "[err] Could not open file '%s' for writing.\n",
                    "");
            exit(1);
        }
    }

    int left_clip = 0, right_clip = 0, nbases = 0;
    char *name;
    char *bases;
    uint8_t *quality;
    //register int i;
    
    unsigned int numreads = h.nreads;
    
    for (int i = 0; i < numreads; i++) { //cout << i << " " << numreads << endl;
        read_sff_read_header(sff_fp, &rh);
        read_sff_read_data(sff_fp, &rd, h.flow_len, rh.nbases);
        
        //rheaders.push_back(rh);
        // get clipping points 
        get_clip_values(rh, trim_flag, &left_clip, &right_clip);
        nbases = right_clip - left_clip;

        // create bases string 
        bases = get_read_bases(rd, left_clip, right_clip);

        // create quality array 
        quality = get_read_quality_values(rd, left_clip, right_clip);

        //Create new read
        Read *read = new Read();
        
        read->initial_length = nbases;
        read->read = string(bases);
        uint8_t quality_char;
        read->quality = (uint8_t*)malloc(sizeof(uint8_t)*nbases);
        for (int j = 0; j < nbases; j++) 
        {
           quality_char = (quality[j] <= 93 ? quality[j] : 93) + 33;
           read->quality[j] = quality_char;
        }
       
        //read->rd = rd;
        read->flowgram = new uint16_t[h.flow_len];
        for(int j=0; j<h.flow_len; j++) {
                read->flowgram[j] = rd.flowgram[j];
                //cout << rd.flowgram[j] << " " << endl;
                
        }
        
        read->flow_index = (uint8_t*)malloc(sizeof(uint8_t)*nbases);
        for(int j=0; j<nbases; j++) {
                read->flow_index[j] = rd.flow_index[j];
                
        }
        
        read->roche_left_clip = (int) max(1, max(rh.clip_qual_left, rh.clip_adapter_left)) - 1;
        read->roche_right_clip = (int) min( (rh.clip_qual_right    == 0 ? rh.nbases : rh.clip_qual_right   ), (rh.clip_adapter_right == 0 ? rh.nbases : rh.clip_adapter_right) );
        
        reads.push_back(read);
        
        
        string tstr = string(rh.name) + " " + string(itoa(rh.clip_adapter_left,new char[5],10)) +  " " + string(itoa(rh.clip_adapter_right,new char[5],10))+  " " + string(itoa(rh.clip_qual_left,new char[5],10))  +   " " + string(itoa(rh.clip_qual_right,new char[5],10)) + " " + string(itoa(rh.clip_qual_right,new char[5],10)); 
        int t_len = tstr.length();
        
        
        // create read name string 
        int name_length = (int) t_len + 1; // account for NULL termination
        name = (char *) malloc( name_length * sizeof(char) );
        if (!name) {
            fprintf(stderr, "Out of memory! For read name string!\n");
            exit(1);
        }
        memset(name, '\0', (size_t) name_length);
        
        read->readID = (char *) malloc( rh.name_len * sizeof(char) );
        //read->readID = rh.name;
        memcpy( read->readID, rh.name, (size_t) rh.name_len );
        
        //strncpy(name, rh.name, (size_t) rh.name_len);
        strncpy(name, tstr.c_str(), (size_t)t_len);
        
        if ( keep_fastq_orig == true )
            construct_fastq_entry(fastq_fp, name, bases, quality, nbases);
        //printf("%d\n",rh.name_len);
        free(name);
        free(bases);
        free(quality);
        free_sff_read_header(&rh);
        free_sff_read_data(&rd);
        
        
    }
    
    read_manifest(sff_fp);

    //free_sff_common_header(&h);
    if ( keep_fastq_orig == true )
        fclose(fastq_fp);
    
    fclose(sff_fp);
}
예제 #5
0
/*
 * Reads an SFF file from an mFILE and decodes it to a Read struct.
 *
 * Returns Read* on success
 *         NULL on failure
 */
Read *mfread_sff(mFILE *mf) {
    int i, bpos;
    Read *r;
    sff_common_header *ch;
    sff_read_header *rh;
    sff_read_data *rd;

    /* Load the SFF contents */
    if (NULL == (ch = read_sff_common_header(mf)))
	return NULL;
    if (NULL == (rh = read_sff_read_header(mf))) {
	free_sff_common_header(ch);
	return NULL;
    }
    if (NULL == (rd = read_sff_read_data(mf, ch->flow_len, rh->nbases))) {
	free_sff_common_header(ch);
	free_sff_read_header(rh);
	return NULL;
    }

    /* Convert to Read struct */
    r = read_allocate(0,0);
    if (r->basePos) free(r->basePos);
    if (r->base)    free(r->base);
    if (r->prob_A)  free(r->prob_A);
    if (r->prob_C)  free(r->prob_C);
    if (r->prob_G)  free(r->prob_G);
    if (r->prob_T)  free(r->prob_T);

    r->nflows = ch->flow_len;
    r->flow_order = ch->flow; ch->flow = NULL;
    r->flow_raw = NULL;
    r->flow = (float *)malloc(r->nflows * sizeof(float));
    for (i = 0; i < r->nflows; i++) {
	r->flow[i] = rd->flowgram[i] / 100.0;
    }

    r->NBases = rh->nbases;
    r->basePos = (uint_2 *)calloc(r->NBases, 2);
    r->base    = rd->bases; rd->bases = NULL;
    r->prob_A  = (char *)calloc(r->NBases, 1);
    r->prob_C  = (char *)calloc(r->NBases, 1);
    r->prob_G  = (char *)calloc(r->NBases, 1);
    r->prob_T  = (char *)calloc(r->NBases, 1);

    bpos = 0;
    for (i=0; i < r->NBases; i++) {
	r->prob_A[i] = 0;
	r->prob_C[i] = 0;
	r->prob_G[i] = 0;
	r->prob_T[i] = 0;
	switch (r->base[i]) {
	case 'A':
	case 'a':
	    r->prob_A[i] = rd->quality[i];
	    break;
	case 'C':
	case 'c':
	    r->prob_C[i] = rd->quality[i];
	    break;
	case 'G':
	case 'g':
	    r->prob_G[i] = rd->quality[i];
	    break;
	case 'T':
	case 't':
	    r->prob_T[i] = rd->quality[i];
	    break;
	}

	bpos += rd->flow_index[i];
	r->basePos[i] = bpos;
    }

    r->leftCutoff = MAX(rh->clip_qual_left, rh->clip_adapter_left);
    r->rightCutoff = MIN(rh->clip_qual_right
			 ? rh->clip_qual_right
			 : r->NBases+1,
			 rh->clip_adapter_right
			 ? rh->clip_adapter_right
			 : r->NBases+1);

    free_sff_common_header(ch);
    free_sff_read_header(rh);
    free_sff_read_data(rd);

    return r;
}
예제 #6
0
파일: main.c 프로젝트: b4winckler/sff2fastq
void
process_sff_to_fastq(char *sff_file, char *fastq_file, int trim_flag) {
    sff_common_header h;
    sff_read_header rh;
    sff_read_data rd;
    FILE *sff_fp, *fastq_fp;

    if ( !strlen(sff_file) ) {
        sff_fp = stdin;
    }
    else if ( (sff_fp = fopen(sff_file, "r")) == NULL ) {
        fprintf(stderr,
                "[err] Could not open file '%s' for reading.\n", sff_file);
        exit(1);
    }

    read_sff_common_header(sff_fp, &h);
    verify_sff_common_header(PRG_NAME, VERSION, &h);

//    printf("size of header: %d \n", sizeof(sff_common_header));
//    printf("\tmagic        : 0x%x\n" , h.magic);
//    printf("\tindex_offset : 0x%llx\n", h.index_offset);
//    printf("\tindex_len    : 0x%x\n" , h.index_len);
//    printf("\tnumreads     : 0x%x\n" , h.nreads);
//    printf("\theader_len   : 0x%x\n" , h.header_len);
//    printf("\tkey_len      : 0x%x\n" , h.key_len);
//    printf("\tflow_len     : 0x%x\n" , h.flow_len);
//    printf("\tflowgram_fmt : 0x%x\n" , h.flowgram_format);
//    printf("\tflow         : %s\n  " , h.flow);
//    printf("\tkey          : %s\n  " , h.key);
//    printf("\n\n");

    if ( !strlen(fastq_file) ) {
        fastq_fp = stdout;
    }
    else {
        if ( (fastq_fp = fopen(fastq_file, "w")) == NULL ) {
            fprintf(stderr,
                    "[err] Could not open file '%s' for writing.\n",
                    fastq_file);
            exit(1);
        }
    }

    int left_clip = 0, right_clip = 0, nbases = 0;
    char *name;
    char *bases;
    uint8_t *quality;
    register int i;
    int numreads = (int) h.nreads;
    for (i = 0; i < numreads; i++) {
        read_sff_read_header(sff_fp, &rh);
        read_sff_read_data(sff_fp, &rd, h.flow_len, rh.nbases);

        /* get clipping points */
        get_clip_values(rh, trim_flag, &left_clip, &right_clip);
        nbases = right_clip - left_clip;

        /* create bases string */
        bases = get_read_bases(rd, left_clip, right_clip);

        /* create quality array */
        quality = get_read_quality_values(rd, left_clip, right_clip);

        /* create read name string */
        int name_length = (int) rh.name_len + 1; // account for NULL termination
        name = (char *) malloc( name_length * sizeof(char) );
        if (!name) {
            fprintf(stderr, "Out of memory! For read name string!\n");
            exit(1);
        }
        memset(name, '\0', (size_t) name_length);
        strncpy(name, rh.name, (size_t) rh.name_len);

        construct_fastq_entry(fastq_fp, name, bases, quality, nbases);

        free(name);
        free(bases);
        free(quality);
        free_sff_read_header(&rh);
        free_sff_read_data(&rd);
    }

    free_sff_common_header(&h);
    fclose(fastq_fp);
    fclose(sff_fp);
}