示例#1
0
static
void HelicosFileInfo_init(HelicosFileInfo* file)
{
    file->ready = false;
    pstring_clear(&file->name);
    pstring_clear(&file->sequence);
    pstring_clear(&file->quality);
}
示例#2
0
static
void FileReadData_init(FileReadData* read, bool name_only)
{
    pstring_clear(&read->name);
    pstring_clear(&read->barcode);
    IlluminaRead_Init(&read->read, name_only);
    if( !name_only ) {
        read->ready = false;
    }
}
示例#3
0
void IlluminaRead_Init(IlluminaRead* read, bool name_only)
{
    assert(read != NULL);
    read->read_id = ILLUMINAWRITER_READID_NONE;
    if( !name_only ) {
        pstring_clear(&read->seq);
        read->qual_type = ILLUMINAWRITER_COLMASK_NOTSET;
        pstring_clear(&read->qual);
        pstring_clear(&read->noise);
        pstring_clear(&read->intensity);
        pstring_clear(&read->signal);
        read->filter = SRA_READ_FILTER_PASS;
    }
}
示例#4
0
static
rc_t fe_new_read(fe_context_t *self, pstring *readId, EAbisolidReadType* type, pstring* label)
{
    rc_t rc = 0;
    pstring name_suffix;
    const char* p;

    assert(self && readId && type && label);
    DEBUG_MSG(3, ("READ_LABEL: '%s'\n", readId->data));
    /* spot name suffix may end with '_(F|R).+' */
    p = strrchr(readId->data, '_');
    if( p != NULL ) {
        rc = set_label_type(p + 1, label, type);
        if( rc == 0 && *type > eAbisolidReadType_SPOT) {
            /* cut label */
            readId->len -= label->len + 1;
        }
    } else {
        pstring_clear(label);
        *type = eAbisolidReadType_SPOT;
    }
    if( rc == 0 && (rc = pstring_copy(&name_suffix, readId)) == 0 ) {
        rc = SRAWriteAbsolid_MakeName(&self->name_prefix, &name_suffix, readId);
    }
    return rc;
}
示例#5
0
rc_t pstring_quality_convert(pstring* qstr, ExperimentQualityEncoding enc, const uint8_t offset, const int8_t min, const int8_t max)
{
    rc_t rc = 0;
    char* c, *end, *next;
    pstring qbin;

    if( qstr == NULL || min > max ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcParam, rcInvalid);
    }
    errno = 0;
    c = qstr->data;
    end = qstr->data + qstr->len;
    pstring_clear(&qbin);
    if(enc == eExperimentQualityEncoding_Undefined) {
	if(memchr(c, ' ', qstr->len) != NULL || memchr(c, '\t', qstr->len) != NULL){
		enc = eExperimentQualityEncoding_Decimal;
	} else {
		enc = eExperimentQualityEncoding_Ascii;
	}
    }


    while( rc == 0 && c < end ) {
        long q;
        switch(enc) {
            case eExperimentQualityEncoding_Decimal:
            case eExperimentQualityEncoding_Hexadecimal:
                /* spaced numbers form */
                errno = 0;
                q = strtol(c, &next, enc == eExperimentQualityEncoding_Decimal ? 10 : 16);
                if( q == 0 && c == next ) {
                    /* no more digits in line */
		    goto DONE; /*** need do break while loop as well ***/
                }
                c = next;
                break;
            case eExperimentQualityEncoding_Ascii:
                /* textual form with offset */
                q = (long)(*c++) - offset;
                break;
            default:
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
                break;
        }
        if( rc == 0 ) {
            if( errno != 0 || q < min || q > max ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcOutofrange);
            } else {
                rc = pstring_append_chr(&qbin, (int8_t)q, 1);
            }
        }
    }
DONE:
    if( rc == 0 ) {
        rc = pstring_copy(qstr, &qbin);
    }
    return rc;
}
示例#6
0
static
void IlluminaFileInfo_init(IlluminaFileInfo* file)
{
    assert(file);
    file->ready = false;
    pstring_clear(&file->name);
    memset(file->coord, 0, sizeof(file->coord));
    IlluminaRead_Init(&file->read, false);
}
示例#7
0
static
rc_t fe_new_read(fe_context_t *self, int flags, pstring *readId )
{
    rc_t rc;
    char *suffix;
    pstring readName, spotGroup;
    static IlluminaSpot spot;

    /* look for spot group */
    suffix = strchr(readId->data, '#');
    if( suffix != NULL ) {
        readId->len = suffix++ - readId->data;
        if( (rc = pstring_assign(&spotGroup, suffix, strlen(suffix))) != 0 ) {
            SRALoaderFile_LOG(self->ctx.file, klogInt, rc,
                "extracting barcode from spot '$(spotname)'", "spotname=%s", readId->data);
            return rc;
        }
    } else {
        pstring_clear(&spotGroup);
    }

    /* build the read name from prefix (self->name_prefix) and read id */
    if(self->name_prefix.len > 0 ) {
        if( (rc = pstring_copy(&readName, &self->name_prefix)) == 0 ) {
            if( isdigit(readName.data[readName.len - 1]) ) {
                rc = pstring_append(&readName, ":", 1);
            }
            if( rc == 0 ) {
                rc = pstring_concat(&readName, readId);
            }
        }
    } else {
        rc = pstring_copy(&readName, readId);
    }
    if( rc != 0 ) {
        SRALoaderFile_LOG(self->ctx.file, klogErr, rc,
            "preparing spot name $(spotname)", "spotname=%s", readId->data);
        return rc;
    }
    SRF_set_read_filter(&self->read.filter, flags);

    IlluminaSpot_Init(&spot);
    if( (rc = IlluminaSpot_Add(&spot, &readName, &spotGroup, &self->read)) == 0 ) {
        rc = SRAWriterIllumina_Write(self->writer, self->ctx.file, &spot);
    }
    return rc;
}
示例#8
0
void AbsolidRead_Init(AbsolidRead* read)
{
    if( read ) {
        pstring_clear(&read->label);
        read->cs_key = 0;
        pstring_clear(&read->seq);
        pstring_clear(&read->qual);
        read->fs_type = eAbisolidFSignalType_NotSet;
        pstring_clear(&read->fxx);
        pstring_clear(&read->cy3);
        pstring_clear(&read->txr);
        pstring_clear(&read->cy5);
        read->filter = 0;
    }
}
示例#9
0
static
void AbsolidSpot_Init(AbsolidSpot* spot, const pstring* spot_name,
                      const uint16_t nreads, const SRASegment* read_seg)
{
    assert(spot != NULL);
    assert(spot_name != NULL);
    assert(nreads != 0);

    spot->spot_name = spot_name;
    pstring_clear(&spot->label);
    memmove(spot->read_seg, read_seg, sizeof(spot->read_seg[0]) * nreads);
    memset(spot->label_start, 0, sizeof(spot->label_start[0]) * nreads);
    memset(spot->label_len, 0, sizeof(spot->label_len[0]) * nreads);
    memset(spot->cs_key, 0, sizeof(spot->cs_key[0]) * nreads);
    pstring_clear(&spot->seq);
    pstring_clear(&spot->qual);
    spot->fs_type = eAbisolidFSignalType_NotSet;
    pstring_clear(&spot->fxx);
    pstring_clear(&spot->cy3);
    pstring_clear(&spot->txr);
    pstring_clear(&spot->cy5);
    memset(spot->filter, 0, sizeof(spot->filter[0]) * nreads);
}
示例#10
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file)
{
    rc_t rc = 0;

    if( file->spot->ready ) {
        /* data still not used */
        return 0;
    }
    FileReadData_init(file->spot, false);
    FileReadData_init(&file->spot[1], false);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) {
        /* single line forms */
        file->line = NULL; /* line consumed */
        file->spot->ready = true;
    } else  if( file->line[0] == '>' || file->line[0] == '@' ) {
        /* 4 or 8 line format */
        FileReadData sd;
        uint8_t word = 0, best_word = 0;
        uint8_t score = 0, best_score = 0;
        /* find and parse spot name on defline */
        do {
            score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word);
            if( score > best_score ) {
                if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 ||
                    (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name");
                }
                file->spot->read.read_id = sd.read.read_id;
                best_score = score;
                best_word = word; /* used below for quality defline parsing */
            }

        } while(score != 0);
        if( best_score == 0 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected");
        }
        if( file->line[0] == '@' ) {
            if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) {
                return rc;
            }
            /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */
            if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) {
                /* try to find read id on next line */
                FileReadData_init(&file->spot[1], false);
                if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) {
                    if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 &&
                        pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 &&
                        file->spot->read.read_id != file->spot[1].read.read_id ) {
                        /* since it is different read id with same name and barcode, fill up second read */
                        if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) {
                            return rc;
                        }
                    }
                }
            }
        } else {
            /* 2 line seq or quality form */
            file->line = NULL; /* line consumed */
            /* read sequence/quality */
            if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data");
            }
            if( file->spot->read.seq.len == 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data");
            } else if( !pstring_is_fasta(&file->spot->read.seq) ) {
                /* swap */
                if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) {
                    file->spot->read.qual_type = file->qualType;
                    pstring_clear(&file->spot->read.seq);
                }
            }
            file->spot->ready = true;
        }
    } else {
            rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown");
    }
    if( rc == 0 ) {
        int k;
        for(k = 0; k < 2; k++) {
            FileReadData* rd = &file->spot[k];
            if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) {
                if( file->qualOffset == 0 ) {
                    /* detect and remember */
                    file->qualOffset = 33;
		    file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    if( GetRCState(rc) == rcOutofrange ) {
                        file->qualOffset = 64;
			file->qualMax = 61;
                        rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    }
                } else {
		    if(file->qualOffset == 33) file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality");
                }
            }
        }
    }
    return 0;
}
示例#11
0
/* parses name as a given word number (1-based) in a str of size len
 * looks for name(#barcode)?([\/.]\d)?
 * returns score of found parts
 * score == 0 word not found
 */ 
static
uint8_t parse_spot_name(const SRALoaderFile* file, FileReadData* spot, const char* str, size_t len, uint8_t word_number)
{
    uint8_t w, score = 0;
    const char* name, *name_end;

    name = name_end = str;
    /* set name_end to end of word_number-th word */
    for(w = 1; w <= word_number || name_end == NULL; w++ ) {
        /* skip consecutive spaces */
        while( *name_end == ' ' && name_end != &str[len] ) {
            name_end++;
        }
        name = name_end;
        name_end = memchr(name, ' ', len - (name_end - str));
        if( name_end == NULL ) {
            if( w == word_number ) {
                name_end = &str[len];
            }
            break;
        }
    }
    if( name != name_end && name_end != NULL ) {
        char* x;
        rc_t rc;

        /* init only name portion */
        FileReadData_init(spot, true);
        --name_end; /* goto last char */
        if( isdigit(name_end[0])&& (name_end[-1] == '\\' || name_end[-1] == '/' )) {
            score++;
            spot->read.read_id = name_end[0] - '0';
            name_end -= 2;
        } else if( isdigit(*name_end) && name_end[-1] == '.' ) {
            int q = 0;
            if( memrchr(name, '#', name_end - name) != NULL ) {
                /* have barode -> this is read id */
                q = 4;
            } else {
                /* may a read id, check to see if 4 coords follow */
                const char* end = name_end - 1;
                while( --end >= name ) {
                    if( strchr(":|_", *end) != NULL ) {
                        q++;
                    } else if( !isdigit(*end) ) {
                        break;
                    }
                }
            }
            if( q == 4 ) {
                score++;
                spot->read.read_id = name_end[0] - '0';
                name_end -= 2;
            }
        }
        if( (x = memrchr(name, '#', name_end - name)) != NULL ) {
            score++;
            if( (rc = pstring_assign(&spot->barcode, x + 1, name_end - x)) != 0 ) {
                SRALoaderFile_LOG(file, klogErr, rc, "barcode $(b)", "b=%.*s", name_end - x, x + 1);
                return 0;
            }
            if( pstring_strcmp(&spot->barcode, "0") == 0 ) {
                pstring_clear(&spot->barcode);
            } else if( spot->barcode.len >= 4 &&
                       (strncmp(spot->barcode.data, "0/1_", 4) == 0 || strncmp(spot->barcode.data, "0/2_", 4) == 0) ) {
                spot->read.read_id = spot->barcode.data[2] - '0';
                pstring_assign(&spot->barcode, &spot->barcode.data[4], spot->barcode.len - 4);
            }
            name_end = --x;
        }
        score++;
        if( (rc = pstring_assign(&spot->name, name, name_end - name + 1)) != 0 ) {
            SRALoaderFile_LOG(file, klogErr, rc, "spot name $(n)", "n=%.*s", name_end - name + 1, name);
            return 0;
        }
        /* search for _R\d\D in name and use it as read id, remove from name or spot won't assemble */
        x = spot->name.data;
        while( (x = strrchr(x, 'R')) != NULL ) {
            if( x != spot->name.data && *(x - 1) == '_' && isdigit(*(x + 1)) && !isalnum(*(x + 2)) ) {
                score++;
		if(spot->read.read_id == -1){
			spot->read.read_id = *(x + 1) - '0';
		}
                strcpy(x - 1, x + 2);
                spot->name.len -= 4;
                break;
            }
            x++;
        }
        /* find last '=' and use only whatever is to the left of it */
        if( (x = memrchr(spot->name.data, '=', spot->name.len)) != NULL ) {
            rc = pstring_assign(&spot->name, spot->name.data, (x - spot->name.data) );
        }
    }
    return score;
}
示例#12
0
bool FGroup_Parse( SLNode *n, void *d )
{
    FGroup_Parse_data* data = (FGroup_Parse_data*)d;
    FGroup* g = (FGroup*)n;
    bool done;
    const SRALoaderFile* data_block_ref = NULL;

    data->rc = 0;
    do {
        IlluminaFileInfo* file = g->files;
        done = true;
        while( data->rc == 0 && file != NULL ) {
            if( (data->rc = read_next_spot(g->blk_pfx, file)) == 0 && file->ready ) {
                done = false;
            }
            file = file->next;
        }
        if( data->rc != 0 || done ) {
            break;
        }
        /* collect spot reads, matching by spot name
         * spot data may be split across multiple files
         */
        IlluminaSpot_Init(&data->spot);
        file = g->files;
        while( data->rc == 0 && file != NULL ) {
            if( file->ready ) {
                if( (file->type == eIlluminaNativeFileTypeNoise && data->self->skip_noise) ||
                    (file->type == eIlluminaNativeFileTypeIntensity && data->self->skip_intensity) ||
                    (file->type == eIlluminaNativeFileTypeSignal && data->self->skip_signal) ) {
                    file->ready = false;
                } else {
                    data_block_ref = file->file;
                    if( file->type == eIlluminaNativeFileTypeQSeq && (g->mask & eIlluminaNativeFileTypeQuality4) ) {
                        /* drop quality1 from qseq data */
                        pstring_clear(&file->read.qual);
                    } else if( file->type == eIlluminaNativeFileTypeQuality4 ) {
                        IlluminaFileInfo* neib = file->next ? file->next : file->prev;
                        /* need to fix spotname to be same cause prb do not have any name in it */
                        if( (data->rc = pstring_copy(&file->name, &neib->name)) != 0 ) {
                            SRALoaderFile_LOG(file->file, klogErr, data->rc, "$(msg) '$(n)'", "msg=syncing prb spot name,n=%s", neib->name.data);
                        }
                    }
                    if( data->rc == 0 ) {
                        data->rc = IlluminaSpot_Add(&data->spot, &file->name, &file->barcode, &file->read);
                        if( data->rc == 0 ) {
                            file->ready = false;
                        } else {
                            if( GetRCState(data->rc) == rcIgnored ) {
                                SRALoaderFile_LOG(file->file, klogWarn, data->rc, "$(msg) '$(s1)' <> '$(s2)'",
                                                "msg=spot name mismatch,s1=%.*s,s2=%.*s",
                                                data->spot.name->len, data->spot.name->data, file->name.len, file->name.data);
                                data->self->spots_bad_count++;
                                /* skip spot for all files in a group */
                                file = g->files;
                                while( file != NULL ) {
                                    file->ready = false;
                                    SRALoaderFile_LOG(file->file, klogWarn, data->rc,
                                                      "$(msg) '$(n)'", "msg=skipped spot,n=%s", file->name.data);
                                    file = file->next;
                                }
                                if( data->self->spots_bad_allowed >= 0 &&
                                    data->self->spots_bad_count > data->self->spots_bad_allowed ) {
                                    data->rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid);
                                }
                                break;
                            }
                        }
                    }
                }
            }
            file = file->next;
        }
        if( GetRCState(data->rc) == rcIgnored ) {
            data->rc = 0;
            continue;
        }
        if( data->rc == 0 ) {
            data->rc = SRAWriterIllumina_Write(data->self->writer, data_block_ref, &data->spot);
        }
    } while( data->rc == 0 );
    return data->rc != 0;
}
示例#13
0
static
rc_t SFFLoaderFmtReadData(SFFLoaderFmt* self, const SRALoaderFile* file)
{
    rc_t rc = 0;
    uint32_t i;

    /* calc signal chunk size */
    size_t signal_sz = self->header.num_flows_per_read * sizeof(uint16_t);
    /* plus position, read, quality */
    size_t sz = signal_sz + self->read_header.number_of_bases * 3;
    /* + padding */
    sz += (sz % 8) ? (8 - (sz % 8)) : 0;

    /* adjust the buffer window to full data block size */
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, sz, "read data", false)) != 0 ) { 
        return rc;
    }
    self->file_advance = sz;

    if( !self->skip_signal ) {
        rc = pstring_assign(&self->signal, self->file_buf, signal_sz);
#if __BYTE_ORDER == __LITTLE_ENDIAN
        if( rc == 0 ) {
            uint16_t* sig = (uint16_t*)self->signal.data;
            for(i = 0; i < self->header.num_flows_per_read; i++) {
                sig[i] = bswap_16(sig[i]);
            }
        }
#endif
    }

    if( rc == 0 ) {
        const uint8_t* pos = self->file_buf + signal_sz;

        if( !self->skip_signal ) {
            INSDC_coord_one *p;
            /* reset buffer to proper size */
            pstring_clear(&self->position);
            rc = pstring_append_chr(&self->position, 0, self->read_header.number_of_bases * sizeof(*p));
            p = (INSDC_coord_one*)&self->position.data[0];
            p[0] = pos[0];
            for(i = 1; i < self->read_header.number_of_bases; i++) {
                p[i] = p[i - 1] + pos[i];
            }
        }
        if( rc == 0 ) {
            pos += self->read_header.number_of_bases;
            rc = pstring_assign(&self->read, pos, self->read_header.number_of_bases);
            /*for(i = 0; i< self->read.len; i++ ) {
                self->read.data[i] = tolower(self->read.data[i]);
            }*/
        }
        if( rc == 0 ) {
            pos += self->read_header.number_of_bases;
            rc = pstring_assign(&self->quality, pos, self->read_header.number_of_bases);
        }
    }
    if( rc != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "copying read data", NULL);
    }
    return rc;
}